diff --git a/.automation_scripts/parse_xml_results.py b/.automation_scripts/parse_xml_results.py
new file mode 100644
index 000000000000..7db2e1ce9233
--- /dev/null
+++ b/.automation_scripts/parse_xml_results.py
@@ -0,0 +1,178 @@
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+# Backends list
+BACKENDS_LIST = [
+    "dist-gloo",
+    "dist-nccl"
+]
+
+TARGET_WORKFLOW = "--rerun-disabled-tests"
+
+def get_job_id(report: Path) -> int:
+    # [Job id in artifacts]
+    # Retrieve the job id from the report path. In our GHA workflows, we append
+    # the job id to the end of the report name, so `report` looks like:
+    #     unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
+    # and we want to get `5596745227` out of it.
+    try:
+        return int(report.parts[0].rpartition("_")[2])
+    except ValueError:
+        return -1
+
+def is_rerun_disabled_tests(root: ET.ElementTree) -> bool:
+    """
+    Check if the test report is coming from rerun_disabled_tests workflow
+    """
+    skipped = root.find(".//*skipped")
+    # Need to check against None here, if not skipped doesn't work as expected
+    if skipped is None:
+        return False
+
+    message = skipped.attrib.get("message", "")
+    return TARGET_WORKFLOW in message or "num_red" in message
+
+def parse_xml_report(
+    tag: str,
+    report: Path,
+    workflow_id: int,
+    workflow_run_attempt: int,
+    work_flow_name: str
+) -> Dict[Tuple[str], Dict[str, Any]]:
+    """Convert a test report xml file into a JSON-serializable list of test cases."""
+    print(f"Parsing {tag}s for test report: {report}")
+
+    job_id = get_job_id(report)
+    print(f"Found job id: {job_id}")
+
+    test_cases: Dict[Tuple[str], Dict[str, Any]] = {}
+
+    root = ET.parse(report)
+    # TODO: unlike unittest, pytest-flakefinder used by rerun disabled tests for test_ops
+    # includes skipped messages multiple times (50 times by default). This slows down
+    # this script too much (O(n)) because it tries to gather all the stats. This should
+    # be fixed later in the way we use pytest-flakefinder. A zipped test report from rerun
+    # disabled test is only few MB, but will balloon up to a much bigger XML file after
+    # extracting from a dozen to few hundred MB
+    if is_rerun_disabled_tests(root):
+        return test_cases
+
+    for test_case in root.iter(tag):
+        case = process_xml_element(test_case)
+        if tag == 'testcase':
+            case["workflow_id"] = workflow_id
+            case["workflow_run_attempt"] = workflow_run_attempt
+            case["job_id"] = job_id
+            case["work_flow_name"] = work_flow_name
+
+            # [invoking file]
+            # The name of the file that the test is located in is not necessarily
+            # the same as the name of the file that invoked the test.
+            # For example, `test_jit.py` calls into multiple other test files (e.g.
+            # jit/test_dce.py). For sharding/test selection purposes, we want to
+            # record the file that invoked the test.
+            #
+            # To do this, we leverage an implementation detail of how we write out
+            # tests (https://bit.ly/3ajEV1M), which is that reports are created
+            # under a folder with the same name as the invoking file.
+            case_name = report.parent.name
+            for ind in range(len(BACKENDS_LIST)):
+                if BACKENDS_LIST[ind] in report.parts:
+                    case_name = case_name + "_" + BACKENDS_LIST[ind]
+                    break
+            case["invoking_file"] = case_name
+            test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["work_flow_name"] ) ] = case
+        elif tag == 'testsuite':
+            case["work_flow_name"] = work_flow_name
+            case["invoking_xml"] = report.name
+            case["running_time_xml"] = case["time"]
+            case_name = report.parent.name
+            for ind in range(len(BACKENDS_LIST)):
+                if BACKENDS_LIST[ind] in report.parts:
+                    case_name = case_name + "_" + BACKENDS_LIST[ind]
+                    break
+            case["invoking_file"] = case_name
+
+            test_cases[ ( case["invoking_file"], case["invoking_xml"], case["work_flow_name"] ) ] = case
+
+    return test_cases
+
+def process_xml_element(element: ET.Element) -> Dict[str, Any]:
+    """Convert a test suite element into a JSON-serializable dict."""
+    ret: Dict[str, Any] = {}
+
+    # Convert attributes directly into dict elements.
+    # e.g.
+    #     <testcase name="test_foo" classname="test_bar"></testcase>
+    # becomes:
+    #     {"name": "test_foo", "classname": "test_bar"}
+    ret.update(element.attrib)
+
+    # The XML format encodes all values as strings. Convert to ints/floats if
+    # possible to make aggregation possible in Rockset.
+    for k, v in ret.items():
+        try:
+            ret[k] = int(v)
+        except ValueError:
+            pass
+        try:
+            ret[k] = float(v)
+        except ValueError:
+            pass
+
+    # Convert inner and outer text into special dict elements.
+    # e.g.
+    #     <testcase>my_inner_text</testcase> my_tail
+    # becomes:
+    #     {"text": "my_inner_text", "tail": " my_tail"}
+    if element.text and element.text.strip():
+        ret["text"] = element.text
+    if element.tail and element.tail.strip():
+        ret["tail"] = element.tail
+
+    # Convert child elements recursively, placing them at a key:
+    # e.g.
+    #     <testcase>
+    #       <foo>hello</foo>
+    #       <foo>world</foo>
+    #       <bar>another</bar>
+    #     </testcase>
+    # becomes
+    #    {
+    #       "foo": [{"text": "hello"}, {"text": "world"}],
+    #       "bar": {"text": "another"}
+    #    }
+    for child in element:
+        if child.tag not in ret:
+            ret[child.tag] = process_xml_element(child)
+        else:
+            # If there are multiple tags with the same name, they should be
+            # coalesced into a list.
+            if not isinstance(ret[child.tag], list):
+                ret[child.tag] = [ret[child.tag]]
+            ret[child.tag].append(process_xml_element(child))
+    return ret
\ No newline at end of file
diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py
new file mode 100644
index 000000000000..514afd19624c
--- /dev/null
+++ b/.automation_scripts/run_pytorch_unit_tests.py
@@ -0,0 +1,518 @@
+#!/usr/bin/env python3
+
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import argparse
+import os
+import shutil
+import subprocess
+from subprocess import STDOUT, CalledProcessError
+
+from collections import namedtuple
+from datetime import datetime
+from pathlib import Path
+from parse_xml_results import (
+        parse_xml_report
+)
+from pprint import pprint
+from typing import Any, Dict, List
+
+# unit test status list
+UT_STATUS_LIST = [
+    "PASSED",
+    "MISSED",
+    "SKIPPED",
+    "FAILED",
+    "XFAILED",
+    "ERROR"
+]
+
+DEFAULT_CORE_TESTS = [
+    "test_nn",
+    "test_torch",
+    "test_cuda",
+    "test_ops",
+    "test_unary_ufuncs",
+    "test_autograd",
+    "inductor/test_torchinductor"
+]
+
+DISTRIBUTED_CORE_TESTS = [
+    "distributed/test_c10d_common",
+    "distributed/test_c10d_nccl",
+    "distributed/test_distributed_spawn"
+]
+
+CONSOLIDATED_LOG_FILE_NAME="pytorch_unit_tests.log"
+
+def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, workflow_name, path="."):
+    test_cases = {}
+    items_list = os.listdir(path)
+    for dir in items_list:
+        new_dir = path + '/' + dir + '/'
+        if os.path.isdir(new_dir):
+            for xml_report in Path(new_dir).glob("**/*.xml"):
+                test_cases.update(
+                    parse_xml_report(
+                        tag,
+                        xml_report,
+                        workflow_run_id,
+                        workflow_run_attempt,
+                        workflow_name
+                    )
+                )
+    return test_cases
+
+def get_test_status(test_case):
+  # In order of priority: S=skipped, F=failure, E=error, P=pass
+  if "skipped" in test_case and test_case["skipped"]:
+      type_message = test_case["skipped"]
+      if type_message.__contains__('type') and type_message['type'] == "pytest.xfail":
+          return "XFAILED"
+      else:
+          return "SKIPPED"
+  elif "failure" in test_case and test_case["failure"]:
+    return "FAILED"
+  elif "error" in test_case and test_case["error"]:
+    return "ERROR"
+  else:
+    return "PASSED"
+
+def get_test_message(test_case, status=None):
+  if status == "SKIPPED":
+    return test_case["skipped"] if "skipped" in test_case else ""
+  elif status == "FAILED":
+    return test_case["failure"] if "failure" in test_case else ""
+  elif status == "ERROR":
+    return test_case["error"] if "error" in test_case else ""
+  else:
+    if "skipped" in test_case:
+      return test_case["skipped"]
+    elif "failure" in test_case:
+      return test_case["failure"]
+    elif "error" in test_case:
+      return test_case["error"]
+    else:
+      return ""
+
+def get_test_file_running_time(test_suite):
+  if test_suite.__contains__('time'):
+    return test_suite["time"]
+  return 0
+
+def get_test_running_time(test_case):
+  if test_case.__contains__('time'):
+    return test_case["time"]
+  return ""
+
+def summarize_xml_files(path, workflow_name):
+    # statistics
+    TOTAL_TEST_NUM = 0
+    TOTAL_PASSED_NUM = 0
+    TOTAL_SKIPPED_NUM = 0
+    TOTAL_XFAIL_NUM = 0
+    TOTAL_FAILED_NUM = 0
+    TOTAL_ERROR_NUM = 0
+    TOTAL_EXECUTION_TIME = 0
+
+    #parse the xml files
+    test_cases = parse_xml_reports_as_dict(-1, -1, 'testcase', workflow_name, path)
+    test_suites = parse_xml_reports_as_dict(-1, -1, 'testsuite', workflow_name, path)
+    test_file_and_status = namedtuple("test_file_and_status", ["file_name", "status"])
+    # results dict
+    res = {}
+    res_item_list = [ "PASSED", "SKIPPED", "XFAILED", "FAILED", "ERROR" ]
+    test_file_items = set()
+    for (k,v) in list(test_suites.items()):
+        file_name = k[0]
+        if not file_name in test_file_items:
+            test_file_items.add(file_name)
+            # initialization
+            for item in res_item_list:
+                temp_item = test_file_and_status(file_name, item)
+                res[temp_item] = {}
+            temp_item_statistics = test_file_and_status(file_name, "STATISTICS")
+            res[temp_item_statistics] = {'TOTAL': 0, 'PASSED': 0, 'SKIPPED': 0, 'XFAILED': 0, 'FAILED': 0, 'ERROR': 0, 'EXECUTION_TIME': 0}
+            test_running_time = get_test_file_running_time(v)
+            res[temp_item_statistics]["EXECUTION_TIME"] += test_running_time
+            TOTAL_EXECUTION_TIME += test_running_time
+        else:
+            test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+            test_running_time = get_test_file_running_time(v)
+            res[test_tuple_key_statistics]["EXECUTION_TIME"] += test_running_time
+            TOTAL_EXECUTION_TIME += test_running_time
+
+    for (k,v) in list(test_cases.items()):
+        file_name = k[0]
+        class_name = k[1]
+        test_name = k[2]
+        combined_name = file_name + "::" + class_name + "::" + test_name
+        test_status = get_test_status(v)
+        test_running_time = get_test_running_time(v)
+        test_message = get_test_message(v, test_status)
+        test_info_value = ""
+        test_tuple_key_status = test_file_and_status(file_name, test_status)
+        test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+        TOTAL_TEST_NUM += 1
+        res[test_tuple_key_statistics]["TOTAL"] += 1
+        if test_status == "PASSED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["PASSED"] += 1
+            TOTAL_PASSED_NUM += 1
+        elif test_status == "SKIPPED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["SKIPPED"] += 1
+            TOTAL_SKIPPED_NUM += 1
+        elif test_status == "XFAILED":
+            test_info_value = str(test_running_time)
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["XFAILED"] += 1
+            TOTAL_XFAIL_NUM += 1
+        elif test_status == "FAILED":
+            test_info_value = test_message
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["FAILED"] += 1
+            TOTAL_FAILED_NUM += 1
+        elif test_status == "ERROR":
+            test_info_value = test_message
+            res[test_tuple_key_status][combined_name] = test_info_value
+            res[test_tuple_key_statistics]["ERROR"] += 1
+            TOTAL_ERROR_NUM += 1
+
+    # generate statistics_dict
+    statistics_dict = {}
+    statistics_dict["TOTAL"] = TOTAL_TEST_NUM
+    statistics_dict["PASSED"] = TOTAL_PASSED_NUM
+    statistics_dict["SKIPPED"] = TOTAL_SKIPPED_NUM
+    statistics_dict["XFAILED"] = TOTAL_XFAIL_NUM
+    statistics_dict["FAILED"] = TOTAL_FAILED_NUM
+    statistics_dict["ERROR"] = TOTAL_ERROR_NUM
+    statistics_dict["EXECUTION_TIME"] = TOTAL_EXECUTION_TIME
+    aggregate_item = workflow_name + "_aggregate"
+    total_item = test_file_and_status(aggregate_item, "STATISTICS")
+    res[total_item] = statistics_dict
+
+    return res
+
+def run_command_and_capture_output(cmd):
+    try:
+        print(f"Running command '{cmd}'")
+        with open(CONSOLIDATED_LOG_FILE_PATH, "a+") as output_file:
+            print(f"========================================", file=output_file, flush=True)
+            print(f"[RUN_PYTORCH_UNIT_TESTS] Running command '{cmd}'", file=output_file, flush=True) # send to consolidated file as well
+            print(f"========================================", file=output_file, flush=True)
+            p = subprocess.run(cmd, shell=True, stdout=output_file, stderr=STDOUT, text=True)
+    except CalledProcessError as e:
+        print(f"ERROR: Cmd {cmd} failed with return code: {e.returncode}!")
+
+def run_entire_tests(workflow_name, test_shell_path, overall_logs_path_current_run, test_reports_src):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_entire_tests/"
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_entire_tests/"
+    elif workflow_name == "inductor":
+        os.environ['TEST_CONFIG'] = 'inductor'
+        copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_entire_tests/"
+    # use test.sh for tests execution
+    run_command_and_capture_output(test_shell_path)
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    entire_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+    return entire_results_dict
+
+def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_priority_tests/"
+        # use run_test.py for tests execution
+        default_priority_test_suites = " ".join(DEFAULT_CORE_TESTS)
+        command = "python3 " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_priority_tests/"
+        # use run_test.py for tests execution
+        distributed_priority_test_suites = " ".join(DISTRIBUTED_CORE_TESTS)
+        command = "python3 " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    priority_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+    return priority_results_dict
+
+def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src, selected_list):
+    if os.path.exists(test_reports_src):
+        shutil.rmtree(test_reports_src)
+
+    os.mkdir(test_reports_src)
+    copied_logs_path = ""
+    if workflow_name == "default":
+        os.environ['TEST_CONFIG'] = 'default'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0'
+        copied_logs_path = overall_logs_path_current_run + "default_xml_results_selected_tests/"
+        # use run_test.py for tests execution
+        default_selected_test_suites = " ".join(selected_list)
+        command = "python3 " + test_run_test_path + " --include " + default_selected_test_suites  + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "distributed":
+        os.environ['TEST_CONFIG'] = 'distributed'
+        os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+        copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_selected_tests/"
+        # use run_test.py for tests execution
+        distributed_selected_test_suites = " ".join(selected_list)
+        command = "python3 " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose"
+        run_command_and_capture_output(command)
+        del os.environ['HIP_VISIBLE_DEVICES']
+    elif workflow_name == "inductor":
+        os.environ['TEST_CONFIG'] = 'inductor'
+        copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_selected_tests/"
+        inductor_selected_test_suites = ""
+        non_inductor_selected_test_suites = ""
+        for item in selected_list:
+            if "inductor/" in item:
+                inductor_selected_test_suites += item
+                inductor_selected_test_suites += " "
+            else:
+                non_inductor_selected_test_suites += item
+                non_inductor_selected_test_suites += " "
+        if inductor_selected_test_suites != "":
+            inductor_selected_test_suites = inductor_selected_test_suites[:-1]
+            command = "python3 " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose"
+            run_command_and_capture_output(command)
+        if non_inductor_selected_test_suites != "":
+            non_inductor_selected_test_suites = non_inductor_selected_test_suites[:-1]
+            command = "python3 " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose"
+            run_command_and_capture_output(command)
+    copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+    selected_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+    return selected_results_dict
+
+def run_test_and_summarize_results(
+    pytorch_root_dir: str,
+    priority_tests: bool,
+    test_config: List[str],
+    default_list: List[str],
+    distributed_list: List[str],
+    inductor_list: List[str],
+    skip_rerun: bool) -> Dict[str, Any]:
+
+    # copy current environment variables
+    _environ = dict(os.environ)
+    
+    # modify path
+    test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh"
+    test_run_test_path = pytorch_root_dir + "/test/run_test.py"
+    repo_test_log_folder_path = pytorch_root_dir + "/.automation_logs/"
+    test_reports_src = pytorch_root_dir + "/test/test-reports/"
+    run_test_python_file = pytorch_root_dir + "/test/run_test.py"
+
+    # change directory to pytorch root
+    os.chdir(pytorch_root_dir)
+
+    # all test results dict
+    res_all_tests_dict = {}
+
+    # patterns
+    search_text = "--reruns=2"
+    replace_text = "--reruns=0"
+
+    # create logs folder
+    if not os.path.exists(repo_test_log_folder_path):
+        os.mkdir(repo_test_log_folder_path)
+
+    # Set common environment variables for all scenarios
+    os.environ['CI'] = '1'
+    os.environ['PYTORCH_TEST_WITH_ROCM'] = '1'
+    os.environ['HSA_FORCE_FINE_GRAIN_PCIE'] = '1'
+    os.environ['PYTORCH_TESTING_DEVICE_ONLY_FOR'] = 'cuda'
+    os.environ['CONTINUE_THROUGH_ERROR'] = 'True'
+    if skip_rerun:
+        # modify run_test.py in-place
+        with open(run_test_python_file, 'r') as file:
+            data = file.read()
+            data = data.replace(search_text, replace_text)
+        with open(run_test_python_file, 'w') as file:
+            file.write(data)
+
+    # Time stamp
+    current_datetime = datetime.now().strftime("%Y%m%d_%H-%M-%S")
+    print("Current date & time : ", current_datetime)
+    # performed as Job ID
+    str_current_datetime = str(current_datetime)
+    overall_logs_path_current_run = repo_test_log_folder_path + str_current_datetime + "/"
+    os.mkdir(overall_logs_path_current_run)
+
+    global CONSOLIDATED_LOG_FILE_PATH
+    CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
+
+    # Check multi gpu availability if distributed tests are enabled
+    if ("distributed" in test_config) or len(distributed_list) != 0:
+        check_num_gpus_for_distributed()
+
+    # Install test requirements
+    command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
+    run_command_and_capture_output(command)
+
+    # Run entire tests for each workflow
+    if not priority_tests and not default_list and not distributed_list and not inductor_list:
+        # run entire tests for default, distributed and inductor workflows → use test.sh
+        if not test_config:
+            check_num_gpus_for_distributed()
+            # default test process
+            res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["default"] = res_default_all
+            # distributed test process
+            res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["distributed"] = res_distributed_all
+            # inductor test process
+            res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["inductor"] = res_inductor_all
+        else:
+            workflow_list = []
+            for item in test_config:
+                workflow_list.append(item)
+            if "default" in workflow_list:
+                res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["default"] = res_default_all
+            if "distributed" in workflow_list:
+                res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["distributed"] = res_distributed_all
+            if "inductor" in workflow_list:
+                res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["inductor"] = res_inductor_all
+    # Run priority test for each workflow
+    elif priority_tests and not default_list and not distributed_list and not inductor_list:
+        if not test_config:
+            check_num_gpus_for_distributed()
+            # default test process
+            res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["default"] = res_default_priority
+            # distributed test process
+            res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+            res_all_tests_dict["distributed"] = res_distributed_priority
+            # will not run inductor priority tests
+            print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+        else:
+            workflow_list = []
+            for item in test_config:
+                workflow_list.append(item)
+            if "default" in workflow_list:
+                res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["default"] = res_default_priority
+            if "distributed" in workflow_list:
+                res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+                res_all_tests_dict["distributed"] = res_distributed_priority
+            if "inductor" in workflow_list:
+                print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+    # Run specified tests for each workflow
+    elif (default_list or distributed_list or inductor_list) and not test_config and not priority_tests:
+        if default_list:
+            default_workflow_list = []
+            for item in default_list:
+                default_workflow_list.append(item)
+            res_default_selected = run_selected_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src, default_workflow_list)
+            res_all_tests_dict["default"] = res_default_selected
+        if distributed_list:
+            distributed_workflow_list = []
+            for item in distributed_list:
+                distributed_workflow_list.append(item)
+            res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
+            res_all_tests_dict["distributed"] = res_distributed_selected
+        if inductor_list:
+            inductor_workflow_list = []
+            for item in inductor_list:
+                 inductor_workflow_list.append(item)
+            res_inductor_selected = run_selected_tests("inductor", test_run_test_path, overall_logs_path_current_run, test_reports_src, inductor_workflow_list)
+            res_all_tests_dict["inductor"] = res_inductor_selected
+    else:
+        raise Exception("Invalid test configurations!")
+
+    # restore environment variables
+    os.environ.clear()
+    os.environ.update(_environ)
+
+    # restore files
+    if skip_rerun:
+        # modify run_test.py in-place
+        with open(run_test_python_file, 'r') as file:
+            data = file.read()
+            data = data.replace(replace_text, search_text)
+        with open(run_test_python_file, 'w') as file:
+            file.write(data)
+
+    return res_all_tests_dict
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Run PyTorch unit tests and generate xml results summary', formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--test_config', nargs='+', default=[], type=str, help="space-separated list of test workflows to be executed eg. 'default distributed'")
+    parser.add_argument('--priority_tests', action='store_true', help="run priority tests only")
+    parser.add_argument('--default_list', nargs='+', default=[], help="space-separated list of 'default' config test suites/files to be executed eg. 'test_weak test_dlpack'")
+    parser.add_argument('--distributed_list', nargs='+', default=[], help="space-separated list of 'distributed' config test suites/files to be executed eg. 'distributed/test_c10d_common distributed/test_c10d_nccl'")
+    parser.add_argument('--inductor_list', nargs='+', default=[], help="space-separated list of 'inductor' config test suites/files to be executed eg. 'inductor/test_torchinductor test_ops'")
+    parser.add_argument('--pytorch_root', default='.', type=str, help="PyTorch root directory")
+    parser.add_argument('--skip_rerun', action='store_true', help="skip rerun process")
+    parser.add_argument('--example_output', type=str, help="{'workflow_name': {\n"
+                                                           "  test_file_and_status(file_name='workflow_aggregate', status='STATISTICS'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='ERROR'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='FAILED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='PASSED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='SKIPPED'): {}, \n"
+                                                           "  test_file_and_status(file_name='test_file_name_1', status='STATISTICS'): {} \n"
+                                                           "}}\n")
+    parser.add_argument('--example_usages', type=str, help="RUN ALL TESTS: python3 run_pytorch_unit_tests.py \n"
+                                                            "RUN PRIORITY TESTS: python3 run_pytorch_unit_tests.py --test_config distributed --priority_test \n"
+                                                            "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
+    return parser.parse_args()
+
+def check_num_gpus_for_distributed():
+    p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
+    num_gpus_visible = int(p.stdout)
+    assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
+
+def main():
+    args = parse_args()
+    all_tests_results = run_test_and_summarize_results(args.pytorch_root, args.priority_tests, args.test_config, args.default_list, args.distributed_list, args.inductor_list, args.skip_rerun)
+    pprint(dict(all_tests_results))
+
+if __name__ == "__main__":
+    main()
diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
index 424ddd0013cd..41cabc3bf511 100644
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -3,8 +3,20 @@ set -eux -o pipefail
 
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 
-if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+# Set CUDA architecture lists to match x86 build_cuda.sh
+if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
     export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
+fi
+
+# Compress the fatbin with -compress-mode=size for CUDA 13
+if [[ "$DESIRED_CUDA" == *"13"* ]]; then
+    export TORCH_NVCC_FLAGS="-compress-mode=size"
+    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
+    export BUILD_BUNDLE_PTXAS=1
 fi
 
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@@ -18,7 +30,7 @@ cd /
 # on the mounted pytorch repo
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
-pip install auditwheel==6.2.0
+pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
     echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
@@ -26,6 +38,16 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
 else
     echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
     export USE_SYSTEM_NCCL=1
+
+    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling CUDA libraries with wheel for aarch64."
+    else
+        echo "Using nvidia libs from pypi for aarch64."
+        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
+        export USE_NVIDIA_PYPI_LIBS=1
+    fi
+
     #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
     USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
index a2b5f6912c9a..1b6429fa8c06 100755
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -69,62 +69,186 @@ def replace_tag(filename) -> None:
         f.writelines(lines)
 
 
+def patch_library_rpath(
+    folder: str,
+    lib_name: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Apply patchelf to set RPATH for a library in torch/lib"""
+    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
+
+    if use_nvidia_pypi_libs:
+        # For PyPI NVIDIA libraries, construct CUDA RPATH
+        cuda_rpaths = [
+            "$ORIGIN/../../nvidia/cudnn/lib",
+            "$ORIGIN/../../nvidia/nvshmem/lib",
+            "$ORIGIN/../../nvidia/nccl/lib",
+            "$ORIGIN/../../nvidia/cusparselt/lib",
+        ]
+
+        if "130" in desired_cuda:
+            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
+        else:
+            cuda_rpaths.extend(
+                [
+                    "$ORIGIN/../../nvidia/cublas/lib",
+                    "$ORIGIN/../../nvidia/cuda_cupti/lib",
+                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",
+                    "$ORIGIN/../../nvidia/cuda_runtime/lib",
+                    "$ORIGIN/../../nvidia/cufft/lib",
+                    "$ORIGIN/../../nvidia/curand/lib",
+                    "$ORIGIN/../../nvidia/cusolver/lib",
+                    "$ORIGIN/../../nvidia/cusparse/lib",
+                    "$ORIGIN/../../nvidia/nvtx/lib",
+                    "$ORIGIN/../../nvidia/cufile/lib",
+                ]
+            )
+
+        # Add $ORIGIN for local torch libs
+        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
+    else:
+        # For bundled libraries, just use $ORIGIN
+        rpath = "$ORIGIN"
+
+    if os.path.exists(lib_path):
+        os.system(
+            f"cd {folder}/tmp/torch/lib/; "
+            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
+        )
+
+
+def copy_and_patch_library(
+    src_path: str,
+    folder: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Copy a library to torch/lib and patch its RPATH"""
+    if os.path.exists(src_path):
+        lib_name = os.path.basename(src_path)
+        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
+        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+
+
 def package_cuda_wheel(wheel_path, desired_cuda) -> None:
     """
     Package the cuda wheel libraries
     """
     folder = os.path.dirname(wheel_path)
-    wheelname = os.path.basename(wheel_path)
     os.mkdir(f"{folder}/tmp")
     os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    libs_to_copy = [
-        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-        "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
-        "/usr/local/cuda/lib64/libcudnn.so.9",
-        "/usr/local/cuda/lib64/libcublas.so.12",
-        "/usr/local/cuda/lib64/libcublasLt.so.12",
-        "/usr/local/cuda/lib64/libcudart.so.12",
-        "/usr/local/cuda/lib64/libcufft.so.11",
-        "/usr/local/cuda/lib64/libcusparse.so.12",
-        "/usr/local/cuda/lib64/libcusparseLt.so.0",
-        "/usr/local/cuda/lib64/libcusolver.so.11",
-        "/usr/local/cuda/lib64/libcurand.so.10",
-        "/usr/local/cuda/lib64/libnccl.so.2",
-        "/usr/local/cuda/lib64/libnvJitLink.so.12",
-        "/usr/local/cuda/lib64/libnvrtc.so.12",
-        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
-        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
-        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
-        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
-        "/usr/local/cuda/lib64/libcudnn_ops.so.9",
-        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
-        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
-        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-        "/lib64/libgomp.so.1",
-        "/usr/lib64/libgfortran.so.5",
-        "/acl/build/libarm_compute.so",
-        "/acl/build/libarm_compute_graph.so",
-        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_lapack_core.so.0",
-        "/usr/local/lib/libnvpl_blas_core.so.0",
-    ]
+    # Delete original wheel since it will be repackaged
+    os.system(f"rm {wheel_path}")
+
+    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
+    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+
+    if use_nvidia_pypi_libs:
+        print("Using nvidia libs from pypi - skipping CUDA library bundling")
+        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
+        # We only need to bundle non-NVIDIA libraries
+        minimal_libs_to_copy = [
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+        ]
 
-    if "129" in desired_cuda:
-        libs_to_copy += [
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
+        # Copy minimal libraries to unzipped_folder/torch/lib
+        for lib_path in minimal_libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+
+        # Patch torch libraries used for searching libraries
+        torch_libs_to_patch = [
+            "libtorch.so",
+            "libtorch_cpu.so",
+            "libtorch_cuda.so",
+            "libtorch_cuda_linalg.so",
+            "libtorch_global_deps.so",
+            "libtorch_python.so",
+            "libtorch_nvshmem.so",
+            "libc10.so",
+            "libc10_cuda.so",
+            "libcaffe2_nvrtc.so",
+            "libshm.so",
+        ]
+        for lib_name in torch_libs_to_patch:
+            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+    else:
+        print("Bundling CUDA libraries with wheel")
+        # Original logic for bundling system CUDA libraries
+        # Common libraries for all CUDA versions
+        common_libs = [
+            # Non-NVIDIA system libraries
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            # Common CUDA libraries (same for all versions)
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
+            "/usr/local/cuda/lib64/libcudnn.so.9",
+            "/usr/local/cuda/lib64/libcusparseLt.so.0",
+            "/usr/local/cuda/lib64/libcurand.so.10",
+            "/usr/local/cuda/lib64/libnccl.so.2",
+            "/usr/local/cuda/lib64/libnvshmem_host.so.3",
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9",
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9",
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
             "/usr/local/cuda/lib64/libcufile.so.0",
             "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            "/usr/local/cuda/lib64/libcusparse.so.12",
         ]
 
-    # Copy libraries to unzipped_folder/a/lib
-    for lib_path in libs_to_copy:
-        lib_name = os.path.basename(lib_path)
-        shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
-        os.system(
-            f"cd {folder}/tmp/torch/lib/; "
-            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
-        )
+        # CUDA version-specific libraries
+        if "13" in desired_cuda:
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
+                "/usr/local/cuda/lib64/libcublas.so.13",
+                "/usr/local/cuda/lib64/libcublasLt.so.13",
+                "/usr/local/cuda/lib64/libcudart.so.13",
+                "/usr/local/cuda/lib64/libcufft.so.12",
+                "/usr/local/cuda/lib64/libcusolver.so.12",
+                "/usr/local/cuda/lib64/libnvJitLink.so.13",
+                "/usr/local/cuda/lib64/libnvrtc.so.13",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
+            ]
+        elif "12" in desired_cuda:
+            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+                "/usr/local/cuda/lib64/libcublas.so.12",
+                "/usr/local/cuda/lib64/libcublasLt.so.12",
+                "/usr/local/cuda/lib64/libcudart.so.12",
+                "/usr/local/cuda/lib64/libcufft.so.11",
+                "/usr/local/cuda/lib64/libcusolver.so.11",
+                "/usr/local/cuda/lib64/libnvJitLink.so.12",
+                "/usr/local/cuda/lib64/libnvrtc.so.12",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
+            ]
+        else:
+            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
+
+        # Combine all libraries
+        libs_to_copy = common_libs + version_specific_libs
+
+        # Copy libraries to unzipped_folder/torch/lib
+        for lib_path in libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
 
     # Make sure the wheel is tagged with manylinux_2_28
     for f in os.scandir(f"{folder}/tmp/"):
@@ -132,14 +256,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
             replace_tag(f"{f.path}/WHEEL")
             break
 
-    os.mkdir(f"{folder}/cuda_wheel")
-    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
-    shutil.move(
-        f"{folder}/cuda_wheel/{wheelname}",
-        f"{folder}/{wheelname}",
-        copy_function=shutil.copy2,
-    )
-    os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")
+    os.system(f"wheel pack {folder}/tmp/ -d {folder}")
+    os.system(f"rm -rf {folder}/tmp/")
 
 
 def complete_wheel(folder: str) -> str:
@@ -162,14 +280,7 @@ def complete_wheel(folder: str) -> str:
             f"/{folder}/dist/{repaired_wheel_name}",
         )
     else:
-        repaired_wheel_name = wheel_name.replace(
-            "linux_aarch64", "manylinux_2_28_aarch64"
-        )
-        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
-        os.rename(
-            f"/{folder}/dist/{wheel_name}",
-            f"/{folder}/dist/{repaired_wheel_name}",
-        )
+        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
 
     print(f"Copying {repaired_wheel_name} to artifacts")
     shutil.copy2(
@@ -211,6 +322,16 @@ def parse_arguments():
     if enable_cuda:
         build_vars += "MAX_JOBS=5 "
 
+        # Handle PyPI NVIDIA libraries vs bundled libraries
+        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+        if use_nvidia_pypi_libs:
+            print("Configuring build for PyPI NVIDIA libraries")
+            # Configure for dynamic linking (matching x86 logic)
+            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
+        else:
+            print("Configuring build for bundled NVIDIA libraries")
+            # Keep existing static linking approach - already configured above
+
     override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
     desired_cuda = os.getenv("DESIRED_CUDA")
     if override_package_version is not None:
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index f22aa919e434..8672fae2bbdd 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then
   DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
 fi
 
-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
+_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
   _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
   _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@@ -114,31 +114,19 @@ case "$tag" in
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
+  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
+    CUDA_VERSION=13.0.0
     ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
+    GCC_VERSION=11
     VISION=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
     TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
     CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.13
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=9
     VISION=yes
     KATEX=yes
@@ -173,8 +161,8 @@ case "$tag" in
     VISION=yes
     ONNX=yes
     ;;
-  pytorch-linux-jammy-py3.9-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-py3.10-clang12)
+    ANACONDA_PYTHON_VERSION=3.10
     CLANG_VERSION=12
     VISION=yes
     TRITON=yes
@@ -209,24 +197,24 @@ case "$tag" in
     UCC_COMMIT=${_UCC_COMMIT}
     PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
     ;;
-  pytorch-linux-jammy-xpu-2025.0-py3)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-xpu-n-1-py3)
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
-    XPU_VERSION=2025.0
+    XPU_VERSION=2025.1
     NINJA_VERSION=1.9.0
     TRITON=yes
     ;;
-  pytorch-linux-jammy-xpu-2025.1-py3)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-xpu-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
-    XPU_VERSION=2025.1
+    XPU_VERSION=2025.2
     NINJA_VERSION=1.9.0
     TRITON=yes
     ;;
-  pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
     KATEX=yes
@@ -234,8 +222,8 @@ case "$tag" in
     DOCS=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
+    ANACONDA_PYTHON_VERSION=3.10
     CUDA_VERSION=12.8.1
     CLANG_VERSION=12
     VISION=yes
@@ -246,8 +234,8 @@ case "$tag" in
     CLANG_VERSION=18
     VISION=yes
     ;;
-  pytorch-linux-jammy-py3.9-gcc11)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-py3.10-gcc11)
+    ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
     VISION=yes
     KATEX=yes
@@ -274,13 +262,10 @@ case "$tag" in
     TRITON_CPU=yes
     ;;
   pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    PYTHON_VERSION=3.10
     ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+    PYTHON_VERSION=3.10
     CUDA_VERSION=12.8.1
     ;;
   pytorch-linux-jammy-aarch64-py3.10-gcc11)
diff --git a/.ci/docker/ci_commit_pins/torchbench.txt b/.ci/docker/ci_commit_pins/torchbench.txt
index efbc3ceeb2af..c9be7b440bae 100644
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
@@ -1 +1 @@
-e03a63be43e33596f7f0a43b0f530353785e4a59
+74a23feff57432129df84d8099e622773cf77925
diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt
index 3be14be85ad6..b03606f6defc 100644
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@@ -1 +1 @@
-a6572fb0be5b9b0a19b0641a0ce05810fa04e44c
+1b0418a9a454b2b93ab8d71f40e59d2297157fae
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 60c896b80c8f..99ec5b4aa341 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-f7888497a1eb9e98d4c07537f0d0bcfe180d1363
+d08c31a24d622b4bf767a6645135b7b3d0d886f4
diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh
index c160e5704ba3..692edd0b898f 100755
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@@ -83,9 +83,9 @@ function build_cpython {
         py_suffix=${py_ver::-1}
         py_folder=$py_suffix
     fi
-    # Only b3 is available now
+    # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
     if [ "$py_suffix" == "3.14.0" ]; then
-        py_suffix="3.14.0b3"
+        py_suffix="3.14.0rc2"
     fi
     wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
     do_cpython_build $py_ver Python-$py_suffix
diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh
index 00c3cfd06b41..c6808ea4a7a2 100644
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@@ -147,7 +147,7 @@ function install_128 {
 }
 
 function install_130 {
-  CUDNN_VERSION=9.12.0.46
+  CUDNN_VERSION=9.13.0.50
   echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
   # install CUDA 13.0 in the same container
   install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index f48140952c3a..8e714bcb6cd3 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -21,7 +21,7 @@ elif [ -n "${TRITON_CPU}" ]; then
   TRITON_REPO="https://github.com/triton-lang/triton-cpu"
   TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/triton-lang/triton"
+  TRITON_REPO="https://github.com/ROCm/triton"
   TRITON_TEXT_FILE="triton"
 fi
 
diff --git a/.ci/docker/common/install_ucc.sh b/.ci/docker/common/install_ucc.sh
index b7f884ea9648..04f15a52e88e 100755
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@@ -44,8 +44,12 @@ function install_ucc() {
 
   ./autogen.sh
 
-  # We only run distributed tests on Tesla M60 and A10G
-  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then
+    NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
+  else
+    # We only run distributed tests on Tesla M60 and A10G
+    NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  fi
 
   if [[ -n "$ROCM_VERSION" ]]; then
     if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh
index 7f21d2e42c72..0b150872f93c 100644
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@@ -65,10 +65,14 @@ function install_ubuntu() {
 
 function install_rhel() {
     . /etc/os-release
-
-    if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
-        echo "RHEL version ${VERSION_ID} not supported"
-        exit
+    if [[ "${ID}" == "rhel" ]]; then
+        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+            echo "RHEL version ${VERSION_ID} not supported"
+            exit
+        fi
+    elif [[ "${ID}" == "almalinux" ]]; then
+        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
+        VERSION_ID="8.8"
     fi
 
     dnf install -y 'dnf-command(config-manager)'
@@ -146,11 +150,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
     XPU_DRIVER_VERSION="/lts/2350"
 fi
 
-# Default use Intel® oneAPI Deep Learning Essentials 2025.0
-if [[ "$XPU_VERSION" == "2025.1" ]]; then
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+# Default use Intel® oneAPI Deep Learning Essentials 2025.1
+if [[ "$XPU_VERSION" == "2025.2" ]]; then
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
 else
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
 fi
 
 # The installation depends on the base OS
diff --git a/.ci/docker/common/patch_libstdc.sh b/.ci/docker/common/patch_libstdc.sh
new file mode 100755
index 000000000000..7e3a00d0dad8
--- /dev/null
+++ b/.ci/docker/common/patch_libstdc.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+# Script used in Linux x86 and aarch64 CD pipeline
+
+# Workaround for exposing statically linked libstdc++ CXX11 ABI symbols.
+# see: https://github.com/pytorch/pytorch/issues/133437
+LIBNONSHARED=$(gcc -print-file-name=libstdc++_nonshared.a)
+nm -g $LIBNONSHARED | grep " T " | grep recursive_directory_iterator | cut -c 20-  > weaken-symbols.txt
+objcopy --weaken-symbols weaken-symbols.txt $LIBNONSHARED $LIBNONSHARED
diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile
index d2788b2713f7..d19431ad8b54 100644
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@@ -74,6 +74,14 @@ RUN bash ./install_cuda.sh 13.0
 RUN bash ./install_magma.sh 13.0
 RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda
 
+# Install libibverbs for libtorch and copy to CUDA directory
+RUN apt-get update -y && \
+    apt-get install -y libibverbs-dev librdmacm-dev && \
+    cp /usr/lib/x86_64-linux-gnu/libmlx5.so* /usr/local/cuda/lib64/ && \
+    cp /usr/lib/x86_64-linux-gnu/librdmacm.so* /usr/local/cuda/lib64/ && \
+    cp /usr/lib/x86_64-linux-gnu/libibverbs.so* /usr/local/cuda/lib64/ && \
+    cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/
+
 FROM cpu as rocm
 ARG ROCM_VERSION
 ARG PYTORCH_ROCM_ARCH
diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28
index b150423e9954..4803cb778c90 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@@ -130,7 +130,8 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
 RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
     /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
     done;
-
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
 
 # cmake-3.18.4 from pip; force in case cmake3 already exists
 RUN yum install -y python3-pip && \
@@ -175,6 +176,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
     python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.1
+ENV XPU_VERSION 2025.2
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
index da7ab4d3fd15..6cfab77941fc 100644
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@@ -71,3 +71,5 @@ RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
 ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
index 369706055737..4d2596fea821 100644
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@@ -95,3 +95,5 @@ COPY --from=nvpl /opt/nvpl/lib/  /usr/local/lib/
 COPY --from=nvpl /opt/nvpl/include/  /usr/local/include/
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index c9d2fddb1324..248ee8409036 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -93,8 +93,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:
 
-mypy==1.16.0
+mypy==1.16.0 ; platform_system != "Windows"
 # Pin MyPy version because new errors are likely to appear with each release
+# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
@@ -112,9 +113,8 @@ ninja==1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
 numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x"
-numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x"
-numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
-numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
+numba==0.60.0 ; python_version == "3.9" and platform_machine != "s390x"
+numba==0.61.2 ; python_version > "3.9" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
@@ -133,12 +133,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
-numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13"
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
 
-pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13"
+pandas==2.2.3
 
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@@ -168,10 +166,11 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:
 
-protobuf==5.29.4
-#Description:  Google's data interchange format
-#Pinned versions: 5.29.4
-#test that import: test_tensorboard.py, test/onnx/*
+protobuf==3.20.2 ; python_version <= "3.12"
+protobuf==4.25.1 ; python_version == "3.13"
+#Description:  Google’s data interchange format
+#Pinned versions: 3.20.1
+#test that import: test_tensorboard.py
 
 psutil
 #Description: information on running processes and system utilization
@@ -249,8 +248,8 @@ scikit-image==0.22.0 ; python_version >= "3.10"
 #Pinned versions: 0.20.3
 #test that import:
 
-scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version >= "3.12"
+scipy==1.13.1 ; python_version == "3.9"
+scipy==1.14.1 ; python_version > "3.9"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
@@ -309,8 +308,7 @@ z3-solver==4.15.1.0 ; platform_machine != "s390x"
 #Pinned versions:
 #test that import:
 
-tensorboard==2.13.0 ; python_version < "3.13"
-tensorboard==2.18.0 ; python_version >= "3.13"
+tensorboard==2.18.0
 #Description: Also included in .ci/docker/requirements-docs.txt
 #Pinned versions:
 #test that import: test_tensorboard
@@ -322,7 +320,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:
 
-lxml==5.3.0
+lxml==5.3.0 ; python_version <= "3.12"
+lxml==6.0.0 ; python_version == "3.13"
 #Description: This is a requirement of unittest-xml-reporting
 
 # Python-3.9 binaries
@@ -334,8 +333,9 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
 
-onnx==1.18.0
-#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
+onnx==1.16.1 ; python_version <= "3.12"
+onnx==1.18.0 ; python_version == "3.13"
+#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
 
@@ -379,7 +379,7 @@ dataclasses_json==0.6.7
 cmake==4.0.0
 #Description: required for building
 
-tlparse==0.3.30
+tlparse==0.4.0
 #Description: required for log parsing
 
 cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt
index 3de4d8e0e44e..c5ad8e969fb9 100644
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
--e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt
index 18091983f59d..1545d966571d 100644
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@@ -1 +1 @@
-3.4.0
+3.5.0
diff --git a/.ci/docker/triton_xpu_version.txt b/.ci/docker/triton_xpu_version.txt
index 18091983f59d..1545d966571d 100644
--- a/.ci/docker/triton_xpu_version.txt
+++ b/.ci/docker/triton_xpu_version.txt
@@ -1 +1 @@
-3.4.0
+3.5.0
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 57f997f30089..1edc8c60c2f0 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -66,6 +66,7 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
+ARG CUDA_VERSION
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
diff --git a/.ci/libtorch/build.sh b/.ci/libtorch/build.sh
index 54ddd905aad0..c2d67f8b1bb2 100644
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@@ -7,4 +7,4 @@ set -ex
 
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
+USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
diff --git a/.ci/lumen_cli/cli/lib/common/gh_summary.py b/.ci/lumen_cli/cli/lib/common/gh_summary.py
new file mode 100644
index 000000000000..72bfaa76e706
--- /dev/null
+++ b/.ci/lumen_cli/cli/lib/common/gh_summary.py
@@ -0,0 +1,143 @@
+from __future__ import annotations
+
+import logging
+import os
+import textwrap
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from cli.lib.common.utils import get_wheels
+from jinja2 import Template
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping
+
+
+logger = logging.getLogger(__name__)
+
+_TPL_CONTENT = Template(
+    textwrap.dedent("""\
+    ## {{ title }}
+
+    ```{{ lang }}
+    {{ content }}
+    ```
+""")
+)
+
+_TPL_LIST_ITEMS = Template(
+    textwrap.dedent("""\
+    ## {{ title }}
+    {% for it in items %}
+    - {{ it.pkg }}: {{ it.relpath }}
+    {% else %}
+    _(no item found)_
+    {% endfor %}
+    """)
+)
+
+_TPL_TABLE = Template(
+    textwrap.dedent("""\
+    {%- if rows %}
+    | {{ cols | join(' | ') }} |
+    |{%- for _ in cols %} --- |{%- endfor %}
+    {%- for r in rows %}
+    | {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
+    {%- endfor %}
+    {%- else %}
+    _(no data)_
+    {%- endif %}
+""")
+)
+
+
+def gh_summary_path() -> Path | None:
+    """Return the Path to the GitHub step summary file, or None if not set."""
+    p = os.environ.get("GITHUB_STEP_SUMMARY")
+    return Path(p) if p else None
+
+
+def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
+    """
+    Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set.
+    append_content: default true, if True, append to the end of the file, else overwrite the whole file
+
+    Returns:
+        True if written successfully (in GitHub Actions environment),
+        False if skipped (e.g., running locally where the variable is not set).
+    """
+    sp = gh_summary_path()
+    if not sp:
+        logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
+        return False
+
+    md_clean = textwrap.dedent(md).strip() + "\n"
+
+    mode = "a" if append_content else "w"
+    with sp.open(mode, encoding="utf-8") as f:
+        f.write(md_clean)
+    return True
+
+
+def md_heading(text: str, level: int = 2) -> str:
+    """Generate a Markdown heading string with the given level (1-6)."""
+    return f"{'#' * max(1, min(level, 6))} {text}\n"
+
+
+def md_details(summary: str, content: str) -> str:
+    """Generate a collapsible <details> block with a summary and inner content."""
+    return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"
+
+
+def summarize_content_from_file(
+    output_dir: Path,
+    freeze_file: str,
+    title: str = "Content from file",
+    code_lang: str = "",  # e.g. "text" or "ini"
+) -> bool:
+    f = Path(output_dir) / freeze_file
+    if not f.exists():
+        return False
+    content = f.read_text(encoding="utf-8").strip()
+    md = render_content(content, title=title, lang=code_lang)
+    return write_gh_step_summary(md)
+
+
+def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
+    items = get_wheels(path, max_depth=max_depth)
+    if not items:
+        return False
+    md = render_list(items, title=title)
+    return write_gh_step_summary(md)
+
+
+def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
+    """
+    Render a list of dicts as a Markdown table using Jinja template.
+    """
+    rows = list(rows)
+    cols = list({k for r in rows for k in r.keys()})
+    md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
+    return md
+
+
+def render_list(
+    items: Iterable[str],
+    *,
+    title: str = "List",
+) -> str:
+    tpl = _TPL_LIST_ITEMS
+    md = tpl.render(title=title, items=items)
+    return md
+
+
+def render_content(
+    content: str,
+    *,
+    title: str = "Content",
+    lang: str = "text",
+) -> str:
+    tpl = _TPL_CONTENT
+    md = tpl.render(title=title, content=content, lang=lang)
+    return md
diff --git a/.ci/lumen_cli/cli/lib/common/git_helper.py b/.ci/lumen_cli/cli/lib/common/git_helper.py
index 7fa070a3cb65..9833caca956c 100644
--- a/.ci/lumen_cli/cli/lib/common/git_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@@ -45,7 +45,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
 
         # Checkout pinned commit
         commit = get_post_build_pinned_commit(target)
-        logger.info("Checking out pinned commit %s", commit)
+        logger.info("Checking out pinned %s commit %s", target, commit)
         r.git.checkout(commit)
 
         # Update submodules if requested
@@ -55,7 +55,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
                 sm.update(init=True, recursive=True, progress=PrintProgress())
 
         logger.info("Successfully cloned %s", target)
-        return r
+        return r, commit
 
     except GitCommandError as e:
         logger.error("Git operation failed: %s", e)
diff --git a/.ci/lumen_cli/cli/lib/common/pip_helper.py b/.ci/lumen_cli/cli/lib/common/pip_helper.py
index 1eed8406c9f7..a53747e24d25 100644
--- a/.ci/lumen_cli/cli/lib/common/pip_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py
@@ -4,7 +4,7 @@
 import shutil
 import sys
 from collections.abc import Iterable
-from importlib.metadata import PackageNotFoundError, version
+from importlib.metadata import PackageNotFoundError, version  # noqa: UP035
 from typing import Optional, Union
 
 from cli.lib.common.utils import run_command
diff --git a/.ci/lumen_cli/cli/lib/common/utils.py b/.ci/lumen_cli/cli/lib/common/utils.py
index 05790bd66acf..b03309810d98 100644
--- a/.ci/lumen_cli/cli/lib/common/utils.py
+++ b/.ci/lumen_cli/cli/lib/common/utils.py
@@ -8,6 +8,7 @@
 import subprocess
 import sys
 from contextlib import contextmanager
+from pathlib import Path
 from typing import Optional
 
 
@@ -115,3 +116,24 @@ def working_directory(path: str):
         yield
     finally:
         os.chdir(prev_cwd)
+
+
+def get_wheels(
+    output_dir: Path,
+    max_depth: Optional[int] = None,
+) -> list[str]:
+    """Return a list of wheels found in the given output directory."""
+    root = Path(output_dir)
+    if not root.exists():
+        return []
+    items = []
+    for dirpath, _, filenames in os.walk(root):
+        depth = Path(dirpath).relative_to(root).parts
+        if max_depth is not None and len(depth) > max_depth:
+            continue
+        for fname in sorted(filenames):
+            if fname.endswith(".whl"):
+                pkg = fname.split("-")[0]
+                relpath = str((Path(dirpath) / fname).relative_to(root))
+                items.append({"pkg": pkg, "relpath": relpath})
+    return items
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
index 7f3a930b2cc6..0e2132839adb 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@@ -1,13 +1,27 @@
 import logging
+import os
+import textwrap
 from typing import Any
 
+from cli.lib.common.gh_summary import write_gh_step_summary
 from cli.lib.common.git_helper import clone_external_repo
 from cli.lib.common.pip_helper import pip_install_packages
 from cli.lib.common.utils import run_command, temp_environ, working_directory
+from jinja2 import Template
 
 
 logger = logging.getLogger(__name__)
 
+_TPL_VLLM_INFO = Template(
+    textwrap.dedent("""\
+    ##  Vllm against Pytorch CI Test Summary
+    **Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
+    {%- if torch_sha %}
+    **Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
+    {%- endif %}
+""")
+)
+
 
 def sample_vllm_test_library():
     """
@@ -27,7 +41,6 @@ def sample_vllm_test_library():
                 "pytest -v -s basic_correctness/test_cumem.py",
                 "pytest -v -s basic_correctness/test_basic_correctness.py",
                 "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
             ],
         },
         "vllm_basic_models_test": {
@@ -54,16 +67,12 @@ def sample_vllm_test_library():
                         "-v",
                         "-s",
                         "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                         "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                         "--ignore=entrypoints/llm/test_collective_rpc.py",
                     ]
                 ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "pytest -v -s entrypoints/llm/test_generate_multiple_loras.py",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/llm/test_generate.py",
+                "pytest -v -s entrypoints/offline_mode",
             ],
         },
         "vllm_regression_test": {
@@ -83,14 +92,24 @@ def sample_vllm_test_library():
             "num_gpus": 4,
             "steps": [
                 "pytest -v -s -x lora/test_chatglm3_tp.py",
-                "echo $VLLM_WORKER_MULTIPROC_METHOD",
                 "pytest -v -s -x lora/test_llama_tp.py",
-                "pytest -v -s -x lora/test_multi_loras_with_tp.py",
+                "pytest -v -s -x lora/test_llm_with_multi_loras.py",
             ],
         },
-        "vllm_lora_280_failure_test": {
-            "title": "LoRA 280 failure test",
-            "id": "vllm_lora_280_failure_test",
+        "vllm_distributed_test_28_failure_test": {
+            "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure",
+            "id": "vllm_distributed_test_28_failure_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s distributed/test_sequence_parallel.py",
+            ],
+        },
+        "vllm_lora_28_failure_test": {
+            "title": "LoRA pytorch 2.8 failure test",
+            "id": "vllm_lora_28_failure_test",
             "steps": ["pytest -v lora/test_quant_model.py"],
         },
         "vllm_multi_model_processor_test": {
@@ -101,6 +120,15 @@ def sample_vllm_test_library():
                 "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",
             ],
         },
+        "vllm_multi_model_test_28_failure_test": {
+            "title": "Multi-Model Test (Failed 2.8 release)",
+            "id": "vllm_multi_model_test_28_failure_test",
+            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
+            "steps": [
+                "pytest -v -s models/multimodal/generation/test_voxtral.py",
+                "pytest -v -s models/multimodal/pooling",
+            ],
+        },
         "vllm_pytorch_compilation_unit_tests": {
             "title": "PyTorch Compilation Unit Tests",
             "id": "vllm_pytorch_compilation_unit_tests",
@@ -115,6 +143,28 @@ def sample_vllm_test_library():
                 "pytest -v -s compile/test_decorator.py",
             ],
         },
+        "vllm_languagde_model_test_extended_generation_28_failure_test": {
+            "title": "Language Models Test (Extended Generation) 2.8 release failure",
+            "id": "vllm_languagde_model_test_extended_generation_28_failure_test",
+            "package_install": [
+                "--no-build-isolation",
+                "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8",
+            ],
+            "steps": [
+                "pytest -v -s models/language/generation/test_mistral.py",
+            ],
+        },
+        "vllm_distributed_test_2_gpu_28_failure_test": {
+            "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure",
+            "id": "vllm_distributed_test_2_gpu_28_failure_test",
+            "env_vars": {
+                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
+            },
+            "num_gpus": 4,
+            "steps": [
+                "pytest -v -s distributed/test_sequence_parallel.py",
+            ],
+        },
         # TODO(elainewy):need to add g6 with 4 gpus to run this test
         "vllm_lora_test": {
             "title": "LoRA Test %N",
@@ -214,12 +264,13 @@ def run_test_plan(
 
 
 def clone_vllm(dst: str = "vllm"):
-    clone_external_repo(
+    _, commit = clone_external_repo(
         target="vllm",
         repo="https://github.com/vllm-project/vllm.git",
         dst=dst,
         update_submodules=True,
     )
+    return commit
 
 
 def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
@@ -230,3 +281,12 @@ def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) ->
     for k in sorted(mapping, key=len, reverse=True):
         step = step.replace(k, mapping[k])
     return step
+
+
+def summarize_build_info(vllm_commit: str) -> bool:
+    torch_sha = os.getenv("GITHUB_SHA")
+    md = (
+        _TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
+        + "\n"
+    )
+    return write_gh_step_summary(md)
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
index d067a14f7590..8db48065cb05 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@@ -13,6 +13,11 @@
     env_str_field,
     with_params_help,
 )
+from cli.lib.common.gh_summary import (
+    gh_summary_path,
+    summarize_content_from_file,
+    summarize_wheels,
+)
 from cli.lib.common.path_helper import (
     copy,
     ensure_dir_exists,
@@ -21,7 +26,7 @@
     is_path_exist,
 )
 from cli.lib.common.utils import run_command
-from cli.lib.core.vllm.lib import clone_vllm
+from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
 
 
 logger = logging.getLogger(__name__)
@@ -153,18 +158,43 @@ def run(self):
         """
         inputs = VllmBuildParameters()
         logger.info("Running vllm build with inputs: %s", inputs)
-        clone_vllm()
+        vllm_commit = clone_vllm()
 
         self.cp_dockerfile_if_exist(inputs)
-
         # cp torch wheels from root direct to vllm workspace if exist
         self.cp_torch_whls_if_exist(inputs)
 
-        ensure_dir_exists(inputs.output_dir)
+        # make sure the output dir to store the build artifacts exist
+        ensure_dir_exists(Path(inputs.output_dir))
 
         cmd = self._generate_docker_build_cmd(inputs)
         logger.info("Running docker build: \n %s", cmd)
-        run_command(cmd, cwd="vllm", env=os.environ.copy())
+
+        try:
+            run_command(cmd, cwd="vllm", env=os.environ.copy())
+        finally:
+            self.genearte_vllm_build_summary(vllm_commit, inputs)
+
+    def genearte_vllm_build_summary(
+        self, vllm_commit: str, inputs: VllmBuildParameters
+    ):
+        if not gh_summary_path():
+            return logger.info("Skipping, not detect GH Summary env var....")
+        logger.info("Generate GH Summary ...")
+        # summarize vllm build info
+        summarize_build_info(vllm_commit)
+
+        # summarize vllm build artifacts
+        vllm_artifact_dir = inputs.output_dir / "wheels"
+        summarize_content_from_file(
+            vllm_artifact_dir,
+            "build_summary.txt",
+            title="Vllm build env pip package summary",
+        )
+        summarize_wheels(
+            inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
+        )
+        summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")
 
     def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
         if not inputs.use_torch_whl:
diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
index 2be8e246486e..76401e33f29f 100644
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@@ -104,20 +104,26 @@ def run(self):
         main function to run vllm test
         """
         self.prepare()
-        with working_directory(self.work_directory):
-            if self.test_type == TestInpuType.TEST_PLAN:
-                if self.num_shards > 1:
-                    run_test_plan(
-                        self.test_plan,
-                        "vllm",
-                        sample_vllm_test_library(),
-                        self.shard_id,
-                        self.num_shards,
-                    )
+        try:
+            with working_directory(self.work_directory):
+                if self.test_type == TestInpuType.TEST_PLAN:
+                    if self.num_shards > 1:
+                        run_test_plan(
+                            self.test_plan,
+                            "vllm",
+                            sample_vllm_test_library(),
+                            self.shard_id,
+                            self.num_shards,
+                        )
+                    else:
+                        run_test_plan(
+                            self.test_plan, "vllm", sample_vllm_test_library()
+                        )
                 else:
-                    run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
-            else:
-                raise ValueError(f"Unknown test type {self.test_type}")
+                    raise ValueError(f"Unknown test type {self.test_type}")
+        finally:
+            # double check the torches are not overridden by other packages
+            check_versions()
 
     def _install_wheels(self, params: VllmTestParameters):
         logger.info("Running vllm test with inputs: %s", params)
@@ -220,6 +226,8 @@ def preprocess_test_in(
     target_path = Path(target_file)
     lines = target_path.read_text().splitlines()
 
+    pkgs_to_add = []
+
     # Remove lines starting with the package names (==, @, >=) — case-insensitive
     pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
     kept_lines = [line for line in lines if not pattern.match(line)]
@@ -236,7 +244,11 @@ def preprocess_test_in(
     ]
 
     # Write back: header_lines + blank + kept_lines
-    out = "\n".join(header_lines + [""] + kept_lines) + "\n"
+    out_lines = header_lines + [""] + kept_lines
+    if pkgs_to_add:
+        out_lines += [""] + pkgs_to_add
+
+    out = "\n".join(out_lines) + "\n"
     target_path.write_text(out)
     logger.info("[INFO] Updated %s", target_file)
 
diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh
index 3fbd25be1da3..6ed38f8b25c6 100644
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@@ -124,6 +124,7 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
     fi
     if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
         echo "Bundling with cudnn and cublas."
+
         DEPS_LIST+=(
             "/usr/local/cuda/lib64/libcudnn_adv.so.9"
             "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
@@ -133,16 +134,11 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
             "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
             "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
             "/usr/local/cuda/lib64/libcudnn.so.9"
-            "/usr/local/cuda/lib64/libcublas.so.12"
-            "/usr/local/cuda/lib64/libcublasLt.so.12"
             "/usr/local/cuda/lib64/libcusparseLt.so.0"
-            "/usr/local/cuda/lib64/libcudart.so.12"
-            "/usr/local/cuda/lib64/libnvrtc.so.12"
             "/usr/local/cuda/lib64/libnvrtc-builtins.so"
             "/usr/local/cuda/lib64/libcufile.so.0"
             "/usr/local/cuda/lib64/libcufile_rdma.so.1"
             "/usr/local/cuda/lib64/libnvshmem_host.so.3"
-            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
             "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
         )
         DEPS_SONAME+=(
@@ -154,22 +150,56 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
             "libcudnn_engines_precompiled.so.9"
             "libcudnn_heuristic.so.9"
             "libcudnn.so.9"
-            "libcublas.so.12"
-            "libcublasLt.so.12"
             "libcusparseLt.so.0"
-            "libcudart.so.12"
-            "libnvrtc.so.12"
             "libnvrtc-builtins.so"
             "libnvshmem_host.so.3"
             "libcufile.so.0"
             "libcufile_rdma.so.1"
-            "libcupti.so.12"
             "libnvperf_host.so"
         )
         # Add libnvToolsExt only if CUDA version is not 12.9
-        if [[ $CUDA_VERSION != 12.9* ]]; then
-            DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
-            DEPS_SONAME+=("libnvToolsExt.so.1")
+        if [[ $CUDA_VERSION == 13* ]]; then
+            DEPS_LIST+=(
+                "/usr/local/cuda/lib64/libcublas.so.13"
+                "/usr/local/cuda/lib64/libcublasLt.so.13"
+                "/usr/local/cuda/lib64/libcudart.so.13"
+                "/usr/local/cuda/lib64/libnvrtc.so.13"
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13"
+                "/usr/local/cuda/lib64/libibverbs.so.1"
+                "/usr/local/cuda/lib64/librdmacm.so.1"
+                "/usr/local/cuda/lib64/libmlx5.so.1"
+                "/usr/local/cuda/lib64/libnl-3.so.200"
+                "/usr/local/cuda/lib64/libnl-route-3.so.200")
+            DEPS_SONAME+=(
+                "libcublas.so.13"
+                "libcublasLt.so.13"
+                "libcudart.so.13"
+                "libnvrtc.so.13"
+                "libcupti.so.13"
+                "libibverbs.so.1"
+                "librdmacm.so.1"
+                "libmlx5.so.1"
+                "libnl-3.so.200"
+                "libnl-route-3.so.200")
+            export USE_CUPTI_SO=1
+            export ATEN_STATIC_CUDA=0
+            export USE_CUDA_STATIC_LINK=0
+            export USE_CUFILE=0
+        else
+            DEPS_LIST+=(
+                "/usr/local/cuda/lib64/libnvToolsExt.so.1"
+                "/usr/local/cuda/lib64/libcublas.so.12"
+                "/usr/local/cuda/lib64/libcublasLt.so.12"
+                "/usr/local/cuda/lib64/libcudart.so.12"
+                "/usr/local/cuda/lib64/libnvrtc.so.12"
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
+            DEPS_SONAME+=(
+                "libnvToolsExt.so.1"
+                "libcublas.so.12"
+                "libcublasLt.so.12"
+                "libcudart.so.12"
+                "libnvrtc.so.12"
+                "libcupti.so.12")
         fi
     else
         echo "Using nvidia libs from pypi."
diff --git a/.ci/pytorch/check_binary.sh b/.ci/pytorch/check_binary.sh
index 0f632f8006c0..cca289ac146b 100755
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi
 
 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' ]]; then
+if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
   # We also check that there are cxx11 symbols in libtorch
   #
   echo "Checking that symbols in libtorch.so have the right gcc abi"
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index 6d79a4517edf..bf03e132d30b 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -284,7 +284,7 @@ function install_torchrec_and_fbgemm() {
 
 function clone_pytorch_xla() {
   if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.9 https://github.com/pytorch/xla.git
     pushd xla
     # pin the xla hash so that we don't get broken by changes to xla
     git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
diff --git a/.ci/pytorch/cpp_doc_push_script.sh b/.ci/pytorch/cpp_doc_push_script.sh
index 6e417bf8bbe9..f085fa78bebe 100755
--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \
 
 # Build the docs
 pushd docs/cpp
-time make VERBOSE=1 html -j
+time make VERBOSE=1 html
 
 popd
 popd
diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh
index 295a82f057dc..a859901191e0 100755
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@@ -195,7 +195,7 @@ torchbench_setup_macos() {
   git checkout "$(cat ../.github/ci_commit_pins/vision.txt)"
   git submodule update --init --recursive
   python setup.py clean
-  python setup.py develop
+  python -m pip install -e . -v --no-build-isolation
   popd
 
   pushd torchaudio
@@ -204,7 +204,7 @@ torchbench_setup_macos() {
   git submodule update --init --recursive
   python setup.py clean
   #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
-  USE_OPENMP=0 python setup.py develop
+  USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation
   popd
 
   checkout_install_torchbench
@@ -302,6 +302,47 @@ test_torchbench_smoketest() {
     fi
 
   done
+  echo "Pytorch benchmark on mps device completed"
+}
+
+test_aoti_torchbench_smoketest() {
+  print_cmake_info
+
+  echo "Launching AOTInductor torchbench setup"
+  pip_benchmark_deps
+  # shellcheck disable=SC2119,SC2120
+  torchbench_setup_macos
+
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  local device=mps
+  local dtypes=(undefined float16 bfloat16 notset)
+  local dtype=${dtypes[$1]}
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
+
+  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
+  local dtype_arg="--${dtype}"
+  if [ "$dtype" == notset ]; then
+      dtype_arg="--float32"
+  fi
+  touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv"
+  for model in "${models[@]}"; do
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
+  done
+
+  echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
+    --performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
+    --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
+    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
 
   echo "Pytorch benchmark on mps device completed"
 }
@@ -350,6 +391,8 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
   test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
   test_torchbench_smoketest "${SHARD_NUMBER}"
+elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then
+  test_aoti_torchbench_smoketest "${SHARD_NUMBER}"
 elif [[ $TEST_CONFIG == *"mps"* ]]; then
   test_python_mps
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
diff --git a/.ci/pytorch/numba-cuda-13.patch b/.ci/pytorch/numba-cuda-13.patch
new file mode 100644
index 000000000000..f96ff287ed39
--- /dev/null
+++ b/.ci/pytorch/numba-cuda-13.patch
@@ -0,0 +1,25 @@
+From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
+From: Michael Wang <13521008+isVoid@users.noreply.github.com>
+Date: Tue, 1 Apr 2025 17:28:05 -0700
+Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
+ (#185)
+
+Co-authored-by: isVoid <isVoid@users.noreply.github.com>
+---
+ numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
+index 1641bf77..233e9ed7 100644
+--- a/numba_cuda/numba/cuda/cudadrv/driver.py
++++ b/numba_cuda/numba/cuda/cudadrv/driver.py
+@@ -365,6 +365,9 @@ def _find_api(self, fname):
+         else:
+             variants = ('_v2', '')
+ 
++        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
++            return getattr(self.lib, fname)
++
+         for variant in variants:
+             try:
+                 return getattr(self.lib, f'{fname}{variant}')
diff --git a/.ci/pytorch/smoke_test/check_binary_symbols.py b/.ci/pytorch/smoke_test/check_binary_symbols.py
index 3e88ffe4ffd7..b0c607659c72 100755
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@@ -32,6 +32,9 @@
     "torch::",
 )
 
+# Patterns for detecting statically linked libstdc++ symbols
+STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")]
+
 
 def _apply_libtorch_symbols(symbols):
     return [
@@ -53,12 +56,17 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]:
     return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]]
 
 
-def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
+def grep_symbols(
+    lib: str, patterns: list[Any], symbol_type: str | None = None
+) -> list[str]:
     def _grep_symbols(
         symbols: list[tuple[str, str, str]], patterns: list[Any]
     ) -> list[str]:
         rc = []
         for _s_addr, _s_type, s_name in symbols:
+            # Filter by symbol type if specified
+            if symbol_type and _s_type != symbol_type:
+                continue
             for pattern in patterns:
                 if pattern.match(s_name):
                     rc.append(s_name)
@@ -80,6 +88,18 @@ def _get_symbols_chunk(i):
         return functools.reduce(list.__add__, (x.result() for x in tasks), [])
 
 
+def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
+    cxx11_statically_linked_symbols = grep_symbols(
+        lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T"
+    )
+    num_statically_linked_symbols = len(cxx11_statically_linked_symbols)
+    print(f"num_statically_linked_symbols (T): {num_statically_linked_symbols}")
+    if num_statically_linked_symbols > 0:
+        raise RuntimeError(
+            f"Found statically linked libstdc++ symbols (recursive_directory_iterator): {cxx11_statically_linked_symbols[:100]}"
+        )
+
+
 def check_lib_symbols_for_abi_correctness(lib: str) -> None:
     print(f"lib: {lib}")
     cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
@@ -107,6 +127,7 @@ def main() -> None:
 
     libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
     check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
+    check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)
 
 
 if __name__ == "__main__":
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index a0c3760b5eaa..e8c5b3fc56af 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -32,6 +32,16 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
   git config --global --add safe.directory /var/lib/jenkins/workspace
 fi
 
+
+# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
+NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
+if [ -n "$NUMBA_CUDA_DIR" ]; then
+  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
+  pushd "$NUMBA_CUDA_DIR"
+  patch -p4 <"$NUMBA_PATCH"
+  popd
+fi
+
 echo "Environment variables:"
 env
 
@@ -496,6 +506,14 @@ test_inductor_cpp_wrapper_shard() {
     -k 'take' \
     --shard "$1" "$NUM_TEST_SHARDS" \
     --verbose
+
+  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
+    python test/run_test.py \
+      --include inductor/test_mkldnn_pattern_matcher \
+      -k 'xpu' \
+      --shard "$1" "$NUM_TEST_SHARDS" \
+      --verbose
+  fi
 }
 
 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@@ -1606,6 +1624,25 @@ test_operator_benchmark() {
       --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }
 
+test_operator_microbenchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  TEST_DIR=$(pwd)
+
+  cd benchmarks/operator_benchmark/pt_extension
+  python -m pip install .
+
+  cd "${TEST_DIR}"/benchmarks/operator_benchmark
+
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
+      --benchmark-name "PyTorch operator microbenchmark" --use-compile
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
+      --benchmark-name "PyTorch operator microbenchmark"
+  done
+}
 
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
   (cd test && python -c "import torch; print(torch.__config__.show())")
@@ -1660,6 +1697,8 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
     test_operator_benchmark cpu ${TEST_MODE}
 
   fi
+elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
+  test_operator_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
   test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@@ -1713,11 +1752,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
   install_torchvision
   test_inductor_shard "${SHARD_NUMBER}"
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
-      test_inductor_distributed
-    fi
-  fi
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
   test_einops
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat
index 19d715b9d0b6..67d156922192 100644
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@@ -137,7 +137,7 @@ sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
   if "%BUILD_ENVIRONMENT%"=="" (
-    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
   ) else (
     copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"
 
diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
index 01e08c8bb4e5..abd2c8722b11 100644
--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
   set CONDA_PARENT_DIR=C:\Jenkins
 )
-
+set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3
 
 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
-if not exist %CONDA_PARENT_DIR%\Miniconda3 (
+if not exist %CONDA_ROOT_DIR% (
   set INSTALL_FRESH_CONDA=1
 )
 
@@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 
-  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
   if errorlevel 1 exit /b
   if not errorlevel 0 exit /b
 )
 
 :: Activate conda so that we can use its commands, i.e. conda, python, pip
-call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
+call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
+:: Activate conda so that we can use its commands, i.e. conda, python, pip
+call conda activate py_tmp
+
+call pip install -r .ci/docker/requirements-ci.txt
diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
index 4a464d6b5786..3173582b06f4 100644
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
-xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
 
 pushd .
 if "%VC_VERSION%" == "" (
diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh
index be7f3e4bb35c..c96d5c331c9f 100755
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@@ -38,13 +38,20 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi
 
 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+
+# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
+# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
+# scipy from 1.6.3 to 1.10
+# expecttest from 0.1.3 to 0.3.0
+# xdoctest from 1.0.2 to 1.3.0
+python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"
 
 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.15.1.0
 
 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.3.30
+python -m pip install tlparse==0.4.0
 
 # Install parameterized
 python -m pip install parameterized==0.8.1
@@ -52,9 +59,6 @@ python -m pip install parameterized==0.8.1
 # Install pulp for testing ilps under torch\distributed\_tools
 python -m pip install pulp==2.9.0
 
-# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
-python -m pip install expecttest==0.3.0
-
 run_tests() {
     # Run nvidia-smi if available
     for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat
index bbdfb4bd1bb7..bbd349e2efb4 100644
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@@ -37,10 +37,10 @@ IF "%CUDA_PATH_V128%"=="" (
 )
 
 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0
     set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
-    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
 )
 
 set "CUDA_PATH=%CUDA_PATH_V128%"
diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat
index 40f2bd7acdbb..e0281c0d78a4 100644
--- a/.ci/pytorch/windows/internal/copy.bat
+++ b/.ci/pytorch/windows/internal/copy.bat
@@ -1,12 +1,20 @@
-copy "%CUDA_PATH%\bin\cusparse*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cublas*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cudart*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\curand*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cufft*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\cusolver*64_*.dll*" pytorch\torch\lib
+
+if %CUDA_VERSION% geq 130 (
+    set "dll_path=bin\x64"
+) else (
+    set "dll_path=bin"
+)
+
+copy "%CUDA_PATH%\%dll_path%\cusparse*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cublas*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cudart*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\curand*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cufft*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\cusolver*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\nvrtc*64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\%dll_path%\nvJitLink_*.dll*"  pytorch\torch\lib
 
 copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib
-copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\nvperf_host*.dll*" pytorch\torch\lib
 
@@ -20,8 +28,3 @@ copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
 if exist "C:\Windows\System32\zlibwapi.dll" (
     copy "C:\Windows\System32\zlibwapi.dll"  pytorch\torch\lib
 )
-
-::copy nvJitLink dll is requires for cuda 12+
-if exist "%CUDA_PATH%\bin\nvJitLink_*.dll*" (
-    copy "%CUDA_PATH%\bin\nvJitLink_*.dll*"  pytorch\torch\lib
-)
diff --git a/.ci/pytorch/windows/internal/driver_update.bat b/.ci/pytorch/windows/internal/driver_update.bat
index 5ed3a236c09a..2c173aed818b 100644
--- a/.ci/pytorch/windows/internal/driver_update.bat
+++ b/.ci/pytorch/windows/internal/driver_update.bat
@@ -1,9 +1,9 @@
-set WIN_DRIVER_VN=528.89
-set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore
-curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe
+set WIN_DRIVER_VN=580.88
+set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore
+curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe
 if errorlevel 1 exit /b 1
 
-start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot
+start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot
 if errorlevel 1 exit /b 1
 
-del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL
+del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL
diff --git a/.ci/pytorch/windows/internal/xpu_install.bat b/.ci/pytorch/windows/internal/xpu_install.bat
index 2296adf4dfe6..f143571a5692 100644
--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
 :xpu_bundle_install_start
 
 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
-set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
+set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
-set XPU_BUNDLE_VERSION=2025.0.1+20
+set XPU_BUNDLE_VERSION=2025.1.3+5
 set XPU_BUNDLE_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
 set XPU_EXTRA_URL=NULL
@@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
 set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0
 
-if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
-    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
-    set XPU_BUNDLE_VERSION=2025.1.3+5
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+    set XPU_BUNDLE_VERSION=2025.2.1+20
 )
 
 :: Check if XPU bundle is target version or already installed
@@ -90,14 +90,3 @@ if errorlevel 1 exit /b 1
 del xpu_extra.exe
 
 :xpu_install_end
-
-if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
-:: Install Level Zero SDK
-set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
-curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
-echo "Installing level zero SDK..."
-7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
-set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
-del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
-
-:install_end
diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh
index b9b6448ae208..e63a68e4f193 100755
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@@ -124,19 +124,15 @@ popd
 
 export TH_BINARY_BUILD=1
 export INSTALL_TEST=0 # dont install test binaries into site-packages
-export MACOSX_DEPLOYMENT_TARGET=10.15
+export MACOSX_DEPLOYMENT_TARGET=11.0
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 
-SETUPTOOLS_PINNED_VERSION="==70.1.0"
-PYYAML_PINNED_VERSION="==5.3"
 EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
 RENAME_WHEEL=true
 case $desired_python in
     3.14t)
         echo "Using 3.14 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
         NUMPY_PINNED_VERSION="==2.1.0"
         CONDA_ENV_CREATE_FLAGS="python-freethreading"
         EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
@@ -145,8 +141,6 @@ case $desired_python in
         ;;
     3.14)
         echo "Using 3.14t deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
         NUMPY_PINNED_VERSION="==2.1.0"
         EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
         desired_python="3.14.0rc1"
@@ -154,8 +148,6 @@ case $desired_python in
         ;;
     3.13t)
         echo "Using 3.13 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
         NUMPY_PINNED_VERSION="==2.1.0"
         CONDA_ENV_CREATE_FLAGS="python-freethreading"
         EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
@@ -164,37 +156,23 @@ case $desired_python in
         ;;
     3.13)
         echo "Using 3.13 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
         NUMPY_PINNED_VERSION="==2.1.0"
         ;;
     3.12)
         echo "Using 3.12 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
         NUMPY_PINNED_VERSION="==2.0.2"
         ;;
     3.11)
         echo "Using 3.11 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=5.3"
         NUMPY_PINNED_VERSION="==2.0.2"
         ;;
     3.10)
         echo "Using 3.10 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="==2.0.2"
-        ;;
-    3.9)
-        echo "Using 3.9 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=5.3"
         NUMPY_PINNED_VERSION="==2.0.2"
         ;;
     *)
-        echo "Using default deps"
-        NUMPY_PINNED_VERSION="==1.11.3"
+        echo "Unsupported version $desired_python"
+        exit 1
         ;;
 esac
 
@@ -204,8 +182,6 @@ conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_p
 source activate "$tmp_env_name"
 
 PINNED_PACKAGES=(
-    "setuptools${SETUPTOOLS_PINNED_VERSION}"
-    "pyyaml${PYYAML_PINNED_VERSION}"
     "numpy${NUMPY_PINNED_VERSION}"
 )
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
@@ -223,7 +199,7 @@ export BUILD_TEST=OFF
 pushd "$pytorch_rootdir"
 echo "Calling setup.py bdist_wheel at $(date)"
 
-python setup.py bdist_wheel -d "$whl_tmp_dir"
+python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
 
 echo "Finished setup.py bdist_wheel at $(date)"
 
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 87fea14b8d28..aa82d36aa7ce 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -5,7 +5,9 @@ export TZ=UTC
 tagged_version() {
   GIT_DIR="${workdir}/pytorch/.git"
   GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
-  if [[ ! -d "${GIT_DIR}" ]]; then
+  if [[ -n "${CIRCLE_TAG:-}" ]]; then
+    echo "${CIRCLE_TAG}"
+  elif [[ ! -d "${GIT_DIR}" ]]; then
     echo "Abort, abort! Git dir ${GIT_DIR} does not exists!"
     kill $$
   elif ${GIT_DESCRIBE} --exact >/dev/null; then
@@ -69,16 +71,11 @@ fi
 
 export PYTORCH_BUILD_NUMBER=1
 
+# This part is done in the builder scripts so commenting the duplicate code
+: <<'BLOCK_COMMENT'
 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-
-# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
-
-# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]]; then
-  TRITON_CONSTRAINT="platform_system == 'Linux'"
-fi
+TRITON_CONSTRAINT="platform_system == 'Linux'"
 
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
   TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
@@ -117,6 +114,7 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
         export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
     fi
 fi
+BLOCK_COMMENT
 
 USE_GLOO_WITH_OPENSSL="ON"
 if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then
diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh
index 27cd36f94928..18dcde50e2b6 100644
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@@ -15,8 +15,7 @@ fi
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
     export VC_YEAR=2022
     export USE_SCCACHE=0
-    export XPU_VERSION=2025.1
-    export XPU_ENABLE_KINETO=1
+    export XPU_VERSION=2025.2
 fi
 
 echo "Free space on filesystem before build:"
diff --git a/.circleci/scripts/binary_windows_test.sh b/.circleci/scripts/binary_windows_test.sh
index 79f714265f2c..9326d9037e8b 100644
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@@ -8,7 +8,7 @@ export VC_YEAR=2022
 
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
     export VC_YEAR=2022
-    export XPU_VERSION=2025.1
+    export XPU_VERSION=2025.2
 fi
 
 pushd "$PYTORCH_ROOT/.ci/pytorch/"
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index 85c7999c1857..798dee312306 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -12,7 +12,9 @@ self-hosted-runner:
     - linux.9xlarge.ephemeral
     - am2.linux.9xlarge.ephemeral
     - linux.12xlarge
+    - linux.12xlarge.memory
     - linux.24xlarge
+    - linux.24xlarge.memory
     - linux.24xlarge.ephemeral
     - linux.24xlarge.amd
     - linux.arm64.2xlarge
diff --git a/.github/actions/build-external-packages/action.yml b/.github/actions/build-external-packages/action.yml
index dc8b8b889536..c0c727d93ac6 100644
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@@ -4,6 +4,11 @@ name: Build External packages
 description: build external packages for PyTorch
 
 inputs:
+  cuda-version:
+    description: CUDA version to use
+    type: string
+    required: true
+    default: '12.8.1'
   cuda-arch-list:
     description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
     type: string
@@ -44,10 +49,12 @@ runs:
       env:
         SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
         SCCACHE_REGION: us-east-1
+        CUDA_VERSION: ${{ inputs.cuda-version }}
         TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
         BASE_IMAGE: ${{ inputs.docker-image }}
         BUILD_TARGETS: ${{ inputs.build-targets }}
-        PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
+        PARENT_OUTPUT_DIR: ${{ inputs.output-dir }}
+        TORCH_WHEELS_PATH: ${{ inputs.torch-wheel-dir }}
       shell: bash
       run: |
         set -euo pipefail
@@ -68,7 +75,6 @@ runs:
           export OUTPUT_DIR
           echo "Building external package: $target in directory $OUTPUT_DIR"
           python3 -m cli.run build external "$target"
-
         done
 
         END_TIME=$(date +%s)
diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml
index 055404c69474..15f193ef3a5d 100644
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@@ -57,6 +57,21 @@ runs:
         submodules: ${{ inputs.submodules }}
         show-progress: false
 
+    - name: Clean submodules post checkout
+      id: clean-submodules
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+      shell: bash
+      env:
+        NO_SUDO: ${{ inputs.no-sudo }}
+      run: |
+        cd "${GITHUB_WORKSPACE}"
+        # Clean stale submodule dirs
+        if [ -z "${NO_SUDO}" ]; then
+          sudo git submodule foreach --recursive git clean -ffdx
+        else
+          git submodule foreach --recursive git clean -ffdx
+        fi
+
     - name: Clean workspace (try again)
       if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
         (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml
index 93c957896b5e..2ea330f93b49 100644
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@@ -6,6 +6,12 @@ inputs:
   cuda-version:
     description: which cuda version to install, 'cpu' for none
     required: true
+  python-version:
+    required: false
+    type: string
+    default: "3.10"
+    description: |
+      The python version to be used. Will be 3.10 by default
 
 runs:
   using: composite
@@ -38,18 +44,24 @@ runs:
         CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"
 
         {
+          echo "CONDA=${CONDA}";
           echo "CONDA_RUN=${CONDA} run --no-capture-output";
           echo "CONDA_BUILD=${CONDA} run conda-build";
           echo "CONDA_INSTALL=${CONDA} install";
         } >> "${GITHUB_ENV}"
 
     - name: Setup Python3
+      env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
       shell: bash
       run: |
         set +e
         set -x
 
-        PYTHON3=$(${CONDA_RUN} which python3)
+        # Create new py_tmp env with python-version
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
+
+        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
         EXIT_CODE=$?
 
         if [[ "${EXIT_CODE}" == "0" ]]; then
@@ -62,7 +74,7 @@ runs:
           # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
           # is also the Miniconda installation that is Python 2 based, and both can be installed if
           # needed. In both cases, Python binary is just called python
-          PYTHON=$(${CONDA_RUN} which python)
+          PYTHON=$(${CONDA_RUN} -n py_tmp which python)
           EXIT_CODE=$?
 
           if [[ "${EXIT_CODE}" == "0" ]]; then
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 0b9c14848239..b0255e764c59 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-10a5002c6195bd95e34df8fe28ff8a2d55a2a922
+27fc2493d383354a008106f22f3be232badee9a1
diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt
index 80c5a90c7be9..bbc484d273a1 100644
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@@ -1 +1 @@
-add1adfec742dfb13e614dab3372b5aafd1ff046
+78a47f87ce259a48f0391fa9ae15add05ea7432b
diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 53cf6c8c9915..ee530f8c8b21 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-a1c6ee92c85e8b0955c20892ed68f032a6015c09
+r2.9
diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
index 330a78424fee..2cee6ed2df19 100644
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@@ -12,54 +12,46 @@ ARG BUILD_BASE_IMAGE=torch-nightly-base
 # by default, it uses devel-ubuntu22.04 official image.
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
+# The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
+ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
 
-#################### TORCH NIGHTLY  BASE IMAGE ####################
+
+#################### TORCH NIGHTLY BASE IMAGE ####################
 # A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
-From nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
-ARG CUDA_VERSION=12.8.1
-ARG PYTHON_VERSION=3.12
-ARG TARGETPLATFORM
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-# Install Python and other dependencies if it does not existed
-RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
-      echo "Installing Python ${PYTHON_VERSION}..." && \
-      echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
-      echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
-      apt-get update -y && \
-      apt-get install -y ccache software-properties-common git curl sudo && \
-      for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-      done && \
-      apt-get update -y && \
-      apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
-      update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
-      update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
-      ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
-      curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
-   else \
-      echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
-   fi \
-   && python3 --version && python3 -m pip --version
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
+
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG GET_PIP_URL
+
+# Install Python and other dependencies
+RUN apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim \
+    && add-apt-repository -y ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
 
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
 # Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
 RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
-    if [ "$current_gcc_version" -lt 10 ]; then \
-      echo "GCC version is $current_gcc_version, installing gcc-10..."; \
-      apt-get update && \
-      apt-get install -y gcc-10 g++-10 && \
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
-      update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
-    else \
-      echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
-    fi && \
-    gcc --version && g++ --version
+    if command -v apt-get >/dev/null; then \
+        if [ "$current_gcc_version" -lt 10 ]; then \
+            echo "GCC version is $current_gcc_version, installing gcc-10..."; \
+            apt-get update \
+            && apt-get install -y gcc-10 g++-10 \
+            && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \
+            && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
+        else \
+            echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
+        fi \
+    fi \
+    && gcc --version && g++ --version
 
 # install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -79,6 +71,21 @@ ENV UV_LINK_MODE=copy
 FROM ${BUILD_BASE_IMAGE} AS base
 USER root
 
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+
+# TODO (huydhn): Only work with PyTorch manylinux builder
+ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
+
+# Install some system dependencies and double check python version
+RUN if command -v apt-get >/dev/null; then \
+        apt-get update -y \
+        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
+    else \
+        dnf install -y git curl wget sudo vim; \
+    fi \
+    && python3 --version && python3 -m pip --version
+
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
@@ -118,17 +125,15 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
     if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
         echo "[INFO] Installing torch wheels to build vllm"; \
         torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
-        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
-        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
-        uv pip install --system "${torch_whl}[opt-einsum]"; \
-        uv pip install --system "${vision_whl}"; \
-        uv pip install --system "${audio_whl}"; \
+        vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
+        audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
+        uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
     elif [ -n "$PINNED_TORCH_VERSION" ]; then \
         echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \
-        uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
+        uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
     else \
         echo "[INFO] Installing torch nightly with latest one to build vllm"; \
-        uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
+        uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
     fi
 
 # Install numba 0.61.2 for cuda environment
@@ -137,12 +142,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 # Install common dependencies from vllm common.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-uv pip install --system -r requirements/common.txt
-
+    uv pip install --system -r requirements/common.txt
 
 # Must put before installing xformers, so it can install the correct version of xfomrers.
-ARG exformer_cuda_arch_list='7.5;8.0+PTX;9.0a'
-ENV TORCH_CUDA_ARCH_LIST=${exformer_cuda_arch_list}
+ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a'
+ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list}
 
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
@@ -153,8 +157,8 @@ RUN pip freeze | grep -E 'ninja'
 
 # Build xformers with cuda and torch nightly/wheel
 # following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# sha for https://github.com/facebookresearch/xformers/tree/v0.0.31
-ARG XFORMERS_COMMIT=eb0946a363464da96ea40afd1a7f72a907c25497
+# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2
+ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468
 ENV CCACHE_DIR=/root/.cache/ccache
 
 RUN --mount=type=cache,target=/root/.cache/ccache \
@@ -176,6 +180,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
+
 RUN cat torch_build_versions.txt
 RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
 
@@ -187,11 +192,6 @@ RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
 FROM base AS build
 ARG TARGETPLATFORM
 
-ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy
-
 COPY . .
 
 RUN python3 use_existing_torch.py
@@ -250,9 +250,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
     fi
 
-RUN echo "[DEBUG] Listing  current directory:" && \
+RUN echo "[INFO] Listing current directory:" && \
     ls -al && \
-    echo "[DEBUG] Showing torch_build_versions.txt content:" && \
+    echo "[INFO] Showing torch_build_versions.txt content:" && \
     cat torch_build_versions.txt
 
 #################### WHEEL BUILD IMAGE ####################
@@ -262,42 +262,40 @@ RUN echo "[DEBUG] Listing  current directory:" && \
 # Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 USER root
+
+ARG CUDA_VERSION
+ARG PYTHON_VERSION
+ARG GET_PIP_URL
+
+# TODO (huydhn): Only work with PyTorch manylinux builder
+ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
+
 # prepare for environment starts
 WORKDIR /workspace
 
-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-# Install Python and other dependencies if it does not existed
-RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
-      echo "Installing Python ${PYTHON_VERSION}..." && \
-      echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
-      echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
-      apt-get update -y && \
-      apt-get install -y ccache software-properties-common git curl sudo && \
-      for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-      done && \
-      apt-get update -y && \
-      apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
-      update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
-      update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
-      ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
-      curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
-   else \
-      echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
-   fi \
-   && python3 --version && python3 -m pip --version
-
+# Install Python and other dependencies
+RUN if command -v apt-get >/dev/null; then \
+        apt-get update -y \
+        && apt-get install -y ccache software-properties-common git curl wget sudo vim \
+        && add-apt-repository -y ppa:deadsnakes/ppa \
+        && apt-get update -y \
+        && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+        && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+        && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
+    else \
+        dnf install -y git curl wget sudo vim; \
+    fi \
+    && python3 --version && python3 -m pip --version
 
 # Get the torch versions, and whls used in previous stagtes for consistency
 COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
 COPY --from=base /workspace/xformers-dist /wheels/xformers
 COPY --from=build /workspace/vllm-dist /wheels/vllm
-RUN echo "[DEBUG] Listing current directory before torch install step:" && \
+RUN echo "[INFO] Listing current directory before torch install step:" && \
     ls -al && \
-    echo "[DEBUG] Showing torch_build_versions.txt content:" && \
+    echo "[INFO] Showing torch_build_versions.txt content:" && \
     cat torch_build_versions.txt
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
@@ -306,7 +304,6 @@ RUN echo "[DEBUG] Listing current directory before torch install step:" && \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
     if ! python3 -m uv --version > /dev/null 2>&1; then \
@@ -326,15 +323,13 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
     --mount=type=cache,target=/root/.cache/uv \
     if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
         torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
-        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
-        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
+        vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \
+        audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \
         echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
-        uv pip install --system "${torch_whl}[opt-einsum]"; \
-        uv pip install --system "${vision_whl}"; \
-        uv pip install --system "${audio_whl}"; \
+        uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \
     else \
         echo "[INFO] Installing torch versions from torch_build_versions.txt"; \
-        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128; \
+        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
     fi
 
 # Install the vllm wheel from previous stage
@@ -345,9 +340,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system /wheels/xformers/*.whl --verbose
 
-
 # Build flashinfer from source.
-ARG torch_cuda_arch_list='8.0;8.9;9.0a'
+ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
 # install package for build flashinfer
 # see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
 
@@ -358,7 +352,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # Build flashinfer for torch nightly from source around 10 mins
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
-ARG FLASHINFER_GIT_REF="v0.2.9rc2"
+ARG FLASHINFER_GIT_REF="v0.2.14.post1"
 RUN --mount=type=cache,target=/root/.cache/uv \
     git clone --depth 1 --recursive --shallow-submodules \
         --branch ${FLASHINFER_GIT_REF} \
@@ -376,6 +370,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 
 # Logging to confirm the torch versions
 RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt
 ################### VLLM INSTALLED IMAGE ####################
 
 
@@ -414,11 +409,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/nightly_torch_test.txt
 
-# Workaround for #17068
-# pinned commit for v2.2.4
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@95d8aba8a8c75aedcaa6143713b11e745e7cd0d9#egg=mamba-ssm"
-
 # Logging to confirm the torch versions
 RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
 
@@ -433,4 +423,5 @@ FROM scratch as export-wheels
 # Just copy the wheels we prepared in previous stages
 COPY --from=base /workspace/xformers-dist /wheels/xformers
 COPY --from=build /workspace/vllm-dist /wheels/vllm
+COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt
 COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python
diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt
index 224835188d87..3a27cac46f71 100644
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@@ -28,7 +28,7 @@ pyyaml==6.0.2
 scipy==1.12.0
 setuptools==72.1.0
 sympy==1.13.3
-tlparse==0.3.30
+tlparse==0.4.0
 tensorboard==2.13.0
 typing-extensions==4.12.2
 unittest-xml-reporting<=3.2.0,>=2.0.0
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index beec9f96aba2..f2851e331725 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import os
+import re
 import shutil
 import sys
 from pathlib import Path
@@ -50,6 +51,30 @@ def patch_init_py(
     with open(path, "w") as f:
         f.write(orig)
 
+def get_rocm_version() -> str:
+    rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm"
+    rocm_version = "0.0.0"
+    rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h"
+    if not os.path.isfile(rocm_version_h):
+        rocm_version_h = f"{rocm_path}/include/rocm_version.h"
+    # The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install.
+    if os.path.isfile(rocm_version_h):
+        RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)")
+        RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)")
+        RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)")
+        major, minor, patch = 0, 0, 0
+        for line in open(rocm_version_h):
+            match = RE_MAJOR.search(line)
+            if match:
+                major = int(match.group(1))
+            match = RE_MINOR.search(line)
+            if match:
+                minor = int(match.group(1))
+            match = RE_PATCH.search(line)
+            if match:
+                patch = int(match.group(1))
+        rocm_version = str(major)+"."+str(minor)+"."+str(patch)
+    return rocm_version
 
 def build_triton(
     *,
@@ -64,14 +89,24 @@ def build_triton(
     if "MAX_JOBS" not in env:
         max_jobs = os.cpu_count() or 1
         env["MAX_JOBS"] = str(max_jobs)
-
+    if not release:
+        # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
+        # while release build should only include the version, i.e. 2.1.0
+        rocm_version = get_rocm_version()
+        version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}"
+        version += version_suffix
     with TemporaryDirectory() as tmpdir:
         triton_basedir = Path(tmpdir) / "triton"
         triton_pythondir = triton_basedir / "python"
 
         triton_repo = "https://github.com/openai/triton"
         if device == "rocm":
-            triton_pkg_name = "pytorch-triton-rocm"
+            triton_repo = "https://github.com/ROCm/triton"
+            rocm_version = get_rocm_version()  # e.g., "7.0.1"
+            if tuple(map(int, rocm_version.split("."))) > (7, 0, 0):
+                triton_pkg_name = "triton"
+            else:
+                triton_pkg_name = "pytorch-triton-rocm"
         elif device == "xpu":
             triton_pkg_name = "pytorch-triton-xpu"
             triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
@@ -84,10 +119,12 @@ def build_triton(
                 ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
             )
         else:
+            check_call(["git", "fetch", "origin", commit_hash], cwd=triton_basedir)
             check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
 
         # change built wheel name and version
         env["TRITON_WHEEL_NAME"] = triton_pkg_name
+        env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix
         if with_clang_ldd:
             env["TRITON_BUILD_WITH_CLANG_LLD"] = "1"
 
diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py
index 9ba210a5ed2b..dd16dbc18db2 100755
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@@ -41,9 +41,9 @@ def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool:
 }
 
 # The link to the published list of disabled jobs
-DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=hjktHz2WOejHpxKpkqpDknTt5rMTM9KK"
 # and unstable jobs
-UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=wrjdvvQTJxgvMO.rGw5MEuMsj6XbjuV7"
 
 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py
index a576706ace22..4dc97ee6a284 100644
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@@ -16,18 +16,16 @@
 
 
 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
-CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
+CUDA_ARCHES = ["12.6", "12.8", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
     "12.6": "12.6.3",
     "12.8": "12.8.1",
-    "12.9": "12.9.1",
     "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
     "12.6": "9",
     "12.8": "9",
-    "12.9": "9",
     "13.0": "9",
 }
 
@@ -40,99 +38,82 @@
 
 CPU_S390X_ARCH = ["cpu-s390x"]
 
-CUDA_AARCH64_ARCHES = ["12.9-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
 
 
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
     "12.6": (
-        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
     ),
     "12.8": (
-        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
-    ),
-    "12.9": (
-        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
     ),
     "13.0": (
-        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
+        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
+        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
     ),
     "xpu": (
-        "intel-cmplr-lib-rt==2025.1.1 | "
-        "intel-cmplr-lib-ur==2025.1.1 | "
-        "intel-cmplr-lic-rt==2025.1.1 | "
-        "intel-sycl-rt==2025.1.1 | "
-        "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "onemkl-sycl-blas==2025.1.0 | "
-        "onemkl-sycl-dft==2025.1.0 | "
-        "onemkl-sycl-lapack==2025.1.0 | "
-        "onemkl-sycl-rng==2025.1.0 | "
-        "onemkl-sycl-sparse==2025.1.0 | "
-        "dpcpp-cpp-rt==2025.1.1 | "
-        "intel-opencl-rt==2025.1.1 | "
-        "mkl==2025.1.0 | "
-        "intel-openmp==2025.1.1 | "
-        "tbb==2022.1.0 | "
-        "tcmlib==1.3.0 | "
-        "umf==0.10.0 | "
-        "intel-pti==0.12.3"
+        "intel-cmplr-lib-rt==2025.2.1 | "
+        "intel-cmplr-lib-ur==2025.2.1 | "
+        "intel-cmplr-lic-rt==2025.2.1 | "
+        "intel-sycl-rt==2025.2.1 | "
+        "oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "onemkl-sycl-blas==2025.2.0 | "
+        "onemkl-sycl-dft==2025.2.0 | "
+        "onemkl-sycl-lapack==2025.2.0 | "
+        "onemkl-sycl-rng==2025.2.0 | "
+        "onemkl-sycl-sparse==2025.2.0 | "
+        "dpcpp-cpp-rt==2025.2.1 | "
+        "intel-opencl-rt==2025.2.1 | "
+        "mkl==2025.2.0 | "
+        "intel-openmp==2025.2.1 | "
+        "tbb==2022.2.0 | "
+        "tcmlib==1.4.0 | "
+        "umf==0.11.0 | "
+        "intel-pti==0.13.1"
     ),
 }
 
@@ -240,12 +221,8 @@ def generate_libtorch_matrix(
         if os == "linux":
             arches += CUDA_ARCHES
             arches += ROCM_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
         elif os == "windows":
             arches += CUDA_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
     if libtorch_variants is None:
         libtorch_variants = [
             "shared-with-deps",
@@ -310,8 +287,6 @@ def generate_wheels_matrix(
             arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
         elif os == "windows":
             arches += CUDA_ARCHES + XPU_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
         elif os == "linux-aarch64":
             # Separate new if as the CPU type is different and
             # uses different build/test scripts
@@ -334,19 +309,20 @@ def generate_wheels_matrix(
                 else arch_version
             )
 
-            # TODO: Enable python 3.13t on cpu-s390x
-            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
-                continue
             # TODO: Enable python 3.14 for rest
-            if os not in ["linux", "linux-aarch64", "macos-arm64", "windows"] and (
-                python_version == "3.14" or python_version == "3.14t"
-            ):
+            if os not in [
+                "linux",
+                "linux-aarch64",
+                "linux-s390x",
+                "macos-arm64",
+                "windows",
+            ] and (python_version == "3.14" or python_version == "3.14t"):
                 continue
 
             # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
 
             if (
-                arch_version in ["13.0", "12.9", "12.8", "12.6"]
+                arch_version in ["13.0", "12.8", "12.6"]
                 and os == "linux"
                 or arch_version in CUDA_AARCH64_ARCHES
             ):
@@ -410,6 +386,5 @@ def generate_wheels_matrix(
 
 
 validate_nccl_dep_consistency("13.0")
-validate_nccl_dep_consistency("12.9")
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 67906d4ad88d..0396c405ad0a 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -135,7 +135,7 @@ class OperatingSystem:
         build_configs=generate_binary_build_matrix.generate_wheels_matrix(
             OperatingSystem.LINUX,
             arches=["6.4"],
-            python_versions=["3.9"],
+            python_versions=["3.10"],
         ),
         ciflow_config=CIFlowConfig(
             labels={
diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py
index 58f3ca50baa1..ac3a1cc12921 100755
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@@ -27,6 +27,7 @@
     get_drci_classifications,
     gh_get_team_members,
     GitHubPR,
+    iter_issue_timeline_until_comment,
     JobCheckState,
     main as trymerge_main,
     MandatoryChecksMissingError,
@@ -34,6 +35,8 @@
     RE_GHSTACK_DESC,
     read_merge_rules,
     remove_job_name_suffix,
+    sha_from_committed_event,
+    sha_from_force_push_after,
     validate_revert,
 )
 
@@ -124,7 +127,7 @@ def __init__(self) -> None:
             self.force = force
             self.pr_num = 76123
             self.dry_run = True
-            self.comment_id = 0
+            self.comment_id = 12345  # Set to non-zero value
             self.reason = "this is for testing"
             self.ignore_current = False
             self.check_mergeability = False
@@ -152,9 +155,9 @@ def mock_revert(
 def mock_merge(
     pr: GitHubPR,
     repo: GitRepo,
+    comment_id: int,
     dry_run: bool = False,
     skip_mandatory_checks: bool = False,
-    comment_id: Optional[int] = None,
     timeout_minutes: int = 400,
     stale_pr_days: int = 3,
     ignore_current: bool = False,
@@ -470,9 +473,9 @@ def test_main_force(
         mock_merge.assert_called_once_with(
             mock.ANY,
             mock.ANY,
+            comment_id=mock.ANY,
             dry_run=mock.ANY,
             skip_mandatory_checks=True,
-            comment_id=mock.ANY,
             ignore_current=False,
         )
 
@@ -485,9 +488,9 @@ def test_main_merge(self, mock_merge: Any, *args: Any) -> None:
         mock_merge.assert_called_once_with(
             mock.ANY,
             mock.ANY,
+            comment_id=mock.ANY,
             dry_run=mock.ANY,
             skip_mandatory_checks=False,
-            comment_id=mock.ANY,
             ignore_current=False,
         )
 
@@ -1138,5 +1141,176 @@ def test__revlist_to_prs_two_prs(
         )
 
 
+@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
+@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
+@mock.patch(
+    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
+)
+class TestTimelineFunctions(TestCase):
+    """Tests for the new timeline-related functions"""
+
+    def test_sha_from_committed_event(self, *args: Any) -> None:
+        """Test extracting SHA from committed event"""
+        # Based on actual GitHub API format - committed events have "sha" at top level
+        event = {
+            "event": "committed",
+            "sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
+        }
+        self.assertEqual(
+            sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
+        )
+
+        # Test with missing SHA
+        event_no_sha = {"event": "committed"}
+        self.assertIsNone(sha_from_committed_event(event_no_sha))
+
+    def test_sha_from_force_push_after(self, *args: Any) -> None:
+        """Test extracting SHA from force push event"""
+        # NOTE: The current function doesn't handle the actual GitHub API format
+        # Real force push events have "commit_id" at top level, but this function
+        # looks for "after", "after_commit", "after_sha", or "head_sha" fields
+
+        # Test with the legacy format the current function handles
+        event_legacy = {
+            "event": "head_ref_force_pushed",
+            "after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
+        }
+        self.assertEqual(
+            sha_from_force_push_after(event_legacy),
+            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
+        )
+
+        # Test with current GitHub API format (should return None with current implementation)
+        event_real_api = {
+            "event": "head_ref_force_pushed",
+            "commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
+        }
+        self.assertEqual(
+            sha_from_force_push_after(event_real_api),
+            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
+        )  # Current function doesn't handle commit_id
+
+        # Test with missing SHA
+        event_no_sha = {"event": "head_ref_force_pushed"}
+        self.assertIsNone(sha_from_force_push_after(event_no_sha))
+
+    @mock.patch("trymerge.gh_fetch_json_list")
+    def test_iter_issue_timeline_until_comment(
+        self, mock_gh_fetch_json_list: Any, *args: Any
+    ) -> None:
+        """Test timeline iteration until target comment"""
+        # Mock timeline data based on actual GitHub API format
+        timeline_data = [
+            {"event": "commented", "id": 100, "body": "first comment"},
+            {"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
+            {"event": "commented", "id": 200, "body": "target comment"},
+            {"event": "commented", "id": 300, "body": "after target"},
+        ]
+        mock_gh_fetch_json_list.return_value = timeline_data
+
+        # Test iteration stops at target comment
+        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
+        self.assertEqual(len(events), 3)  # Should stop at target comment
+        self.assertEqual(events[0]["event"], "commented")
+        self.assertEqual(events[0]["id"], 100)
+        self.assertEqual(events[1]["event"], "committed")
+        self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
+        self.assertEqual(events[2]["event"], "commented")
+        self.assertEqual(events[2]["id"], 200)
+
+    @mock.patch("trymerge.gh_fetch_json_list")
+    def test_iter_issue_timeline_until_comment_not_found(
+        self, mock_gh_fetch_json_list: Any, *args: Any
+    ) -> None:
+        """Test timeline iteration when target comment is not found"""
+        # Mock empty timeline
+        mock_gh_fetch_json_list.return_value = []
+
+        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
+        self.assertEqual(len(events), 0)
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_commit_after_comment(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        """Test get_commit_sha_at_comment returns correct SHA after comment"""
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "commented", "id": 100},
+            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertEqual(sha, "commit2")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_force_push_before_comment(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "head_ref_force_pushed", "commit_id": "commit3"},
+            {"event": "commented", "id": 100},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertEqual(sha, "commit3")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
+            {"event": "commented", "id": 100},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertEqual(sha, "commit3")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_multiple_comments(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "committed", "sha": "commit1"},
+            {"event": "commented", "id": 100},
+            {"event": "committed", "sha": "commit2"},
+            {"event": "commented", "id": 200},
+            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
+            {"event": "commented", "id": 300},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(200)
+        self.assertEqual(sha, "commit2")
+        sha = pr.get_commit_sha_at_comment(300)
+        self.assertEqual(sha, "commit3")
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_no_events(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.return_value = [
+            {"event": "commented", "id": 100},
+            {"event": "labeled", "label": {"name": "test"}},
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertIsNone(sha)
+
+    @mock.patch("trymerge.iter_issue_timeline_until_comment")
+    def test_get_commit_sha_at_comment_exception(
+        self, mock_iter_timeline: Any, *args: Any
+    ) -> None:
+        mock_iter_timeline.side_effect = Exception("API error")
+        pr = GitHubPR("pytorch", "pytorch", 77700)
+        sha = pr.get_commit_sha_at_comment(100)
+        self.assertIsNone(sha)
+
+
 if __name__ == "__main__":
     main()
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
index 695a53305a05..00b66869dcf2 100755
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@@ -450,6 +450,63 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10
 
 
+def iter_issue_timeline_until_comment(
+    org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
+) -> Any:
+    """
+    Yield timeline entries in order until (and including) the entry whose id == target_comment_id
+    for a 'commented' event. Stops once the target comment is encountered.
+    """
+    page = 1
+
+    while page <= max_pages:
+        url = (
+            f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
+        )
+        params = {"per_page": 100, "page": page}
+
+        batch = gh_fetch_json_list(url, params)
+
+        if not batch:
+            return
+        for ev in batch:
+            # The target is the issue comment row with event == "commented" and id == issue_comment_id
+            if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
+                yield ev  # nothing in the timeline after this matters, so stop early
+                return
+            yield ev
+        if len(batch) < 100:
+            return
+        page += 1
+
+    # If we got here without finding the comment, then we either hit a bug or some github PR
+    # has a _really_ long timeline.
+    # The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
+    raise RuntimeError(
+        f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
+        f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
+    )
+
+
+def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
+    """Extract SHA from committed event in timeline"""
+    return ev.get("sha")
+
+
+def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
+    """Extract SHA from force push event in timeline"""
+    # The current GitHub API format
+    commit_id = ev.get("commit_id")
+    if commit_id:
+        return str(commit_id)
+
+    # Legacy format
+    after = ev.get("after") or ev.get("after_commit") or {}
+    if isinstance(after, dict):
+        return after.get("sha") or after.get("oid")
+    return ev.get("after_sha") or ev.get("head_sha")
+
+
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
     rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
     return rc["data"]["repository"]["pullRequest"]
@@ -737,16 +794,24 @@ def get_changed_files_count(self) -> int:
     def last_commit(self) -> Any:
         return self.info["commits"]["nodes"][-1]["commit"]
 
+    def last_commit_sha(self, default: Optional[str] = None) -> str:
+        # for commits, the oid is the sha
+
+        if default is None:
+            return str(self.last_commit()["oid"])
+
+        return str(self.last_commit().get("oid", default))
+
     def get_merge_base(self) -> str:
         if self.merge_base:
             return self.merge_base
 
-        last_commit_oid = self.last_commit()["oid"]
+        last_commit_sha = self.last_commit_sha()
         # NB: We could use self.base_ref() here for regular PR, however, that doesn't
         # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
         # so let's just use main instead
         self.merge_base = gh_fetch_merge_base(
-            self.org, self.project, last_commit_oid, self.default_branch()
+            self.org, self.project, last_commit_sha, self.default_branch()
         )
 
         # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@@ -835,6 +900,44 @@ def get_approved_by(self) -> list[str]:
     def get_commit_count(self) -> int:
         return int(self.info["commits_with_authors"]["totalCount"])
 
+    def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
+        """
+        Get the PR head commit SHA that was present when a specific comment was posted.
+        This ensures we only merge the state of the PR at the time the merge command was issued,
+        not any subsequent commits that may have been pushed after.
+
+        Returns None if no head-changing events found before the comment or if the comment was not found.
+        """
+        head = None
+
+        try:
+            for event in iter_issue_timeline_until_comment(
+                self.org, self.project, self.pr_num, comment_id
+            ):
+                etype = event.get("event")
+                if etype == "committed":
+                    sha = sha_from_committed_event(event)
+                    if sha:
+                        head = sha
+                        print(f"Timeline: Found commit event for SHA {sha}")
+                elif etype == "head_ref_force_pushed":
+                    sha = sha_from_force_push_after(event)
+                    if sha:
+                        head = sha
+                        print(f"Timeline: Found force push event for SHA {sha}")
+                elif etype == "commented":
+                    if event.get("id") == comment_id:
+                        print(f"Timeline: Found final comment with sha {sha}")
+                        return head
+        except Exception as e:
+            print(
+                f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
+            )
+            return None
+
+        print(f"Did not find comment with id {comment_id} in the PR timeline")
+        return None
+
     def get_pr_creator_login(self) -> str:
         return cast(str, self.info["author"]["login"])
 
@@ -1151,7 +1254,7 @@ def merge_into(
         *,
         skip_mandatory_checks: bool = False,
         dry_run: bool = False,
-        comment_id: Optional[int] = None,
+        comment_id: int,
         ignore_current_checks: Optional[list[str]] = None,
     ) -> None:
         # Raises exception if matching rule is not found
@@ -1167,7 +1270,7 @@ def merge_into(
             skip_internal_checks=can_skip_internal_checks(self, comment_id),
             ignore_current_checks=ignore_current_checks,
         )
-        additional_merged_prs = self.merge_changes(
+        additional_merged_prs = self.merge_changes_locally(
             repo, skip_mandatory_checks, comment_id
         )
 
@@ -1196,7 +1299,7 @@ def merge_into(
                 broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
                 flaky_checks=ignorable_checks.get("FLAKY", []),
                 unstable_checks=ignorable_checks.get("UNSTABLE", []),
-                last_commit_sha=self.last_commit().get("oid", ""),
+                last_commit_sha=self.last_commit_sha(default=""),
                 merge_base_sha=self.get_merge_base(),
                 merge_commit_sha=merge_commit_sha,
                 is_failed=False,
@@ -1217,7 +1320,7 @@ def merge_into(
             dry_run=dry_run,
         )
 
-    def merge_changes(
+    def merge_changes_locally(
         self,
         repo: GitRepo,
         skip_mandatory_checks: bool = False,
@@ -1226,27 +1329,15 @@ def merge_changes(
         skip_all_rule_checks: bool = False,
     ) -> list["GitHubPR"]:
         """
-        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
+        :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
         """
         branch_to_merge_into = self.default_branch() if branch is None else branch
         if repo.current_branch() != branch_to_merge_into:
             repo.checkout(branch_to_merge_into)
-        if not self.is_ghstack_pr():
-            msg = self.gen_commit_message()
-            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-            repo.fetch(self.last_commit()["oid"], pr_branch_name)
-            repo._run_git("merge", "--squash", pr_branch_name)
-            repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
-
-            # Did the PR change since we started the merge?
-            pulled_sha = repo.show_ref(pr_branch_name)
-            latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
-            if pulled_sha != latest_pr_status.last_commit()["oid"]:
-                raise RuntimeError(
-                    "PR has been updated since CI checks last passed. Please rerun the merge command."
-                )
-            return []
-        else:
+
+        # It's okay to skip the commit SHA check for ghstack PRs since
+        # authoring requires write access to the repo.
+        if self.is_ghstack_pr():
             return self.merge_ghstack_into(
                 repo,
                 skip_mandatory_checks,
@@ -1254,6 +1345,48 @@ def merge_changes(
                 skip_all_rule_checks=skip_all_rule_checks,
             )
 
+        msg = self.gen_commit_message()
+        pr_branch_name = f"__pull-request-{self.pr_num}__init__"
+
+        # Determine which commit SHA to merge
+        commit_to_merge = None
+        if not comment_id:
+            raise ValueError("Must provide --comment-id when merging regular PRs")
+
+        # Get the commit SHA that was present when the comment was made
+        commit_to_merge = self.get_commit_sha_at_comment(comment_id)
+        if not commit_to_merge:
+            raise RuntimeError(
+                f"Could not find commit that was pushed before comment {comment_id}"
+            )
+
+        # Validate that this commit is the latest commit on the PR
+        latest_commit = self.last_commit_sha()
+        if commit_to_merge != latest_commit:
+            raise RuntimeError(
+                f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
+                f"but now the latest commit on the PR is {latest_commit}. "
+                f"Please re-issue the merge command to merge the latest commit."
+            )
+
+        print(f"Merging commit {commit_to_merge} locally")
+
+        repo.fetch(commit_to_merge, pr_branch_name)
+        repo._run_git("merge", "--squash", pr_branch_name)
+        repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
+
+        # Did the PR change since we started the merge?
+        pulled_sha = repo.show_ref(pr_branch_name)
+        latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
+        if (
+            pulled_sha != latest_pr_status.last_commit_sha()
+            or pulled_sha != commit_to_merge
+        ):
+            raise RuntimeError(
+                "PR has been updated since CI checks last passed. Please rerun the merge command."
+            )
+        return []
+
 
 class MergeRuleFailedError(RuntimeError):
     def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
@@ -1458,7 +1591,7 @@ def find_matching_merge_rule(
             pending_checks = []
             failed_checks = []
 
-        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
+        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
         if len(failed_checks) > 0:
             if reject_reason_score < 30000:
                 reject_reason_score = 30000
@@ -2156,14 +2289,14 @@ def categorize_checks(
 def merge(
     pr: GitHubPR,
     repo: GitRepo,
+    comment_id: int,
     dry_run: bool = False,
     skip_mandatory_checks: bool = False,
-    comment_id: Optional[int] = None,
     timeout_minutes: int = 400,
     stale_pr_days: int = 3,
     ignore_current: bool = False,
 ) -> None:
-    initial_commit_sha = pr.last_commit()["oid"]
+    initial_commit_sha = pr.last_commit_sha()
     pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
     print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
 
@@ -2234,7 +2367,7 @@ def merge(
             f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
         )
         pr = GitHubPR(pr.org, pr.project, pr.pr_num)
-        if initial_commit_sha != pr.last_commit()["oid"]:
+        if initial_commit_sha != pr.last_commit_sha():
             raise RuntimeError(
                 "New commits were pushed while merging. Please rerun the merge command."
             )
@@ -2401,7 +2534,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
     if args.check_mergeability:
         if pr.is_ghstack_pr():
             get_ghstack_prs(repo, pr)  # raises error if out of sync
-        pr.merge_changes(
+        pr.merge_changes_locally(
             repo,
             skip_mandatory_checks=True,
             skip_all_rule_checks=True,
@@ -2416,12 +2549,18 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
         gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
         return
     try:
+        # Ensure comment id is set, else fail
+        if not args.comment_id:
+            raise ValueError(
+                "Comment ID is required for merging PRs, please provide it using --comment-id"
+            )
+
         merge(
             pr,
             repo,
+            comment_id=args.comment_id,
             dry_run=args.dry_run,
             skip_mandatory_checks=args.force,
-            comment_id=args.comment_id,
             ignore_current=args.ignore_current,
         )
     except Exception as e:
@@ -2443,7 +2582,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
                 broken_trunk_checks=[],
                 flaky_checks=[],
                 unstable_checks=[],
-                last_commit_sha=pr.last_commit().get("oid", ""),
+                last_commit_sha=pr.last_commit_sha(default=""),
                 merge_base_sha=pr.get_merge_base(),
                 is_failed=True,
                 skip_mandatory_checks=args.force,
diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2
index 23d4c003efa8..7c93fdf522a4 100644
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@@ -4,7 +4,7 @@
 {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}
 
 {%- set timeout_minutes = 240 -%}
-{%- set timeout_minutes_windows_binary = 300 -%}
+{%- set timeout_minutes_windows_binary = 360 -%}
 
 {%- macro concurrency(build_environment) -%}
 concurrency:
@@ -32,7 +32,7 @@ concurrency:
 {%- macro setup_ec2_windows() -%}
       !{{ display_ec2_information() }}
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2
index e0998e46fb5f..bf7db5866e78 100644
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@@ -56,7 +56,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -77,6 +77,9 @@ jobs:
       runs_on: linux.s390x
       ALPINE_IMAGE: "docker.io/s390x/alpine"
       timeout-minutes: 420
+      {%- elif config["gpu_arch_type"] == "rocm" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.24xlarge.ephemeral
@@ -135,7 +138,7 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -150,10 +153,10 @@ jobs:
         with:
           name: !{{ config["build_name"] }}
           path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: !{{ config["container_image"] }}
@@ -161,7 +164,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -182,7 +185,7 @@ jobs:
         with:
           name: !{{ config["build_name"] }}
           path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
       - name: ROCm set GPU_FLAG
         run: |
           echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
@@ -196,7 +199,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: !{{ config["container_image"] }}
@@ -204,7 +207,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2
index 02fa68f54172..662060bb1307 100644
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@@ -68,12 +68,7 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
       - name: Populate binary env
         run: |
           # shellcheck disable=SC1091
diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2
index 1039a6214a7a..5e3798f8e237 100644
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@@ -33,7 +33,7 @@
   {%- if is_windows %}
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
   {%- endif %}
 
 {%- else %}
diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2
index c3a824ad05a3..c61686f8df27 100644
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@@ -64,7 +64,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -135,7 +135,7 @@ jobs:
 {%- else %}
       !{{ set_runner_specific_vars() }}
       !{{ common.setup_ec2_windows() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
 {%- endif %}
       - name: Populate binary env
         shell: bash
@@ -211,7 +211,7 @@ jobs:
           "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
 {%- else %}
       !{{ common.setup_ec2_windows() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
       !{{ set_runner_specific_vars() }}
 {%- endif %}
       - uses: !{{ common.download_artifact_action }}
diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml
index 72241a772be6..d9e5e29576d4 100644
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@@ -47,7 +47,7 @@ jobs:
       reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           fetch-depth: 1
           submodules: false
@@ -69,25 +69,25 @@ jobs:
     runs-on: ${{ matrix.runner }}
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-image-name: ${{ inputs.docker-image-name }}
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -97,7 +97,7 @@ jobs:
         run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9
         if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
 
       - name: Output disk space left
@@ -209,5 +209,5 @@ jobs:
           file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
         if: always()
diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml
index bfa035bc753b..e81e4b6a8b26 100644
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@@ -142,13 +142,13 @@ jobs:
 
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.github-token }}
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}
 
@@ -178,7 +178,6 @@ jobs:
       - name: Checkout PyTorch to pytorch dir
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -213,9 +212,9 @@ jobs:
       - name: Calculate docker image
         id: calculate-docker-image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
-          # If doing this in main or release branch, use docker.io. Otherwise
+          # If doing this in release/2.9 or release branch, use docker.io. Otherwise
           # use ECR
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: ${{ inputs.DOCKER_IMAGE }}
@@ -227,7 +226,7 @@ jobs:
 
       - name: Pull Docker image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -283,7 +282,7 @@ jobs:
 
       - name: Teardown Linux
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
 
       - name: Chown workspace
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml
index 2d9e4d0e27b2..887ab908b2d8 100644
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@@ -125,14 +125,14 @@ jobs:
 
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
         if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.github-token }}
 
         # Setup the environment
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}
 
@@ -155,7 +155,6 @@ jobs:
       - name: Checkout PyTorch to pytorch dir
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           show-progress: false
           path: pytorch
@@ -186,9 +185,7 @@ jobs:
           path: "${{ runner.temp }}/artifacts/"
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        with:
-          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9
         if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}
 
       - name: configure aws credentials
@@ -203,7 +200,7 @@ jobs:
       - name: Calculate docker image
         id: calculate-docker-image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: ${{ inputs.DOCKER_IMAGE }}
@@ -213,7 +210,7 @@ jobs:
 
       - name: Pull Docker image
         if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -225,7 +222,7 @@ jobs:
 
       - name: Teardown Linux
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
 
       - name: Chown workspace
         if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml
index 636b76d42931..61896f52bbed 100644
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@@ -81,7 +81,7 @@ jobs:
       SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           no-sudo: true
 
diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml
index ff5dbe604bac..5980ad849fa7 100644
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@@ -67,7 +67,7 @@ jobs:
             # an OOM issue when running the job, so this upgrades the runner from 4xlarge
             # to the next available tier of 12xlarge. So much memory just to generate cpp
             # doc
-            runner: ${{ inputs.runner_prefix }}linux.12xlarge
+            runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory
             # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
             # Let's try to figure out how this can be improved
             timeout-minutes: 360
@@ -84,7 +84,7 @@ jobs:
     name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -95,7 +95,7 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
@@ -110,12 +110,12 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -222,5 +222,5 @@ jobs:
           s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
         if: always()
diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml
index 014e6106b073..4c46ad28cf6b 100644
--- a/.github/workflows/_link_check.yml
+++ b/.github/workflows/_link_check.yml
@@ -11,7 +11,7 @@ on:
 jobs:
   lint-urls:
     if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     with:
       job-name: lint-urls
       timeout: 120
@@ -37,7 +37,7 @@ jobs:
 
   lint-xrefs:
     if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     with:
       job-name: lint-xrefs
       timeout: 60
diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml
index 6b4bd429e3c9..f909488850d0 100644
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@@ -134,7 +134,7 @@ jobs:
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -147,7 +147,7 @@ jobs:
       # checkout because when we run this action we don't *have* a local
       # checkout. In other cases you should prefer a local checkout.
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           no-sudo: true
 
@@ -183,7 +183,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image-name: ${{ inputs.docker-image-name }}
@@ -199,7 +199,7 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true'
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@@ -457,7 +457,7 @@ jobs:
           artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
         if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
 
       - name: Cleanup docker
diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
index 66579b573a63..f413f497d79e 100644
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@@ -99,7 +99,7 @@ jobs:
       contents: read
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -108,7 +108,7 @@ jobs:
               docker exec -it $(docker container ps --format '{{.ID}}') bash
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           no-sudo: true
 
@@ -139,7 +139,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image-name: ${{ inputs.docker-image }}
@@ -155,7 +155,7 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@@ -167,9 +167,9 @@ jobs:
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9
         with:
-          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
+          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
         if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}
 
       - name: Setup GPU_FLAG for docker run
@@ -273,6 +273,8 @@ jobs:
           TEST_CONFIG: ${{ matrix.config }}
           SHARD_NUMBER: ${{ matrix.shard }}
           NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
+          OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
           REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
@@ -418,7 +420,7 @@ jobs:
           aws-region: us-east-1
 
       - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.9
         if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           benchmark-results-dir: test/test-reports
@@ -476,7 +478,7 @@ jobs:
           workflow_attempt: ${{github.run_attempt}}
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
         if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'
 
       # NB: We are currently having an intermittent GPU-related issue on G5 runners with
diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml
index a2a5f8dd9111..9561dcc8b895 100644
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@@ -67,11 +67,11 @@ jobs:
       test-matrix: ${{ steps.filter.outputs.test-matrix }}
     steps:
       - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
 
       - name: Set xcode version
         env:
@@ -82,7 +82,7 @@ jobs:
           fi
 
       - name: Setup Python
-        uses: pytorch/test-infra/.github/actions/setup-python@main
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.9
         with:
           python-version: ${{ inputs.python-version }}
           pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
@@ -188,4 +188,4 @@ jobs:
       - name: Clean up disk space
         if: always()
         continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9
diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml
index 086e25b4868e..29ff3a72817f 100644
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@@ -105,11 +105,11 @@ jobs:
           done
 
       - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
 
       - name: Get workflow job id
         id: get-job-id
@@ -119,7 +119,7 @@ jobs:
           github-token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Setup Python
-        uses: pytorch/test-infra/.github/actions/setup-python@main
+        uses: pytorch/test-infra/.github/actions/setup-python@release/2.9
         with:
           python-version: ${{ inputs.python-version }}
           pip-requirements-file: .github/requirements/pip-requirements-macOS.txt
@@ -257,7 +257,7 @@ jobs:
           file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
 
       - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.9
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
@@ -287,4 +287,4 @@ jobs:
       - name: Clean up disk space
         if: always()
         continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9
diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml
index f73972942b5f..b6cd5d88a094 100644
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@@ -81,7 +81,7 @@ jobs:
     steps:
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           no-sudo: true
 
@@ -113,12 +113,12 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -330,7 +330,7 @@ jobs:
           aws-region: us-east-1
 
       - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.9
         with:
           benchmark-results-dir: test/test-reports
           dry-run: false
diff --git a/.github/workflows/_runner-determinator.yml b/.github/workflows/_runner-determinator.yml
index 0d674f044ec4..dd28024dbd80 100644
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@@ -59,7 +59,7 @@ jobs:
       PR_NUMBER: ${{ github.event.pull_request.number }}
     steps:
       # - name: Checkout PyTorch
-      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
       #   with:
       #     fetch-depth: 1
       #     submodules: true
diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml
index ebfb4001e437..92543128265d 100644
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@@ -77,6 +77,7 @@ jobs:
         run: |
           git config --global core.longpaths true
           git config --global core.symlinks true
+          git config --global core.ignorecase false
 
           # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
           # the directory on Windows and prevent GHA from checking out as reported
@@ -84,10 +85,10 @@ jobs:
           git config --global core.fsmonitor false
 
       - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.9
 
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -102,7 +103,7 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           no-sudo: true
 
@@ -150,7 +151,7 @@ jobs:
           BUILD_WHEEL: 1
           MAX_JOBS: 8
           CUDA_VERSION: ${{ inputs.cuda-version }}
-          PYTHON_VERSION: "3.9"
+          PYTHON_VERSION: "3.10"
           SCCACHE_BUCKET: "ossci-compiler-cache"
           SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
           SCCACHE_REGION: us-east-1
diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml
index 0c95503928fb..37e48d99e2be 100644
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@@ -70,6 +70,7 @@ jobs:
         run: |
           git config --global core.longpaths true
           git config --global core.symlinks true
+          git config --global core.ignorecase false
 
           # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
           # the directory on Windows and prevent GHA from checking out as reported
@@ -77,10 +78,10 @@ jobs:
           git config --global core.fsmonitor false
 
       - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.9
 
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -96,7 +97,7 @@ jobs:
 
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           no-sudo: true
 
@@ -183,7 +184,7 @@ jobs:
         env:
           USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
           INSTALL_WINDOWS_SDK: 1
-          PYTHON_VERSION: 3.9
+          PYTHON_VERSION: "3.10"
           CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
           VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
           TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml
index 177e6ca4bbe3..6bceb4eef6ba 100644
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@@ -77,7 +77,7 @@ jobs:
     steps:
       # [see note: pytorch repo ref]
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
 
       - name: Setup XPU
         uses: ./.github/actions/setup-xpu
@@ -95,7 +95,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
@@ -109,7 +109,7 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -275,7 +275,7 @@ jobs:
       - name: Change permissions
         if: ${{ always() && steps.test.conclusion }}
         run: |
-          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test"
 
       - name: Print remaining test logs
         shell: bash
diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml
index 0754b154a358..e0492f736442 100644
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@@ -39,7 +39,7 @@ jobs:
         tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
     steps:
       - name: Build docker image
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.9
         with:
           docker-image-name: almalinux-builder
           custom-tag-prefix: ${{matrix.tag}}
diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml
index cc2f54fc45f8..edfa0168e19f 100644
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@@ -32,7 +32,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -58,7 +58,7 @@ jobs:
         ]
     steps:
       - name: Build docker image
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.9
         with:
           docker-image-name: libtorch-cxx11-builder
           custom-tag-prefix: ${{ matrix.tag }}
diff --git a/.github/workflows/build-manywheel-images-s390x.yml b/.github/workflows/build-manywheel-images-s390x.yml
index c498e169f1aa..a719bf21a1ca 100644
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: linux.s390x
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: false
           no-sudo: true
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
index ce42d5644c93..e3549cd6284a 100644
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@@ -32,7 +32,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -47,12 +47,11 @@ jobs:
       matrix:
         include: [
           { name: "manylinux2_28-builder",          tag: "cuda13.0",         runner: "linux.9xlarge.ephemeral" },
-          { name: "manylinux2_28-builder",          tag: "cuda12.9",         runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
           { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
-          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
           { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
           { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
@@ -64,7 +63,7 @@ jobs:
     name: ${{ matrix.name }}:${{ matrix.tag }}
     steps:
       - name: Build docker image
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.9
         with:
           docker-image-name: ${{ matrix.name }}
           custom-tag-prefix: ${{ matrix.tag }}
diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml
index 932d9c886302..8f066de47534 100644
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@@ -3,7 +3,7 @@ name: Build Triton wheels
 on:
   push:
     branches:
-      - main
+      - release/2.9
     tags:
       # NOTE: Binary build pipelines should only get triggered on release candidate builds
       # Release candidate tags look like: v1.11.0-rc1
@@ -36,7 +36,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -74,12 +74,12 @@ jobs:
       PLATFORM: 'manylinux_2_28_x86_64'
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: false
 
@@ -87,7 +87,7 @@ jobs:
         uses: ./.github/actions/setup-linux
 
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ env.DOCKER_IMAGE }}
 
@@ -184,7 +184,7 @@ jobs:
           path: ${{ runner.temp }}/artifacts/wheelhouse/*
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
         if: always()
 
   build-wheel-win:
@@ -217,7 +217,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml
new file mode 100644
index 000000000000..9efedf64cce7
--- /dev/null
+++ b/.github/workflows/build-vllm-wheel.yml
@@ -0,0 +1,248 @@
+name: Build vLLM wheels
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/build-vllm-wheel.yml
+      - .github/ci_commit_pins/vllm.txt
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - .github/workflows/build-vllm-wheel.yml
+      - .github/ci_commit_pins/vllm.txt
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-wheel:
+    if: github.repository_owner == 'pytorch'
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ '3.12' ]
+        # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
+        device: [ 'cu128', 'cu129' ]
+        runner: [ 'linux.12xlarge.memory' ]
+        include:
+          - device: cu128
+            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
+          - device: cu129
+            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
+    name: "Build ${{ matrix.device }} vLLM wheel"
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 480
+    env:
+      PY_VERS: ${{ matrix.python-version }}
+      MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
+      PLATFORM: 'manylinux_2_28_x86_64'
+      BUILD_DEVICE: ${{ matrix.device }}
+    steps:
+      - name: Setup SSH (Click me for login details)
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
+        with:
+          submodules: false
+
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Get latest PyTorch nightly
+        shell: bash
+        run: |
+          set -eux
+
+          # Keep PyTorch nightly wheel here so that we can install it later during
+          # vLLM build process
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+
+          container_name=$(docker run \
+            --tty \
+            --detach \
+            -e PLATFORM \
+            -v "${GITHUB_WORKSPACE}:/pytorch" \
+            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
+            -w /artifacts/ \
+            "${MANYLINUX_IMAGE}"
+          )
+
+          # Determine python executable for given version (copied from build-triton-wheel)
+          case $PY_VERS in
+          3.10)
+            PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python
+            ;;
+          3.11)
+            PYTHON_EXECUTABLE=/opt/python/cp311-cp311/bin/python
+            ;;
+          3.12)
+            PYTHON_EXECUTABLE=/opt/python/cp312-cp312/bin/python
+            ;;
+          3.13)
+            PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python
+            ;;
+          3.13t)
+            PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python
+            ;;
+          3.14)
+            PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python
+            ;;
+          3.14t)
+            PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python
+            ;;
+          *)
+            echo "Unsupported python version ${PY_VERS}"
+            exit 1
+            ;;
+          esac
+
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
+            --pre torch torchvision torchaudio \
+            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
+
+          # I wonder if there is a command to both download and install the wheels
+          # in one go
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \
+            --pre torch torchvision torchaudio \
+            --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
+
+          # Save this for later
+          echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
+          echo "container_name=${container_name}" >> "$GITHUB_ENV"
+
+      - name: Build vLLM wheel
+        uses: ./.github/actions/build-external-packages
+        with:
+          build-targets: vllm
+          docker-image: ${{ env.MANYLINUX_IMAGE }}
+          cuda-arch-list: '8.0;8.9;9.0;10.0;12.0'
+          torch-wheel-dir: ${{ runner.temp }}/artifacts
+          output-dir: ${{ runner.temp }}/artifacts/externals
+
+      - name: Prepare vLLM wheel
+        shell: bash
+        run: |
+          set -eux
+
+          # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
+          docker exec -t "${container_name}" bash -c "
+            set -eux
+
+            nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
+
+            pushd externals/vllm/wheels
+            for package in xformers flashinfer-python vllm; do
+              pushd \$package
+              auditwheel repair --plat \$PLATFORM *.whl \
+                --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
+              repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
+              repair_wheel=\$(basename \${repair_wheel})
+              popd
+
+              cp \${package}/wheelhouse/\${repair_wheel} .
+              version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+
+              if [[ \$package == vllm ]]; then
+                new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
+              else
+                major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
+                new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
+              fi
+
+              mv -- \$repair_wheel \$new_wheel
+              rm -rf \$package
+            done
+            popd
+          "
+
+          docker exec -t "${container_name}" chown -R 1000:1000 /artifacts
+
+      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
+        with:
+          name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
+          if-no-files-found: error
+          path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
+
+      - name: Teardown Linux
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
+        if: always()
+
+  # Copied from build-triton-wheel workflow (mostly)
+  upload-wheel:
+    name: "Upload ${{ matrix.device }} vLLM wheel"
+    needs:
+      - build-wheel
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        device: [ 'cu128', 'cu129' ]
+    env:
+      BUILD_DEVICE: ${{ matrix.device }}
+    permissions:
+      id-token: write
+      contents: read
+    container:
+      image: continuumio/miniconda3:4.12.0
+    environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Configure AWS credentials(PyTorch account) for main
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
+          aws-region: us-east-1
+
+      - name: Configure AWS credentials(PyTorch account) for RC builds
+        if: ${{ github.event_name == 'push' &&  (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
+          aws-region: us-east-1
+
+      - name: Download Build Artifacts
+        uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+        with:
+          # Download all available artifacts
+          path: ${{ runner.temp }}/artifacts-all
+
+      - name: Select Wheel Artifacts
+        shell: bash
+        run: |
+          set -eux
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
+
+      - name: Set DRY_RUN (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
+        shell: bash
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+
+      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
+        shell: bash
+        run: |
+          set -ex
+
+          if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then
+            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
+          fi
+
+      - name: Upload binaries
+        env:
+          PACKAGE_TYPE: wheel
+          UPLOAD_SUBFOLDER: ${{ env.BUILD_DEVICE }}
+          PKG_DIR: ${{ runner.temp }}/artifacts
+        shell: bash
+        run: |
+          set -ex
+          bash .circleci/scripts/binary_upload.sh
diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml
index 44430522b79d..1174a1c502f6 100644
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@@ -38,7 +38,7 @@ jobs:
     runs-on: linux.24_04.4x
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml
index bef3d8797149..da83019a5908 100644
--- a/.github/workflows/close-nonexistent-disable-issues.yml
+++ b/.github/workflows/close-nonexistent-disable-issues.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index 57fe7be15d29..03631be3e563 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -19,7 +19,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index b86ee2352bd1..f88244a13ffc 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -33,7 +33,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -50,28 +50,27 @@ jobs:
         runner: [linux.12xlarge]
         docker-image-name: [
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
+          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks,
           pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
           pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
-          pytorch-linux-jammy-py3.9-clang12,
+          pytorch-linux-jammy-py3.10-clang12,
           pytorch-linux-jammy-py3.13-clang12,
           pytorch-linux-jammy-rocm-n-py3,
           pytorch-linux-noble-rocm-n-py3,
           pytorch-linux-noble-rocm-alpha-py3,
           pytorch-linux-jammy-rocm-n-py3-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12,
-          pytorch-linux-jammy-py3.9-gcc11,
-          pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
+          pytorch-linux-jammy-py3.10-gcc11,
+          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
           pytorch-linux-jammy-py3.12-halide,
-          pytorch-linux-jammy-xpu-2025.0-py3,
-          pytorch-linux-jammy-xpu-2025.1-py3,
+          pytorch-linux-jammy-xpu-n-1-py3,
+          pytorch-linux-jammy-xpu-n-py3,
           pytorch-linux-jammy-py3-clang18-asan,
           pytorch-linux-jammy-py3-clang12-onnx,
           pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
           # Executorch pin needs update
           # pytorch-linux-jammy-py3-clang12-executorch,
           pytorch-linux-jammy-py3.12-triton-cpu,
@@ -97,21 +96,21 @@ jobs:
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required for git merge-base
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
 
       - name: Build docker image
         id: build-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-image-name: ci-image:${{ matrix.docker-image-name }}
           always-rebuild: true
           push: true
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.build-docker-image.outputs.docker-image }}
 
@@ -142,5 +141,5 @@ jobs:
         if: always()
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
         if: always()
diff --git a/.github/workflows/docker-cache-mi300.yml b/.github/workflows/docker-cache-mi300.yml
index 02c1171c567a..bc2ae450f7c2 100644
--- a/.github/workflows/docker-cache-mi300.yml
+++ b/.github/workflows/docker-cache-mi300.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: rocm-docker
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           no-sudo: true
 
@@ -39,13 +39,13 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
           push: false
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml
index 2560ebf7912a..134e4caf3088 100644
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@@ -37,7 +37,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -52,7 +52,7 @@ jobs:
       matrix: ${{ steps.generate-matrix.outputs.matrix }}
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           fetch-depth: 1
           submodules: true
@@ -82,7 +82,7 @@ jobs:
       CUDNN_VERSION: ${{ matrix.cudnn_version }}
     steps:
       - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
       # [see note: pytorch repo ref]
@@ -164,12 +164,12 @@ jobs:
           fi
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
         if: always()
 
   validate:
     needs: build
-    uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@main
+    uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.9
     with:
-      channel: nightly
+      channel: test
       ref: main
diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
index 59b14b455e9a..7e36c82644dc 100644
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -112,7 +112,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-cuda-aarch64-12_9-build:
+  manywheel-py3_10-cuda-aarch64-12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -121,39 +121,131 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_10-cuda-aarch64-12_9
+      build_name: manywheel-py3_10-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_10-cuda-aarch64-12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_10-cuda-aarch64-12_9-build
+    needs: manywheel-py3_10-cuda-aarch64-12_6-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda-aarch64-12_9
+      build_name: manywheel-py3_10-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_10-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_10-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_10-cuda-aarch64-13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_10-cuda-aarch64-13_0
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda-aarch64-13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda-aarch64-13_0-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -223,7 +315,99 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_11-cuda-aarch64-12_9-build:
+  manywheel-py3_11-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_11-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_11-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_11-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_11-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -232,39 +416,39 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_11-cuda-aarch64-12_9
+      build_name: manywheel-py3_11-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_11-cuda-aarch64-13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_11-cuda-aarch64-12_9-build
+    needs: manywheel-py3_11-cuda-aarch64-13_0-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda-aarch64-12_9
+      build_name: manywheel-py3_11-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -334,7 +518,53 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_12-cuda-aarch64-12_9-build:
+  manywheel-py3_12-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_12-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-cuda-aarch64-12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -343,39 +573,85 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_12-cuda-aarch64-12_9
+      build_name: manywheel-py3_12-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_12-cuda-aarch64-12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_12-cuda-aarch64-12_9-build
+    needs: manywheel-py3_12-cuda-aarch64-12_8-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda-aarch64-12_9
+      build_name: manywheel-py3_12-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-cuda-aarch64-13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_12-cuda-aarch64-13_0
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda-aarch64-13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda-aarch64-13_0-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -445,7 +721,53 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13-cuda-aarch64-12_9-build:
+  manywheel-py3_13-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13-cuda-aarch64-12_8-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -454,39 +776,85 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13-cuda-aarch64-12_9
+      build_name: manywheel-py3_13-cuda-aarch64-12_8
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_13-cuda-aarch64-12_8-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13-cuda-aarch64-12_9-build
+    needs: manywheel-py3_13-cuda-aarch64-12_8-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
       DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda-aarch64-12_9
+      build_name: manywheel-py3_13-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13-cuda-aarch64-13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13-cuda-aarch64-13_0
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda-aarch64-13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cuda-aarch64-13_0-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -556,7 +924,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13t-cuda-aarch64-12_9-build:
+  manywheel-py3_13t-cuda-aarch64-12_6-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -565,39 +933,131 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+      build_name: manywheel-py3_13t-cuda-aarch64-12_6
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_13t-cuda-aarch64-12_6-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_13t-cuda-aarch64-12_9-build
+    needs: manywheel-py3_13t-cuda-aarch64-12_6-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
       DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+      build_name: manywheel-py3_13t-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda-aarch64-13_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13t-cuda-aarch64-13_0
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda-aarch64-13_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda-aarch64-13_0-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -667,7 +1127,99 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14-cuda-aarch64-12_9-build:
+  manywheel-py3_14-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -676,39 +1228,39 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14-cuda-aarch64-12_9
+      build_name: manywheel-py3_14-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_14-cuda-aarch64-13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_14-cuda-aarch64-12_9-build
+    needs: manywheel-py3_14-cuda-aarch64-13_0-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cuda-aarch64-12_9
+      build_name: manywheel-py3_14-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -778,7 +1330,99 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14t-cuda-aarch64-12_9-build:
+  manywheel-py3_14t-cuda-aarch64-12_6-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_6
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda-aarch64-12_6-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda-aarch64-12_6-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu126
+      GPU_ARCH_VERSION: "12.6-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_6
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cuda-aarch64-12_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_8
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda-aarch64-12_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda-aarch64-12_8-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: "12.8-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cuda-aarch64-13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -787,39 +1431,39 @@ jobs:
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.arm64.m7g.4xlarge.ephemeral
       ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+      build_name: manywheel-py3_14t-cuda-aarch64-13_0
       build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
       timeout-minutes: 420
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cuda-aarch64-12_9-upload:  # Uploading
+  manywheel-py3_14t-cuda-aarch64-13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: manywheel-py3_14t-cuda-aarch64-12_9-build
+    needs: manywheel-py3_14t-cuda-aarch64-13_0-build
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: manywheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0-aarch64"
       GPU_ARCH_TYPE: cuda-aarch64
       DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+      build_name: manywheel-py3_14t-cuda-aarch64-13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
index 776e77e80826..bc671ae80ae2 100644
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -248,7 +248,7 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  libtorch-cuda12_9-shared-with-deps-release-build:
+  libtorch-cuda13_0-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -257,22 +257,22 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_name: libtorch-cuda13_0-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+  libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - libtorch-cuda12_9-shared-with-deps-release-build
+      - libtorch-cuda13_0-shared-with-deps-release-build
       - get-label-type
     uses: ./.github/workflows/_binary-test-linux.yml
     with:
@@ -280,38 +280,38 @@ jobs:
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_name: libtorch-cuda13_0-shared-with-deps-release
       build_environment: linux-binary-libtorch
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+  libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    needs: libtorch-cuda13_0-shared-with-deps-release-test
     with:
       PYTORCH_ROOT: /pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DOCKER_IMAGE_TAG_PREFIX: cuda13.0
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_name: libtorch-cuda13_0-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -333,6 +333,7 @@ jobs:
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: libtorch-rocm6_3-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
@@ -368,7 +369,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -390,7 +390,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: libtorch-cxx11-builder
@@ -398,7 +398,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -447,6 +447,7 @@ jobs:
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: libtorch-rocm6_4-shared-with-deps-release
       build_environment: linux-binary-libtorch
     secrets:
@@ -482,7 +483,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -504,7 +504,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: libtorch-cxx11-builder
@@ -512,7 +512,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
diff --git a/.github/workflows/generated-linux-binary-libtorch-release-main.yml b/.github/workflows/generated-linux-binary-libtorch-release-main.yml
index c98d71dfefc4..9d55fc6e50ab 100644
--- a/.github/workflows/generated-linux-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-release-main.yml
@@ -36,7 +36,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml
index ec08b2c78eb6..85b91378b253 100644
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@@ -36,7 +36,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -60,7 +60,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-test:  # Testing
diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
index 96a4a0fff837..5f9eaab976a6 100644
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -127,7 +127,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_6-test:  # Testing
@@ -193,7 +193,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda12_8-test:  # Testing
@@ -241,72 +241,6 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_10-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_9
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_9-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
   manywheel-py3_10-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -325,7 +259,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-cuda13_0-test:  # Testing
@@ -389,6 +323,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_10-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -423,7 +358,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -445,7 +379,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -453,7 +387,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -500,6 +434,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_10-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -534,7 +469,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -556,7 +490,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -564,7 +498,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -612,7 +546,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_10-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_10-xpu-test:  # Testing
@@ -638,7 +572,7 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -656,7 +590,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -667,7 +600,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -675,7 +608,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -785,7 +718,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_6-test:  # Testing
@@ -851,7 +784,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda12_8-test:  # Testing
@@ -899,72 +832,6 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_11-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_11-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_9
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda12_9-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
   manywheel-py3_11-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -983,7 +850,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-cuda13_0-test:  # Testing
@@ -1047,6 +914,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_11-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -1081,7 +949,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1103,7 +970,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1111,7 +978,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1158,6 +1025,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.11"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_11-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -1192,7 +1060,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1214,7 +1081,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1222,7 +1089,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1270,7 +1137,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_11-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_11-xpu-test:  # Testing
@@ -1296,7 +1163,7 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -1314,7 +1181,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1325,7 +1191,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1333,7 +1199,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1443,7 +1309,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_6-test:  # Testing
@@ -1509,7 +1375,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda12_8-test:  # Testing
@@ -1557,72 +1423,6 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_12-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_12-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_12-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_9
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-cuda12_9-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
   manywheel-py3_12-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -1641,7 +1441,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-cuda13_0-test:  # Testing
@@ -1705,6 +1505,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_12-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -1739,7 +1540,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1761,7 +1561,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1769,7 +1569,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1816,6 +1616,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.12"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_12-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -1850,7 +1651,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1872,7 +1672,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1880,7 +1680,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -1928,7 +1728,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_12-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_12-xpu-test:  # Testing
@@ -1954,7 +1754,7 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -1972,7 +1772,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1983,7 +1782,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -1991,7 +1790,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2101,7 +1900,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_6-test:  # Testing
@@ -2167,7 +1966,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda12_8-test:  # Testing
@@ -2215,72 +2014,6 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.13"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_9
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda12_9-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
   manywheel-py3_13-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2299,7 +2032,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-cuda13_0-test:  # Testing
@@ -2363,6 +2096,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_13-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -2397,7 +2131,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2419,7 +2152,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2427,7 +2160,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2474,6 +2207,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_13-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -2508,7 +2242,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2530,7 +2263,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2538,7 +2271,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2586,7 +2319,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13-xpu-test:  # Testing
@@ -2612,7 +2345,7 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -2630,7 +2363,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2641,7 +2373,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -2649,7 +2381,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -2759,7 +2491,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_6-test:  # Testing
@@ -2825,7 +2557,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda12_8-test:  # Testing
@@ -2873,72 +2605,6 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_13t-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.13t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_13t-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_13t-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_9
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13t-cuda12_9-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
   manywheel-py3_13t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -2957,7 +2623,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-cuda13_0-test:  # Testing
@@ -3021,6 +2687,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_13t-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -3055,7 +2722,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3077,7 +2743,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3085,7 +2751,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3132,6 +2798,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.13t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_13t-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -3166,7 +2833,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3188,7 +2854,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3196,7 +2862,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3244,7 +2910,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_13t-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_13t-xpu-test:  # Testing
@@ -3270,7 +2936,7 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -3288,7 +2954,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3299,7 +2964,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3307,7 +2972,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3417,7 +3082,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_6-test:  # Testing
@@ -3483,7 +3148,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda12_8-test:  # Testing
@@ -3531,72 +3196,6 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_14-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_14-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cuda12_9
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cuda12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14-cuda12_9-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cuda12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
   manywheel-py3_14-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -3615,7 +3214,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-cuda13_0-test:  # Testing
@@ -3679,6 +3278,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_14-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -3713,7 +3313,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3735,7 +3334,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3743,7 +3342,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3790,6 +3389,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_14-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -3824,7 +3424,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3846,7 +3445,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3854,7 +3453,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -3902,7 +3501,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14-xpu-test:  # Testing
@@ -3928,7 +3527,7 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -3946,7 +3545,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3957,7 +3555,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -3965,7 +3563,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -4075,7 +3673,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_6
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_6-test:  # Testing
@@ -4141,7 +3739,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda12_8
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda12_8-test:  # Testing
@@ -4189,72 +3787,6 @@ jobs:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
 
-  manywheel-py3_14t-cuda12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_14t-cuda12_9
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cuda12_9-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_14t-cuda12_9-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cuda12_9
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cuda12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14t-cuda12_9-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cuda12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
   manywheel-py3_14t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
@@ -4273,7 +3805,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-cuda13_0
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-cuda13_0-test:  # Testing
@@ -4337,6 +3869,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.3
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_14t-rocm6_3
       build_environment: linux-binary-manywheel
     secrets:
@@ -4371,7 +3904,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4393,7 +3925,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -4401,7 +3933,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -4448,6 +3980,7 @@ jobs:
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
       DESIRED_PYTHON: "3.14t"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
       build_name: manywheel-py3_14t-rocm6_4
       build_environment: linux-binary-manywheel
     secrets:
@@ -4482,7 +4015,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4504,7 +4036,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -4512,7 +4044,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
@@ -4560,7 +4092,7 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build_name: manywheel-py3_14t-xpu
       build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
   manywheel-py3_14t-xpu-test:  # Testing
@@ -4586,7 +4118,7 @@ jobs:
       contents: read
     steps:
       - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
+        uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9
       - name: configure aws credentials
         id: aws_creds
         uses: aws-actions/configure-aws-credentials@v4
@@ -4604,7 +4136,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4615,7 +4146,7 @@ jobs:
         working-directory: pytorch
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -4623,7 +4154,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
index 8177bac3fe21..9df4835757c4 100644
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@@ -38,13 +38,13 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-rocm6_4-build:
+  manywheel-py3_10-rocm6_4-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     uses: ./.github/workflows/_binary-build-linux.yml
     needs: get-label-type
@@ -58,16 +58,17 @@ jobs:
       GPU_ARCH_TYPE: rocm
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_10-rocm6_4
       build_environment: linux-binary-manywheel-rocm
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm6_4-test:  # Testing
+  manywheel-py3_10-rocm6_4-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - manywheel-py3_9-rocm6_4-build
+      - manywheel-py3_10-rocm6_4-build
       - get-label-type
     runs-on: linux.rocm.gpu.mi250
     timeout-minutes: 240
@@ -82,19 +83,18 @@ jobs:
       SKIP_ALL_TESTS: 1
       DOCKER_IMAGE: manylinux2_28-builder
       DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Setup ROCm
         uses: ./.github/actions/setup-rocm
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: manywheel-py3_9-rocm6_4
+          name: manywheel-py3_10-rocm6_4
           path: "${{ runner.temp }}/artifacts/"
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -116,7 +116,7 @@ jobs:
           role-duration-seconds: 18000
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
           docker-image-name: manylinux2_28-builder
@@ -124,7 +124,7 @@ jobs:
           docker-build-dir: .ci/docker
           working-directory: pytorch
       - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
       - name: Test Pytorch binary
diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
index b0c3c06b2e61..d7fd44031be2 100644
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -302,3 +302,195 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.13t"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      timeout-minutes: 420
+      build_name: manywheel-py3_13t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      timeout-minutes: 420
+      build_name: manywheel-py3_14-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14t"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      timeout-minutes: 420
+      build_name: manywheel-py3_14t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14t-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-s390x
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
index ad7a1cf1d71d..5f21fc565901 100644
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@@ -46,7 +46,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -67,15 +67,9 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
index bcc7279dd777..b12a5212cd4e 100644
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@@ -63,15 +63,9 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -208,15 +202,9 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -353,15 +341,9 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -498,15 +480,9 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -643,15 +619,9 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -788,15 +758,9 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -933,15 +897,9 @@ jobs:
           chmod +x "${RUNNER_TEMP}/conda.sh"
           /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
           echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
index 2c86e7e10359..7a8ea9cbfa2c 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -51,7 +51,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -64,7 +64,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Populate binary env
         shell: cmd
@@ -128,7 +128,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-debug-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -141,7 +141,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Populate binary env
         shell: cmd
@@ -201,7 +201,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cpu-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
index 912a452f0ee8..14081649d370 100644
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -51,7 +51,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -64,7 +64,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Populate binary env
         shell: cmd
@@ -128,7 +128,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-release-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -141,7 +141,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Populate binary env
         shell: cmd
@@ -201,7 +201,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cpu-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
index 1dd70d0d06a9..d0e02dade299 100644
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@@ -41,7 +41,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -51,7 +51,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -124,7 +124,7 @@ jobs:
       - wheel-py3_11-cpu-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -198,7 +198,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -271,7 +271,7 @@ jobs:
       - wheel-py3_12-cpu-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -345,7 +345,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -418,7 +418,7 @@ jobs:
       - wheel-py3_13-cpu-build
       - get-label-type
     runs-on: "windows-11-arm64-preview"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
index ac15a9f3e97a..3df2c65440a5 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@@ -28,7 +28,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -38,7 +38,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -51,7 +51,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -77,7 +77,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -109,7 +109,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -153,7 +152,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -166,7 +165,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -183,7 +182,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -215,7 +214,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
index 9c3a96d4caee..f4413a86c657 100644
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@@ -35,7 +35,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -45,7 +45,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -58,7 +58,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -84,7 +84,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -116,7 +116,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -160,7 +159,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -173,7 +172,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -190,7 +189,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -222,7 +221,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -283,7 +281,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cpu-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -292,7 +290,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -306,7 +304,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -332,7 +330,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -364,7 +362,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -408,7 +405,7 @@ jobs:
       - libtorch-cuda12_6-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -422,7 +419,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -439,7 +436,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -471,7 +468,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -533,7 +529,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda12_6-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -542,7 +538,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -556,7 +552,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -582,7 +578,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -614,7 +610,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -658,7 +653,7 @@ jobs:
       - libtorch-cuda12_8-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -672,7 +667,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -689,7 +684,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -721,7 +716,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -783,30 +777,30 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda12_8-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda12_9-shared-with-deps-debug-build:
+  libtorch-cuda13_0-shared-with-deps-debug-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -832,7 +826,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -864,7 +858,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -884,7 +877,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: libtorch-cuda12_9-shared-with-deps-debug
+          name: libtorch-cuda13_0-shared-with-deps-debug
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -902,27 +895,27 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  libtorch-cuda12_9-shared-with-deps-debug-test:  # Testing
+  libtorch-cuda13_0-shared-with-deps-debug-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - libtorch-cuda12_9-shared-with-deps-debug-build
+      - libtorch-cuda13_0-shared-with-deps-debug-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -939,7 +932,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -971,7 +964,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -992,7 +984,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: libtorch-cuda12_9-shared-with-deps-debug
+          name: libtorch-cuda13_0-shared-with-deps-debug
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1015,26 +1007,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda12_9-shared-with-deps-debug-upload:  # Uploading
+  libtorch-cuda13_0-shared-with-deps-debug-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: libtorch-cuda12_9-shared-with-deps-debug-test
+    needs: libtorch-cuda13_0-shared-with-deps-debug-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: debug
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda12_9-shared-with-deps-debug
+      DESIRED_PYTHON: "3.10"
+      build_name: libtorch-cuda13_0-shared-with-deps-debug
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
index 9a0a3496e37b..ef94d6212af3 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@@ -28,7 +28,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -38,7 +38,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -51,7 +51,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -77,7 +77,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -109,7 +109,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -153,7 +152,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -166,7 +165,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -183,7 +182,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -215,7 +214,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
index d212894b7443..8f4ec6e0b205 100644
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@@ -35,7 +35,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -45,7 +45,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -58,7 +58,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -84,7 +84,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -116,7 +116,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -160,7 +159,7 @@ jobs:
       - libtorch-cpu-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -173,7 +172,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -190,7 +189,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -222,7 +221,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -283,7 +281,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cpu-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -292,7 +290,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -306,7 +304,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -332,7 +330,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -364,7 +362,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -408,7 +405,7 @@ jobs:
       - libtorch-cuda12_6-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -422,7 +419,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -439,7 +436,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -471,7 +468,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -533,7 +529,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda12_6-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
@@ -542,7 +538,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -556,7 +552,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -582,7 +578,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -614,7 +610,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -658,7 +653,7 @@ jobs:
       - libtorch-cuda12_8-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
@@ -672,7 +667,7 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -689,7 +684,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -721,7 +716,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -783,30 +777,30 @@ jobs:
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
       build_name: libtorch-cuda12_8-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  libtorch-cuda12_9-shared-with-deps-release-build:
+  libtorch-cuda13_0-shared-with-deps-release-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -832,7 +826,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -864,7 +858,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -884,7 +877,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: libtorch-cuda12_9-shared-with-deps-release
+          name: libtorch-cuda13_0-shared-with-deps-release
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -902,27 +895,27 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+  libtorch-cuda13_0-shared-with-deps-release-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - libtorch-cuda12_9-shared-with-deps-release-build
+      - libtorch-cuda13_0-shared-with-deps-release-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
     steps:
       - name: Display EC2 information
         shell: bash
@@ -939,7 +932,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -971,7 +964,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -992,7 +984,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: libtorch-cuda12_9-shared-with-deps-release
+          name: libtorch-cuda13_0-shared-with-deps-release
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -1015,26 +1007,26 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+  libtorch-cuda13_0-shared-with-deps-release-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    needs: libtorch-cuda13_0-shared-with-deps-release-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: libtorch
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       LIBTORCH_CONFIG: release
       LIBTORCH_VARIANT: shared-with-deps
       # This is a dummy value for libtorch to work correctly with our batch scripts
       # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
-      build_name: libtorch-cuda12_9-shared-with-deps-release
+      DESIRED_PYTHON: "3.10"
+      build_name: libtorch-cuda13_0-shared-with-deps-release
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml
index b476973a1d86..bca8d4843463 100644
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@@ -35,7 +35,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -45,7 +45,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -80,7 +80,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -112,7 +112,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -156,7 +155,7 @@ jobs:
       - wheel-py3_10-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -182,7 +181,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -214,7 +213,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -280,7 +278,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -316,7 +314,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -348,7 +346,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -392,7 +389,7 @@ jobs:
       - wheel-py3_10-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -419,7 +416,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -451,7 +448,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -518,7 +514,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -554,7 +550,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -586,7 +582,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -630,7 +625,7 @@ jobs:
       - wheel-py3_10-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -657,7 +652,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -689,7 +684,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -752,18 +746,18 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_10-cuda12_9-build:
+  wheel-py3_10-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -792,7 +786,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -824,7 +818,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -844,7 +837,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_10-cuda12_9
+          name: wheel-py3_10-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -862,20 +855,20 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_10-cuda12_9-test:  # Testing
+  wheel-py3_10-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_10-cuda12_9-build
+      - wheel-py3_10-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
@@ -895,7 +888,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -927,7 +920,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -948,7 +940,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_10-cuda12_9
+          name: wheel-py3_10-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -971,22 +963,22 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_10-cuda12_9-upload:  # Uploading
+  wheel-py3_10-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_10-cuda12_9-test
+    needs: wheel-py3_10-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.10"
-      build_name: wheel-py3_10-cuda12_9
+      build_name: wheel-py3_10-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -994,7 +986,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1004,7 +996,7 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -1030,7 +1022,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1062,7 +1054,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1106,7 +1097,7 @@ jobs:
       - wheel-py3_10-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1132,7 +1123,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1164,7 +1155,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1230,7 +1220,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1265,7 +1255,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1297,7 +1287,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1341,7 +1330,7 @@ jobs:
       - wheel-py3_11-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1367,7 +1356,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1399,7 +1388,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1465,7 +1453,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1501,7 +1489,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1533,7 +1521,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1577,7 +1564,7 @@ jobs:
       - wheel-py3_11-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1604,7 +1591,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1636,7 +1623,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1703,7 +1689,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1739,7 +1725,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1771,7 +1757,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1815,7 +1800,7 @@ jobs:
       - wheel-py3_11-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -1842,7 +1827,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -1874,7 +1859,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -1937,18 +1921,18 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_11-cuda12_9-build:
+  wheel-py3_11-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -1977,7 +1961,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2009,7 +1993,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2029,7 +2012,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_11-cuda12_9
+          name: wheel-py3_11-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -2047,20 +2030,20 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_11-cuda12_9-test:  # Testing
+  wheel-py3_11-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_11-cuda12_9-build
+      - wheel-py3_11-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
@@ -2080,7 +2063,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2112,7 +2095,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2133,7 +2115,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_11-cuda12_9
+          name: wheel-py3_11-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -2156,22 +2138,22 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_11-cuda12_9-upload:  # Uploading
+  wheel-py3_11-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_11-cuda12_9-test
+    needs: wheel-py3_11-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.11"
-      build_name: wheel-py3_11-cuda12_9
+      build_name: wheel-py3_11-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -2179,7 +2161,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2189,7 +2171,7 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -2215,7 +2197,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2247,7 +2229,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2291,7 +2272,7 @@ jobs:
       - wheel-py3_11-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2317,7 +2298,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2349,7 +2330,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2415,7 +2395,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2450,7 +2430,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2482,7 +2462,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2526,7 +2505,7 @@ jobs:
       - wheel-py3_12-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2552,7 +2531,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2584,7 +2563,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2650,7 +2628,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2686,7 +2664,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2718,7 +2696,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2762,7 +2739,7 @@ jobs:
       - wheel-py3_12-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2789,7 +2766,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2821,7 +2798,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -2888,7 +2864,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -2924,7 +2900,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -2956,7 +2932,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3000,7 +2975,7 @@ jobs:
       - wheel-py3_12-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3027,7 +3002,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3059,7 +3034,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3122,18 +3096,18 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_12-cuda12_9-build:
+  wheel-py3_12-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3162,7 +3136,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3194,7 +3168,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3214,7 +3187,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_12-cuda12_9
+          name: wheel-py3_12-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -3232,20 +3205,20 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_12-cuda12_9-test:  # Testing
+  wheel-py3_12-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_12-cuda12_9-build
+      - wheel-py3_12-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
@@ -3265,7 +3238,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3297,7 +3270,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3318,7 +3290,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_12-cuda12_9
+          name: wheel-py3_12-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -3341,22 +3313,22 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_12-cuda12_9-upload:  # Uploading
+  wheel-py3_12-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_12-cuda12_9-test
+    needs: wheel-py3_12-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.12"
-      build_name: wheel-py3_12-cuda12_9
+      build_name: wheel-py3_12-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -3364,7 +3336,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3374,7 +3346,7 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -3400,7 +3372,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3432,7 +3404,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3476,7 +3447,7 @@ jobs:
       - wheel-py3_12-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3502,7 +3473,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3534,7 +3505,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3600,7 +3570,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3635,7 +3605,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3667,7 +3637,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3711,7 +3680,7 @@ jobs:
       - wheel-py3_13-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3737,7 +3706,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3769,7 +3738,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3835,7 +3803,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3871,7 +3839,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -3903,7 +3871,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -3947,7 +3914,7 @@ jobs:
       - wheel-py3_13-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -3974,7 +3941,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4006,7 +3973,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4073,7 +4039,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4109,7 +4075,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4141,7 +4107,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4185,7 +4150,7 @@ jobs:
       - wheel-py3_13-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4212,7 +4177,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4244,7 +4209,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4307,18 +4271,18 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13-cuda12_9-build:
+  wheel-py3_13-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -4347,7 +4311,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4379,7 +4343,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4399,7 +4362,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13-cuda12_9
+          name: wheel-py3_13-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -4417,20 +4380,20 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13-cuda12_9-test:  # Testing
+  wheel-py3_13-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13-cuda12_9-build
+      - wheel-py3_13-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
@@ -4450,7 +4413,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4482,7 +4445,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4503,7 +4465,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13-cuda12_9
+          name: wheel-py3_13-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -4526,22 +4488,22 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13-cuda12_9-upload:  # Uploading
+  wheel-py3_13-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13-cuda12_9-test
+    needs: wheel-py3_13-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13"
-      build_name: wheel-py3_13-cuda12_9
+      build_name: wheel-py3_13-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -4549,7 +4511,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4559,7 +4521,7 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -4585,7 +4547,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4617,7 +4579,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4661,7 +4622,7 @@ jobs:
       - wheel-py3_13-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4687,7 +4648,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4719,7 +4680,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4785,7 +4745,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4820,7 +4780,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4852,7 +4812,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -4896,7 +4855,7 @@ jobs:
       - wheel-py3_13t-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -4922,7 +4881,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -4954,7 +4913,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5020,7 +4978,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5056,7 +5014,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5088,7 +5046,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5132,7 +5089,7 @@ jobs:
       - wheel-py3_13t-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5159,7 +5116,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5191,7 +5148,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5258,7 +5214,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5294,7 +5250,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5326,7 +5282,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5370,7 +5325,7 @@ jobs:
       - wheel-py3_13t-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5397,7 +5352,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5429,7 +5384,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5492,18 +5446,18 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_13t-cuda12_9-build:
+  wheel-py3_13t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -5532,7 +5486,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5564,7 +5518,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5584,7 +5537,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_13t-cuda12_9
+          name: wheel-py3_13t-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -5602,20 +5555,20 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_13t-cuda12_9-test:  # Testing
+  wheel-py3_13t-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_13t-cuda12_9-build
+      - wheel-py3_13t-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
@@ -5635,7 +5588,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5667,7 +5620,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5688,7 +5640,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_13t-cuda12_9
+          name: wheel-py3_13t-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -5711,22 +5663,22 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_13t-cuda12_9-upload:  # Uploading
+  wheel-py3_13t-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_13t-cuda12_9-test
+    needs: wheel-py3_13t-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.13t"
-      build_name: wheel-py3_13t-cuda12_9
+      build_name: wheel-py3_13t-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -5734,7 +5686,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5744,7 +5696,7 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.13t"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -5770,7 +5722,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5802,7 +5754,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5846,7 +5797,7 @@ jobs:
       - wheel-py3_13t-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -5872,7 +5823,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -5904,7 +5855,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -5970,7 +5920,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6005,7 +5955,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6037,7 +5987,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6081,7 +6030,7 @@ jobs:
       - wheel-py3_14-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6107,7 +6056,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6139,7 +6088,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6205,7 +6153,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6241,7 +6189,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6273,7 +6221,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6317,7 +6264,7 @@ jobs:
       - wheel-py3_14-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6344,7 +6291,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6376,7 +6323,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6443,7 +6389,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6479,7 +6425,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6511,7 +6457,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6555,7 +6500,7 @@ jobs:
       - wheel-py3_14-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6582,7 +6527,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6614,7 +6559,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6677,18 +6621,18 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_14-cuda12_9-build:
+  wheel-py3_14-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14"
@@ -6717,7 +6661,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6749,7 +6693,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6769,7 +6712,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_14-cuda12_9
+          name: wheel-py3_14-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -6787,20 +6730,20 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_14-cuda12_9-test:  # Testing
+  wheel-py3_14-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_14-cuda12_9-build
+      - wheel-py3_14-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14"
@@ -6820,7 +6763,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6852,7 +6795,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -6873,7 +6815,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_14-cuda12_9
+          name: wheel-py3_14-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -6896,22 +6838,22 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_14-cuda12_9-upload:  # Uploading
+  wheel-py3_14-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_14-cuda12_9-test
+    needs: wheel-py3_14-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.14"
-      build_name: wheel-py3_14-cuda12_9
+      build_name: wheel-py3_14-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -6919,7 +6861,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -6929,7 +6871,7 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -6955,7 +6897,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -6987,7 +6929,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7031,7 +6972,7 @@ jobs:
       - wheel-py3_14-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -7057,7 +6998,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7089,7 +7030,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7155,7 +7095,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -7190,7 +7130,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7222,7 +7162,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7266,7 +7205,7 @@ jobs:
       - wheel-py3_14t-cpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -7292,7 +7231,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7324,7 +7263,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7390,7 +7328,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -7426,7 +7364,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7458,7 +7396,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7502,7 +7439,7 @@ jobs:
       - wheel-py3_14t-cuda12_6-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -7529,7 +7466,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7561,7 +7498,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7628,7 +7564,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -7664,7 +7600,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7696,7 +7632,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7740,7 +7675,7 @@ jobs:
       - wheel-py3_14t-cuda12_8-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -7767,7 +7702,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7799,7 +7734,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7862,18 +7796,18 @@ jobs:
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_14t-cuda12_9-build:
+  wheel-py3_14t-cuda13_0-build:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14t"
@@ -7902,7 +7836,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -7934,7 +7868,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -7954,7 +7887,7 @@ jobs:
       - uses: actions/upload-artifact@v4.4.0
         if: always()
         with:
-          name: wheel-py3_14t-cuda12_9
+          name: wheel-py3_14t-cuda13_0
           retention-days: 14
           if-no-files-found: error
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
@@ -7972,20 +7905,20 @@ jobs:
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
 
-  wheel-py3_14t-cuda12_9-test:  # Testing
+  wheel-py3_14t-cuda13_0-test:  # Testing
     if: ${{ github.repository_owner == 'pytorch' }}
     needs:
-      - wheel-py3_14t-cuda12_9-build
+      - wheel-py3_14t-cuda13_0-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14t"
@@ -8005,7 +7938,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -8037,7 +7970,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -8058,7 +7990,7 @@ jobs:
       - uses: actions/download-artifact@v4.1.7
         name: Download Build Artifacts
         with:
-          name: wheel-py3_14t-cuda12_9
+          name: wheel-py3_14t-cuda13_0
           path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
       - name: Populate binary env
         shell: bash
@@ -8081,22 +8013,22 @@ jobs:
         if: always()
         run: |
           .github\scripts\kill_active_ssh_sessions.ps1
-  wheel-py3_14t-cuda12_9-upload:  # Uploading
+  wheel-py3_14t-cuda13_0-upload:  # Uploading
     if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       id-token: write
       contents: read
-    needs: wheel-py3_14t-cuda12_9-test
+    needs: wheel-py3_14t-cuda13_0-test
     with:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
       # TODO: This is a legacy variable that we eventually want to get rid of in
       #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      DESIRED_CUDA: cu130
+      GPU_ARCH_VERSION: "13.0"
       GPU_ARCH_TYPE: cuda
       DESIRED_PYTHON: "3.14t"
-      build_name: wheel-py3_14t-cuda12_9
+      build_name: wheel-py3_14t-cuda13_0
     secrets:
       github-token: ${{ secrets.GITHUB_TOKEN }}
     uses: ./.github/workflows/_binary-upload.yml
@@ -8104,7 +8036,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' }}
     needs: get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -8114,7 +8046,7 @@ jobs:
       GPU_ARCH_TYPE: xpu
       SKIP_ALL_TESTS: 1
       DESIRED_PYTHON: "3.14t"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1
     steps:
       # NOTE: These environment variables are put here so that they can be applied on every job equally
       #       They are also here because setting them at a workflow level doesn't give us access to the
@@ -8140,7 +8072,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -8172,7 +8104,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
@@ -8216,7 +8147,7 @@ jobs:
       - wheel-py3_14t-xpu-build
       - get-label-type
     runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    timeout-minutes: 300
+    timeout-minutes: 360
     env:
       PYTORCH_ROOT: ${{ github.workspace }}/pytorch
       PACKAGE_TYPE: wheel
@@ -8242,7 +8173,7 @@ jobs:
           echo "instance-type: $(get_ec2_metadata instance-type)"
           echo "system info $(uname -a)"
       - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9
         continue-on-error: true
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
@@ -8274,7 +8205,6 @@ jobs:
       - name: Checkout PyTorch
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
           submodules: recursive
           path: pytorch
           show-progress: false
diff --git a/.github/workflows/h100-cutlass-backend.yml b/.github/workflows/h100-cutlass-backend.yml
index edf4c2e0e807..6eb072399242 100644
--- a/.github/workflows/h100-cutlass-backend.yml
+++ b/.github/workflows/h100-cutlass-backend.yml
@@ -27,7 +27,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml
index a0a7495483d4..8996add88383 100644
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@@ -24,7 +24,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/h100-symm-mem.yml b/.github/workflows/h100-symm-mem.yml
index c75ca569fc7d..fa8a795216f3 100644
--- a/.github/workflows/h100-symm-mem.yml
+++ b/.github/workflows/h100-symm-mem.yml
@@ -24,7 +24,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-micro-benchmark-x86.yml b/.github/workflows/inductor-micro-benchmark-x86.yml
index 117183428abc..c6cc075e6b27 100644
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@@ -18,13 +18,13 @@ permissions:
   contents: read
 
 jobs:
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+  inductor-build:
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       # Use metal host for benchmark jobs
       test-matrix: |
         { include: [
@@ -32,13 +32,13 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-micro-benchmark-test:
+    name: inductor-micro-benchmark-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     with:
       build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
     secrets: inherit
diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml
index a0ae234ab566..842094e0eb48 100644
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@@ -20,7 +20,7 @@ permissions:
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml
index c17a4ed6341a..7502381de93d 100644
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@@ -23,7 +23,7 @@ permissions:
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -32,13 +32,13 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build:
-    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
+  nightly-dynamo-benchmarks-build:
+    name: nightly-dynamo-benchmarks-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -51,13 +51,13 @@ jobs:
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test:
-    name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks
+  nightly-dynamo-benchmarks-test:
+    name: nightly-dynamo-benchmarks-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build
+    needs: nightly-dynamo-benchmarks-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
       timeout-minutes: 720
     secrets: inherit
diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml
index 628f62424012..35217f72bf1a 100644
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@@ -18,7 +18,7 @@ jobs:
   get-default-label-prefix:
     if: github.repository_owner == 'pytorch'
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/inductor-perf-test-b200.yml b/.github/workflows/inductor-perf-test-b200.yml
index 7b59e92386a3..3c648a849f78 100644
--- a/.github/workflows/inductor-perf-test-b200.yml
+++ b/.github/workflows/inductor-perf-test-b200.yml
@@ -70,7 +70,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
index e16c8be79130..9e3165fe11ea 100644
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@@ -55,7 +55,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml
index dfaec8240d6c..7e323fa5a92e 100644
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@@ -75,7 +75,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -84,9 +84,8 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  # NB: Keep this in sync with trunk.yml
   build:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -128,7 +127,7 @@ jobs:
     secrets: inherit
 
   test-periodically:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: test-periodically
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '15 0,12 * * 1-6'
@@ -145,7 +144,7 @@ jobs:
     secrets: inherit
 
   test-weekly:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: test-weekly
     uses: ./.github/workflows/_linux-test.yml
     needs: build
     if: github.event.schedule == '0 7 * * 0'
@@ -162,9 +161,12 @@ jobs:
     secrets: inherit
 
   test:
-    name: cuda12.8-py3.10-gcc9-sm90
+    name: test
     uses: ./.github/workflows/_linux-test.yml
     needs: build
+    # The pull_request trigger is used in PR to bump transformers pin which always
+    # needs one round of benchmark
+    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
       dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml
index 0d92455a8f3c..c3b9a4229924 100644
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@@ -48,6 +48,9 @@ jobs:
           { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
           { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
           { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
+          { config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" },
+          { config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" },
+          { config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" },
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm.yml
index f329fe74e6b6..dddf68091fdb 100644
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@@ -70,7 +70,7 @@ permissions: read-all
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
index 6e19130a1924..8057b1042676 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@@ -60,7 +60,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -69,14 +69,14 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-zen-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
@@ -95,16 +95,16 @@ jobs:
       selected-test-configs: ${{ inputs.benchmark_configs }}
     secrets: inherit
 
-  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly:
-    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+  inductor-test-nightly:
+    name: inductor-test-nightly
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     if: github.event.schedule == '0 7 * * *'
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
-      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
@@ -112,17 +112,16 @@ jobs:
       monitor-data-collect-interval: 4
     secrets: inherit
 
-
-  linux-jammy-zen-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-zen-cpu-py3.9-gcc11-inductor
+  inductor-test:
+    name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     if: github.event_name == 'workflow_dispatch'
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
-      docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml
index 62234e5f499a..b68e9ad95ca4 100644
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@@ -65,7 +65,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -74,14 +74,14 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" },
@@ -101,16 +101,16 @@ jobs:
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-test-nightly-freezing:
+    name: inductor-test-nightly-freezing
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     if: github.event.schedule == '0 7 * * *'
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
@@ -118,16 +118,16 @@ jobs:
       monitor-data-collect-interval: 4
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-test:
+    name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-build
     if: github.event_name == 'workflow_dispatch'
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
       dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
       timeout-minutes: 720
       # disable monitor in perf tests
       disable-monitor: false
diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml
index 9fd81a5a05c9..7c573d4d2571 100644
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@@ -70,7 +70,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -79,7 +79,6 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  # NB: Keep this in sync with trunk.yml
   build:
     name: cuda12.8-py3.10-gcc9-sm80
     uses: ./.github/workflows/_linux-build.yml
diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml
index 436cf95c156d..b17ebb84d5d3 100644
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@@ -22,7 +22,7 @@ permissions:
 jobs:
   get-default-label-prefix:
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -31,8 +31,8 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build:
-    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+  periodic-dynamo-benchmarks-build:
+    name: periodic-dynamo-benchmarks-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
@@ -57,23 +57,33 @@ jobs:
           { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
           { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
         ]}
       build-additional-packages: "vision audio fbgemm torchao"
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test:
-    name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
+  periodic-dynamo-benchmarks-test:
+    name: periodic-dynamo-benchmarks-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build
+    needs: periodic-dynamo-benchmarks-build
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build:
+  rocm-periodic-dynamo-benchmarks-build:
     if: github.repository_owner == 'pytorch'
-    name: rocm-py3_10-periodic-dynamo-benchmarks
+    name: rocm-periodic-dynamo-benchmarks-build
     uses: ./.github/workflows/_linux-build.yml
     with:
       build-environment: linux-jammy-rocm-py3_10
@@ -99,21 +109,21 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test:
+  rocm-periodic-dynamo-benchmarks-test:
     permissions:
       id-token: write
       contents: read
-    name: rocm-py3_10-periodic-dynamo-benchmarks
+    name: rocm-periodic-dynamo-benchmarks-test
     uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build
+    needs: rocm-periodic-dynamo-benchmarks-build
     with:
       build-environment: linux-jammy-rocm-py3_10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
+      docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }}
+      test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build:
-    name: cuda12.8-py3.10-gcc9-sm80
+  inductor-smoke-build:
+    name: inductor-smoke-build
     uses: ./.github/workflows/_linux-build.yml
     needs:
       - get-default-label-prefix
@@ -129,23 +139,23 @@ jobs:
       build-additional-packages: "vision audio fbgemm torchao"
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test:
-    name: cuda12.8-py3.10-gcc9-sm80
+  inductor-smoke-test:
+    name: inductor-smoke-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build
+    needs: inductor-smoke-build
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build:
-    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
+  periodic-dynamo-benchmarks-cpu-build:
+    name: periodic-dynamo-benchmarks-cpu-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-default-label-prefix
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -160,68 +170,6 @@ jobs:
           { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
           { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
           { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
-        ]}
-      build-additional-packages: "vision audio torchao"
-    secrets: inherit
-
-  linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test:
-    name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build
-    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
-    secrets: inherit
-
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-default-label-prefix
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-      test-matrix: |
-        { include: [
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-      build-additional-packages: "vision audio fbgemm torchao"
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-default-label-prefix
-    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
-      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
-      test-matrix: |
-        { include: [
           { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
           { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
           { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
@@ -247,12 +195,12 @@ jobs:
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  periodic-dynamo-benchmarks-cpu-test:
+    name: periodic-dynamo-benchmarks-cpu-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: periodic-dynamo-benchmarks-cpu-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml
index 732ec7eb85f3..369eee791dd6 100644
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@@ -28,7 +28,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm.yml
index b1bb7972d67d..87d78b600f44 100644
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@@ -20,7 +20,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml
index df918c329dd7..31ca8e6faa3b 100644
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@@ -19,7 +19,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -28,8 +28,8 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -47,44 +47,18 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-test:
+    name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    needs: inductor-build
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_12-gcc9-inductor-build:
-    name: cuda12.8-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      test-matrix: |
-        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_12-gcc9-inductor-test:
-    name: cuda12.8-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cpu-py3_12-inductor-halide-build:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+  inductor-halide-build:
+    name: inductor-halide-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -97,18 +71,18 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_12-inductor-halide-test:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+  inductor-halide-test:
+    name: inductor-halide-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_12-inductor-halide-build
+    needs: inductor-halide-build
     with:
       build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+  inductor-triton-cpu-build:
+    name: inductor-triton-cpu-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -121,23 +95,23 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
+  inductor-triton-cpu-test:
     name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
+    needs: inductor-triton-cpu-build
     with:
       build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-build:
+    name: inductor-cpu-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       test-matrix: |
         { include: [
@@ -148,37 +122,12 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
-    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_13-gcc9-inductor-build:
-    name: cuda12.8-py3.13-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_13-gcc9-inductor-test:
-    name: cuda12.8-py3.13-gcc9-sm86
+  inductor-cpu-test:
+    name: inductor-cpu-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build
+    needs: inductor-cpu-build
     with:
-      build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 721572f1807b..a70929dd868d 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -35,7 +35,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -44,8 +44,8 @@ jobs:
       curr_ref_type: ${{ github.ref_type }}
       opt_out_experiments: lf
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
@@ -53,7 +53,6 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.6'
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
       test-matrix: |
         { include: [
           { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
@@ -65,25 +64,24 @@ jobs:
       build-additional-packages: "vision audio fbgemm torchao"
     secrets: inherit
 
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm86
+  inductor-test:
+    name: inductor-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    needs: inductor-build
     with:
       build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-build:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-build:
+    name: inductor-cpu-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
       test-matrix: |
         { include: [
           { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
@@ -98,12 +96,12 @@ jobs:
       build-additional-packages: "vision audio torchao"
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-inductor-test:
-    name: linux-jammy-cpu-py3.9-gcc11-inductor
+  inductor-cpu-test:
+    name: inductor-cpu-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
+    needs: inductor-cpu-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/lint-autoformat.yml b/.github/workflows/lint-autoformat.yml
index b962970dc5b7..f64c9973d698 100644
--- a/.github/workflows/lint-autoformat.yml
+++ b/.github/workflows/lint-autoformat.yml
@@ -13,7 +13,7 @@ jobs:
     if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.labels.*.name, 'autoformat') }}
     steps:
       - name: Checkout pytorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: true
           fetch-depth: 0
diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml
index e0de9ede3508..98adf44aefd8 100644
--- a/.github/workflows/lint-bc.yml
+++ b/.github/workflows/lint-bc.yml
@@ -20,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Run BC Lint Action
-        uses: pytorch/test-infra/.github/actions/bc-lint@main
+        uses: pytorch/test-infra/.github/actions/bc-lint@release/2.9
         with:
           repo: ${{ github.event.pull_request.head.repo.full_name }}
           base_sha: ${{ github.event.pull_request.base.sha }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index b1a6dfb39071..534c15824715 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -21,7 +21,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -33,7 +33,7 @@ jobs:
     uses: ./.github/workflows/_get-changed-files.yml
 
   lintrunner-clang:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     needs: [get-label-type, get-changed-files]
     # Only run if there are changed files relevant to clangtidy / clangformat
     if: |
@@ -53,7 +53,7 @@ jobs:
     with:
       timeout: 120
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
       # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
       # to run git rev-parse HEAD~:.ci/docker when a new image is needed
       fetch-depth: 0
@@ -72,7 +72,7 @@ jobs:
   # NOTE: mypy needs its own job because it depends on --all-files, without assessing all files it sometimes
   #       fails to find types when it should
   lintrunner-mypy:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     needs: [get-label-type, get-changed-files]
     # Only run if there are changed files relevant to mypy
     if: |
@@ -96,7 +96,7 @@ jobs:
         ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
 
   lintrunner-noclang:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     needs: [get-label-type, get-changed-files]
     with:
       timeout: 120
@@ -117,7 +117,7 @@ jobs:
         fi
 
   quick-checks:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     needs: get-label-type
     with:
       timeout: 120
@@ -157,7 +157,7 @@ jobs:
     if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks')
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: false
           fetch-depth: -1
@@ -170,7 +170,7 @@ jobs:
           bash .github/scripts/pr-sanity-check.sh
 
   workflow-checks:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     needs: get-label-type
     with:
       timeout: 120
@@ -181,6 +181,7 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         # Regenerate workflows
+        export RELEASE_VERSION_TAG=2.9
         .github/scripts/generate_ci_workflows.py
 
         RC=0
@@ -190,7 +191,7 @@ jobs:
           echo 'As shown by the above diff, the committed .github/workflows'
           echo 'are not up to date according to .github/templates.'
           echo 'Please run this command, commit, and push again to your PR:'
-          echo
+          echo export RELEASE_VERSION_TAG=2.9
           echo '    .github/scripts/generate_ci_workflows.py'
           echo
           echo 'If running that command does nothing, you may need to rebase'
@@ -204,7 +205,7 @@ jobs:
         exit $RC
 
   toc:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     needs: get-label-type
     with:
       timeout: 120
@@ -240,7 +241,7 @@ jobs:
   test-tools:
     name: Test tools
     if: ${{ github.repository == 'pytorch/pytorch' }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     needs: get-label-type
     with:
       timeout: 120
@@ -260,14 +261,14 @@ jobs:
     runs-on: linux.24_04.4x
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: false
           fetch-depth: 1
-      - name: Setup Python 3.9
+      - name: Setup Python 3.10
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.9'
+          python-version: '3.10'
           architecture: x64
           cache: pip
       - name: Install dependencies
@@ -297,7 +298,7 @@ jobs:
       # [see note: pytorch repo ref]
       # deep clone (fetch-depth 0) required, to allow us to use git log
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml
index 2b840a39a5c2..357347f78138 100644
--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@@ -19,7 +19,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml
index 565a9b25df50..292f0a956c35 100644
--- a/.github/workflows/llm_td_retrieval.yml
+++ b/.github/workflows/llm_td_retrieval.yml
@@ -12,7 +12,7 @@ jobs:
     name: get-label-type
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -116,5 +116,5 @@ jobs:
           AWS_REGION: ""
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
         if: always()
diff --git a/.github/workflows/nightly-s3-uploads.yml b/.github/workflows/nightly-s3-uploads.yml
index acf3504dec9c..1cafca0e0c85 100644
--- a/.github/workflows/nightly-s3-uploads.yml
+++ b/.github/workflows/nightly-s3-uploads.yml
@@ -23,7 +23,7 @@ jobs:
     environment: upload-stats
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 2acc987e523c..eddb21ea2ca5 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -19,7 +19,7 @@ concurrency:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -42,8 +42,8 @@ jobs:
     needs: get-label-type
     with:
       runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
     secrets: inherit
 
   docs-push:
@@ -54,7 +54,7 @@ jobs:
       - get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
       docker-image: ${{ needs.docs-build.outputs.docker-image }}
       push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
       run-doxygen: true
@@ -92,7 +92,7 @@ jobs:
     if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
     steps:
       - name: "${{ matrix.repo-owner }}/${{ matrix.repo-name }} update-commit-hash"
-        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.9
         with:
           repo-owner: ${{ matrix.repo-owner }}
           repo-name: ${{ matrix.repo-name }}
diff --git a/.github/workflows/nitpicker.yml b/.github/workflows/nitpicker.yml
index 40bd245ce913..242f021e46fa 100644
--- a/.github/workflows/nitpicker.yml
+++ b/.github/workflows/nitpicker.yml
@@ -19,7 +19,7 @@ jobs:
     if: ${{ github.event.pull_request.number != 26921 && github.repository_owner == 'pytorch' }}
     steps:
     - name: Checkout PyTorch
-      uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
     - uses: ethanis/nitpicker@v1
       with:
         nitpicks: '.github/nitpicks.yml'
diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml
index 16cb1600b8d6..dcdc2cd0ba24 100644
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@@ -14,6 +14,10 @@ on:
   schedule:
     # Run at 07:00 UTC every Sunday
     - cron: 0 7 * * 0
+  pull_request:
+    paths:
+      - benchmarks/operator_benchmark/**
+      - .github/workflows/operator_benchmark.yml
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -24,38 +28,38 @@ permissions:
   contents: read
 
 jobs:
-  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
+  opbenchmark-build:
     if: github.repository_owner == 'pytorch'
-    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    name: opbenchmark-build
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build:
+  opbenchmark-on-demand-build:
     if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
-    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+    name: opbenchmark-on-demand-build
     uses: ./.github/workflows/_linux-build.yml
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
         ]}
     secrets: inherit
 
-  linux-jammy-cpu-py3_9-gcc11-opbenchmark-test:
-    name: linux-jammy-cpu-py3.9-gcc11-opbenchmark
+  opbenchmark-test:
+    name: opbenchmark-test
     uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build
+    needs: opbenchmark-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11-build
-      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-gcc11-build
+      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/operator_microbenchmark.yml b/.github/workflows/operator_microbenchmark.yml
new file mode 100644
index 000000000000..9205b927c5d7
--- /dev/null
+++ b/.github/workflows/operator_microbenchmark.yml
@@ -0,0 +1,46 @@
+name: operator_microbenchmark
+
+on:
+  push:
+    tags:
+      - ciflow/op-benchmark/*
+  workflow_dispatch:
+  schedule:
+    # Run at 06:00 UTC everyday
+    - cron: 0 6 * * *
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  opmicrobenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.0 9.0'
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test:
+    name: opmicrobenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml
index 4d8890e69fc7..850c98b3fa81 100644
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@@ -41,7 +41,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 7d43c68c61b0..418699cb5f5a 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -43,7 +43,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -59,13 +59,14 @@ jobs:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-cuda12.4-py3.10-gcc11
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
+      cuda-arch-list: 7.5
       test-matrix: |
         { include: [
-          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
         ]}
     secrets: inherit
 
@@ -170,6 +171,38 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
     secrets: inherit
 
+  linux-jammy-cuda13_0-py3_10-gcc11-build:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      cuda-arch-list: 7.5
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11
+      test-matrix: |
+        { include: [
+          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda13_0-py3_10-gcc11-test:
+    name: linux-jammy-cuda13.0-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda13_0-py3_10-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda13.0-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit
+
   linux-jammy-rocm-py3_10-build:
     name: linux-jammy-rocm-py3.10
     uses: ./.github/workflows/_linux-build.yml
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index e2cac7bb7315..f884fee53fc7 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -42,21 +42,21 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
 
-  linux-jammy-py3_9-gcc11-build:
-    name: linux-jammy-py3.9-gcc11
+  linux-jammy-py3_10-gcc11-build:
+    name: linux-jammy-py3.10-gcc11
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -73,49 +73,49 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-gcc11-test:
-    name: linux-jammy-py3.9-gcc11
+  linux-jammy-py3_10-gcc11-test:
+    name: linux-jammy-py3.10-gcc11
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-py3_9-gcc11-build
+      - linux-jammy-py3_10-gcc11-build
       - target-determination
     with:
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-docs:
     name: linux-docs
     uses: ./.github/workflows/_docs.yml
-    needs: linux-jammy-py3_9-gcc11-build
+    needs: linux-jammy-py3_10-gcc11-build
     with:
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }}
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
     secrets: inherit
 
-  linux-jammy-py3_9-gcc11-no-ops:
-    name: linux-jammy-py3.9-gcc11-no-ops
+  linux-jammy-py3_10-gcc11-no-ops:
+    name: linux-jammy-py3.10-gcc11-no-ops
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-no-ops
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-gcc11-pch:
-    name: linux-jammy-py3.9-gcc11-pch
+  linux-jammy-py3_10-gcc11-pch:
+    name: linux-jammy-py3.10-gcc11-pch
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-pch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11-pch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
@@ -132,17 +132,17 @@ jobs:
       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
         ]}
       sync-tag: asan-build
     secrets: inherit
 
-
   linux-jammy-py3_10-clang18-asan-test:
     name: linux-jammy-py3.10-clang18-asan
     uses: ./.github/workflows/_linux-test.yml
@@ -183,14 +183,14 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-build:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-build:
+    name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@@ -207,16 +207,16 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-test:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-test:
+    name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-py3_9-clang12-build
+      - linux-jammy-py3_10-clang12-build
       - target-determination
     with:
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-jammy-py3_13-clang12-build:
@@ -253,14 +253,14 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build:
-    name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+  linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
+    name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 1 },
@@ -282,14 +282,14 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build:
-    name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
+  linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
+    name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
       build-generates-artifacts: false
       test-matrix: |
         { include: [
@@ -342,15 +342,40 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-xpu-2025_1-py3_9-build:
-    name: linux-jammy-xpu-2025.1-py3.9
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-xpu-n-py3_9-build:
+    name: linux-jammy-xpu-n-py3.9
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      sync-tag: linux-xpu-2025-1-build
+      sync-tag: linux-xpu-n-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-2025.1-py3.9
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
+      build-environment: linux-jammy-xpu-n-py3.9
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
       test-matrix: |
         { include: [
           { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml
index 7e3ba43bf984..51a807250f54 100644
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@@ -28,7 +28,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/rocm-mi355.yml b/.github/workflows/rocm-mi355.yml
index e5dda604a4db..adf5fe919087 100644
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@@ -22,7 +22,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index 2a7b1d184330..197a04054bfe 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -41,7 +41,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -78,14 +78,14 @@ jobs:
       test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-build:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-build:
+    name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
       test-matrix: |
         { include: [
           { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
@@ -93,16 +93,16 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-py3_9-clang12-test:
-    name: linux-jammy-py3.9-clang12
+  linux-jammy-py3_10-clang12-test:
+    name: linux-jammy-py3.10-clang12
     uses: ./.github/workflows/_linux-test.yml
     needs:
-      - linux-jammy-py3_9-clang12-build
+      - linux-jammy-py3_10-clang12-build
       - target-determination
     with:
-      build-environment: linux-jammy-py3.9-clang12
-      docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
     secrets: inherit
 
   linux-jammy-rocm-py3_10-build:
diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml
index ec579fda8da9..f5f29c9646f4 100644
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@@ -13,7 +13,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -35,7 +35,7 @@ jobs:
 
       - name: Calculate docker image
         id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9
         with:
           docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
           working-directory: pytorch
@@ -50,13 +50,13 @@ jobs:
           echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
 
       - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
       - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
         id: install-nvidia-driver
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9
 
       - name: Clone CodeLlama
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -149,7 +149,7 @@ jobs:
             "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}"
 
       - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9
         if: always()
 
 concurrency:
diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml
index c712b11185a7..3e9f848e9e09 100644
--- a/.github/workflows/target_determination.yml
+++ b/.github/workflows/target_determination.yml
@@ -9,7 +9,7 @@ jobs:
     name: get-label-type
     # Don't run on forked repos
     if: github.repository_owner == 'pytorch'
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@@ -27,7 +27,7 @@ jobs:
       # checkout because when we run this action we don't *have* a local
       # checkout. In other cases you should prefer a local checkout.
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: false
 
diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml
index 5f0ad59d3a3b..a13e1d027f13 100644
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@@ -15,7 +15,7 @@ jobs:
   check_binary_linux_cpu:
     if: github.repository_owner == 'pytorch'
     name: Test check_binary.sh for Linux CPU
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     with:
       docker-image: python:3.11
       docker-build-dir: "skip-docker-build"
@@ -28,7 +28,7 @@ jobs:
   check_binary_linux_cuda:
     if: github.repository_owner == 'pytorch'
     name: Test check_binary.sh for Linux CUDA
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9
     with:
       runner: linux.g4dn.4xlarge.nvidia.gpu
       docker-image: python:3.11
diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml
index 1e83c7b9d98c..d08d6033c47e 100644
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@@ -28,7 +28,7 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/tools-unit-tests.yml b/.github/workflows/tools-unit-tests.yml
index c687c07b7ca7..9c104571ef89 100644
--- a/.github/workflows/tools-unit-tests.yml
+++ b/.github/workflows/tools-unit-tests.yml
@@ -25,7 +25,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout pytorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: true
           fetch-depth: 0
@@ -52,7 +52,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout pytorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: true
           fetch-depth: 0
diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml
index 08fcd3340262..e4f0c692e976 100644
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@@ -18,7 +18,7 @@ jobs:
   get-default-label-prefix:
     if: github.repository_owner == 'pytorch'
     name: get-default-label-prefix
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 0081e4e1f895..efc027ad2acb 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -39,7 +39,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -224,13 +224,12 @@ jobs:
       tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
     secrets: inherit
 
-  # NB: Keep this in sync with inductor-perf-test-nightly.yml
-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm80
+  inductor-build:
+    name: inductor-build
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
       docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
       cuda-arch-list: '8.0'
     secrets: inherit
@@ -241,8 +240,8 @@ jobs:
     needs: get-label-type
     with:
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
       test-matrix: |
         { include: [
           { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@@ -256,7 +255,7 @@ jobs:
       - verify-cachebench-cpu-build
       - target-determination
     with:
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
       docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
       test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
     secrets: inherit
diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
index 1fdb1da67a59..5c456c607c88 100644
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@@ -59,22 +59,19 @@ jobs:
             # on the PR appear in chronological order (timing issues can shuffle them around)
             sleep 60
           fi
+
+          # Require a comment id for merge operations
+          if [ -z "${COMMENT_ID}" ]; then
+            echo "Error: merge requires COMMENT_ID to be specified"
+            exit 1
+          fi
+
           if [ -n "${FORCE}" ]; then
-            if [ -n "${COMMENT_ID}" ]; then
-              python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
-            else
-              python3 .github/scripts/trymerge.py --force "${PR_NUM}"
-            fi
+            python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}"
           elif [ -n "${IGNORE_CURRENT}" ]; then
-            if [ -n "${COMMENT_ID}" ]; then
-              python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
-            else
-              python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}"
-            fi
-          elif [ -n "${COMMENT_ID}" ]; then
-            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
+            python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}"
           else
-            python3 .github/scripts/trymerge.py "${PR_NUM}"
+            python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}"
           fi
       - name: Comment on Canceled
         if: ${{ cancelled() && steps.checkout.outcome == 'success' }}
diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml
index 7f0fe6058bd0..5eeb8b19a325 100644
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@@ -46,7 +46,7 @@ jobs:
 
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
index 3d445756f7a2..e3ca35d2d01d 100644
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@@ -18,12 +18,12 @@ jobs:
     environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
     steps:
       - name: Update viable/strict
-        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
+        uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.9
         id: update_viablestrict
         with:
           repository: pytorch/pytorch
           stable-branch: viable/strict
-          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]'
+          requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]'
           secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
           clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
           clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml
index a1b8c38141ae..535950b3c0b7 100644
--- a/.github/workflows/update_pytorch_labels.yml
+++ b/.github/workflows/update_pytorch_labels.yml
@@ -17,7 +17,7 @@ jobs:
       contents: read
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/upload-test-stats-while-running.yml b/.github/workflows/upload-test-stats-while-running.yml
index 9aecaad0e068..82c21467dc6a 100644
--- a/.github/workflows/upload-test-stats-while-running.yml
+++ b/.github/workflows/upload-test-stats-while-running.yml
@@ -16,7 +16,7 @@ jobs:
     runs-on: linux.2xlarge
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml
index f77b6081b776..3cfc651b2a62 100644
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@@ -58,7 +58,7 @@ jobs:
         run: echo "${TRIGGERING_WORKFLOW}"
 
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
 
       - name: Configure aws credentials
         uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml
index 07471619437a..db3fc72e68e9 100644
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@@ -32,7 +32,7 @@ jobs:
     name: Upload dynamo performance stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           submodules: false
           fetch-depth: 1
diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml
index 570256200605..1764139fed25 100644
--- a/.github/workflows/upload_test_stats_intermediate.yml
+++ b/.github/workflows/upload_test_stats_intermediate.yml
@@ -17,7 +17,7 @@ jobs:
     environment: upload-stats
     steps:
       - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9
         with:
           fetch-depth: 1
           submodules: false
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index 14524069ab5a..dcfa4027c7ae 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -2,12 +2,14 @@ name: vllm-test
 
 on:
   push:
+    branches:
+      - main
+      - release/*
     tags:
       - ciflow/vllm/*
   workflow_dispatch:
   schedule:
-    # Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
-    - cron: '0 0,12 * * *'
+    - cron: '0 */8 * * *'  # every 8 hours at minute 0 (UTC)
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@@ -20,7 +22,7 @@ permissions:
 jobs:
   get-label-type:
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
     with:
       triggering_actor: ${{ github.triggering_actor }}
@@ -46,14 +48,18 @@ jobs:
           { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_280_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_multi_model_processor_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_lora_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "vllm_multi_model_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
+          { config: "vllm_languagde_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"},
+          { config: "vllm_distributed_test_2_gpu_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_lora_test", shard: 2, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
           { config: "vllm_lora_test", shard: 3, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4"},
+          { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"},
+          { config: "vllm_distributed_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"}
         ]}
     secrets: inherit
 
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index b95dadd5f2b1..2c534891c6e2 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -22,7 +22,7 @@ jobs:
           fetch-depth: 0
       - name: update-xla-commit-hash
         continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/update-commit-hash@main
+        uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.9
         with:
           repo-name: xla
           branch: master
diff --git a/.github/workflows/win-arm64-build-test.yml b/.github/workflows/win-arm64-build-test.yml
index 627a43b56bf7..95b4e2f027f6 100644
--- a/.github/workflows/win-arm64-build-test.yml
+++ b/.github/workflows/win-arm64-build-test.yml
@@ -4,6 +4,9 @@ on:
   push:
     tags:
       - ciflow/win-arm64/*
+  schedule:
+    # Every 4 hours starting at 00:00 UTC
+    - cron: '0 */4 * * *'
 
 env:
   GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml
index c62918b4af21..3a17bb9d70a1 100644
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@@ -19,22 +19,22 @@ jobs:
   get-label-type:
     if: github.repository_owner == 'pytorch'
     name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9
     with:
       triggering_actor: ${{ github.triggering_actor }}
       issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
       curr_branch: ${{ github.head_ref || github.ref_name }}
       curr_ref_type: ${{ github.ref_type }}
 
-  linux-jammy-xpu-2025_0-py3_9-build:
-    name: linux-jammy-xpu-2025.0-py3.9
+  linux-jammy-xpu-n-1-py3_10-build:
+    name: linux-jammy-xpu-n-1-py3.10
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      sync-tag: linux-xpu-2025-0-build
+      sync-tag: linux-xpu-n-1-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-2025.0-py3.9
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3
+      build-environment: linux-jammy-xpu-n-1-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
       runner: linux.12xlarge
       test-matrix: |
         { include: [
@@ -47,60 +47,62 @@ jobs:
         ]}
     secrets: inherit
 
-  linux-jammy-xpu-2025_1-py3_9-build:
-    name: linux-jammy-xpu-2025.1-py3.9
+  linux-jammy-xpu-n-py3_10-build:
+    name: linux-jammy-xpu-n-py3.10
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
-      sync-tag: linux-xpu-2025-1-build
+      sync-tag: linux-xpu-n-build
       runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-2025.1-py3.9
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
       runner: linux.12xlarge
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
+          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
         ]}
     secrets: inherit
 
-  linux-jammy-xpu-2025_1-py3_9-test:
-    name: linux-jammy-xpu-2025.1-py3.9
+  linux-jammy-xpu-n-py3_10-test:
+    name: linux-jammy-xpu-n-py3.10
     uses: ./.github/workflows/_xpu-test.yml
-    needs: linux-jammy-xpu-2025_1-py3_9-build
+    needs: linux-jammy-xpu-n-py3_10-build
     permissions:
       id-token: write
       contents: read
     with:
-      build-environment: linux-jammy-xpu-2025.1-py3.9
-      docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }}
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
     secrets: inherit
 
-  windows-xpu-2025_0-build:
+  windows-xpu-n-1-build:
     if: github.repository_owner == 'pytorch'
-    name: win-vs2022-xpu-2025_0-py3
+    name: win-vs2022-xpu-n-1-py3
     uses: ./.github/workflows/_win-build.yml
     with:
-      build-environment: win-vs2022-xpu-py3
+      build-environment: win-vs2022-xpu-n-1-py3
       cuda-version: cpu
       use-xpu: true
-      xpu-version: '2025.0'
+      xpu-version: '2025.1'
       vc-year: '2022'
     secrets: inherit
 
-  windows-xpu-2025_1-build:
+  windows-xpu-n-build:
     if: github.repository_owner == 'pytorch'
-    name: win-vs2022-xpu-2025_1-py3
+    name: win-vs2022-xpu-n-py3
     uses: ./.github/workflows/_win-build.yml
     with:
-      build-environment: win-vs2022-xpu-py3
+      build-environment: win-vs2022-xpu-n-py3
       cuda-version: cpu
       use-xpu: true
-      xpu-version: '2025.1'
+      xpu-version: '2025.2'
       vc-year: '2022'
     secrets: inherit
diff --git a/.gitignore b/.gitignore
index d1fa4cd3caf2..f20486806796 100644
--- a/.gitignore
+++ b/.gitignore
@@ -82,6 +82,7 @@ torch/return_types.pyi
 torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
+torch/csrc/functionalization/generated/*
 torch/csrc/lazy/generated/*.[!m]*
 torch_compile_debug/
 # Listed manually because some files in this directory are not generated
diff --git a/BUILD.bazel b/BUILD.bazel
index 58ebc31e243c..f13da6bfbe43 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -91,6 +91,8 @@ generated_cpu_cpp = [
     "aten/src/ATen/NativeMetaFunctions.h",
     "aten/src/ATen/RegistrationDeclarations.h",
     "aten/src/ATen/VmapGeneratedPlumbing.h",
+    "aten/src/ATen/ViewMetaClasses.h",
+    "aten/src/ATen/ViewMetaClasses.cpp",
     "aten/src/ATen/core/aten_interned_strings.h",
     "aten/src/ATen/core/enum_tag.h",
     "aten/src/ATen/core/TensorBody.h",
@@ -747,6 +749,7 @@ cc_library(
             "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu",
             "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu",
             "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
+            "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
             "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
         ],
     )) + torch_sources,
@@ -1105,6 +1108,7 @@ test_suite(
         "aten/src/ATen/templates/LazyNonNativeIr.h",
         "aten/src/ATen/templates/RegisterDispatchKey.cpp",
         "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
+        "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
         "aten/src/ATen/native/native_functions.yaml",
         "aten/src/ATen/native/tags.yaml",
         "aten/src/ATen/native/ts_native_functions.yaml",
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000000..dcdf409e7314
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,15 @@
+# Testing
+
+Use our test class and test runner:
+
+```
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+class TestFeature(TestCase):
+    ...
+
+if __name__ == "__main__":
+    run_tests()
+```
+
+To test Tensor equality, use assertEqual.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad7368e19298..ce7890f002d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -272,7 +272,7 @@ cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
 cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL"
                        OFF)
 cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON
-                       "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 option(USE_NNAPI "Use NNAPI" OFF)
 option(USE_NNPACK "Use NNPACK" ON)
 cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX"
@@ -880,10 +880,21 @@ cmake_dependent_option(
   USE_FBGEMM_GENAI
   "Whether to build FBGEMM GenAI quantized GEMM kernels.\
   Will be disabled if not supported by the platform"
-  OFF
-  "USE_CUDA OR USE_ROCM"
+  ON
+  "USE_ROCM"
   OFF)
 
+IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+  message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
+  set(USE_FBGEMM_GENAI off)
+endif()
+
+# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100
+if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0a")
+  message(WARNING "Setting USE_FBGEMM_GENAI to ON for CUDA build on SM100")
+  set(USE_FBGEMM_GENAI ON)
+endif()
+
 # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
 # Eff Attention won't
 cmake_dependent_option(
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index dade8f4ec6ec..9d2b5d355391 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -88,13 +88,13 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
 
 * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
 
-* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use
+* When installing with `python -m pip install -e . -v --no-build-isolation` (in contrast to `python -m pip install . -v --no-build-isolation`) Python runtime will use
   the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder)
   This way you do not need to repeatedly install after modifying Python files (`.py`).
   However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...).
 
 
-  One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
+  One way to avoid running `python -m pip install -e . -v --no-build-isolation` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac,
   is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following:
   ```bash
   pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd
@@ -116,7 +116,7 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
 
   Next run `python setup.py clean`. After that, you can install in editable mode again.
 
-* If you run into errors when running `python -m pip install -e .`, here are some debugging steps:
+* If you run into errors when running `python -m pip install -e . -v --no-build-isolation`, here are some debugging steps:
   1. Run `printf '#include <stdio.h>\nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure
   your CMake works and can compile this simple Hello World program without errors.
   2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many
@@ -129,10 +129,10 @@ source venv/bin/activate  # or `& .\venv\Scripts\Activate.ps1` on Windows
       git clean -xdf
       python setup.py clean
       git submodule update --init --recursive
-      python -m pip install -r requirements.txt
+      python -m pip install --group dev
       python -m pip install --no-build-isolation -v -e .
       ```
-  4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to
+  4. The main step within `python -m pip install -e . -v --no-build-isolation` is running `make` from the `build` directory. If you want to
     experiment with some environment variables, you can pass them into the command:
       ```bash
       ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e .
@@ -259,6 +259,7 @@ dependencies as well as the nightly binaries into the repo directory.
       support for PyTorch.
 * [tools](tools) - Code generation scripts for the PyTorch library.
   See [README](tools/README.md) of this directory for more details.
+* [torchgen](torchgen) - contains the logic and tooling for generating PyTorch's low-level C++ and Python bindings from operator definitions, typically specified in native_functions.yaml
 * [test](test) - Python unit tests for PyTorch Python frontend.
   * [test_torch.py](test/test_torch.py) - Basic tests for PyTorch
     functionality.
@@ -294,7 +295,7 @@ The following packages should be installed with `pip`:
 - `pytest` - recommended to run tests more selectively
 Running
 ```
-pip install -r requirements.txt
+pip install --group dev
 ```
 will install these dependencies for you.
 
@@ -645,9 +646,9 @@ can be selected interactively with your mouse to zoom in on a particular part of
 the program execution timeline. The `--native` command-line option tells
 `py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers
 for C++ code it may be necessary to compile PyTorch in debug mode by prepending
-your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`.
-Depending on your operating system it may also be necessary to run `py-spy` with
-root privileges.
+your `python -m pip install -e . -v --no-build-isolation` call to compile
+PyTorch with `DEBUG=1`. Depending on your operating system it may also be
+necessary to run `py-spy` with root privileges.
 
 `py-spy` can also work in an `htop`-like "live profiling" mode and can be
 tweaked to adjust the stack sampling rate, see the `py-spy` readme for more
@@ -655,10 +656,10 @@ details.
 
 ## Managing multiple build trees
 
-One downside to using `python -m pip install -e .` is that your development
-version of PyTorch will be installed globally on your account (e.g., if
-you run `import torch` anywhere else, the development version will be
-used).
+One downside to using `python -m pip install -e . -v --no-build-isolation` is
+that your development version of PyTorch will be installed globally on your
+account (e.g., if you run `import torch` anywhere else, the development version
+will be used).
 
 If you want to manage multiple builds of PyTorch, you can make use of
 [venv environments](https://docs.python.org/3/library/venv.html) to maintain
@@ -719,7 +720,7 @@ options.
 
 ### Code completion and IDE support
 
-When using `python -m pip install -e .`, PyTorch will generate
+When using `python -m pip install -e . -v --no-build-isolation`, PyTorch will generate
 a `compile_commands.json` file that can be used by many editors
 to provide command completion and error highlighting for PyTorch's
 C++ code. You need to `pip install ninja` to generate accurate
diff --git a/Dockerfile b/Dockerfile
index 7b8964bd860e..331cf00593cb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -50,11 +50,10 @@ RUN git submodule update --init --recursive
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.11
 ARG CUDA_PATH=cu121
-ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
-RUN /opt/conda/bin/conda update -y -n base -c defaults conda
-RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
+# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
+RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0
 
 ARG TARGETPLATFORM
 
diff --git a/README.md b/README.md
index 4c18724be0c0..99e6dabd1618 100644
--- a/README.md
+++ b/README.md
@@ -243,7 +243,7 @@ git submodule update --init --recursive
 
 ```bash
 # Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
-pip install -r requirements.txt
+pip install --group dev
 ```
 
 **On Linux**
@@ -394,7 +394,7 @@ On macOS
 
 ```bash
 export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}"
-MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build
+MACOSX_DEPLOYMENT_TARGET=11.0 CMAKE_ONLY=1 python setup.py build
 ccmake build  # or cmake-gui build
 ```
 
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index d8787154a213..38b383c2bb31 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -1,5 +1,15 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/public")
+if(USE_ROCM)
+    include(LoadHIP OPTIONAL RESULT_VARIABLE _had_loadhip)
+    if(_had_loadhip)
+        # Exposed by LoadHIP.cmake, e.g. "7.1.2" or "7.2.0"
+        message(STATUS "LoadHIP loaded: ROCM_VERSION_DEV='${ROCM_VERSION_DEV}'")
+    else()
+        message(WARNING "LoadHIP.cmake not found; ROCM_VERSION_DEV unavailable")
+    endif()
+endif()
 
 if(NOT MSVC)
   string(APPEND CMAKE_CXX_FLAGS " -Wno-ignored-qualifiers")
@@ -216,7 +226,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention
 if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
   add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp})
 
-  target_include_directories(flash_attention PUBLIC
+  target_include_directories(flash_attention SYSTEM PUBLIC
     ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc
     ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include
     ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
@@ -252,47 +262,81 @@ if(USE_MEM_EFF_ATTENTION)
   list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu})
 endif()
 
-IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
-  message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
-  set(USE_FBGEMM_GENAI off)
-endif()
-
 # FBGEMM GenAI
 IF(USE_FBGEMM_GENAI)
   set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
-  set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
-
-  if(USE_ROCM)
-    # Only include the kernels we want to build to avoid increasing binary size.
-    file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
-      "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
-      "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
-    set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-
-    # Add additional HIPCC compiler flags for performance
-    set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-      -mllvm
-      -amdgpu-coerce-illegal-types=1
-      -mllvm
-      -enable-post-misched=0
-      -mllvm
-      -greedy-reverse-local-assignment=1
-      -fhip-new-launch-api)
-
-    hip_add_library(
-      fbgemm_genai STATIC
-      ${fbgemm_genai_native_rocm_hip}
-      HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+  set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
+  if(USE_CUDA)
+    # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
+    # If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
+    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
+    file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
+      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
+      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
+    list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX})
+
+    file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp
+      "${FBGEMM_GENAI_SRCS}/common/*.cpp"
+    )
+
+    # Combine all source files into a single list
+    list(APPEND fbgemm_genai_all_sources
+      ${fbgemm_genai_native_cuda_cu}
+      ${fbgemm_genai_native_cuda_cpp}
+    )
+
+    # Now, create the library and provide the sources at the same time
+    add_library(fbgemm_genai OBJECT ${fbgemm_genai_all_sources})
+
     set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
+
+    set(fbgemm_genai_mx8mx8bf16_grouped
+      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
+    )
 
     target_include_directories(fbgemm_genai PUBLIC
-      # FBGEMM version of Composable Kernel is used due to some customizations
-      ${FBGEMM_THIRD_PARTY}/composable_kernel/include
-      ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
-      ${FBGEMM_GENAI_DIR}/include/
-      ${FBGEMM_GENAI_DIR}/common/include/
+      ${FBGEMM_THIRD_PARTY}/cutlass/include
+      ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
+      ${fbgemm_genai_mx8mx8bf16_grouped}
+      ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
+      ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
     )
+  else()
+    if(USE_ROCM)
+      # Only include the kernels we want to build to avoid increasing binary size.
+      file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
+        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
+        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
+      set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+
+      # Add additional HIPCC compiler flags for performance
+      set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
+        -mllvm
+        -enable-post-misched=0
+        -mllvm
+        -greedy-reverse-local-assignment=1
+        -fhip-new-launch-api)
+      if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
+        list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
+      endif()
+
+      hip_add_library(
+        fbgemm_genai STATIC
+        ${fbgemm_genai_native_rocm_hip}
+        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
+      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
+
+      target_include_directories(fbgemm_genai PUBLIC
+        # FBGEMM version of Composable Kernel is used due to some customizations
+        ${FBGEMM_THIRD_PARTY}/composable_kernel/include
+        ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
+        ${FBGEMM_THIRD_PARTY}/cutlass/include
+        ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
+        ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
+        ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
+      )
+    endif()
   endif()
 endif()
 
@@ -635,12 +679,26 @@ if(USE_CUDA AND NOT USE_ROCM)
   add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
   list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
+
+  # Add FBGEMM_GENAI include directories for torch_ops.h
+  if(USE_FBGEMM_GENAI)
+    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
+    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
+  endif()
+
   if($ENV{ATEN_STATIC_CUDA})
-    list(APPEND ATen_CUDA_DEPENDENCY_LIBS
-      ${CUDA_LIBRARIES}
-      CUDA::cusparse_static
-      CUDA::cufft_static_nocallback
-    )
+    if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
+      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+          ${CUDA_LIBRARIES}
+          CUDA::cusparse_static
+          CUDA::cufft_static_nocallback)
+    else()
+      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
+          ${CUDA_LIBRARIES}
+          CUDA::cusparse_static
+          CUDA::cufft_static)
+    endif()
+
    if(NOT BUILD_LAZY_CUDA_LINALG)
      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
        CUDA::cusolver_static
diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp
index 9632cd5ed698..98ad757946be 100644
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@@ -308,17 +308,44 @@ void fillVersion<DLManagedTensorVersioned>(
 // constructed out of ATen tensor
 template <class T>
 T* toDLPackImpl(const Tensor& src) {
-  // create a new tensor with possibly normalized strides
-  // gh-83069
-  auto shape = src.sizes();
-  auto strides = src.strides().vec();
-  for (int i = 0; i < src.dim(); i++) {
-    if (shape[i] < 2) {
-      strides[i] = 1;
+  auto view = src;
+
+  // Detect whether there is need to normalize the strides
+  // Background: gh-83069
+  //
+  // However, normalizing strides can come at a high-cost
+  // to slow down toDLPack conversion 3x, so we
+  // only normalize if needed.
+  //
+  // The following code detects whether the src follows
+  // a continuous pattern. If the src follows such pattern (common-case)
+  // then we do not need to normalize the strides.
+  bool need_normalize_strides = false;
+  int64_t expected_stride = 1;
+  for (int i = src.dim() - 1; i >= 0; i--) {
+    // detect if we do not meet continuous pattern
+    // and the size is 1, so there is opportunity to normalize
+    if (src.stride(i) != expected_stride && src.size(i) == 1) {
+      need_normalize_strides = true;
+      break;
+    }
+    expected_stride *= src.size(i);
+  }
+
+  // less common case, try normalizing the strides
+  if (need_normalize_strides) {
+    // create a new tensor with possibly normalized strides
+    // gh-83069
+    auto shape = src.sizes();
+    auto strides = src.strides().vec();
+    for (int i = 0; i < src.dim(); i++) {
+      if (shape[i] < 2) {
+        strides[i] = 1;
+      }
     }
+    view = src.as_strided(shape, strides, src.storage_offset());
   }
 
-  auto view = src.as_strided(shape, strides, src.storage_offset());
   ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
   atDLMTensor->handle = view;
   atDLMTensor->tensor.manager_ctx = atDLMTensor;
diff --git a/aten/src/ATen/DTensorState.cpp b/aten/src/ATen/DTensorState.cpp
new file mode 100644
index 000000000000..0644aae3d070
--- /dev/null
+++ b/aten/src/ATen/DTensorState.cpp
@@ -0,0 +1,17 @@
+#include <ATen/DTensorState.h>
+
+namespace at {
+
+namespace {
+thread_local bool kDTensorAllowImplicitReplication = false;
+}
+
+bool get_dtensor_allow_implicit_replication() {
+  return kDTensorAllowImplicitReplication;
+}
+
+void set_dtensor_allow_implicit_replication(bool enabled) {
+  kDTensorAllowImplicitReplication = enabled;
+}
+
+} // namespace at
diff --git a/aten/src/ATen/DTensorState.h b/aten/src/ATen/DTensorState.h
new file mode 100644
index 000000000000..07e89eaeddae
--- /dev/null
+++ b/aten/src/ATen/DTensorState.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace at {
+
+TORCH_API bool get_dtensor_allow_implicit_replication();
+TORCH_API void set_dtensor_allow_implicit_replication(bool enabled);
+
+struct DTensorAllowImplicitReplication {
+  DTensorAllowImplicitReplication()
+      : prev_dtensor_allow_implicit_replication_(
+            get_dtensor_allow_implicit_replication()) {
+    set_dtensor_allow_implicit_replication(true);
+  }
+
+  DTensorAllowImplicitReplication(const DTensorAllowImplicitReplication&) =
+      delete;
+  DTensorAllowImplicitReplication& operator=(
+      const DTensorAllowImplicitReplication&) = delete;
+  DTensorAllowImplicitReplication(DTensorAllowImplicitReplication&&) = delete;
+  DTensorAllowImplicitReplication& operator=(
+      DTensorAllowImplicitReplication&&) = delete;
+
+  ~DTensorAllowImplicitReplication() {
+    set_dtensor_allow_implicit_replication(
+        prev_dtensor_allow_implicit_replication_);
+  }
+
+ private:
+  bool prev_dtensor_allow_implicit_replication_;
+};
+
+} // namespace at
diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp
index a5512818343f..8bca495abdc6 100644
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@@ -9,11 +9,6 @@
 
 namespace at::functionalization {
 
-ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
-  if (out_idx == this->out_index) return *this;
-  return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
-}
-
 // Note [Functionalization: Alias Removal Part 2]
 // See Note [Functionalization: Alias Removal] for more details.
 // This function applies a single update from one of the views to the StorageImpl.
@@ -42,12 +37,12 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
 static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
   at::Tensor t = update.new_val;
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  if (update.view_metas.empty()) return t;
+  if (update.view_metas.empty()) { return t; }
 
   std::vector<at::Tensor> tmp_values({base});
   tmp_values.reserve(update.view_metas.size());
   for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
-    at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
+    at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
     // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
     // All of these ops require additional information to recover the sizes of the original tensor.
     // If need to, we could probably apply this optimization and only bother computing tmp_values
@@ -55,9 +50,8 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
     tmp_values.push_back(std::move(next_view));
   }
   for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
-    int64_t out_idx = update.view_metas[i].out_index;
     // Each view inverse is implemented in ViewInverses.cpp.
-    t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
+    t = update.view_metas[i]->reverse(tmp_values[i], t);
   }
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
   return t;
@@ -111,13 +105,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }
 
-void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
   TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");
 
   if (metas.size() > 1) {
     for (size_t i = 1; i < metas.size(); ++i) {
       // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
-      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
+      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
 "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
 " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
 "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h
index 8cd1cb7434aa..0c9c1fd775f3 100644
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@@ -8,44 +8,89 @@ namespace at::functionalization {
 
 // See Note [Functionalization Pass In Core]
 
+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying)
+  /// scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to
+  /// handle.
+  ViewOrScatterInverse,
+};
+
+#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
+  static const char* name() {                 \
+    return #TYPE;                             \
+  }
+
+#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
+  using SerializableTuple = std::tuple<__VA_ARGS__>
+
 // ViewMeta is a class used by the functionalization pass to navigate between
 // a base tensor and a view tensor.
 // For example, if I call `b = a.view1(...)`
-// the functionalization pass will generate and store a ViewMeta on b that looks
-// like:
+// the functionalization pass will generate and store a ViewMeta specialization
+// for `view1` operation on b that looks like:
 //
-// ViewMeta(
-//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
-//     return base.view1(...);
-//   },
-//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
-//   int64_t mutated_view_idx) -> at::Tensor {
-//     return at::functionalization::impl::view1_inverse(base, mutated_view,
-//     ...);
+// struct TORCH_API view1_ViewMeta : public ViewMeta {
+//   FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
+//   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+//       bool /* reapply_views */,
+//       const std::vector<int64_t>&);
+//
+//   view1_ViewMeta(const SerializableTuple& tpl)
+//       : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+//
+//   view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
+//       : ViewMeta(/*has_symbolic_inputs=*/false),
+//         reapply_views(reapply_views),
+//         size(size) {}
+//
+//   Tensor forward(const Tensor& base) override {
+//       return base.view1(...);
 //   }
 //
-// The forward_fn lambda describes how to replay view1 on a tensor.
+//   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
+//       return at::functionalization::impl::view1_inverse(base, mutated_view,
+//       ...);
+//   }
 //
-// The reverse_fn lambda describes how, given a tensor that is already a view,
+//   SerializableTuple to_serializable_tuple() {
+//     return std::make_tuple(reapply_views, size);
+//   }
+//
+//   bool reapply_views;
+//   std::vector<int64_t> size;
+// };
+//
+// The forward function describes how to replay view1 on a tensor.
+//
+// The reverse function describes how, given a tensor that is already a view,
 // how to get the corresponding base tensor. See Note [Functionalization Pass:
 // View Inverses] for details.
+//
+// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
+// representing the `ViewMeta` instance state. Methods that take in/return such
+// a type are used for supporting pickle serialization.
 struct ViewMeta {
   ViewMeta(
-      std::function<Tensor(const Tensor&, int64_t)> forward,
-      std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
       bool has_symbolic_inputs,
       bool is_multi_output = false,
       bool is_as_strided = false,
       int64_t out_idx = 0)
-      : forward_fn(std::move(forward)),
-        reverse_fn(std::move(reverse)),
-        out_index(out_idx),
+      : out_index(out_idx),
         is_multi_output(is_multi_output),
         is_as_strided(is_as_strided),
         has_symbolic_inputs(has_symbolic_inputs) {}
 
-  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
-  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
+  virtual ~ViewMeta() = default;
+
+  virtual Tensor forward(const Tensor& base) = 0;
+  virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
+
   // See Note [out_idx in ViewMeta]
   int64_t out_index;
 
@@ -57,10 +102,17 @@ struct ViewMeta {
   // Tells us if this view operation has any symbolic inputs
   bool has_symbolic_inputs;
 
-  // Returns a copy of the current ViewMeta, if out_idx matches the current
-  // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
+  // Returns a new ViewMeta with the same forward/reverse
   // functions, but a new out index.
-  ViewMeta to_out_idx(int64_t out_idx);
+  //
+  // This method should be implemented by those `ViewMeta` that have more than
+  // one output.
+  virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "ViewMeta::to_out_index not implemented. ",
+        "Likely because there's only one output.");
+  }
 };
 
 // FunctionalStorageImpl is a subclass of StorageImpl used by the
@@ -93,14 +145,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
     const at::Tensor new_val;
     // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
-    const std::vector<ViewMeta> view_metas;
+    const std::vector<std::shared_ptr<ViewMeta>> view_metas;
   };
 
   explicit FunctionalStorageImpl(const Tensor& value);
 
   void add_update(
       const Tensor& updated_val,
-      const std::vector<ViewMeta>& view_metas);
+      const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
   bool apply_updates();
   const Tensor& base() {
     return base_;
diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp
index 7d5e4e84e861..3a574fa7d491 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@@ -129,17 +129,19 @@ void FunctionalTensorWrapper::freeze_storage() const {
 // - view_value: The output tensor that we need to wrap.
 // - base: The "base" of the view that `view_value` was generated from.
 // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
-FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
-  : c10::TensorImpl(
-      c10::DispatchKeySet(DispatchKey::Functionalize),
-      view_value.dtype(),
-      view_value.device()
-    ),
-    value_(view_value),
-    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
-    was_storage_changed_(base->was_storage_changed_),
-    is_symbolic_(base->is_symbolic_)
-{
+FunctionalTensorWrapper::FunctionalTensorWrapper(
+    const Tensor& view_value,
+    const FunctionalTensorWrapper* base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta)
+    : c10::TensorImpl(
+          c10::DispatchKeySet(DispatchKey::Functionalize),
+          view_value.dtype(),
+          view_value.device()),
+      value_(view_value),
+      is_multi_output_view_(
+          base->is_multi_output_view_ || meta->is_multi_output),
+      was_storage_changed_(base->was_storage_changed_),
+      is_symbolic_(base->is_symbolic_) {
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
   set_constructor_metadata();
@@ -148,11 +150,10 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
       view_metas_ = base->view_metas_;  // copy
   }
   view_metas_.push_back(meta);
-  maybe_mark_symbolic(meta);
+  maybe_mark_symbolic(meta.get());
   storage_ = base->storage_; // alias this tensor's storage with the base tensor's
 }
 
-
 functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
   return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
 }
@@ -176,18 +177,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 }
 
 // See Note [Functionalization Pass - Inplace View Ops]
-void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
+void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
   view_metas_.push_back(meta);
   // Manually track the fact that this tensor received a metadata mutation!
   has_metadata_mutation_ = true;
   // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
-  maybe_mark_symbolic(meta);
+  maybe_mark_symbolic(meta.get());
   // Note [Functionalization Pass - Inplace View Ops]
   // So, these ops are special - they're mutation AND view ops. They get special codegen.
   // An example is transpose_, e.g. `a.transpose_()`
   // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
   at::AutoDispatchSkipFunctionalize guard;
-  value_ = meta.forward_fn(value_, meta.out_index);
+  value_ = meta->forward(value_);
   TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
 }
 
@@ -368,15 +369,8 @@ void FunctionalTensorWrapper::sync_() {
   regenerate_from_base();
 }
 
-Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
-  auto t = base;
-
-  // Reapply views to get the viewed tensor from the base in alias_
-  for (auto& view_meta: view_metas_) {
-    t = view_meta.forward_fn(t, view_meta.out_index);
-  }
-
-  return t;
+const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
+  return view_metas_;
 }
 
 void FunctionalTensorWrapper::regenerate_from_base() {
@@ -385,7 +379,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
   auto t = storage_impl->base();
 
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  t = apply_view_metas(t);
+  t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
 
   replace_(t, /*from_lazy_regenerate=*/true);
@@ -724,11 +718,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
 }
 
 bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
-  if (t_list.empty()) return false;
+  if (t_list.empty()) { return false; }
   auto functional_count = 0;
   for (const auto i : c10::irange(t_list.size())) {
     auto const & e= t_list[i];
-    if (!e.has_value() || !e->defined()) continue;
+    if (!e.has_value() || !e->defined()) { continue; }
     if (isFunctionalTensor(e)) {
       ++functional_count;
     }
@@ -738,10 +732,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
 
 template <typename T>
 static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
-  if (list.size() == 0) return false;
+  if (list.size() == 0) { return false; }
   auto functional_count = 0;
   for (const auto& tensor : list) {
-    if (!tensor.defined()) continue;
+    if (!tensor.defined()) { continue; }
     if (isFunctionalTensor(tensor)) {
       ++functional_count;
     }
@@ -759,20 +753,28 @@ void freeze_functional_tensor(const Tensor& tensor) {
   functional_base_impl->freeze_storage();
 }
 
-Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
+Tensor create_functional_tensor_with_view_meta(
+    const at::Tensor& view_to_wrap,
+    const at::Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta,
+    int64_t out_idx) {
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
   auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
+  auto meta_ = meta;
   if (out_idx != 0) {
     // Note [out_idx in ViewMeta]
     // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
     // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
-    meta = meta.to_out_idx(out_idx);
+    meta_ = meta->to_out_index(out_idx);
   }
-  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
+  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
 }
 
-std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
+std::vector<Tensor> create_functional_tensor_with_view_meta(
+    ITensorListRef view_to_wrap,
+    const at::Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta) {
   std::vector<Tensor> outputs(view_to_wrap.size());
   int64_t i = 0;
   for (const auto& tensor : view_to_wrap) {
@@ -782,12 +784,22 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_
   return outputs;
 }
 
-void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
+void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
   auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
   self_impl->mutate_view_meta(meta);
 }
 
+Tensor apply_view_meta_sequence(
+    const Tensor& base,
+    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
+  Tensor r = base;
+  for (auto& vm : sequence) {
+    r = vm->forward(r);
+  }
+  return r;
+}
+
 // Note [Propagating strides in the functionalization pass]
 // In order to properly compute stride information, the functionalization pass
 // calls each {view} reference implementations with meta tensors.
@@ -881,7 +893,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
     const auto& ivalue = returns[idx];
     if (ivalue.isTensor()) {
       const auto& t = ivalue.toTensor();
-      if (!t.defined()) continue;
+      if (!t.defined()) { continue; }
       at::functionalization::impl::sync(t);
       auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
       (*stack)[returns_begin + idx] = t_new;
diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h
index b260b7c9f958..6d9050728da7 100644
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   explicit FunctionalTensorWrapper(
       const Tensor& view_value,
       const FunctionalTensorWrapper* base,
-      const functionalization::ViewMeta& meta);
+      const std::shared_ptr<functionalization::ViewMeta>& meta);
 
   // Get the underlying, actual tensor, that doesn't know anything about
   // functionalization.
@@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
         ->are_all_mutations_under_no_grad_or_inference_mode();
   }
 
-  void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
-    is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
+  void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
+    is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
   }
 
   bool is_symbolic() const {
     return is_symbolic_;
   }
 
-  // Runs the forward_fn of every ViewMeta collected in the current instance
-  // to some other base.
-  Tensor apply_view_metas(const Tensor& base);
+  // Retrieves the ViewMeta sequence of this tensor.
+  const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
+      const;
 
   // Sync's the underlying tensor with its alias, if it's out of date. This
   // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@@ -146,7 +146,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   // from the base tensor. This method is used by inplace-view ops like
   // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
   // tensor by replaying the views off of the alias.
-  void mutate_view_meta(const at::functionalization::ViewMeta& meta);
+  void mutate_view_meta(
+      const std::shared_ptr<at::functionalization::ViewMeta>& meta);
 
   // Custom implementation of self.set_(src)
   void set__impl(const FunctionalTensorWrapper* other);
@@ -285,7 +286,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   bool is_symbolic_ = false;
 
   size_t generation_ = 0;
-  std::vector<at::functionalization::ViewMeta> view_metas_;
+  std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
 
  protected:
   static void copy_tensor_metadata(
@@ -377,16 +378,20 @@ TORCH_API void propagate_xla_data_direct(
 Tensor create_functional_tensor_with_view_meta(
     const Tensor& view_to_wrap,
     const Tensor& base,
-    functionalization::ViewMeta meta,
+    const std::shared_ptr<functionalization::ViewMeta>& meta,
     int64_t out_idx = 0);
 std::vector<Tensor> create_functional_tensor_with_view_meta(
     ITensorListRef view_to_wrap,
     const Tensor& base,
-    const functionalization::ViewMeta& meta);
+    const std::shared_ptr<functionalization::ViewMeta>& meta);
 
 void mutate_view_meta(
     const Tensor& self,
-    const functionalization::ViewMeta& meta);
+    const std::shared_ptr<functionalization::ViewMeta>& meta);
+
+TORCH_API Tensor apply_view_meta_sequence(
+    const Tensor& base,
+    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
 
 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
index 97094c9f125a..10f988b4d281 100644
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@@ -1,3 +1,5 @@
+#include <ATen/FunctionalizeFallbackKernel.h>
+
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
@@ -7,7 +9,6 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 #include <c10/util/strides.h>
-#include <ATen/EmptyTensor.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@@ -28,6 +29,31 @@
 #include <utility>
 #endif
 
+namespace at::functionalization {
+
+Tensor resize__ViewMeta::forward(const Tensor& base) {
+  if (reapply_views) {
+    return base.as_strided(size, c10::contiguous_strides(size));
+  } else {
+    return at::as_strided_copy(base, size, c10::contiguous_strides(size));
+  }
+}
+
+Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
+  return base.as_strided_scatter(
+      mutated_view, size, c10::contiguous_strides(size));
+}
+
+Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
+  return at::_unsafe_view_symint(base, size);
+}
+
+Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
+  return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
+}
+
+} // namespace at::functionalization
+
 namespace {
   void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
     const auto& schema = op.schema();
@@ -106,7 +132,9 @@ namespace {
       const auto& ivalue = returns[idx];
       if (ivalue.isTensor() && should_wrap_outputs) {
         const auto& t = ivalue.toTensor();
-        if (!t.defined()) continue;
+        if (!t.defined()) {
+          continue;
+        }
         auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
         (*stack)[returns_begin + idx] = t_new;
       } else if (ivalue.isTensorList() && should_wrap_outputs) {
@@ -169,19 +197,8 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
   // The output of resizing is equivalent to taking a slice of a larger tensor.
   // We have to emulate this "slicing" with an as_strided call.
   auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
-  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-    [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      if (reapply_views) {
-        return base.as_strided(size, c10::contiguous_strides(size));
-      } else {
-        return at::as_strided_copy(base, size, c10::contiguous_strides(size));
-      }
-    },
-    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
-    },
-    /*has_symbolic_inputs=*/false
-  );
+  auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
+      reapply_views, size.vec());
   at::functionalization::impl::mutate_view_meta(self, view_meta);
   return self;
 }
@@ -300,17 +317,11 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
     tmp_output = at::_unsafe_view_symint(self_, size);
   }
 
-  bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
-
-  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-    [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return at::_unsafe_view_symint(base, size);
-    },
-    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
-    },
-    /*has_symbolic_inputs=*/has_symbolic_inputs
-  );
+  bool has_symbolic_inputs = std::any_of(
+      size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
+  auto view_meta =
+      std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
+          has_symbolic_inputs, size.vec());
 
   auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
   // See  Note [Propagating strides in the functionalization pass]
diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.h b/aten/src/ATen/FunctionalizeFallbackKernel.h
new file mode 100644
index 000000000000..aabcfc827af3
--- /dev/null
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <ATen/FunctionalStorageImpl.h>
+
+namespace at::functionalization {
+
+// `ViewMeta` implementation for `resize_` operation.
+struct TORCH_API resize__ViewMeta : public ViewMeta {
+  FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+      bool /* reapply_views */,
+      const std::vector<int64_t>&);
+
+  resize__ViewMeta(const SerializableTuple& tpl)
+      : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+
+  resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
+      : ViewMeta(/*has_symbolic_inputs=*/false),
+        reapply_views(reapply_views),
+        size(size) {}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+
+  SerializableTuple to_serializable_tuple() {
+    return std::make_tuple(reapply_views, size);
+  }
+
+  bool reapply_views;
+  std::vector<int64_t> size;
+};
+
+// `ViewMeta` implementation for `_unsafe_view` operation.
+struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
+  FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+      bool /* has_symbolic_inputs */,
+      const std::vector<c10::SymInt>&);
+
+  _unsafe_view_ViewMeta(const SerializableTuple& tpl)
+      : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+
+  _unsafe_view_ViewMeta(
+      bool has_symbolic_inputs,
+      const std::vector<c10::SymInt>& size)
+      : ViewMeta(has_symbolic_inputs), size(size) {}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+
+  SerializableTuple to_serializable_tuple() {
+    return std::make_tuple(has_symbolic_inputs, size);
+  }
+
+  std::vector<c10::SymInt> size;
+};
+
+} // namespace at::functionalization
diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp
index 33977d8d7cf8..22509c7be4e1 100644
--- a/aten/src/ATen/ThreadLocalState.cpp
+++ b/aten/src/ATen/ThreadLocalState.cpp
@@ -8,6 +8,7 @@
 #include <ATen/record_function.h>
 #include <ATen/SavedTensorHooks.h>
 #include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/DTensorState.h>
 
 namespace at {
 
@@ -19,6 +20,7 @@ ThreadLocalState::ThreadLocalState()
       torch_dispatch_mode_state_(c10::impl::TorchDispatchModeTLS::get_state()), python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()),
       python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()),
       saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()),
+      dtensor_allow_implicit_replication_(at::get_dtensor_allow_implicit_replication()),
       saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) {
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER)
   for(size_t i=0; i<autocast_dtypes_.size(); i++) {
@@ -52,6 +54,8 @@ void ThreadLocalState::setThreadLocalState(
 
   c10::impl::PythonDispatcherTLS::set_state(state.python_dispatcher_state_);
 
+  at::set_dtensor_allow_implicit_replication(state.dtensor_allow_implicit_replication_);
+
   c10::ThreadLocalDebugInfo::_forceCurrentDebugInfo(state.debug_info_);
 
   c10::impl::_force_tls_local_dispatch_key_set(state.dispatch_key_);
diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h
index bb28175c5f42..d0d8112fc4ce 100644
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@@ -75,6 +75,8 @@ class TORCH_API ThreadLocalState {
 
   bool functionalization_reapply_views_state_;
 
+  bool dtensor_allow_implicit_replication_;
+
   // TLS for arbitrary python objects that is registered via hooks
   at::impl::ThreadLocalPythonObjects saved_objects_;
 
diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 8463379149e2..5f43738ea0fa 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -1,5 +1,18 @@
 #pragma once
 
+// See https://github.com/pytorch/pytorch/issues/161660
+// This compile flag is intended to be passed in to CppExtensions that rely on
+// the stable ABI via the `extra_compile_args` argument. This is a stopgap
+// solution to ensure that non-stable libtorch APIs are not used in the extension.
+// The long term solution is to have a torch_stable target that excludes headers
+// that are not in torch/stable or torch/headeronly.
+// See test/cpp_extensions/torch_stable_test_extension/setup.py for an example
+// of how this is used.
+#ifdef TORCH_STABLE_ONLY
+#error \
+    "TensorBase.h should not be included when TORCH_STABLE_ONLY compile flag is passed"
+#endif
+
 #include <c10/core/Device.h>
 #include <c10/core/Layout.h>
 #include <c10/core/MemoryFormat.h>
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index be93d5991e9a..672309ec19a2 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -15,7 +15,7 @@ std::enable_if_t<
         std::is_base_of_v<Base, Child>,
     std::unique_ptr<Base>>
 make_unique_base(Args&&... args) {
-  return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
+  return std::make_unique<Child>(std::forward<Args>(args)...);
 }
 } // namespace detail
 
diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h
index b33e7ce0c549..2ba841e44e20 100644
--- a/aten/src/ATen/core/dynamic_type.h
+++ b/aten/src/ATen/core/dynamic_type.h
@@ -64,6 +64,7 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10);
   _(ScalarType, kDynamicIntTypeBit, 1)                                \
   _(Layout, kDynamicIntTypeBit, 1)                                        \
   _(SymInt, kDynamicIntTypeBit, 1)                                        \
+  _(SymBool, kDynamicIntTypeBit, 1)                                        \
   _(MemoryFormat, kDynamicIntTypeBit, 1)
 
 #define FORWARD_DECL_TYPE(NAME, _, __) struct NAME ## Type;
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 4ab57f0beb1c..0d319ea59384 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -996,9 +996,6 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
 
 template <>
 void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
-  #ifdef USE_ROCM
-  TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm");
-  #endif
   // TODO: Support tuning for Half inputs and FP32 output
   bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
 }
@@ -1006,9 +1003,7 @@ void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)
 
 template <>
 void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
-  #ifdef USE_ROCM
-  TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm");
-  #else
+  #ifndef USE_ROCM
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
 
     if (prop->major < 8)
@@ -1513,9 +1508,6 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
 
 template <>
 void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
-  #ifdef USE_ROCM
-  TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
-  #endif
   // TODO: Support Tuning for fp16-fp32 gemm
   gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
 }
@@ -1523,9 +1515,7 @@ void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
 
 template <>
 void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
-  #ifdef USE_ROCM
-  TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
-  #else
+  #ifndef USE_ROCM
     cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
 
     if (prop->major < 8)
@@ -1947,11 +1937,11 @@ void scaled_gemm(
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
   cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER;
   cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER;
+#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
   // hipblaslt supported row-wise before cublas, and did so their own way (via
   // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
   // the SCALE_MODEs). Here we check for this early custom mode.
   bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
-#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
   if (use_rowwise) {
     matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
     matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
@@ -1966,8 +1956,12 @@ void scaled_gemm(
             }
   #endif
   }
-#else
-  // rowwise isn't supported using cublaslt or older hipblaslt
+#elif (CUDA_VERSION < 12090) && !defined(USE_ROCM)
+  // hipblaslt supported row-wise before cublas, and did so their own way (via
+  // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
+  // the SCALE_MODEs). Here we check for this early custom mode.
+  bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
+  // rowwise isn't supported using older cublaslt or older hipblaslt
   TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
 #endif  // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
   computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
@@ -2583,8 +2577,6 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>)) {
                                    reinterpret_cast<cuDoubleComplex*>(result)));
 }
 
-// HIP on Windows does not support
-#if !(defined(USE_ROCM) && defined(_MSC_VER))
 template <>
 void getrsBatched<float>(CUDABLAS_GETRS_ARGTYPES(float)) {
   TORCH_CUDABLAS_CHECK(cublasSgetrsBatched(
@@ -2783,6 +2775,5 @@ void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::comple
       devInfoArray,
       batchSize));
 }
-#endif // !(defined(USE_ROCM) && defined(_MSC_VER))
 
 } // namespace at::cuda::blas
diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h
index 5021917fe095..b235840418e2 100644
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@@ -343,9 +343,6 @@ void vdot<c10::complex<double>>(CUDABLAS_DOT_ARGTYPES(c10::complex<double>));
   int m, int n, int nrhs, Dtype** dA_array, int ldda, \
   Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize
 
-// HIP on Windows does not support getrs, geqrf, getrf, gels
-#if !(defined(USE_ROCM) && defined(_MSC_VER))
-
 template<class Dtype>
 void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
   static_assert(false&&sizeof(Dtype),"at::cuda::blas::getrsBatched: not implemented");
@@ -400,28 +397,4 @@ TORCH_CUDA_CU_API void gelsBatched<c10::complex<double>>(CUDABLAS_GELS_BATCHED_A
 template<>
 TORCH_CUDA_CU_API void gelsBatched<c10::complex<float>>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex<float>));
 
-#else // !(defined(USE_ROCM) && defined(_MSC_VER))
-
-template<class Dtype>
-void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) {
-  TORCH_CHECK(false, "at::cuda::blas::getrsBatched: not supported for HIP on Windows");
-}
-
-template <class Dtype>
-void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) {
-  TORCH_CHECK(false, "at::cuda::blas::geqrfBatched: not supported for HIP on Windows");
-}
-
-template<class Dtype>
-void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) {
-  TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not supported for HIP on Windows");
-}
-
-template <class Dtype>
-void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) {
-  TORCH_CHECK(false, "at::cuda::blas::gelsBatched: not supported for HIP on Windows");
-}
-
-#endif // !(defined(USE_ROCM) && defined(_MSC_VER))
-
 } // namespace at::cuda::blas
diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
index 60e1a19c1aac..a65db3f2df12 100644
--- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
+++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh
@@ -45,6 +45,24 @@ struct OffsetCalculator {
 
   C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
     offset_type offsets;
+
+#if defined(USE_ROCM)
+    if ((dims > 0) && (dims <= 2)) {
+      auto divmod = sizes_[0].divmod(linear_idx);
+      #pragma unroll
+      for (int arg = 0; arg < NARGS; arg++)
+        offsets[arg] = divmod.mod * strides_[0][arg];
+      if (dims >= 2) {
+        divmod = sizes_[1].divmod(divmod.div);
+        #pragma unroll
+        for (int arg = 0; arg < NARGS; arg++)
+          offsets[arg] += divmod.mod * strides_[1][arg];
+      }
+      // [...]
+      return offsets;
+    }
+#endif
+
     #pragma unroll
     for (int arg = 0; arg < NARGS; arg++) {
       offsets[arg] = 0;
diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index d89875865b88..aca83386ad42 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -117,6 +117,8 @@ namespace at::cuda {
   _(nvrtcGetPTXSize)                              \
   _(nvrtcGetPTX)                                  \
   _(cuModuleLoadData)                             \
+  _(cuModuleLoad)                                 \
+  _(cuGetErrorString)                             \
   _(cuModuleGetFunction)                          \
   _(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \
   _(nvrtcGetErrorString)                          \
diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
index 9972cbd1c151..3511e48ae061 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@@ -220,19 +220,17 @@ TuningResultsValidator::TuningResultsValidator() {
       []() { return GetPyTorchVersion(); },
       [this](auto&& k) { return ValidatePyTorchVersion(std::forward<decltype(k)>(k)); });
 #ifdef USE_ROCM
-  // rocm
+  // hip
   {
-#ifdef _WIN32
-    std::string rocm_version = HIP_VERSION_BUILD_NAME;
-#else
-    std::string rocm_version = ROCM_BUILD_INFO;
-#endif
+    // HIP version is more accurate than ROCm version.  User's environment could be a stock
+    // ROCm install but with a mix of newer components, making ROCm version meaningless.
+    std::string hip_version = c10::str(TORCH_HIP_VERSION);
     RegisterValidator(
-       "ROCM_VERSION",
-       [rocm_version]() { return rocm_version; },
-       [rocm_version](auto&& k) {
-        TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version);
-        return rocm_version == k ? OK : FAIL;
+       "HIP_VERSION",
+       [hip_version]() { return hip_version; },
+       [hip_version](auto&& k) {
+        TUNABLE_LOG1("HIP_VERSION validation: expect ", k, " to match ", hip_version);
+        return hip_version == k ? OK : FAIL;
       });
   }
   // gfx arch
diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h
index 6c2492b12e6b..85f0286542e7 100644
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@@ -38,6 +38,7 @@ inline int dataSize(cudnnDataType_t dataType)
   }
 }
 
+// NOTE [ cudnn fixSizeOneDimStride ]
 // The stride for a size-1 dimensions is not uniquely determined; in
 // fact, it can be anything you want, because the fact that the
 // tensor is size 1 at this dimension means that you will never actually
diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp
index de69e5c1e23a..6e63708a90f4 100644
--- a/aten/src/ATen/functorch/BatchRulesModules.cpp
+++ b/aten/src/ATen/functorch/BatchRulesModules.cpp
@@ -7,6 +7,7 @@
 #include <ATen/functorch/BatchRulesHelper.h>
 #include <ATen/functorch/PlumbingHelper.h>
 #include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/DTensorState.h>
 
 #include <utility>
 
@@ -44,8 +45,13 @@ static std::tuple<Tensor, std::optional<int64_t>> embedding_batch_rule(
   const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
   auto indices_ = moveBatchDimToFront(indices, indices_bdim);
 
-  const auto range = getStepTensor(indices, batch_size, num_embeddings);
-  indices_ = indices_ + range;
+  {
+    // getStepTensor returns a regular Tensor. If indices_ is a DTensor
+    // we want to allow this mixed DTensor-Tensor operation.
+    at::DTensorAllowImplicitReplication guard;
+    const auto range = getStepTensor(indices, batch_size, num_embeddings);
+    indices_ = indices_ + range;
+  }
   auto result = at::embedding_symint(weight_, indices_, std::move(padding_idx), scale_grad_by_freq, sparse);
   return std::make_tuple(std::move(result), 0);
 }
diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
index b26d2c4a419e..48a735c3e533 100644
--- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp
@@ -171,6 +171,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
 
   POINTWISE_BOXED(fill_.Scalar);
   POINTWISE_BOXED(zero_);
+  // This is special because this op doesn't return anything
+  m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata);
 
 #undef UNARY_POINTWISE
 #undef UNARY_POINTWISE_ALL
diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp
index 08c09b88f99c..86e42ee3b66d 100644
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@@ -19,31 +19,37 @@ inline miopenDataType_t getDataType(const at::Tensor& t) {
   } else {
     TORCH_CHECK(
         false,
-        "TensorDescriptor only supports float, half and bfloat16 tensors");
+        "TensorDescriptor does not support ", scalar_type);
   }
 }
 
 } // anonymous namespace
 
+constexpr size_t MIOPEN_DIM_MAX = 5;
 
-void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
-  set(getDataType(t), t.sizes(), t.strides(), pad);
+void TensorDescriptor::set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad) {
+  set(getDataType(t), t.sizes(), t.strides(), pad,
+    memory_format == at::MemoryFormat::ChannelsLast ||
+    memory_format == at::MemoryFormat::ChannelsLast3d);
 }
 
-constexpr size_t MIOPEN_DIM_MAX = 5;
+void TensorDescriptor::set(const at::Tensor &t, size_t pad) {
+  auto memory_format = t.suggest_memory_format();
+  set(getDataType(t), t.sizes(), t.strides(), pad,
+    memory_format == at::MemoryFormat::ChannelsLast ||
+    memory_format == at::MemoryFormat::ChannelsLast3d);
+}
 
 void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad) {
+  set(datatype, t_sizes, t_strides, pad,
+    is_channels_last_strides_2d(t_sizes, t_strides) ||
+    is_channels_last_strides_3d(t_sizes, t_strides));
+}
+
+void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad, bool nhwc) {
   size_t dim = t_sizes.size();
   if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
-#define _STR(X) #X
-#define STR(X) _STR(X)
-    TORCH_CHECK(
-        false,
-        "MIOpen supports only up to ",
-        STR(MIOPEN_DIM_MAX),
-        " dimensions");
-#undef _STR
-#undef STR
+    TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
   int size[MIOPEN_DIM_MAX];
   int stride[MIOPEN_DIM_MAX];
   for (const auto i : c10::irange(dim)) {
@@ -54,7 +60,7 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr
     size[i] = 1;
     stride[i] = 1;
   }
-  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride);
+  set(datatype, static_cast<int>(std::max(dim, pad)), size, stride, nhwc);
 }
 
 std::string miopenTypeToString(miopenDataType_t dtype) {
@@ -74,10 +80,11 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
   out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
-  int nbDims = 4;
+  int nbDims = 0;
   int dimA[MIOPEN_DIM_MAX];
   int strideA[MIOPEN_DIM_MAX];
   miopenDataType_t dtype;
+  miopenGetTensorDescriptorSize(d.desc(), &nbDims);
   miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
   out << "    type = " << miopenTypeToString(dtype) << "\n";
   out << "    nbDims = " << nbDims << "\n";
@@ -99,19 +106,17 @@ void TensorDescriptor::print() { std::cout << *this; }
 
 void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad) {
   auto dim = t.ndimension();
-  if (dim > static_cast<int64_t>(MIOPEN_DIM_MAX) || pad > static_cast<int64_t>(MIOPEN_DIM_MAX)) {
-#define _STR(X) #X
-#define STR(X) _STR(X)
-    TORCH_CHECK(
-        false,
-        "MIOpen supports only up to ",
-        STR(MIOPEN_DIM_MAX),
-        " dimensions");
-#undef _STR
-#undef STR
-  }
+  if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX)
+  TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions");
+  // NB: It is possible for this test to be insufficient, because the
+  // Tensor passed in to set the filter descriptor may not be the actual
+  // Tensor whose data pointer is passed to cuDNN.  Nevertheless,
+  // that is the common case, so we can catch most client errors with this test.
   TORCH_CHECK(t.is_contiguous(memory_format),
-      "MIOpen filters (a.k.a. weights) must be contiguous");
+    "MIOpen filters (a.k.a. weights) must be contiguous in desired memory_format\n",
+    "Weight sizes: ", t.sizes(), "\n",
+    "Weight strides: ", t.strides(), "\n",
+    "cuDNN suggested memory_format: ", memory_format);
 
   int size[MIOPEN_DIM_MAX];
   int stride[MIOPEN_DIM_MAX];
@@ -131,7 +136,9 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo
   }
 
   dim = std::max<int64_t>(dim, pad);
-  set(getDataType(t), (int) dim, size, stride);
+  set(getDataType(t), static_cast<int>(dim), size, stride,
+    memory_format == at::MemoryFormat::ChannelsLast ||
+    memory_format == at::MemoryFormat::ChannelsLast3d);
 }
 
 }}
diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h
index 2eee837cd533..8825575c9231 100644
--- a/aten/src/ATen/miopen/Descriptors.h
+++ b/aten/src/ATen/miopen/Descriptors.h
@@ -9,6 +9,8 @@
 
 namespace at { namespace native {
 
+std::string miopenTypeToString(miopenDataType_t dtype);
+
 inline int dataSize(miopenDataType_t dataType)
 {
   switch (dataType) {
@@ -19,6 +21,32 @@ inline int dataSize(miopenDataType_t dataType)
   }
 }
 
+// See NOTE [ cudnn fixSizeOneDimStride ] in aten/src/ATen/cudnn/Descriptors.h
+template <typename T>
+static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) {
+  int64_t z = 1;
+  int index = 0;
+  std::vector<int> permutation(dim);
+
+  if (nhwc) {
+    permutation[index++] = 1;
+  }
+  for (int d = dim-1; d > 1; d--) {
+    permutation[index++] = d;
+  }
+  if (!nhwc) {
+    permutation[index++] = 1;
+  }
+  permutation[index++] = 0;
+  for (int d : permutation) {
+    if (size[d] == 1) {
+      stride[d] = z;
+    } else {
+      z *= size[d];
+    }
+  }
+}
+
 template <typename T, miopenStatus_t (*dtor)(T*)>
 struct DescriptorDeleter {
   void operator()(T* x) {
@@ -75,14 +103,20 @@ class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
     set(t, pad);
   }
 
+  // See Note [CuDNN broadcast padding]
   void set(const at::Tensor &t, size_t pad = 0);
+  void set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad = 0);
   void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0);
 
   void print();
 
 private:
-  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
-    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
+  void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc);
+
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
+    std::vector<int> strides_copy(stride, stride + dim);
+    fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
   }
 };
 
@@ -100,8 +134,10 @@ class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
   void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0);
 
 private:
-  void set(miopenDataType_t dataType, int dim, int* size, int* stride) {
-    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride));
+  void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) {
+    std::vector<int> strides_copy(stride, stride + dim);
+    fixSizeOneDimStride<int>(dim, size, strides_copy.data(), nhwc);
+    MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data()));
   }
 };
 
@@ -166,4 +202,4 @@ union Constant
   }
 };
 
-}}  // namespace
+}} // namespace
diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp
index d858df073397..6c58de099648 100644
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@@ -12,7 +12,7 @@
 
 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 13.0+.", \
+  "The MPS backend is supported on MacOS 14.0+. ", \
   "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
   "as the MPS framework doesn't support float64. Please use float32 instead."
diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm
index a2ec221c1bfe..34fbd31af91d 100644
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@@ -70,7 +70,10 @@
 }
 
 void* MPSHooks::getCommandBuffer() const {
-  return at::mps::getDefaultMPSStream()->commandBuffer();
+  auto stream = at::mps::getDefaultMPSStream();
+  // Release pending computeCommandEncoder, as extensions is likely to allocate new one
+  stream->endKernelCoalescing();
+  return stream->commandBuffer();
 }
 
 void* MPSHooks::getDispatchQueue() const {
diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm
index e9627a343ad6..71325bd69e1d 100644
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@@ -158,7 +158,18 @@ @interface MPSGraphExecutionDescriptor ()
       endKernelCoalescing();
       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
 
-      [blitEncoder fillBuffer:buffer range:NSMakeRange(offset, length) value:value];
+      // For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
+      // See https://github.com/pytorch/pytorch/issues/163962
+      // Workaround by batching copy commands into 4Gb chunks
+      constexpr size_t max_copy_size = 0x100000000; // 4GB
+      size_t bytes_filled = 0;
+      size_t bytes_remains = length;
+      while (bytes_remains > 0) {
+        NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains);
+        [blitEncoder fillBuffer:buffer range:NSMakeRange(offset + bytes_filled, bytes_to_copy) value:value];
+        bytes_filled += bytes_to_copy;
+        bytes_remains -= bytes_to_copy;
+      }
       [blitEncoder endEncoding];
       synchronize(syncType);
     }
diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp
index 674ccf11cfb9..49366151ae60 100644
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@@ -9,6 +9,7 @@
 #include <ATen/native/mkldnn/Matmul.h>
 #include <ATen/native/mkldnn/Linear.h>
 #include <ATen/native/Resize.h>
+#include <ATen/native/GroupedMMUtils.h>
 #if !defined(__s390x__) && !defined(__powerpc__)
 #include <cpuinfo.h>
 #endif
@@ -332,4 +333,23 @@ _scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
   return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
 }
 
+// TODO(vasiliy, future PR): figure out why we need to declare this function, when
+// other functions that live in ATen/native/*.cpp without declarations
+// or headers work just fine.
+Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype);
+
+Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype) {
+  _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
+  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+  _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
+  return out;
+}
+
 }  // namespace at::native
diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp
index b16c1ef04fa0..e06afddd05aa 100644
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@@ -496,18 +496,18 @@ void gemm(
   // for the fallback path, first compute gemm with beta = 0,
   // and then add c in full precision.
   int64_t c_size = n * m;
-  std::vector<at::Half> float16_c(c_size, 0.f);
-  gemm_stub(
+  std::vector<float> float_c(c_size, 0.f);
+  gemm_no_downcast_stub(
       at::kCPU, at::kHalf,
-      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float16_c.data(), m);
+      transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
   for (const auto j : c10::irange(n)) {
     for (const auto i : c10::irange(m)) {
       auto offset = j * ldc + i;
       // beta == 0 won't propagate NaN from C
       if (beta == 0.f) {
-        c[offset] = c10::convert<float>(float16_c[j * m + i]);
+        c[offset] = float_c[j * m + i];
       } else {
-        c[offset] = beta * c[offset] + c10::convert<float>(float16_c[j * m + i]);
+        c[offset] = beta * c[offset] + float_c[j * m + i];
       }
     }
   }
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 84381efe55b0..e160c84ced33 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -353,19 +353,21 @@ TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable);
 TORCH_API bool _cudnn_get_conv_benchmark_empty_cache();
 
 
-inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
-
+inline at::MemoryFormat miopen_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) {
   // disable NHWC for float64 input.
   if (!at::detail::getCUDAHooks().compiledWithMIOpen() ||
       input.scalar_type() == at::kDouble ||
       weight.scalar_type() == at::kDouble) {
-    return false;
+    return at::MemoryFormat::Contiguous;
   }
 
   // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
-  // See #64427
-  static std::optional<bool> PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
-  static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC;
+  // See https://github.com/pytorch/pytorch/issues/64427.
+  // non static variable is used to be able to change environment variable in runtime for testing
+  // enabled by default for ROCm >= 7.0.0 with miopen 3.5
+  int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0;
+  bool is_miopen_3_5 = miopen_version >= 30500;  // ROCm 7.0
+  bool suggest_nhwc = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC").value_or(is_miopen_3_5);
 
   auto input_memory_format = input.suggest_memory_format();
   auto weight_memory_format = weight.suggest_memory_format();
@@ -375,13 +377,24 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten
     (input_memory_format  == at::MemoryFormat::ChannelsLast) ||
     (weight_memory_format == at::MemoryFormat::ChannelsLast)
   );
+  if (can_use_miopen_channels_last_2d) {
+    return at::MemoryFormat::ChannelsLast;
+  }
 
   bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && (
     (input_memory_format  == at::MemoryFormat::ChannelsLast3d) ||
     (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
   );
+  if (can_use_miopen_channels_last_3d) {
+    return at::MemoryFormat::ChannelsLast3d;
+  }
+
+  return at::MemoryFormat::Contiguous;
+}
 
-  return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
+// deprecated, but to remove would be BC-breaking
+inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+  return miopen_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous;
 }
 
 inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index d2b7b055684e..2e0e4a47f37b 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -14,6 +14,7 @@
 #include <c10/util/accumulate.h>
 #include <c10/util/irange.h>
 #include <c10/macros/Macros.h>
+#include <algorithm>
 #include <limits>
 #include <utility>
 
@@ -300,67 +301,50 @@ struct ConvParams {
   bool allow_tf32{};
 
   bool is_strided() const {
-    bool is_strided = false;
-    for (const auto& s : stride) {
-      is_strided |= (s != 1);
-    }
-    return is_strided;
+    return std::any_of(
+      stride.cbegin(), stride.cend(), [](const T& s) { return s != 1; });
   }
 
   bool is_dilated() const {
-    bool is_dilated = false;
-    for (const auto& d : dilation) {
-      is_dilated |= (d != 1);
-    }
-    return is_dilated;
+    return std::any_of(
+      dilation.cbegin(), dilation.cend(), [](const T& d) { return d != 1; });
   }
 
   bool is_padded() const {
-    bool is_padded = false;
-    for (auto p : padding) {
-      is_padded |= (p != 0);
-    }
-    return is_padded;
+    return std::any_of(
+      padding.cbegin(), padding.cend(), [](const T& p) { return p != 0; });
   }
 
   bool is_output_padding_neg() const {
-    bool is_non_neg = false;
-    for (const auto& p : output_padding) {
-      is_non_neg |= (p < 0);
-    }
-    return is_non_neg;
+    return std::any_of(
+      output_padding.cbegin(),
+      output_padding.cend(),
+      [](const T& p) { return p < 0; });
   }
 
   bool is_output_padding_big() const {
-    bool is_big = false;
+    // Revisit this with std::views::zip at C++20.
     for (auto i: c10::irange(output_padding.size())) {
-      is_big |= (output_padding[i] >= stride[i]);
+      if (output_padding[i] >= stride[i]) {
+        return true;
+      }
     }
-    return is_big;
+    return false;
   }
 
   bool is_padding_neg() const {
-    bool is_non_neg = false;
-    for (const auto& p : padding) {
-      is_non_neg |= (p < 0);
-    }
-    return is_non_neg;
+    return std::any_of(
+      padding.cbegin(), padding.cend(), [](const T& p) { return p < 0; });
   }
 
   bool is_dilation_neg() const {
-    bool is_non_neg = false;
-    for (const auto& p : dilation) {
-      is_non_neg |= (p < 0);
-    }
-    return is_non_neg;
+    return std::any_of(
+      dilation.cbegin(), dilation.cend(), [](const T& d) { return d < 0; });
   }
 
   bool is_stride_nonpos() const {
-    bool is_nonpos = false;
-    for (const auto& s : stride) {
-      is_nonpos |= (s <= 0);
-    }
-    return is_nonpos;
+    return std::any_of(
+      stride.cbegin(), stride.cend(), [](const T& s) { return s <= 0; });
   }
 
   void view1d_as_2d() {
@@ -426,11 +410,23 @@ struct ConvParams {
   // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
   // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
 #if !defined(C10_MOBILE)
-    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
       return false;
     }
+    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+    // broken on cuDNN 9.8
+    if (cudnn_version >= 90800) {
+      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
+          (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
+          weight.dim() == 5) {
+        for (int i = 2; i < weight.dim(); i++) {
+          if (weight.size(i) != 1) {
+            return false;
+          }
+        }
+      }
+    }
     if (needs_64bit_indexing_no_split(input, weight)) {
-      static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                         " if the V8 API is not enabled or before cuDNN version 9.3+."
@@ -438,9 +434,6 @@ struct ConvParams {
         return false;
       }
     }
-    if (!input.is_cuda() || !cudnn_enabled) {
-      return false;
-    }
     if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
       if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
         return false;
@@ -459,13 +452,19 @@ struct ConvParams {
 
   // Use cudnn for FP16 depthwise convolutions
   bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
-    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
-      // always use cudnn_depthwise for channels_last format
-      return true;
+    if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
+      return false;
     }
     // native kernel doesn't support 64-bit non-splittable case
-    if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
+    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
       static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
+      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
+        if (cudnn_version < 0 || cudnn_version > 91000) {
+          return false;
+        }
+      }
+
       if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
         TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                         " if the V8 API is not enabled or before cuDNN version 9.3+."
@@ -475,6 +474,10 @@ struct ConvParams {
         return true;
       }
     }
+    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
+      // always use cudnn_depthwise for channels_last format
+      return true;
+    }
     if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
       bool kernel_cond =  (use_cudnn(input, weight) &&
                            input.scalar_type() == kHalf && // only for FP16
@@ -1419,10 +1422,8 @@ static inline at::MemoryFormat determine_backend_memory_format(
     case ConvBackend::Miopen:
     case ConvBackend::MiopenDepthwise:
     case ConvBackend::MiopenTranspose:
-      if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
-        TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
-            "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
-        backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
+      if (detail::getCUDAHooks().compiledWithMIOpen()) {
+        backend_memory_format = miopen_conv_suggest_memory_format(input, weight);
       }
       break;
     case ConvBackend::Mkldnn:
diff --git a/aten/src/ATen/native/GroupedMMUtils.h b/aten/src/ATen/native/GroupedMMUtils.h
new file mode 100644
index 000000000000..78993308cd5f
--- /dev/null
+++ b/aten/src/ATen/native/GroupedMMUtils.h
@@ -0,0 +1,167 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/CPUFunctions.h>
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/bmm.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/mm.h>
+#endif
+
+namespace at::native {
+
+inline bool check_valid_strides_and_return_transposed(const Tensor& mat) {
+  IntArrayRef tensor_strides = mat.strides();
+  IntArrayRef tensor_sizes = mat.sizes();
+  int end_dim = mat.dim() - 1;
+  int alignment = 16 / mat.element_size();
+  TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
+  if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
+    TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
+    return true;
+  } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
+    TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
+    return false;
+  } else {
+    TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
+  }
+}
+
+inline at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
+const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+c10::ScalarType out_dtype
+) {
+  c10::SmallVector<int64_t, 3> out_size;
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+  if (a_is_2d) {
+    if (b_is_2d) {
+      out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
+    } else {
+      TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
+      out_size = {mat_a.size(0), mat_b.size(-1)};
+    }
+  } else {
+    if (b_is_2d) {
+      // this case is not actually encountered for MoE gemms
+      TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
+      out_size = {mat_a.size(1), mat_b.size(1)};
+    } else { // regular bmm
+      TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
+      out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
+    }
+  }
+
+  #ifndef USE_ROCM
+  // For TMA transfers, strides of output tensor have to be either
+  // 1, or aligned to 16 bytes.
+  const auto last_dim = out_size.size() - 1;
+  const auto alignment = 16 / c10::elementSize(out_dtype);
+  const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
+  std::vector<int64_t> out_stride;
+  if (a_is_2d != b_is_2d) {
+    out_stride = {size_padded, 1};
+  } else {
+    out_stride = {out_size[1] * size_padded, size_padded, 1};
+  }
+  return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype));
+  #else
+  return at::empty(out_size, mat_a.options().dtype(out_dtype));
+  #endif
+}
+
+inline void _grouped_mm_validate_inputs(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype) {
+  TORCH_CHECK((mat_a.dtype() == at::kBFloat16) || (mat_a.dtype() == at::kFloat) || (mat_a.dtype() == at::kHalf), "Expected mat_a to be Float32, BFloat16 or Float16 matrix, got ", mat_a.scalar_type());
+  TORCH_CHECK((mat_b.dtype() == at::kBFloat16) || (mat_b.dtype() == at::kFloat) || (mat_b.dtype() == at::kHalf), "Expected mat_b to be Float32, BFloat16 or Float16 matrix, got ", mat_b.scalar_type());
+  TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
+  TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+  if (!a_is_2d || !b_is_2d) {
+    TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
+  }
+
+  // check that the strides are valid, the fn will throw an error if not
+  check_valid_strides_and_return_transposed(mat_a);
+  check_valid_strides_and_return_transposed(mat_b);
+  TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
+
+  if (offs.has_value()) {
+    TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
+    TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
+  }
+  TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
+}
+
+inline c10::ScalarType _resolve_grouped_mm_out_dtype(const Tensor& mat_a, const Tensor& mat_b,
+std::optional<c10::ScalarType> out_dtype) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  // TODO(future PR): enable float32 output dtype for bfloat16 and float16 inputs
+  TORCH_CHECK(out_dtype_ == mat_a.dtype(), "Grouped gemm output dtype must match `mat_a` dtype");
+  return out_dtype_;
+}
+
+
+inline void _grouped_mm_fallback(const Tensor& mat_a, const Tensor& mat_b,
+const std::optional<at::Tensor>& offs,
+const std::optional<at::Tensor>& bias,
+std::optional<c10::ScalarType> out_dtype,
+Tensor out) {
+  LOG(INFO) << "fallback path for `torch._grouped_mm`, performance may not be optimal";
+  const bool a_is_2d = mat_a.dim() == 2;
+  const bool b_is_2d = mat_b.dim() == 2;
+  if (a_is_2d && !b_is_2d) {
+    // 2d x 3d with offsets
+    int group_start_idx = 0;
+    auto offs_cpu = offs.value().cpu();
+    for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
+      int group_end_idx = offs_cpu[group_idx].item<int>();
+      auto mat_a_slice = mat_a.slice(0, group_start_idx, group_end_idx);
+      auto out_slice = out.slice(0, group_start_idx, group_end_idx);
+      at::mm_out(out_slice, mat_a_slice, mat_b[group_idx]);
+      group_start_idx = group_end_idx;
+    }
+
+  } else if (!a_is_2d && b_is_2d) {
+    // 3d x 2d with offsets
+    int group_start_idx = 0;
+    auto offs_cpu = offs.value().cpu();
+    for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
+      int group_end_idx = offs_cpu[group_idx].item<int>();
+      auto mat_b_slice = mat_b.slice(1, group_start_idx, group_end_idx);
+      auto out_slice = out.slice(1, group_start_idx, group_end_idx);
+      at::mm_out(out_slice, mat_a[group_idx], mat_b_slice);
+      group_start_idx = group_end_idx;
+    }
+
+  } else if (a_is_2d && b_is_2d) {
+    // 2d x 2d with offsets
+    int group_start_idx = 0;
+    auto offs_cpu = offs.value().cpu();
+    for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) {
+      int group_end_idx = offs_cpu[group_idx].item<int>();
+      auto mat_a_slice = mat_a.slice(1, group_start_idx, group_end_idx);
+      auto mat_b_slice = mat_b.slice(0, group_start_idx, group_end_idx);
+      auto out_slice = out[group_idx];
+      at::mm_out(out_slice, mat_a_slice, mat_b_slice);
+      group_start_idx = group_end_idx;
+    }
+
+  } else {
+    // 3d x 3d without offsets - regular bmm
+    at::bmm_out(out, mat_a, mat_b);
+  }
+}
+
+
+} // namespace at::native
diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp
index 5d3a84ea39f6..a744da3bcad2 100644
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@@ -185,6 +185,17 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
   // right:  "lro, summed, ro" permuted with rpermutation and the three flattened
   // then the permuted output is a view of bmm(left, right)
   // finally, opermutation reverts the permutation to the original order of dimensions
+  // By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions.
+  // However, if all dimensions from the right operand appear before those from the left
+  // operand in the final output, we can swap the operands so that bmm directly produces
+  // the result in the correct memory order.
+
+  bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front();
+  if (swap_lo_ro) {
+    std::swap(left, right);
+    std::swap(lo, ro);
+    std::swap(lo_size, ro_size);
+  }
   auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size();
   std::vector<SymInt> out_size;
   out_size.reserve(out_num_dim);
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index b62c584641db..616e6ec60e13 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1360,7 +1360,8 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
 #endif
 
 
-#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
+#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
+// Used by default on x86 platforms and on AArch64+ACL
 static inline int64_t get_mkldnn_matmul_min_dim() {
   static auto value = [&] {
     const int64_t default_min_dim = [&] {
@@ -1395,8 +1396,6 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
   return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
 }
 #endif
-
-
 static void addmm_impl_cpu_(
     Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
   TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
@@ -1772,8 +1771,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
     return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
         (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
   };
-
-#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
+#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
+  // Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
   bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
   if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
     try {
@@ -1785,7 +1784,6 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
     }
   }
 #endif
-
   if (contraction_size * res_rows * res_cols < 400) {
     if (is_bmm_out) {
       AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] {
diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp
index 53d56622fe62..ca86292403fb 100644
--- a/aten/src/ATen/native/LossNLL.cpp
+++ b/aten/src/ATen/native/LossNLL.cpp
@@ -47,10 +47,14 @@ TORCH_META_FUNC(nll_loss_forward)
   TORCH_CHECK(
       target.dim() <= 1,
       "0D or 1D target tensor expected, multi-target not supported");
-
-  auto no_batch_dim = self.dim() == 1  && target.dim() == 0;
+  if (self.dim() == 1 && target.dim() == 1) {
+      TORCH_CHECK_VALUE(
+          target.size(0) == 1,
+          "For 1D input, 1D target must have size 1, but got target size: ",
+          target.size(0));
+  }
   TORCH_CHECK(
-      no_batch_dim || (self.size(0) == target.size(0)),
+      self.dim() == 1 || (self.size(0) == target.size(0)),
       "size mismatch (got input: ",
       self.sizes(),
       ", target: ",
diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp
index 2ac513bf0888..8833bdb6e471 100644
--- a/aten/src/ATen/native/Onehot.cpp
+++ b/aten/src/ATen/native/Onehot.cpp
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
+#include <ATen/DTensorState.h>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -24,8 +25,13 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) {
         if (num_classes == -1) {
           num_classes = self.max().item().toLong() + 1;
         }
-        at::Tensor index = at::arange(num_classes, self.options());
-        return at::eq(self.unsqueeze(-1), index).to(kLong);
+        {
+          // If `self` is a DTensor, then allow implicit replication
+          // of the `index` Tensor.
+          at::DTensorAllowImplicitReplication guard;
+          at::Tensor index = at::arange(num_classes, self.options());
+          return at::eq(self.unsqueeze(-1), index).to(kLong);
+        }
     }
 
     auto shape = self.sizes().vec();
diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp
index 8072d24a1090..8099648d37b2 100644
--- a/aten/src/ATen/native/PadNd.cpp
+++ b/aten/src/ATen/native/PadNd.cpp
@@ -240,8 +240,15 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod
       default: {}
     }
   }
-  C10_THROW_ERROR(NotImplementedError,
-      "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now");
+
+  std::ostringstream error_msg;
+  error_msg << "Padding size " << pad.size() << " is not supported for " << input_dim << "D input tensor.\n";
+  error_msg << "Supported combinations for non-constant padding:\n";
+  error_msg << "  - 2D or 3D input: padding size = 2 (pads last dimension)\n";
+  error_msg << "  - 3D or 4D input: padding size = 4 (pads last 2 dimensions)\n";
+  error_msg << "  - 4D or 5D input: padding size = 6 (pads last 3 dimensions)";
+
+  C10_THROW_ERROR(NotImplementedError, error_msg.str());
 }
 
 Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, std::string_view mode, std::optional<double> value) {
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 408faea1b764..7d613fc02312 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
   if (self.dim() == 1 || broadcast_index) {
     Tensor squeezed = index;
     if (broadcast_index && index.dim() > 1) {
-      for (const auto d : c10::irange(index.dim())) {
+      for (int64_t d = index.dim() - 1; d >= 0; --d) {
         if (d == dim) {
           continue;
         }
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 054cc66cf8eb..1886e65fc1ed 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -1640,6 +1640,9 @@ Tensor zeros_symint(
     std::optional<Layout> layout,
     std::optional<Device> device,
     std::optional<bool> pin_memory) {
+  for (const auto& dim_size : size) {
+    TORCH_CHECK(dim_size >= 0, "zeros: Dimension size must be non-negative.");
+  }
   Layout layout_ = layout.value_or(Layout::Strided);
   if (at::sparse_csr::is_sparse_compressed(layout_)) {
     return zeros_sparse_compressed_symint(
diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp
index 77acfe47363e..4fa0556ad785 100644
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@@ -18,6 +18,7 @@
 #include <ATen/ops/is_set_to_native.h>
 #include <ATen/ops/size_native.h>
 #include <ATen/ops/stride_native.h>
+#include <ATen/ops/sym_is_contiguous_native.h>
 #include <ATen/ops/sym_numel_native.h>
 #include <ATen/ops/sym_size_native.h>
 #include <ATen/ops/sym_storage_offset_native.h>
@@ -57,6 +58,12 @@ c10::SymInt sym_size(const Tensor& self, int64_t dim) {
   return self.sym_size(dim);
 }
 
+c10::SymBool sym_is_contiguous(
+    const Tensor& self,
+    c10::MemoryFormat memory_format) {
+  return self.sym_is_contiguous(memory_format);
+}
+
 c10::SymInt sym_stride(const Tensor& self, int64_t dim) {
   return self.sym_stride(dim);
 }
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index 5715fd8f047f..83b51a998563 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -89,7 +89,7 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t
   using result_type = typename traits::result_type;
   for (; i < n; i++) {
     result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
-    *out_ptr = c10::guts::apply(op, dereference<traits>(
+    *out_ptr = std::apply(op, dereference<traits>(
         &data[1],
         &strides[1],
         i));
@@ -102,7 +102,7 @@ inline void
 execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) {
   using traits = function_traits<func_t>;
   for (; i < n; i++) {
-    c10::guts::apply(op, dereference<traits>(
+    std::apply(op, dereference<traits>(
         &data[0],
         &strides[0],
         i));
@@ -162,7 +162,7 @@ void handle_tuple_outputs(char* C10_RESTRICT data[],
 }
 
 // Loop operation for `cpu_kernel_multiple_outputs`.
-// 1. Use `c10::guts::apply` to make dynamic method invocation
+// 1. Use `std::apply` to make dynamic method invocation
 //    for the lambda passed in `cpu_kernel_multiple_outputs`.
 // 2. Iterate over the members of the returned tuple, set the corresponding
 //    output tensor by the tuple member in `handle_tuple_outputs` function.
@@ -183,7 +183,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_
   }
 
   for (; i < n; i++) {
-    auto output = c10::guts::apply(op, dereference<traits>(
+    auto output = std::apply(op, dereference<traits>(
       &data[num_outputs],
       &strides[num_outputs],
       i));
@@ -213,8 +213,8 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
   for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
     auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
     auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
-    auto out1 = c10::guts::apply(vop, std::move(args1));
-    auto out2 = c10::guts::apply(vop, std::move(args2));
+    auto out1 = std::apply(vop, std::move(args1));
+    auto out2 = std::apply(vop, std::move(args2));
     out1.store(data[0] + i * sizeof(scalar_t));
     out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
   }
diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp
index e3f08194bb58..59d838b9782d 100644
--- a/aten/src/ATen/native/cpu/PaddingKernel.cpp
+++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp
@@ -156,7 +156,7 @@ void cpu_padding(
   int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0;
   int64_t offset_w = p.offsets[ndim - 1];
 
-  // do vectorized copy whe output is overlapped with input on W,
+  // do vectorized copy when output is overlapped with input on W,
   // only applies to positive padding
   auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) {
     if (positive_padding) {
diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
index 5a288193143d..d013dfa0485e 100644
--- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp
@@ -318,7 +318,7 @@ batch_norm_cpu_collect_stats_channels_last_impl(
     //
     // The optimal THRESHOLD to tile was found empirically.
     // When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead.
-    // Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
+    // When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization.
     //
     // When num_threads == 1, always use Method 2 as there is no synchronization overhead.
     //
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index a7c17893903b..23447c7e09b3 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -16,6 +16,7 @@
 #include <ATen/cuda/tunable/TunableGemm.h>
 #include <ATen/native/Resize.h>
 #include <c10/util/MaybeOwned.h>
+#include <ATen/native/GroupedMMUtils.h>
 #include <ATen/native/cuda/RowwiseScaledMM.h>
 #include <ATen/native/cuda/ScaledGroupMM.h>
 #include <ATen/native/cuda/GroupMM.h>
@@ -1079,6 +1080,16 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals
 #endif
 }
 
+static bool _grouped_mm_allowed_device() {
+#ifdef USE_ROCM
+    return false;
+#else
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    // CUDA capability 8.0 and greater
+    return dprops->major >= 8;
+#endif
+}
+
 #ifdef USE_ROCM
 static bool _scaled_mm_is_fnuz() {
     return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
@@ -1540,71 +1551,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
 }
 
 namespace {
-  at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a,
-  const Tensor& mat_b,
-  const std::optional<at::Tensor>& offs,
-  std::optional<c10::ScalarType> out_dtype
-  ) {
-    c10::SmallVector<int64_t, 3> out_size;
-    const bool a_is_2d = mat_a.dim() == 2;
-    const bool b_is_2d = mat_b.dim() == 2;
-    if (a_is_2d) {
-      if (b_is_2d) {
-        out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)};
-      } else {
-        TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match");
-        out_size = {mat_a.size(0), mat_b.size(-1)};
-      }
-    } else {
-      if (b_is_2d) {
-        // this case is not actually encountered for MoE gemms
-        TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match");
-        out_size = {mat_a.size(1), mat_b.size(1)};
-      } else { // regular bmm
-        TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match");
-        out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)};
-      }
-    }
-
-    const auto out_dtype_ = out_dtype.value_or(kBFloat16);
-    TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
-
-    #ifndef USE_ROCM
-    // For TMA transfers, strides of output tensor have to be either
-    // 1, or aligned to 16 bytes.
-    const auto last_dim = out_size.size() - 1;
-    const auto alignment = 16 / c10::elementSize(out_dtype_);
-    const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment;
-    std::vector<int64_t> out_stride;
-    if (a_is_2d != b_is_2d) {
-      out_stride = {size_padded, 1};
-    } else {
-      out_stride = {out_size[1] * size_padded, size_padded, 1};
-    }
-    return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_));
-    #else
-    return at::empty(out_size, mat_a.options().dtype(out_dtype_));
-    #endif
-  }
-
-  bool check_valid_strides_and_return_transposed(const Tensor& mat) {
-    IntArrayRef tensor_strides = mat.strides();
-    IntArrayRef tensor_sizes = mat.sizes();
-    int end_dim = mat.dim() - 1;
-    int alignment = 16 / mat.element_size();
-    TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n");
-    if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max<int64_t>(1, tensor_sizes[end_dim - 1]))) {
-      TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes");
-      return true;
-    } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max<int64_t>(1, tensor_sizes[end_dim]))) {
-      TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes");
-      return false;
-    } else {
-      TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes");
-    }
-  }
-
-  void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
+  void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
+    // Checks scales for 2d or 3d target tensors (`mat`).
     if (mat.dim() == 2) {
       TORCH_CHECK(
           scale.dim() == 1,
@@ -1638,9 +1586,66 @@ namespace {
           "scale must have the same first dimension as mat for arg ",
           arg_idx);
     }
-}
+  }
 
+  void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
+    // Checks scales for 2d or 3d target tensors (`mat`).
+    if (mat.dim() == 2) {
+      // For MXFP8, 2d tensors have variable size groups represented as subtensors,
+      // that are converted to blocked padded format individually,
+      // so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
+      TORCH_CHECK(
+        scale.dim() == mat.dim(),
+        "for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
+
+      // LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
+      // RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
+      //   * weight is transposed prior to the call, scale stays non-transposed.
+      bool LHS = arg_idx == 0;
+      int scale_dim_to_check = 0;
+      int mat_dim_to_check = LHS ? 0 : 1;
+      TORCH_CHECK(
+          scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
+          "for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
+          "must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
+    } else {
+      // For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
+      // so we can check the exact expected scale sizes here without a d2h sync.
+      auto round_up = [](auto x, auto y) {
+          return ((x + y - 1) / y) * y;
+      };
+
+      // TODO: this is for 3d tensor in 2d-3d case specifically.
+      // We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
+      int64_t G = mat.size(0);
+      int64_t K = mat.size(1);
+      int64_t N = mat.size(2);
+      int64_t blocked_scale_K = round_up(K/32, 4);
+      int64_t blocked_scale_N = round_up(N, 128);
+
+      // fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
+      TORCH_CHECK(
+        scale.dim() == mat.dim() - 1,
+        "for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
+      );
+      TORCH_CHECK(
+        scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
+        "for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
+      );
+    }
+  }
 
+  void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
+    bool using_fp8_rowwise = scale.scalar_type() == kFloat;
+    bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
+    if (using_fp8_rowwise) {
+      _check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
+    } else if (using_mxfp8) {
+      _check_scales_mxfp8(mat, scale, dim, arg_idx);
+    } else {
+      TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
+    }
+  }
 }
 
 Tensor
@@ -1665,8 +1670,8 @@ const std::optional<at::Tensor>& bias,
 const std::optional<at::Tensor>& scale_result,
 std::optional<c10::ScalarType> out_dtype,
 bool use_fast_accum) {
-  bool allowed_device = _scaled_mm_allowed_device();
-  TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+");
+  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
+  TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+");
 
   TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed");
   TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed");
@@ -1699,16 +1704,47 @@ bool use_fast_accum) {
     TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
   }
 
-  // Both Per-Tensor and Row-wise scaling expect fp32 tensors
+  // FP8 per-tensor and per-row scaling expect fp32 scales.
+  // MXFP8 expects float8_e8m0fnu scales.
   TORCH_CHECK(
-      scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat,
-      "Both scale_a and scale_b must be float (fp32) tensors.");
+      (scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) ||
+      (scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu),
+      "For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors.");
 
   const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1;
   check_scale(mat_a, scale_a, 0 ,0, scale_multiplier);
   check_scale(mat_b, scale_b, 1, 1, scale_multiplier);
 
-  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
+  const auto out_dtype_ = out_dtype.value_or(kBFloat16);
+  TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm");
+
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+
+#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM)
+  // MXFP8 grouped GEMM dispatching
+  bool is_mx8mx8bf16 = (
+    mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn &&
+    scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu
+  );
+  TORCH_CHECK(out_dtype == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm");
+
+  if (is_mx8mx8bf16) {
+    bool b_is_3d = mat_b.dim() == 3;
+    bool is_2d_2d = a_is_2d && b_is_2d;
+    bool is_2d_3d = a_is_2d && b_is_3d;
+    TORCH_CHECK(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases");
+    TORCH_CHECK(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets");
+
+    fbgemm_gpu::mx8mx8bf16_grouped_mm(
+        mat_a,
+        mat_b,
+        scale_a,
+        scale_b,
+        offs.value(),
+        out);
+    return out;
+  }
+#endif
 
 #ifndef USE_ROCM
   TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type());
@@ -1741,6 +1777,7 @@ bool use_fast_accum) {
 #else
   TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM")
 #endif
+
 #endif
 
 }
@@ -1750,33 +1787,21 @@ const std::optional<at::Tensor>& offs,
 const std::optional<at::Tensor>& bias,
 std::optional<c10::ScalarType> out_dtype) {
 #ifndef USE_ROCM
-  bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true);
-  TORCH_CHECK(allowed_device, "torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0, 10.0");
-
-  TORCH_CHECK(mat_a.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_a.scalar_type());
-  TORCH_CHECK(mat_b.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_b.scalar_type());
-  TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d");
-  TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d");
-  const bool a_is_2d = mat_a.dim() == 2;
-  const bool b_is_2d = mat_b.dim() == 2;
-  if (!a_is_2d || !b_is_2d) {
-    TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match");
-  }
-
-  // check that the strides are valid, the fn will throw an error if not
-  check_valid_strides_and_return_transposed(mat_a);
-  check_valid_strides_and_return_transposed(mat_b);
-  TORCH_CHECK(offs.has_value() ==  (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d");
-
-  if (offs.has_value()) {
-    TORCH_CHECK(offs->dim() == 1, "offs has to be 1D");
-    TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32");
+  _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
+  bool a_b_and_out_are_bf16 = (
+    mat_a.dtype() == at::kBFloat16 &&
+    mat_b.dtype() == at::kBFloat16 &&
+    out_dtype.value_or(at::kBFloat16) == at::kBFloat16
+  );
+  bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
+  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
+  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
+  if (use_fast_path) {
+    // fast path, no d2h sync needed
+    at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
+  } else {
+    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
   }
-  TORCH_CHECK(!bias.has_value(), "Bias not supported yet");
-
-  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype);
-
-  at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
   return out;
 #else
   TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index 12ad84a15b18..ee28c5c1693f 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -999,12 +999,41 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
       dtypes[i] = iter.dtype(i);
     }
     auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
+#ifdef USE_ROCM
+    constexpr int grp_sz = 128;
+    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+      if (unrl) {
+        auto offsets0 = offset_calc.get(idx);
+        auto offsets1 = offset_calc.get(idx + grp_sz);
+        auto offsets2 = offset_calc.get(idx + grp_sz * 2);
+        auto offsets3 = offset_calc.get(idx + grp_sz * 3);
+        void* out0 = data[0] + offsets0[0];
+        void* out1 = data[0] + offsets1[0];
+        void* out2 = data[0] + offsets2[0];
+        void* out3 = data[0] + offsets3[0];
+        arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
+        arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
+        arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
+        arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
+        c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
+        c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
+      } else {
+        auto offsets = offset_calc.get(idx);
+        void* out = data[0] + offsets[0];
+        arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+      }
+    });
+#else
     launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
       auto offsets = offset_calc.get(idx);
       void* out = data[0] + offsets[0];
       arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
       c10::cast_and_store<arg0_t>(dtypes[0], out, result);
     });
+#endif
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu
index 59b0426bab1f..62a07e1e28c8 100644
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@@ -42,6 +42,19 @@ void bfloat16_copy_kernel_cuda(TensorIteratorBase &iter) {
     });
 }
 
+#ifdef USE_ROCM
+void bfloat16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
+    gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::BFloat16 value) {
+        return static_cast<float>(value);
+    });
+}
+void float16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) {
+    gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::Half value) {
+        return static_cast<float>(value);
+    });
+}
+#endif
+
 void float8_copy_kernel_cuda(TensorIteratorBase &iter) {
   ScalarType dtype = iter.dtype(0);
   ScalarType other_dtype = iter.dtype(1);
@@ -187,7 +200,17 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) {
      } else {
        float16_copy_kernel_cuda(iter);
      }
-  } else if (isBitsType(dtype)) {
+  }
+#ifdef USE_ROCM
+  else if ((iter.dtype(1) == kBFloat16 || iter.dtype(1) == kHalf) && dtype == kFloat) {
+    if (iter.dtype(1) == kBFloat16) {
+      bfloat16tofloat32_copy_kernel_cuda(iter);
+    } else {
+      float16tofloat32_copy_kernel_cuda(iter);
+    }
+  }
+#endif
+  else if (isBitsType(dtype)) {
     TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting "
       "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype);
     AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] {
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index 02feb55cb69d..dacef18c79b6 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -59,7 +59,7 @@ constexpr uint64_t getDefaultMaxThreadsPerBlock() {
 #ifdef USE_ROCM
 #define SKIP_SORTED_INDICES 32
 template <typename scalar_t, int SZ>
-__global__ void indexing_backward_kernel(
+__global__ void indexing_backward_kernel_many_indices(
   const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
   int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) {
   using opmath_t = at::opmath_type<scalar_t>;
@@ -254,7 +254,8 @@ __global__ void indexing_backward_kernel_stride_1(
     }
   }
 }
-#else
+#endif
+
 template <typename scalar_t, int SZ>
 __global__ void indexing_backward_kernel(
   const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@@ -333,6 +334,7 @@ __global__ void indexing_backward_kernel(
   }
 }
 
+#ifndef USE_ROCM
 template <typename scalar_t>
 __global__ void indexing_backward_kernel_stride_1(
   const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
@@ -708,6 +710,9 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
       dim3 block(warp_size, indices_per_block);
 
 #ifdef USE_ROCM
+      dim3 new_grid_many_indices(ceil_div(num_indices, (int64_t) (indices_per_block * warp_size)),
+      grid.y == 1 ? std::min<int>(at::cuda::getCurrentDeviceProperties()->maxGridSize[1], ceil_div(sliceSize, (int64_t) (warp_size))) : grid.y,
+      grid.z);
       dim3 new_grid(ceil_div(num_indices, (int64_t) (indices_per_block * warp_size)), grid.y, grid.z);
       size_t smem_dups_size = indices_per_block * warp_size * sizeof(int64_t);
 #define KERNEL_GRID new_grid
@@ -780,11 +785,43 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
             kBool,
             kBFloat16);
         } else {
+#ifdef USE_ROCM
+          if (num_indices >= 200000)
+            AT_DISPATCH_V2(
+              expandedValue.scalar_type(),
+              "indexing_backward_many_indices",
+              AT_WRAP([&] {
+                indexing_backward_kernel_many_indices<scalar_t, UNROLL><<<new_grid_many_indices, block, smem_dups_size, stream>>>(
+                  sorted_indices.const_data_ptr<int64_t>(),
+                  orig_indices.const_data_ptr<int64_t>(),
+                  expandedValue.const_data_ptr<scalar_t>(),
+                  src_.mutable_data_ptr<scalar_t>(),
+                  num_indices,
+                  sliceSize,
+                  strideBefore,
+                  nElemBefore,
+                  accumulate);
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
+              }),
+              AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
+              // AT_EXPAND(AT_FLOAT8_TYPES),
+              // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True
+              // should not be supported here, then reenable AT_FLOAT8_DTYPES
+              kFloat8_e4m3fn,
+              kFloat8_e5m2,
+              kFloat8_e4m3fnuz,
+              kFloat8_e5m2fnuz,
+              kComplexHalf,
+              kHalf,
+              kBool,
+              kBFloat16);
+          else
+#endif
           AT_DISPATCH_V2(
             expandedValue.scalar_type(),
             "indexing_backward",
             AT_WRAP([&] {
-              indexing_backward_kernel<scalar_t, UNROLL><<<KERNEL_GRID, block, KERNEL_SMEM, stream>>>(
+              indexing_backward_kernel<scalar_t, UNROLL><<<grid, block, 0, stream>>>(
                 sorted_indices.const_data_ptr<int64_t>(),
                 orig_indices.const_data_ptr<int64_t>(),
                 expandedValue.const_data_ptr<scalar_t>(),
diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu
index 3acb359342f1..c6f88692a8a5 100644
--- a/aten/src/ATen/native/cuda/SegmentReduce.cu
+++ b/aten/src/ATen/native/cuda/SegmentReduce.cu
@@ -20,7 +20,7 @@
 
 // SegmentReduce compilation with CUDA-12.9 causes  NVCC crash on Windows
 // See https://github.com/pytorch/pytorch/issues/156181
-#if !defined(_WIN32) || CUDART_VERSION < 12090
+#if !(defined(_WIN32) && CUDART_VERSION == 12090)
 
 namespace at::native {
 
@@ -606,4 +606,4 @@ REGISTER_DISPATCH(
 
 } // namespace at::native
 
-#endif
+#endif
\ No newline at end of file
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 940680eb3682..81387bcceaf0 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -141,7 +141,11 @@ WelfordDataLN cuWelfordOnlineSum(
   if constexpr (!rms_norm){
     U delta = val - curr_sum.mean;
     U new_count = curr_sum.count + 1.f;
+#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+    U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
+#else
     U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
+#endif
     return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
   } else{
     return {0.f, curr_sum.sigma2 + val * val, 0};
@@ -159,7 +163,11 @@ WelfordDataLN cuWelfordCombine(
     U count = dataA.count + dataB.count;
     U mean, sigma2;
     if (count > decltype(dataB.count){0}) {
+#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+      auto coef = __builtin_amdgcn_rcpf(count);
+#else
       auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
+#endif
       auto nA = dataA.count * coef;
       auto nB = dataB.count * coef;
       mean = nA*dataA.mean + nB*dataB.mean;
diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp
index 182716ed7a1a..1658ce34ca6c 100644
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@@ -146,7 +146,7 @@ namespace native {
 
 namespace fe = cudnn_frontend;
 
-#define MAX_MHA_DIM 4
+constexpr uint8_t MAX_MHA_DIM = 4;
 
 // Whether we will use ragged offsets in the dense (non-nested) path
 // to avoid recompilation
@@ -238,7 +238,8 @@ void setMHAParams(
     const std::optional<Tensor>& attn_bias,
     double dropout_probability,
     bool is_causal,
-    bool return_softmaxstats) {
+    bool return_softmaxstats,
+    bool is_nested) {
   memset(&params, 0, sizeof(MHAParams));
   params.device_id = at::cuda::current_device();
   params.dataType = fe::DataType_t::HALF;
@@ -255,23 +256,24 @@ void setMHAParams(
   params.is_causal = is_causal;
   params.return_softmaxstats = return_softmaxstats;
   params.has_attn_bias = attn_bias.has_value();
+  // Expect 4D dense tensor, 3D nested case (THD)
   TORCH_INTERNAL_ASSERT(
-      q.sizes().size() == MAX_MHA_DIM,
+      q.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      q.strides().size() == MAX_MHA_DIM,
+      q.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "Q tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      k.sizes().size() == MAX_MHA_DIM,
+      k.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "K tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      k.strides().size() == MAX_MHA_DIM,
+      k.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "K tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      v.sizes().size() == MAX_MHA_DIM,
+      v.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "V tensor has unexpected number of dims, please report a bug to PyTorch.");
   TORCH_INTERNAL_ASSERT(
-      v.strides().size() == MAX_MHA_DIM,
+      v.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested),
       "V tensor has unexpected number of dims, please report a bug to PyTorch.");
   std::copy(q.sizes().begin(), q.sizes().end(), params.q_dim.begin());
   std::copy(q.strides().begin(), q.strides().end(), params.q_stride.begin());
@@ -320,7 +322,8 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
       const std::optional<Tensor>& attn_bias,
       double dropout_probability,
       bool is_causal,
-      bool return_softmaxstats) {
+      bool return_softmaxstats,
+      bool is_nested) {
     setMHAParams(
         this->pod,
         b,
@@ -335,7 +338,8 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
         attn_bias,
         dropout_probability,
         is_causal,
-        return_softmaxstats);
+        return_softmaxstats,
+        is_nested);
   }
 };
 
@@ -479,6 +483,8 @@ auto build_graph(
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA")
           .set_is_inference(return_softmaxstats == false)
+          // TODO(eqy): switch to this API once cuDNN FE is upgraded
+          // .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale);
   if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
@@ -699,6 +705,8 @@ auto build_graph_nestedtensor(
       fe::graph::SDPA_attributes()
           .set_name("CUDNN_SDPA_NESTEDTENSOR")
           .set_is_inference(return_softmaxstats == false)
+          // TODO(eqy): switch to this API once cuDNN FE is upgraded
+          // .set_generate_stats(return_softmaxstats)
           .set_causal_mask(is_causal)
           .set_attn_scale(attn_scale)
           .set_seq_len_q(SEQ_LEN_Q_)
@@ -1386,7 +1394,8 @@ void run_cudnn_SDP_fprop(
       attn_bias,
       dropout_probability,
       is_causal,
-      return_softmaxstats);
+      return_softmaxstats,
+      false);
   auto graph_ptr = getMHAGraphCache_().find(key);
   std::shared_ptr<fe::graph::Graph> mha_graph;
   if (graph_ptr) {
@@ -1484,30 +1493,53 @@ void run_cudnn_SDP_fprop_nestedtensor(
   if (return_softmaxstats && !softmaxstats.defined()) {
     softmaxstats = at::empty({q.size(0), h_q, 1}, q.options().dtype(kFloat));
   }
-  auto mha_graph = build_graph_nestedtensor(
+
+  auto key = MHACacheKeyWrapper(
       b,
       h_q,
-      h_k,
-      h_v,
-      s_q,
-      s_kv,
+      s_q, // max-seqlen-q
+      s_kv, // max-seqlen-kv
       d_qk,
       d_v,
-      scaling_factor,
-      return_softmaxstats,
-      is_causal,
-      dropout_probability,
-      cum_seqlen_q,
-      cum_seqlen_kv,
       q,
       k,
       v,
       attn_bias,
-      softmaxstats,
-      o,
-      dropoutseed,
-      dropoutoffset,
-      handle);
+      dropout_probability,
+      is_causal,
+      return_softmaxstats,
+      true);
+  auto graph_ptr = getMHAGraphCache_().find(key);
+  std::shared_ptr<fe::graph::Graph> mha_graph;
+
+  if (graph_ptr) {
+    mha_graph = *graph_ptr;
+  } else {
+    mha_graph = build_graph_nestedtensor(
+        b,
+        h_q,
+        h_k,
+        h_v,
+        s_q,
+        s_kv,
+        d_qk,
+        d_v,
+        scaling_factor,
+        return_softmaxstats,
+        is_causal,
+        dropout_probability,
+        cum_seqlen_q,
+        cum_seqlen_kv,
+        q,
+        k,
+        v,
+        attn_bias,
+        softmaxstats,
+        o,
+        dropoutseed,
+        dropoutoffset,
+        handle);
+  }
   auto seqlen_q = at::diff(cum_seqlen_q, 1, 0);
   auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0);
   auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk);
@@ -1636,7 +1668,8 @@ void run_cudnn_SDP_bprop(
       attn_bias,
       dropout_probability,
       is_causal,
-      true);
+      true,
+      false);
   auto graph_backward_ptr = getMHAGraphBackwardCache_().find(key);
   std::shared_ptr<fe::graph::Graph> mha_graph;
   if (graph_backward_ptr) {
@@ -1761,33 +1794,55 @@ void run_cudnn_SDP_bprop_nestedtensor(
 
   cudnnHandle_t handle = getCudnnHandle();
 
-  auto mha_graph = build_graph_backward_nestedtensor(
+  auto key = MHACacheKeyWrapper(
       b,
       h_q,
-      h_k,
-      h_v,
-      s_q,
-      s_kv,
+      s_q, // max-seqlen-q
+      s_kv, // max-seqlen-kv
       d_qk,
       d_v,
-      scaling_factor,
-      is_causal,
-      dropout_probability,
-      cum_seqlen_q,
-      cum_seqlen_kv,
       q,
       k,
       v,
       attn_bias,
-      o,
-      dO_,
-      softmaxstats,
-      dQ,
-      dK,
-      dV,
-      dropoutseed,
-      dropoutoffset,
-      handle);
+      dropout_probability,
+      is_causal,
+      true,
+      true);
+  auto graph_ptr = getMHAGraphCache_().find(key);
+  std::shared_ptr<fe::graph::Graph> mha_graph;
+
+  if (graph_ptr) {
+    mha_graph = *graph_ptr;
+  } else {
+    mha_graph = build_graph_backward_nestedtensor(
+        b,
+        h_q,
+        h_k,
+        h_v,
+        s_q,
+        s_kv,
+        d_qk,
+        d_v,
+        scaling_factor,
+        is_causal,
+        dropout_probability,
+        cum_seqlen_q,
+        cum_seqlen_kv,
+        q,
+        k,
+        v,
+        attn_bias,
+        o,
+        dO_,
+        softmaxstats,
+        dQ,
+        dK,
+        dV,
+        dropoutseed,
+        dropoutoffset,
+        handle);
+  }
 
   std::unordered_map<int64_t, void*> variant_pack = {
       // inputs
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index 154118d9f272..41226680c4b5 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -18,6 +18,7 @@
 #include <ATen/ops/squeeze.h>
 #include <ATen/ops/sum.h>
 #include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
 #endif
 
 // TODO: Remove the condition on AT_ROCM_ENABLED entirely,
@@ -145,13 +146,13 @@ at::Tensor miopen_convolution_relu(
 
 #include <ATen/TensorUtils.h>
 #include <ATen/native/ConvUtils.h>
+#include <ATen/native/utils/ParamsHash.h>
 #include <c10/util/irange.h>
 
 #include <c10/hip/HIPCachingAllocator.h>
 
 #include <functional>
 #include <iterator>
-#include <sstream>
 #include <algorithm>
 #include <memory>
 #include <mutex>
@@ -162,10 +163,13 @@ at::Tensor miopen_convolution_relu(
 
 namespace at { namespace native {
 
-Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
-  auto group_size = t.size(dim) / groups;
-  return t.narrow(dim, group_idx * group_size, group_size);
-}
+// See NOTE [ Convolution design ] in aten/src/ATen/native/cudnn/ConvShared.cpp
+
+// ---------------------------------------------------------------------
+//
+// Helper classes
+//
+// ---------------------------------------------------------------------
 
 // This POD struct is used to let us easily compute hashes of the
 // parameters
@@ -174,6 +178,8 @@ struct ConvolutionParams
   miopenHandle_t handle;
   miopenDataType_t dataType;
   int input_size[2 + max_dim];
+  uint8_t input_dim;
+  at::MemoryFormat memory_format;
   int input_stride[2 + max_dim];
   int weight_size[2 + max_dim];
   int padding[max_dim];
@@ -181,25 +187,29 @@ struct ConvolutionParams
   int dilation[max_dim];
   int64_t groups;
   bool deterministic;
-  int device_id; //This is needed to distinguish between miopen handles of multiple gpus.
+  c10::DeviceIndex device_id; //This is needed to distinguish between miopen handles of multiple gpus.
   // NB: transposed purposely omitted: transposed just swaps
   // forward and backward, so you can reuse the benchmark entry,
 };
-// ConvolutionParams must be a POD because we read out its memory
-// contenst as char* when hashing
-static_assert(std::is_standard_layout_v<ConvolutionParams>, "ConvolutionParams not POD");
 
 void setConvolutionParams(
-    ConvolutionParams* params, miopenHandle_t handle,
-    const at::Tensor& input, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool deterministic) {
-
+    ConvolutionParams* params,
+    miopenHandle_t handle,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool deterministic,
+    at::MemoryFormat memory_format) {
   miopenDataType_t dataType = getMiopenDataType(input);
   memset(params, 0, sizeof(ConvolutionParams));
   params->dataType = dataType;
   params->handle = handle;
   // ASSERT(weight.dim() == input.dim())
+  params->input_dim = input.dim();
+  params->memory_format = memory_format;
   for (int i = 0; i != input.dim(); ++i) {
     params->input_size[i] = (int) input.size(i);
     params->input_stride[i] = (int) input.stride(i);
@@ -214,9 +224,7 @@ void setConvolutionParams(
   }
   params->groups = groups;
   params->deterministic = deterministic;
-  int device_id;
-  HIP_CHECK(hipGetDevice(&device_id));
-  params->device_id = device_id;
+  params->device_id = at::cuda::current_device();
 }
 
 // Convenience struct for passing around descriptors and data
@@ -239,31 +247,10 @@ struct ConvolutionArgs {
 //
 // ---------------------------------------------------------------------
 
-// Hashing machinery for ConvolutionParams
-struct ParamsHash {
-  std::size_t operator()(const ConvolutionParams& params) const {
-    auto ptr = reinterpret_cast<const uint8_t*>(&params);
-    uint32_t value = 0x811C9DC5;
-    for (const auto i : c10::irange((int)sizeof(ConvolutionParams))) {
-      value ^= ptr[i];
-      value *= 0x01000193;
-    }
-    return (size_t)value;
-  }
-};
-
-struct ParamsEqual {
-  bool operator()(const ConvolutionParams& a, const ConvolutionParams& b) const {
-    auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
-    auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
-    return memcmp(ptr1, ptr2, sizeof(ConvolutionParams)) == 0;
-  }
-};
-
 template <typename T>
 struct BenchmarkCache {
   std::mutex mutex;
-  std::unordered_map<ConvolutionParams, T, ParamsHash, ParamsEqual> map;
+  std::unordered_map<ConvolutionParams, T, ParamsHash<ConvolutionParams>, ParamsEqual<ConvolutionParams>> map;
 
   bool find(const ConvolutionParams& params, T* results) {
     std::lock_guard<std::mutex> guard(mutex);
@@ -314,39 +301,39 @@ size_t getWorkspaceSize(
     const ConvolutionArgs& args, const miopenConvFwdAlgorithm_t)
 {
     size_t sz = 0;
-    miopenConvolutionForwardGetWorkSpaceSize(
+    MIOPEN_CHECK(miopenConvolutionForwardGetWorkSpaceSize(
         args.handle,
         args.wdesc.desc(),
         args.idesc.desc(),
         args.cdesc.desc(),
         args.odesc.desc(),
-        &sz);
+        &sz));
     return sz;
 }
 size_t getWorkspaceSize(
     const ConvolutionArgs& args, const miopenConvBwdDataAlgorithm_t)
 {
     size_t sz = 0;
-    miopenConvolutionBackwardDataGetWorkSpaceSize(
+    MIOPEN_CHECK(miopenConvolutionBackwardDataGetWorkSpaceSize(
         args.handle,
         args.odesc.desc(),
         args.wdesc.desc(),
         args.cdesc.desc(),
         args.idesc.desc(),
-        &sz);
+        &sz));
     return sz;
 }
 size_t getWorkspaceSize(
     const ConvolutionArgs& args, const miopenConvBwdWeightsAlgorithm_t)
 {
     size_t sz = 0;
-    miopenConvolutionBackwardWeightsGetWorkSpaceSize(
+    MIOPEN_CHECK(miopenConvolutionBackwardWeightsGetWorkSpaceSize(
         args.handle,
         args.odesc.desc(),
         args.idesc.desc(),
         args.cdesc.desc(),
         args.wdesc.desc(),
-        &sz);
+        &sz));
     return sz;
 }
 
@@ -649,6 +636,94 @@ Workspace chooseSolution(const ConvolutionArgs& args, uint64_t* solution_id)
   }
 }
 
+// See NOTE [ raw_cudnn_convolution_forward_out ] in aten/src/ATen/native/cudnn/Conv_v7.cpp
+
+// ---------------------------------------------------------------------
+//
+// Splitting to 32bit
+//
+// ---------------------------------------------------------------------
+
+template <typename func_t>
+static inline void split_batch_dim_to_32bit_out(
+    const at::Tensor& output,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise,
+    int64_t max_worksize,
+    func_t func_32bit) {
+  constexpr int64_t int_max = std::numeric_limits<int>::max();
+  const int64_t ni = input.numel();
+  const int64_t no = output.numel();
+  // Assume the shape of the tensor is (N, C, D1, D2, ...)
+  // if N * C * D1 * D2 * ... <= int_max, then no need to split at all
+  if (ni <= int_max && no <= int_max) {
+    func_32bit(
+        output,
+        input,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        depthwise);
+    return;
+  }
+  // else, if C * D1 * D2 * ... <= int_max, then we just need to split across
+  // the N dimension
+  //
+  // Here we use a simple heuristics to determine the size of each split
+  // We don't max out the 2^31 address space because this number is super
+  // large and very likely to get an OOM.
+  int64_t n = output.size(0);
+  int64_t max_inner_size = std::max<int64_t>(ni, no) / n;
+  int64_t split_size = std::max<int64_t>(max_worksize / max_inner_size, 1L);
+  int64_t num_splits = (n + split_size - 1) / split_size;
+  if (split_size * max_inner_size < int_max) {
+    for (const auto i : c10::irange(num_splits)) {
+      int64_t start = split_size * i;
+      int64_t split_size_ = std::min<int64_t>(split_size, n - start);
+      Tensor input_ = input.narrow(0, start, split_size_);
+      Tensor output_ = output.narrow(0, start, split_size_);
+      func_32bit(
+          output_,
+          input_,
+          weight,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic,
+          depthwise);
+    }
+    return;
+  }
+  // If control flow reaches here, this means even splitting N is not enough,
+  // then things starts to become complicated: For example, for conv2d, there
+  // following questions needs to be considered.
+  // - Is the memory layout NCHW or NHWC ?
+  // - If the conv is NCHW -> NC'H'W', then should we
+  //   - split only NC?
+  //   - split only N'C'?
+  //   - split both?
+  // - If the conv is NHWC, then we need to split across H, we need to be very
+  // careful about the boundary condition
+  //   to make sure that the boundary is handled correctly.
+  // - If we decide to make these splits, is the memory contiguous? Do we need
+  // to copy the memory? Considering the complexity of this issue, it is better
+  // not to use cuDNN for this case
+  TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN.");
+}
+
 // ---------------------------------------------------------------------
 //
 // Bias addition
@@ -690,8 +765,47 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const
   */
 }
 
-// see NOTE [ Convolution design ] in src/Aten/native/cudnn/Conv.cpp
+Tensor miopen_convolution_backward_bias(const Tensor& grad_output_t)
+{
+  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
+
+  // TODO: Workaround since MIOpen does not support NHWC bias
+  // See #64426
+  std::vector<int64_t> discard_dims;
+  for( int i = 0; i < grad_output_t.dim(); i++ ) {
+    if(i != output_channels_dim ) {
+      discard_dims.push_back(i);
+    }
+  }
+
+  Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) );
+  if( outputBias.dim() == 0 ) {
+    // always return a tensor of shape [_]
+    return outputBias.unsqueeze(0);
+  }
+  else {
+    return outputBias;
+  }
+
+/* MIOpen does not support NHWC bias. Activate once support is added.
+  auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options());
+
+  TensorArg grad_bias{ grad_bias_t, "result", 0 };
+
+  TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}),
+                         static_cast<size_t>(grad_output->dim())};
+  TensorDescriptor odesc{*grad_output};
+
+  auto handle = getMiopenHandle();
+  auto dataType = getMiopenDataType(*grad_bias);
+  Constant one(dataType, 1);
+  Constant zero(dataType, 0);
 
+  MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
+                                                   &zero, bdesc.desc(), grad_bias->data_ptr()));
+  return *grad_bias;
+*/
+}
 
 // ---------------------------------------------------------------------
 //
@@ -699,30 +813,47 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const
 //
 // ---------------------------------------------------------------------
 
-// The raw API directly invokes MIOpen.
-//
-// There are a few reasons this should never be directly exposed
-// via ATen:
-//
-//    - It takes output as a parameter (this should be computed!)
-//    - It doesn't do input checking
-//    - It doesn't resize output (it is assumed to be correctly sized)
-//
-void raw_miopen_convolution_forward_out(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
+void raw_miopen_convolution_forward_out_32bit(
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
   auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenConvolution;
+  miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution;
 
-  ConvolutionArgs args{ input, output, weight };
+  ConvolutionArgs args{input, output, weight};
   args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(weight, input.suggest_memory_format(), 0);
-  args.odesc.set(output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+  at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      memory_format);
+  args.idesc.set(input, memory_format);
+  args.wdesc.set(weight, memory_format, 0);
+  args.odesc.set(output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      benchmark,
+      deterministic);
 
   if (at::globalContext().immediateMiopen()) {
       uint64_t solution_id;
@@ -730,10 +861,16 @@ void raw_miopen_convolution_forward_out(
 
       MIOPEN_CHECK(miopenConvolutionForwardImmediate(
         args.handle,
-        args.wdesc.desc(), weight.const_data_ptr(),
-        args.idesc.desc(), input.const_data_ptr(),
+        args.wdesc.desc(),
+        weight.const_data_ptr(),
+        args.idesc.desc(),
+        input.const_data_ptr(),
         args.cdesc.desc(),
-        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id));
+        args.odesc.desc(),
+        output.data_ptr(),
+        workspace.data,
+        workspace.size,
+        solution_id));
   }
   else {
       miopenConvFwdAlgorithm_t fwdAlg;
@@ -744,472 +881,216 @@ void raw_miopen_convolution_forward_out(
 
       MIOPEN_CHECK(miopenConvolutionForward(
         args.handle,
-        &one, args.idesc.desc(), input.const_data_ptr(),
-        args.wdesc.desc(), weight.const_data_ptr(),
-        args.cdesc.desc(), fwdAlg, &zero,
-        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size));
+        &one,
+        args.idesc.desc(),
+        input.const_data_ptr(),
+        args.wdesc.desc(),
+        weight.const_data_ptr(),
+        args.cdesc.desc(),
+        fwdAlg,
+        &zero,
+        args.odesc.desc(),
+        output.data_ptr(),
+        workspace.data,
+        workspace.size));
   }
 }
 
-Tensor miopen_convolution_forward(
+void raw_miopen_convolution_forward_out(
+    const Tensor& output,
+    const Tensor& input,
+    const Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  split_batch_dim_to_32bit_out(
+      output,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise,
+      1024 * 1024 * 256,
+      raw_miopen_convolution_forward_out_32bit);
+}
+
+void miopen_convolution_forward_out(
+    TensorArg& output,
     CheckedFrom c,
-    const TensorArg& input, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
+    const TensorArg& input,
+    const TensorArg& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
   checkAllSameType(c, {input, weight});
   checkAllSameGPU(c, {input, weight});
 
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor output_t = at::detail::empty_cuda(
-      conv_output_size(input->sizes(), weight->sizes(),
-                       padding, stride, dilation),
-      input->options().memory_format(memory_format));
-
-  if (output_t.numel() == 0) {
-    return output_t;
-  }
-
-  // Avoid ambiguity of "output" when this is being used as backwards
-  TensorArg output{ output_t, "result", 0 };
-  convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
+  auto memory_format = output->suggest_memory_format();
+  convolution_shape_check(
+      c, input, weight, output, padding, stride, dilation, groups);
 
-  // See #4500
   Tensor weight_contig = weight->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  weight_contig.resize_(weight_contig.sizes(), memory_format);
   Tensor input_contig = input->contiguous(memory_format);
-  input_contig.resize_(input_contig.sizes(), memory_format);
-
-
 
   raw_miopen_convolution_forward_out(
-      *output, input_contig, weight_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return *output;
+      *output,
+      input_contig,
+      weight_contig,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 }
 
 Tensor miopen_convolution(
-    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic)
-{
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_t_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
   const Tensor& bias_t = *bias_t_maybe_owned;
 
-  TensorArg input  { input_t,  "input",  1 },
-            weight { weight_t, "weight", 2 },
-            bias   { bias_t,   "bias",   3 };
+  TensorArg input{input_t, "input",  1 }, weight{weight_t, "weight", 2}, bias{bias_t, "bias", 3};
   CheckedFrom c = "miopen_convolution";
-  auto output_t = miopen_convolution_forward(
-    c, input, weight, padding, stride, dilation, groups, benchmark, deterministic);
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      input->options().memory_format(memory_format));
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
+      c,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
   if (bias->defined()) {
-    miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+    miopen_convolution_add_bias_(c, output, bias);
   }
-  return output_t;
+  return *output;
 }
 
-//Depthwise Convolutions
-void raw_miopen_depthwise_convolution_forward_out(
-    const Tensor& output, const Tensor& input, const Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
+Tensor miopen_convolution_transpose_backward_input(
+    const Tensor& grad_output_t,
+    const Tensor& weight_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
+  TensorArg grad_output{ grad_output_t,  "grad_output", 1 }, weight{weight_t, "weight", 2};
+  auto memory_format =
+    miopen_conv_suggest_memory_format(grad_output_t, weight_t);
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        grad_output_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      grad_output_t.options().memory_format(memory_format));
 
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenDepthwise;
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
+      "miopen_convolution_transpose_backward_input",
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
+  return *output;
+}
 
-  ConvolutionArgs args{ input, output, weight };
-  args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(weight, input.suggest_memory_format(), 0);
-  args.odesc.set(output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+// file organization would put miopen_convolution_transpose_backward_weight here,
+// but it depends on miopen_convolution_backward_weight which is defined later
+Tensor miopen_convolution_transpose_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic);
 
-  if (at::globalContext().immediateMiopen()) {
-      uint64_t solution_id;
-      Workspace workspace = chooseSolution<miopenConvFwdAlgorithm_t>(args, &solution_id);
+std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_convolution_transpose_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output_t,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    std::array<bool,3> output_mask) {
+  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
 
-      MIOPEN_CHECK(miopenConvolutionForwardImmediate(
-        args.handle,
-        args.wdesc.desc(), weight.const_data_ptr(),
-        args.idesc.desc(), input.const_data_ptr(),
-        args.cdesc.desc(),
-        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id));
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = miopen_convolution_transpose_backward_input(
+        grad_output,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic);
   }
-  else {
-      miopenConvFwdAlgorithm_t fwdAlg;
-      Workspace workspace = chooseAlgorithm(args, benchmark, &fwdAlg);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      MIOPEN_CHECK(miopenConvolutionForward(
-        args.handle,
-        &one, args.idesc.desc(), input.const_data_ptr(),
-        args.wdesc.desc(), weight.const_data_ptr(),
-        args.cdesc.desc(), fwdAlg, &zero,
-        args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size));
+  if (output_mask[1]) {
+    grad_weight = miopen_convolution_transpose_backward_weight(
+        weight.sizes(),
+        grad_output,
+        input,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic);
+  }
+  if (output_mask[2]) {
+    grad_bias = miopen_convolution_backward_bias(grad_output);
   }
-}
-
-Tensor miopen_depthwise_convolution_forward(
-    CheckedFrom c,
-    const TensorArg& input, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  checkAllSameType(c, {input, weight});
-  checkAllSameGPU(c, {input, weight});
-
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor output_t = at::detail::empty_cuda(
-      conv_output_size(input->sizes(), weight->sizes(),
-                       padding, stride, dilation),
-      input->options().memory_format(memory_format));
-
-  TensorArg output{ output_t, "result", 0 };
-  convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
-
-  // See #4500
-  Tensor weight_contig = weight->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  weight_contig.resize_(weight_contig.sizes(), memory_format);
-  Tensor input_contig = input->contiguous(memory_format);
-  input_contig.resize_(input_contig.sizes(), memory_format);
-
-  raw_miopen_depthwise_convolution_forward_out(
-      *output, input_contig, weight_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return *output;
-}
-
-Tensor miopen_depthwise_convolution(
-    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic)
-{
-  // See [Note: hacky wrapper removal for optional tensor]
-  c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
-  const Tensor& bias_t = *bias_t_maybe_owned;
-
-  TensorArg input  { input_t,  "input",  1 },
-            weight { weight_t, "weight", 2 },
-            bias   { bias_t,   "bias",   3 };
-  CheckedFrom c = "miopen_depthwise_convolution";
-  auto output_t = miopen_depthwise_convolution_forward(
-    c, input, weight, padding, stride, dilation, groups, benchmark, deterministic);
-  if (bias->defined()) {
-    miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
-  }
-  return output_t;
-}
-
-// ---------------------------------------------------------------------
-//
-// Convolution backward (bias)
-//
-// ---------------------------------------------------------------------
-
-Tensor miopen_convolution_backward_bias(
-    const Tensor& grad_output_t)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
-
-  // TODO: Workaround since MIOpen does not support NHWC bias
-  // See #64426
-  std::vector<int64_t> discard_dims;
-  for( int i = 0; i < grad_output_t.dim(); i++ ) {
-      if(i != output_channels_dim ) {
-          discard_dims.push_back(i);
-      }
-  }
-
-  Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) );
-  if( outputBias.dim() == 0 ) {
-      // always return a tensor of shape [_]
-      return outputBias.unsqueeze(0);
-  }
-  else {
-      return outputBias;
-  }
-
-/* MIOpen does not support NHWC bias. Activate once support is added.
-  auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options());
-
-  TensorArg grad_bias{ grad_bias_t, "result", 0 };
-
-  TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}),
-                         static_cast<size_t>(grad_output->dim())};
-  TensorDescriptor odesc{*grad_output};
-
-  auto handle = getMiopenHandle();
-  auto dataType = getMiopenDataType(*grad_bias);
-  Constant one(dataType, 1);
-  Constant zero(dataType, 0);
-
-  MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
-                                                   &zero, bdesc.desc(), grad_bias->data_ptr()));
-  return *grad_bias;
-*/
-}
-
-// ---------------------------------------------------------------------
-//
-// Convolution backward (weight)
-//
-// ---------------------------------------------------------------------
-
-void raw_miopen_convolution_backward_weight_out(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenConvolution;
-
-  ConvolutionArgs args{ input, grad_output, grad_weight };
-  args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
-
-  if (at::globalContext().immediateMiopen()) {
-      uint64_t solution_id;
-      Workspace workspace = chooseSolution<miopenConvBwdWeightsAlgorithm_t>(args, &solution_id);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
-          args.handle,
-          args.odesc.desc(), grad_output.const_data_ptr(),
-          args.idesc.desc(), input.const_data_ptr(),
-          args.cdesc.desc(),
-          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id));
-  }
-  else {
-      miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
-      Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardWeights(
-          args.handle,
-          &one, args.odesc.desc(), grad_output.const_data_ptr(),
-          args.idesc.desc(), input.const_data_ptr(),
-          args.cdesc.desc(), bwdFilterAlg, &zero,
-          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
-  }
-}
-
-//Depthwise backward weights.
-void raw_miopen_depthwise_convolution_backward_weight_out(
-    const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
-  auto dataType = getMiopenDataType(input);
-  miopenConvolutionMode_t c_mode = miopenDepthwise;
-
-  ConvolutionArgs args{ input, grad_output, grad_weight };
-  args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(grad_weight, input.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
-
-  if (at::globalContext().immediateMiopen()) {
-      uint64_t solution_id;
-      Workspace workspace = chooseSolution<miopenConvBwdWeightsAlgorithm_t>(args, &solution_id);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
-          args.handle,
-          args.odesc.desc(), grad_output.const_data_ptr(),
-          args.idesc.desc(), input.const_data_ptr(),
-          args.cdesc.desc(),
-          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id));
-  }
-  else {
-      miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
-      Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardWeights(
-          args.handle,
-          &one, args.odesc.desc(), grad_output.const_data_ptr(),
-          args.idesc.desc(), input.const_data_ptr(),
-          args.cdesc.desc(), bwdFilterAlg, &zero,
-          args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size));
-  }
-}
-
-Tensor miopen_depthwise_convolution_backward_weight(
-    CheckedFrom c,
-    IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-
-  checkAllSameType(c, {grad_output, input});
-  checkAllSameGPU(c, {grad_output, input});
-
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
-  TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
-
-  Tensor input_contig_t = input->contiguous(memory_format);
-  input_contig_t.resize_(input_contig_t.sizes(), memory_format);
-  TensorArg input_contig{ input_contig_t, "input", 2};
-
-  auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
-
-  // For uniformity with everything else, although it seems grad_weight
-  // would be unambiguous too.
-  TensorArg grad_weight{ grad_weight_t, "result", 0 };
-  convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
-
-  raw_miopen_depthwise_convolution_backward_weight_out(
-      *grad_weight, *grad_output_contig, *input_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return grad_weight_t;
-}
-
-Tensor miopen_depthwise_convolution_backward_weight(
-    IntArrayRef weight_size,
-    const Tensor& grad_output_t,
-    const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            input{ input_t, "input", 2 };
-  return miopen_depthwise_convolution_backward_weight(
-      "miopen_depthwise_convolution_backward_weight",
-      weight_size, grad_output, input,
-      padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-Tensor miopen_convolution_backward_weight(
-    CheckedFrom c,
-    IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-
-  checkAllSameType(c, {grad_output, input});
-  checkAllSameGPU(c, {grad_output, input});
-
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*input, *grad_output)) {
-    memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
-  Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format);
-  TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 };
-
-  Tensor input_contig_t = input->contiguous(memory_format);
-  input_contig_t.resize_(input_contig_t.sizes(), memory_format);
-  TensorArg input_contig{ input_contig_t, "input", 2};
-
-  auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format);
-
-  // For uniformity with everything else, although it seems grad_weight
-  // would be unambiguous too.
-  TensorArg grad_weight{ grad_weight_t, "result", 0 };
-  convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups);
-
-  raw_miopen_convolution_backward_weight_out(
-      *grad_weight, *grad_output_contig, *input_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
-
-  return grad_weight_t;
-}
-
-Tensor miopen_convolution_backward_weight(
-    IntArrayRef weight_size,
-    const Tensor& grad_output_t,
-    const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            input{ input_t, "input", 2 };
-  return miopen_convolution_backward_weight(
-      "miopen_convolution_backward_weight",
-      weight_size, grad_output, input,
-      padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-Tensor miopen_convolution_transpose_backward_input(
-    const Tensor& grad_output_t, const Tensor& weight_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic)
-{
-  TensorArg grad_output { grad_output_t,  "grad_output", 1 },
-            weight      { weight_t, "weight", 2 };
-  return miopen_convolution_forward(
-    "miopen_convolution_transpose_backward_input",
-    grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-Tensor miopen_convolution_transpose_backward_weight(
-    IntArrayRef weight_size,
-    const Tensor& grad_output_t,
-    const Tensor& input_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            input{ input_t, "input", 2 };
-  return miopen_convolution_backward_weight(
-      "miopen_convolution_backward_weight",
-      weight_size, input, grad_output,
-      padding, stride, dilation, groups, benchmark, deterministic);
-}
-
-std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
-
-  Tensor grad_input, grad_weight, grad_bias;
-  if (output_mask[0]) {
-    grad_input = miopen_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[1]) {
-    grad_weight = miopen_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[2]) {
-    grad_bias = miopen_convolution_backward_bias(grad_output);
-  }
-
-  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
 }
 
 // ---------------------------------------------------------------------
@@ -1218,23 +1099,50 @@ std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_transpose_backwa
 //
 // ---------------------------------------------------------------------
 
-void raw_miopen_convolution_backward_input_out(
+// See NOTE [ Backward vs transpose convolutions ] in aten/src/ATen/native/cudnn/ConvShared.cpp
+
+void raw_miopen_convolution_backward_input_out_32bit(
     const at::Tensor& grad_input,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
   auto dataType = getMiopenDataType(grad_output);
-  miopenConvolutionMode_t c_mode = miopenConvolution;
+  miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution;
 
-  ConvolutionArgs args{ grad_input, grad_output, weight };
+  ConvolutionArgs args{grad_input, grad_output, weight};
   args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(grad_input);
-  args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+  at::MemoryFormat memory_format =
+    miopen_conv_suggest_memory_format(grad_input, weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
+      grad_input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      memory_format);
+  args.idesc.set(grad_input, memory_format);
+  args.wdesc.set(weight, memory_format, 0);
+  args.odesc.set(grad_output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      grad_output.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      benchmark,
+      deterministic);
 
   if (at::globalContext().immediateMiopen()) {
       uint64_t solution_id;
@@ -1245,7 +1153,10 @@ void raw_miopen_convolution_backward_input_out(
           args.odesc.desc(), grad_output.const_data_ptr(),
           args.wdesc.desc(), weight.const_data_ptr(),
           args.cdesc.desc(),
-          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id));
+          args.idesc.desc(), grad_input.mutable_data_ptr(),
+          workspace.data,
+          workspace.size,
+          solution_id));
   }
   else {
       miopenConvBwdDataAlgorithm_t bwdDataAlg;
@@ -1256,216 +1167,521 @@ void raw_miopen_convolution_backward_input_out(
 
       MIOPEN_CHECK(miopenConvolutionBackwardData(
           args.handle,
-          &one, args.odesc.desc(), grad_output.const_data_ptr(),
+          &one,
+          args.odesc.desc(), grad_output.const_data_ptr(),
           args.wdesc.desc(), weight.const_data_ptr(),
-          args.cdesc.desc(), bwdDataAlg, &zero,
-          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size));
+          args.cdesc.desc(),
+          bwdDataAlg,
+          &zero,
+          args.idesc.desc(), grad_input.mutable_data_ptr(),
+          workspace.data,
+          workspace.size));
   }
 }
 
-// see NOTE [ Backward vs transpose convolutions ] in src/Aten/native/cudnn/Conv.cpp
+void raw_miopen_convolution_backward_input_out(
+    const at::Tensor& grad_input,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  split_batch_dim_to_32bit_out(
+      grad_input,
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise,
+      1024 * 1024 * 128,
+      raw_miopen_convolution_backward_input_out_32bit);
+}
 
 Tensor miopen_convolution_backward_input(
     CheckedFrom c,
-    IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
+    IntArrayRef input_size,
+    const TensorArg& grad_output,
+    const TensorArg& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
   checkAllSameType(c, {grad_output, weight});
   checkAllSameGPU(c, {grad_output, weight});
 
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*grad_output, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
-
+  auto memory_format = miopen_conv_suggest_memory_format(*grad_output, *weight);
   Tensor grad_input_t = at::detail::empty_cuda(
       input_size, grad_output->options().memory_format(memory_format));
 
   // Avoid "grad_input" when this is being used as transposed convolution
-  TensorArg grad_input{ grad_input_t, "result", 0 };
-  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
+  TensorArg grad_input{grad_input_t, "result", 0};
+  convolution_shape_check(
+      c, grad_input, weight, grad_output, padding, stride, dilation, groups);
 
-  // See #4500
   Tensor weight_contig = weight->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  weight_contig.resize_(weight_contig.sizes(), memory_format);
-
   Tensor grad_output_contig = grad_output->contiguous(memory_format);
-  grad_output_contig.resize_(grad_output_contig.sizes(), memory_format);
 
   raw_miopen_convolution_backward_input_out(
-      *grad_input, grad_output_contig, weight_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
+      *grad_input,
+      grad_output_contig,
+      weight_contig,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 
   return *grad_input;
 }
 
-Tensor miopen_convolution_transpose_forward(
-    CheckedFrom c,
-    const TensorArg& grad_output, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(),
-                                    padding, output_padding, stride, dilation, groups);
-  return miopen_convolution_backward_input(c, input_size, grad_output, weight,
-                                    padding, stride, dilation, groups, benchmark, deterministic);
-}
-
+// overload
 Tensor miopen_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            weight{ weight_t, "weight", 2 };
+    IntArrayRef input_size,
+    const Tensor& grad_output_t,
+    const Tensor& weight_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  TensorArg grad_output{grad_output_t, "grad_output", 1},
+      weight{weight_t, "weight", 2};
   return miopen_convolution_backward_input(
       "miopen_convolution_backward_input",
-      input_size, grad_output, weight,
-      padding, stride, dilation, groups, benchmark, deterministic);
+      input_size,
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 }
 
-//Depthwise convolutions backward data.
-void raw_miopen_depthwise_convolution_backward_input_out(
-    const at::Tensor& grad_input,
-    const at::Tensor& grad_output,
-    const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic) {
-
-  auto dataType = getMiopenDataType(grad_output);
-  miopenConvolutionMode_t c_mode = miopenDepthwise;
+void raw_miopen_convolution_backward_weight_out_32bit(
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  auto dataType = getMiopenDataType(input);
+  miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution;
 
-  ConvolutionArgs args{ grad_input, grad_output, weight };
+  ConvolutionArgs args{input, grad_output, grad_weight};
   args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(grad_input);
-  args.wdesc.set(weight, grad_output.suggest_memory_format(), 0);
-  args.odesc.set(grad_output);
-  args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+  at::MemoryFormat memory_format =
+    miopen_conv_suggest_memory_format(input, grad_weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
+      input,
+      grad_weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      memory_format);
+  args.idesc.set(input, memory_format);
+  args.wdesc.set(grad_weight, memory_format, 0);
+  args.odesc.set(grad_output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      benchmark,
+      deterministic);
 
   if (at::globalContext().immediateMiopen()) {
       uint64_t solution_id;
-      Workspace workspace = chooseSolution<miopenConvBwdDataAlgorithm_t>(args, &solution_id);
+      Workspace workspace = chooseSolution<miopenConvBwdWeightsAlgorithm_t>(args, &solution_id);
 
-      MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate(
+      MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate(
           args.handle,
           args.odesc.desc(), grad_output.const_data_ptr(),
-          args.wdesc.desc(), weight.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
           args.cdesc.desc(),
-          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id));
+          args.wdesc.desc(), grad_weight.data_ptr(),
+          workspace.data,
+          workspace.size,
+          solution_id));
+  }
+  else {
+      miopenConvBwdWeightsAlgorithm_t bwdFilterAlg;
+      Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg);
+
+      Constant one(dataType, 1);
+      Constant zero(dataType, 0);
+
+      MIOPEN_CHECK(miopenConvolutionBackwardWeights(
+          args.handle,
+          &one,
+          args.odesc.desc(), grad_output.const_data_ptr(),
+          args.idesc.desc(), input.const_data_ptr(),
+          args.cdesc.desc(),
+          bwdFilterAlg,
+          &zero,
+          args.wdesc.desc(), grad_weight.data_ptr(),
+          workspace.data,
+          workspace.size));
+  }
+}
+
+void raw_miopen_convolution_backward_weight_out(
+    const Tensor& grad_weight,
+    const Tensor& grad_output,
+    const Tensor& input,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  constexpr int64_t int_max = std::numeric_limits<int>::max();
+  const int64_t ni = input.numel();
+  const int64_t no = grad_output.numel();
+  // Assume the shape of the tensor is (N, C, D1, D2, ...)
+  // if N * C * D1 * D2 * ... <= int_max, then no need to split at all
+  if (ni <= int_max && no <= int_max) {
+    raw_miopen_convolution_backward_weight_out_32bit(
+        grad_weight,
+        grad_output,
+        input,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        depthwise);
+    return;
   }
-  else {
-      miopenConvBwdDataAlgorithm_t bwdDataAlg;
-      Workspace workspace = chooseAlgorithm(args, benchmark, &bwdDataAlg);
-
-      Constant one(dataType, 1);
-      Constant zero(dataType, 0);
-
-      MIOPEN_CHECK(miopenConvolutionBackwardData(
-          args.handle,
-          &one, args.odesc.desc(), grad_output.const_data_ptr(),
-          args.wdesc.desc(), weight.const_data_ptr(),
-          args.cdesc.desc(), bwdDataAlg, &zero,
-          args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size));
+  // else, if C * D1 * D2 * ... <= int_max, then we just need to split across
+  // the N dimension
+  //
+  // Here we use a simple heuristics to determine the size of each split
+  // We don't max out the 2^31 address space because this number is super
+  // large and very likely to get an OOM.
+  int64_t n = grad_output.size(0);
+  int64_t max_inner_size = std::max<int64_t>(ni, no) / n;
+  int64_t split_size =
+      std::max<int64_t>(1024 * 1024 * 512 / max_inner_size, 1L);
+  int64_t num_splits = (n + split_size - 1) / split_size;
+  if (split_size * max_inner_size < int_max) {
+    const auto kAccType = (grad_weight.scalar_type() == kHalf ||
+                           grad_weight.scalar_type() == kBFloat16)
+        ? kFloat
+        : grad_weight.scalar_type();
+    Tensor grad_weight_accumulator =
+        at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType));
+    for (const auto i : c10::irange(num_splits)) {
+      int64_t start = split_size * i;
+      int64_t split_size_ = std::min<int64_t>(split_size, n - start);
+      Tensor input_ = input.narrow(0, start, split_size_);
+      Tensor grad_output_ = grad_output.narrow(0, start, split_size_);
+      Tensor grad_weight_ = at::empty_like(grad_weight);
+      raw_miopen_convolution_backward_weight_out_32bit(
+          grad_weight_,
+          grad_output_,
+          input_,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic,
+          depthwise);
+      grad_weight_accumulator.add_(grad_weight_);
+    }
+    grad_weight.copy_(grad_weight_accumulator);
+    return;
   }
+  // If control flow reaches here, this means even splitting N is not enough,
+  // then things starts to become complicated: For example, for conv2d, there
+  // following questions needs to be considered.
+  // - Is the memory layout NCHW or NHWC ?
+  // - If the conv is NCHW -> NC'H'W', then should we
+  //   - split only NC?
+  //   - split only N'C'?
+  //   - split both?
+  // - If the conv is NHWC, then we need to split across H, we need to be very
+  // careful about the boundary condition
+  //   to make sure that the boundary is handled correctly.
+  // - If we decide to make these splits, is the memory contiguous? Do we need
+  // to copy the memory? Considering the complexity of this issue, it is better
+  // not to use cuDNN for this case
+  TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN.");
 }
 
-Tensor miopen_depthwise_convolution_backward_input(
+Tensor miopen_convolution_backward_weight(
     CheckedFrom c,
-    IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  checkAllSameType(c, {grad_output, weight});
-  checkAllSameGPU(c, {grad_output, weight});
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, grad_output_t);
 
-  auto memory_format = at::MemoryFormat::Contiguous;
-  if (miopen_conv_use_channels_last(*grad_output, *weight)) {
-    memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
-  }
+  Tensor grad_output_contig_t = grad_output_t.contiguous(memory_format);
+  TensorArg grad_output_contig{grad_output_contig_t, "grad_output", 1};
 
-  Tensor grad_input_t = at::detail::empty_cuda(
-      input_size, grad_output->options().memory_format(memory_format));
+  Tensor input_contig_t = input_t.contiguous(memory_format);
+  TensorArg input{input_contig_t, "input", 2};
 
-  TensorArg grad_input{ grad_input_t, "result", 0 };
-  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
+  checkAllSameType(c, {grad_output_contig, input});
+  checkAllSameGPU(c, {grad_output_contig, input});
 
-  // See #4500
-  Tensor weight_contig = weight->contiguous(memory_format);
-  // Make sure that NC11 strides follow formula
-  weight_contig.resize_(weight_contig.sizes(), memory_format);
+  auto grad_weight_t =
+    at::empty(weight_size, grad_output_contig->options(), memory_format);
 
-  Tensor grad_output_contig = grad_output->contiguous(memory_format);
-  grad_output_contig.resize_(grad_output_contig.sizes(), memory_format);
+  // For uniformity with everything else, although it seems grad_weight
+  // would be unambiguous too.
+  TensorArg grad_weight{grad_weight_t, "result", 0};
+  convolution_shape_check(
+      c,
+      input,
+      grad_weight,
+      grad_output_contig,
+      padding,
+      stride,
+      dilation,
+      groups);
 
-  raw_miopen_depthwise_convolution_backward_input_out(
-      *grad_input, grad_output_contig, weight_contig,
-      padding, stride, dilation, groups, benchmark, deterministic);
+  raw_miopen_convolution_backward_weight_out(
+      *grad_weight,
+      *grad_output_contig,
+      *input,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 
-  return *grad_input;
+  return grad_weight_t;
 }
 
-Tensor miopen_depthwise_convolution_backward_input(
-    IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic)
-{
-  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
-            weight{ weight_t, "weight", 2 };
-  return miopen_depthwise_convolution_backward_input(
-      "miopen_depthwise_convolution_backward_input",
-      input_size, grad_output, weight,
-      padding, stride, dilation, groups, benchmark, deterministic);
+// overload
+Tensor miopen_convolution_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    bool depthwise=false) {
+  return miopen_convolution_backward_weight(
+      "miopen_convolution_backward_weight",
+      weight_size,
+      grad_output_t,
+      input_t,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      depthwise);
 }
 
-std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
-
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
+std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_convolution_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output_t,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    std::array<bool,3> output_mask) {
+  Tensor grad_output = grad_output_t.to(input.suggest_memory_format());
 
   Tensor grad_input, grad_weight, grad_bias;
-  if (output_mask[0]) {
-    grad_input = miopen_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[1]) {
-    grad_weight = miopen_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[2]) {
-    grad_bias = miopen_convolution_backward_bias(grad_output);
+  if (input.numel() == 0) {
+    if (output_mask[0]) {
+      grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    }
+    if (output_mask[1]) {
+      grad_weight = at::zeros_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    }
+    if (output_mask[2]) {
+      grad_bias = at::zeros_like(grad_output_t, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+    }
+  } else {
+    if (output_mask[0]) {
+      grad_input = miopen_convolution_backward_input(
+          input.sizes(),
+          grad_output,
+          weight,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic);
+    }
+    if (output_mask[1]) {
+      grad_weight = miopen_convolution_backward_weight(
+          weight.sizes(),
+          grad_output,
+          input,
+          padding,
+          stride,
+          dilation,
+          groups,
+          benchmark,
+          deterministic);
+    }
+    if (output_mask[2]) {
+      grad_bias = miopen_convolution_backward_bias(grad_output);
+    }
   }
 
-  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+  return std::tuple<Tensor, Tensor, Tensor>{grad_input, grad_weight, grad_bias};
 }
 
-std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_depthwise_convolution_backward(
-    const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
-    bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
+Tensor miopen_convolution_transpose_forward(
+    CheckedFrom c,
+    const TensorArg& grad_output,
+    const TensorArg& weight,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
+  auto input_size = conv_input_size(
+      grad_output->sizes(),
+      weight->sizes(),
+      padding,
+      output_padding,
+      stride,
+      dilation,
+      groups);
+  return miopen_convolution_backward_input(
+      c,
+      input_size,
+      grad_output,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
+}
 
-  Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format());
+Tensor miopen_convolution_transpose_backward_weight(
+    IntArrayRef weight_size,
+    const Tensor& grad_output_t,
+    const Tensor& input_t,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
+  return miopen_convolution_backward_weight(
+      "miopen_convolution_backward_weight",
+      weight_size,
+      input_t,
+      grad_output_t,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
+}
 
-  Tensor grad_input, grad_weight, grad_bias;
-  if (output_mask[0]) {
-    grad_input = miopen_depthwise_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[1]) {
-    grad_weight = miopen_depthwise_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
-  }
-  if (output_mask[2]) {
-    grad_bias = miopen_convolution_backward_bias(grad_output);
-  }
+Tensor miopen_convolution_transpose(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_t_opt,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
+  const Tensor& bias_t = *bias_t_maybe_owned;
 
-  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+  TensorArg input{input_t, "input", 1}, weight{weight_t, "weight", 2}, bias{bias_t, "bias", 3};
+  CheckedFrom c = "miopen_convolution_transpose";
+  auto output_t = miopen_convolution_transpose_forward(
+      c,
+      input,
+      weight,
+      padding,
+      output_padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic);
+  if (bias->defined()) {
+    miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+  }
+  return output_t;
 }
 
-Tensor miopen_convolution_transpose(
-    const Tensor& input_t, const Tensor& weight_t, const std::optional<Tensor>& bias_t_opt,
-    IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
-    int64_t groups, bool benchmark, bool deterministic)
+// ---------------------------------------------------------------------
+//
+// Convolution depthwise
+//
+// ---------------------------------------------------------------------
+
+Tensor miopen_depthwise_convolution(
+    const Tensor& input_t,
+    const Tensor& weight_t,
+    const std::optional<Tensor>& bias_t_opt,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic)
 {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt);
@@ -1474,16 +1690,86 @@ Tensor miopen_convolution_transpose(
   TensorArg input  { input_t,  "input",  1 },
             weight { weight_t, "weight", 2 },
             bias   { bias_t,   "bias",   3 };
-  CheckedFrom c = "miopen_convolution_transpose";
-  auto output_t = miopen_convolution_transpose_forward(
-    c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic);
+  CheckedFrom c = "miopen_depthwise_convolution";
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      input_t.options().memory_format(memory_format));
+  if (output_t.numel() == 0) {
+    return output_t;
+  }
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
+      c,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      benchmark,
+      deterministic,
+      true);
   if (bias->defined()) {
-    miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
+    miopen_convolution_add_bias_(c, output, bias);
   }
-  return output_t;
+  return *output;
 }
 
-// MIOpen fused convolution bias activation forward
+std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_backward(
+    const at::Tensor& input,
+    const at::Tensor& grad_output_t,
+    const at::Tensor& weight,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    IntArrayRef dilation,
+    int64_t groups,
+    bool benchmark,
+    bool deterministic,
+    std::array<bool,3> output_mask) {
+  Tensor grad_output = grad_output_t.to(input.suggest_memory_format());
+
+  Tensor grad_input, grad_weight, grad_bias;
+  if (output_mask[0]) {
+    grad_input = miopen_convolution_backward_input(
+        input.sizes(),
+        grad_output,
+        weight,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        true);
+  }
+  if (output_mask[1]) {
+    grad_weight = miopen_convolution_backward_weight(
+        weight.sizes(),
+        grad_output,
+        input,
+        padding,
+        stride,
+        dilation,
+        groups,
+        benchmark,
+        deterministic,
+        true);
+  }
+  if (output_mask[2]) {
+    grad_bias = miopen_convolution_backward_bias(grad_output);
+  }
+
+  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
+}
+
+// ---------------------------------------------------------------------
+// fusions
+// ---------------------------------------------------------------------
+
 void raw_miopen_convolution_relu_out(
     const Tensor& output,
     const Tensor& input,
@@ -1495,17 +1781,35 @@ void raw_miopen_convolution_relu_out(
     int64_t groups,
     bool benchmark,
     bool deterministic) {
-
   auto dataType = getMiopenDataType(input);
   miopenConvolutionMode_t c_mode = miopenConvolution;
-
   ConvolutionArgs args{ input, output, weight };
   args.handle = getMiopenHandle();
-  setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic);
-  args.idesc.set(input);
-  args.wdesc.set(weight, input.suggest_memory_format(), 0);
-  args.odesc.set(output);
-  args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic);
+  at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
+  setConvolutionParams(
+      &args.params,
+      args.handle,
+      input,
+      weight,
+      padding,
+      stride,
+      dilation,
+      groups,
+      deterministic,
+      memory_format);
+  args.idesc.set(input, memory_format);
+  args.wdesc.set(weight, memory_format, 0);
+  args.odesc.set(output, memory_format);
+  args.cdesc.set(
+      dataType,
+      c_mode,
+      input.dim() - 2,
+      args.params.padding,
+      args.params.stride,
+      args.params.dilation,
+      args.params.groups,
+      benchmark,
+      deterministic);
 
   TensorDescriptor bdesc;
   bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
@@ -1549,8 +1853,8 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
 }
 
 Tensor miopen_convolution_add_relu(
-    const Tensor& input,
-    const Tensor& weight,
+    const Tensor& input_t,
+    const Tensor& weight_t,
     const Tensor& z,
     const std::optional<Scalar>& alpha,
     const std::optional<Tensor>& bias,
@@ -1562,17 +1866,28 @@ Tensor miopen_convolution_add_relu(
   // MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
   // y = act ( alpha1 * conv(x) + alpha2 * z + bias )
 
-  auto memory_format = input.suggest_memory_format();
+  auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
 
   auto& ctx = at::globalContext();
   bool benchmark = ctx.benchmarkCuDNN();
 
-  TensorArg input_arg  { input,  "input",  1 },
-            weight_arg { weight, "weight", 2 };
-  auto output = miopen_convolution_forward(
+  TensorArg input  { input_t,  "input",  1 },
+            weight { weight_t, "weight", 2 };
+
+  Tensor output_t = at::detail::empty_cuda(
+      conv_output_size(
+        input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+      input_t.options().memory_format(memory_format));
+  if (output_t.numel() == 0){
+    return output_t;
+  }
+  // Avoid ambiguity of "output" when this is being used as backwards
+  TensorArg output{output_t, "result", 0};
+  miopen_convolution_forward_out(
+      output,
       "miopen_convolution_add_relu",
-      input_arg,
-      weight_arg,
+      input,
+      weight,
       padding,
       stride,
       dilation,
@@ -1581,53 +1896,51 @@ Tensor miopen_convolution_add_relu(
       false // deterministic
   );
 
-  auto contig_output = self_or_new_memory_format(output, memory_format);
+  auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
 
-  if (!output.is_same(contig_output)) {
-    contig_output.copy_(output);
+  if (!output_t.is_same(contig_output_t)) {
+    contig_output_t.copy_(output_t);
   }
 
   auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
   auto _bias = bias.has_value()
           ? bias.value()
           : at::zeros(
-                {contig_output.size(1)},
-                optTypeMetaToScalarType(contig_output.options().dtype_opt()),
-                contig_output.options().layout_opt(),
-                contig_output.options().device_opt(),
-                contig_output.options().pinned_memory_opt());
+                {contig_output_t.size(1)},
+                optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
+                contig_output_t.options().layout_opt(),
+                contig_output_t.options().device_opt(),
+                contig_output_t.options().pinned_memory_opt());
 
-  at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input.dim(), _bias).add(z, _alpha);
-  contig_output.add_(alpha_mul_z_add_bias);
-  contig_output.relu_();
+  at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
+  contig_output_t.add_(alpha_mul_z_add_bias);
+  contig_output_t.relu_();
 
-  return contig_output;
+  return contig_output_t;
 }
 
 Tensor miopen_convolution_relu(
-    const Tensor& input,
-    const Tensor& weight,
+    const Tensor& input_t,
+    const Tensor& weight_t,
     const std::optional<Tensor>& bias,
     IntArrayRef stride,
     IntArrayRef padding,
     IntArrayRef dilation,
     int64_t groups) {
 
-  auto memory_format = input.suggest_memory_format();
-
   auto& ctx = at::globalContext();
   bool benchmark = ctx.benchmarkCuDNN();
 
   // MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
-  if (input.suggest_memory_format() == at::MemoryFormat::Contiguous
-          && input.scalar_type() == at::kFloat
-          && input.ndimension() == 4) {
+  if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
+          && input_t.scalar_type() == at::kFloat
+          && input_t.ndimension() == 4) {
 
     // FuseFrozenConvAddRelu performs some tensor shape checking
     Tensor output_t = at::detail::empty_cuda(
         conv_output_size(
-            input.sizes(), weight.sizes(), padding, stride, dilation),
-        input.options().memory_format(input.suggest_memory_format()));
+            input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+        input_t.options().memory_format(input_t.suggest_memory_format()));
     if (output_t.numel() == 0) {
       return output_t;
     }
@@ -1643,8 +1956,8 @@ Tensor miopen_convolution_relu(
 
     raw_miopen_convolution_relu_out(
         output_t,
-        input,
-        weight,
+        input_t,
+        weight_t,
         _bias,
         stride,
         padding,
@@ -1659,12 +1972,25 @@ Tensor miopen_convolution_relu(
   else {
     // fallback
 
-    TensorArg input_arg  { input,  "input",  1 },
-              weight_arg { weight, "weight", 2 };
-    auto output = miopen_convolution_forward(
+    auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
+
+    TensorArg input  { input_t,  "input",  1 },
+              weight { weight_t, "weight", 2 };
+
+    Tensor output_t = at::detail::empty_cuda(
+        conv_output_size(
+          input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
+        input->options().memory_format(memory_format));
+    if (output_t.numel() == 0){
+      return output_t;
+    }
+    // Avoid ambiguity of "output" when this is being used as backwards
+    TensorArg output{output_t, "result", 0};
+    miopen_convolution_forward_out(
+        output,
         "miopen_convolution_relu",
-        input_arg,
-        weight_arg,
+        input,
+        weight,
         padding,
         stride,
         dilation,
@@ -1673,26 +1999,26 @@ Tensor miopen_convolution_relu(
         false // deterministic
     );
 
-    auto contig_output = self_or_new_memory_format(output, memory_format);
+    auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
 
-    if (!output.is_same(contig_output)) {
-      contig_output.copy_(output);
+    if (!output_t.is_same(contig_output_t)) {
+      contig_output_t.copy_(output_t);
     }
 
     auto _bias = bias.has_value()
             ? bias.value()
             : at::zeros(
-                  {contig_output.size(1)},
-                  optTypeMetaToScalarType(contig_output.options().dtype_opt()),
-                  contig_output.options().layout_opt(),
-                  contig_output.options().device_opt(),
-                  contig_output.options().pinned_memory_opt());
+                  {contig_output_t.size(1)},
+                  optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
+                  contig_output_t.options().layout_opt(),
+                  contig_output_t.options().device_opt(),
+                  contig_output_t.options().pinned_memory_opt());
 
-    at::Tensor reshaped_bias = at::native::reshape_bias(input.dim(), _bias);
-    contig_output.add_(reshaped_bias);
-    contig_output.relu_();
+    at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
+    contig_output_t.add_(reshaped_bias);
+    contig_output_t.relu_();
 
-    return contig_output;
+    return contig_output_t;
   }
 }
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
index ef485904f977..873005b3dd2b 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@@ -260,7 +260,7 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
   alloc_with_matching_layout(query, output, output_shape);
   at::Tensor logsumexp, debug_attn_mask; // not supported
 
-  at::native::onednn::gpu_float_sdpa(
+  at::native::onednn::sdpa(
       batch_size,
       seq_len_q,
       seq_len_kv,
@@ -274,7 +274,9 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
       attn_bias,
       is_causal,
       scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)),
-      output);
+      output,
+      false,
+      logsumexp);
 
   // rng not used
   auto philox_seed = at::empty({}, at::dtype(at::kLong));
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
index 1d90711f6e38..e840e21f4f7a 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
@@ -13,6 +13,9 @@ using dims = logical_tensor::dims;
 using op = dnnl::graph::op;
 using partition = dnnl::graph::partition;
 
+constexpr logical_tensor::data_type sdpa_intermediate_dtype =
+    logical_tensor::data_type::f32;
+
 inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) {
   return scalar_type == c10::ScalarType::Float   ? data_type::f32
       : scalar_type == c10::ScalarType::Half     ? data_type::f16
@@ -20,6 +23,8 @@ inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) {
                                                  : data_type::undef;
 }
 
+namespace sdpa_forward {
+
 struct SDPALogicalParams {
   enum class TensorID {
     query,
@@ -28,7 +33,8 @@ struct SDPALogicalParams {
     neg_inf,
     attn_mask,
     value,
-    output,
+    attention,
+    logsumexp,
     end,
   };
 
@@ -38,14 +44,16 @@ struct SDPALogicalParams {
   std::optional<logical_tensor> neg_inf;
   std::optional<logical_tensor> attn_mask;
   logical_tensor value{};
-  logical_tensor output{};
+  logical_tensor attention{};
+  std::optional<logical_tensor> logsumexp;
 
   SDPALogicalParams(
       const at::Tensor& query_,
       const at::Tensor& key_,
       const at::Tensor& value_,
       const std::optional<at::Tensor>& attn_mask_,
-      const at::Tensor& output_,
+      const at::Tensor& attention_,
+      const at::Tensor& logsumexp_,
       int batch_size,
       int seq_len_q,
       int seq_len_kv,
@@ -53,19 +61,26 @@ struct SDPALogicalParams {
       int num_head_kv,
       int head_dim_qk,
       int head_dim_v,
-      bool is_causal) {
+      bool is_causal,
+      bool compute_logsumexp) {
     const data_type dtype = to_logical_tensor_data_type(query_.scalar_type());
     TORCH_INTERNAL_ASSERT(
         (dtype != data_type::undef),
         "Only FP16/BF16/FP32 datatypes are currently supported");
+    TORCH_INTERNAL_ASSERT(
+        query_.scalar_type() == attention_.scalar_type(),
+        "scaled_dot_product_attention_xpu: query and attention tensors should have the same data type.");
     const dims scalar_shape = {1};
-    std::vector<logical_tensor> inputLogicalTensors;
 
     at::Tensor reshaped_query = query_;
     at::Tensor reshaped_key = key_;
     at::Tensor reshaped_value = value_;
-    at::Tensor reshaped_output = output_;
+    at::Tensor reshaped_attention = attention_;
+    at::Tensor reshaped_logsumexp =
+        compute_logsumexp ? logsumexp_.unsqueeze(-1) : logsumexp_;
     at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor());
+
+    // handle broadcasted input tensors for OneDNN
     if (at::native::onednn::is_broadcast(reshaped_query)) {
       at::native::onednn::undo_broadcast(reshaped_query);
     }
@@ -75,9 +90,6 @@ struct SDPALogicalParams {
     if (at::native::onednn::is_broadcast(reshaped_value)) {
       at::native::onednn::undo_broadcast(reshaped_value);
     }
-    if (at::native::onednn::is_broadcast(reshaped_output)) {
-      at::native::onednn::undo_broadcast(reshaped_output);
-    }
     if (attn_mask_.has_value() &&
         at::native::onednn::is_broadcast(reshaped_attn_mask)) {
       at::native::onednn::undo_broadcast(reshaped_attn_mask);
@@ -95,23 +107,22 @@ struct SDPALogicalParams {
           {batch_size, group_num, group_size, seq_len_q, head_dim_qk});
       reshaped_key = key_.unsqueeze(2);
       reshaped_value = value_.unsqueeze(2);
-      reshaped_output = output_.view(
+      reshaped_attention = attention_.view(
           {batch_size, group_num, group_size, seq_len_q, head_dim_v});
       if (attn_mask_.has_value() && attn_mask_.value().dim() == 4) {
         reshaped_attn_mask = attn_mask_.value().unsqueeze(2);
       }
     }
 
-    query = {
-        static_cast<size_t>(TensorID::query),
-        dtype,
-        reshaped_query.sizes().vec(),
-        reshaped_query.strides().vec()};
-    key = {
-        static_cast<size_t>(TensorID::key),
-        dtype,
-        reshaped_key.sizes().vec(),
-        reshaped_key.strides().vec()};
+#define LOGIC_TENSOR_DESC(name, dtype)     \
+  name = {                                 \
+      static_cast<size_t>(TensorID::name), \
+      dtype,                               \
+      reshaped_##name.sizes().vec(),       \
+      reshaped_##name.strides().vec()}
+
+    LOGIC_TENSOR_DESC(query, dtype);
+    LOGIC_TENSOR_DESC(key, dtype);
     scale = {
         static_cast<size_t>(TensorID::scale),
         to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())),
@@ -132,22 +143,19 @@ struct SDPALogicalParams {
       TORCH_INTERNAL_ASSERT(
           (mask_dtype != data_type::undef),
           "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask");
-      attn_mask = {
-          static_cast<size_t>(TensorID::attn_mask),
-          mask_dtype,
-          reshaped_attn_mask.sizes().vec(),
-          reshaped_attn_mask.strides().vec()};
+      LOGIC_TENSOR_DESC(attn_mask, mask_dtype);
     }
-    value = {
-        static_cast<size_t>(TensorID::value),
-        dtype,
-        reshaped_value.sizes().vec(),
-        reshaped_value.strides().vec()};
-    output = {
-        static_cast<size_t>(TensorID::output),
-        dtype,
-        reshaped_output.sizes().vec(),
-        reshaped_output.strides().vec()};
+    LOGIC_TENSOR_DESC(value, dtype);
+    LOGIC_TENSOR_DESC(attention, dtype);
+    if (compute_logsumexp) {
+      TORCH_INTERNAL_ASSERT(
+          logsumexp_.scalar_type() == at::kFloat,
+          "scaled_dot_product_attention: Expected logsumexp data type in FP32, but got ",
+          logsumexp_.scalar_type(),
+          " instead.");
+      LOGIC_TENSOR_DESC(logsumexp, sdpa_intermediate_dtype);
+    }
+#undef LOGIC_TENSOR_DESC
   }
   std::vector<logical_tensor> get_input() const {
     std::vector<logical_tensor> input = {query, key, scale};
@@ -161,16 +169,21 @@ struct SDPALogicalParams {
     return input;
   }
   std::vector<logical_tensor> get_output() const {
-    return {output};
+    std::vector<logical_tensor> output;
+    output.push_back(attention);
+    if (logsumexp.has_value()) {
+      output.push_back(logsumexp.value());
+    }
+    return output;
   }
 };
 
 partition create_sdpa_graph_partition(
     bool is_causal,
+    bool compute_logsumexp,
     data_type dtype,
     const SDPALogicalParams& params) {
   // graph building and partitioning
-  // currently, we assume that Q and K have same sequence length
 
   size_t lt_id = static_cast<size_t>(SDPALogicalParams::TensorID::end);
   size_t op_id = 0;
@@ -180,7 +193,7 @@ partition create_sdpa_graph_partition(
   // Matrix Extensions (Intel(R) XMX) support, which means the
   // Q/K/V tensors have bf16 or f16 data type while the output of the first
   // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type.
-  logical_tensor matmul_qk_out{lt_id++, data_type::f32};
+  logical_tensor matmul_qk_out{lt_id++, sdpa_intermediate_dtype};
   op matmul_qk{
       op_id++,
       op::kind::MatMul,
@@ -189,7 +202,7 @@ partition create_sdpa_graph_partition(
       "matmul_qk"};
   matmul_qk.set_attr<bool>(op::attr::transpose_b, true);
 
-  logical_tensor scaled_qk_out{lt_id++, data_type::f32};
+  logical_tensor scaled_qk_out{lt_id++, sdpa_intermediate_dtype};
   op scale_mul{
       op_id++,
       op::kind::Multiply,
@@ -214,7 +227,7 @@ partition create_sdpa_graph_partition(
   if (params.attn_mask.has_value()) {
     TORCH_INTERNAL_ASSERT(
         !is_causal, "Additive mask cannot use with is_causal.");
-    masked_qk_out = {lt_id++, data_type::f32};
+    masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
     mask_add = {
         op_id++,
         op::kind::Add,
@@ -249,7 +262,7 @@ partition create_sdpa_graph_partition(
         {mask_gt_out.value()},
         "mask_gt"};
 
-    masked_qk_out = {lt_id++, data_type::f32};
+    masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
     mask_select = {
         op_id++,
         op::kind::Select,
@@ -270,12 +283,15 @@ partition create_sdpa_graph_partition(
   logical_tensor softmax_out{lt_id++, dtype};
   softmax.add_input(masked_qk_out.value_or(scaled_qk_out));
   softmax.add_output(softmax_out);
+  if (compute_logsumexp) {
+    softmax.add_output(params.logsumexp.value());
+  }
 
   op matmul_v{
       op_id++,
       op::kind::MatMul,
       {softmax_out, params.value},
-      {params.output},
+      {params.attention},
       "matmul_v"};
 
   constexpr auto ekind = dnnl::engine::kind::gpu;
@@ -304,44 +320,469 @@ partition create_sdpa_graph_partition(
 
 partition& find_or_create_graph_partition(
     bool is_causal,
+    bool compute_logsumexp,
     const SDPALogicalParams& params) {
-  thread_local static PartitionCache cache;
+  thread_local PartitionCache cache;
   const data_type dtype = params.query.get_data_type();
 
   // cache key creation
   // patternID is determined on the basis of the arguments provided
   std::bitset<32> patternID;
   if (dtype == data_type::f32) {
-    // bit 3 corresponds to float32 dtype
-    patternID.set(3, 1);
+    patternID.set(static_cast<uint8_t>(PartitionCache::BitType::Float32), 1);
   }
   if (dtype == data_type::bf16) {
-    // bit 2 corresponds to fp16/bf16 dtype
-    patternID.set(2, 1);
+    patternID.set(static_cast<uint8_t>(PartitionCache::BitType::Bfloat16), 1);
   }
   // sdp pattern
-  patternID.set(4, 1);
+  patternID.set(static_cast<uint8_t>(PartitionCache::BitType::SdpaPattern), 1);
 
   // Refer to comments in Utils.h. The first 8 bits are reserved
   int pos = 8;
   // attn_mask
   patternID.set(pos++, params.attn_mask.has_value());
   patternID.set(pos++, is_causal);
+  // compute_logsumexp
+  patternID.set(pos++, compute_logsumexp);
 
   auto partition_ = cache.find_partition(patternID);
   if (!partition_.has_value()) {
     // partition cache no hit
     // graph building and partitioning
-    partition sdp_partition =
-        create_sdpa_graph_partition(is_causal, dtype, params);
+    partition sdp_partition = create_sdpa_graph_partition(
+        is_causal, compute_logsumexp, dtype, params);
     partition_ = cache.insert_partition_cache(patternID, sdp_partition);
   }
   return *partition_;
 }
+} // namespace sdpa_forward
+
+namespace sdpa_backward {
+
+struct SDPABackwardLogicalParams {
+  enum class TensorID {
+    grad_out,
+    query,
+    key,
+    value,
+    out,
+    logsumexp,
+    scale,
+    neg_inf,
+    attn_mask,
+    grad_query,
+    grad_key,
+    grad_value,
+    end,
+  };
+
+  logical_tensor grad_out{};
+  logical_tensor query{};
+  logical_tensor key{};
+  logical_tensor value{};
+  logical_tensor out{};
+  logical_tensor logsumexp{};
+  logical_tensor scale{};
+  std::optional<logical_tensor> neg_inf;
+  std::optional<logical_tensor> attn_mask;
+  logical_tensor grad_query{};
+  logical_tensor grad_key{};
+  logical_tensor grad_value{};
+
+  SDPABackwardLogicalParams(
+      const at::Tensor& grad_out_,
+      const at::Tensor& query_,
+      const at::Tensor& key_,
+      const at::Tensor& value_,
+      const at::Tensor& out_,
+      const at::Tensor& logsumexp_,
+      const std::optional<at::Tensor>& attn_mask_,
+      const at::Tensor& grad_query_,
+      const at::Tensor& grad_key_,
+      const at::Tensor& grad_value_,
+      int batch_size,
+      int num_head_q,
+      int num_head_kv,
+      int seq_len_q,
+      int seq_len_kv,
+      int head_dim_qk,
+      int head_dim_v,
+      bool is_causal) {
+    const data_type dtype = to_logical_tensor_data_type(query_.scalar_type());
+    TORCH_INTERNAL_ASSERT(
+        (dtype != data_type::undef),
+        "Only FP16/BF16/FP32 datatypes are currently supported");
+    TORCH_INTERNAL_ASSERT(
+        grad_out_.scalar_type() == query_.scalar_type() &&
+            grad_out_.scalar_type() == key_.scalar_type() &&
+            grad_out_.scalar_type() == value_.scalar_type() &&
+            grad_out_.scalar_type() == out_.scalar_type(),
+        "scaled_dot_product_attention_backward_xpu: Expected grad_out, q, k, v and out to have the same data type, but got ",
+        " grad_out: ",
+        grad_out_.scalar_type(),
+        ", q: ",
+        query_.scalar_type(),
+        ", k: ",
+        key_.scalar_type(),
+        ", v: ",
+        value_.scalar_type(),
+        ", out: ",
+        out_.scalar_type());
+    TORCH_INTERNAL_ASSERT(
+        logsumexp_.defined() && logsumexp_.scalar_type() == at::kFloat,
+        "scaled_dot_product_attention_backward_xpu: Expected logsumexp to be defined and have FP32 data type");
+    const dims scalar_shape = {1};
+
+    at::Tensor reshaped_grad_out = grad_out_;
+    at::Tensor reshaped_query = query_;
+    at::Tensor reshaped_key = key_;
+    at::Tensor reshaped_value = value_;
+    at::Tensor reshaped_out = out_;
+    at::Tensor reshaped_logsumexp = logsumexp_.unsqueeze(-1);
+    at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor());
+    at::Tensor reshaped_grad_query = grad_query_;
+    at::Tensor reshaped_grad_key = grad_key_;
+    at::Tensor reshaped_grad_value = grad_value_;
+
+    // handle broadcasted input tensors for OneDNN
+    if (at::native::onednn::is_broadcast(reshaped_grad_out)) {
+      at::native::onednn::undo_broadcast(reshaped_grad_out);
+    }
+    if (at::native::onednn::is_broadcast(reshaped_query)) {
+      at::native::onednn::undo_broadcast(reshaped_query);
+    }
+    if (at::native::onednn::is_broadcast(reshaped_key)) {
+      at::native::onednn::undo_broadcast(reshaped_key);
+    }
+    if (at::native::onednn::is_broadcast(reshaped_value)) {
+      at::native::onednn::undo_broadcast(reshaped_value);
+    }
+    if (attn_mask_.has_value() &&
+        at::native::onednn::is_broadcast(reshaped_attn_mask)) {
+      at::native::onednn::undo_broadcast(reshaped_attn_mask);
+    }
+
+    // TODO: Support GQA in backward pass once OneDNN supports it.
+
+#define LOGIC_TENSOR_DESC(name, dtype)     \
+  name = {                                 \
+      static_cast<size_t>(TensorID::name), \
+      dtype,                               \
+      reshaped_##name.sizes().vec(),       \
+      reshaped_##name.strides().vec()}
+
+    LOGIC_TENSOR_DESC(grad_out, dtype);
+    LOGIC_TENSOR_DESC(query, dtype);
+    LOGIC_TENSOR_DESC(key, dtype);
+    LOGIC_TENSOR_DESC(value, dtype);
+    LOGIC_TENSOR_DESC(out, dtype);
+    LOGIC_TENSOR_DESC(logsumexp, sdpa_intermediate_dtype);
+    scale = {
+        static_cast<size_t>(TensorID::scale),
+        to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())),
+        scalar_shape,
+        logical_tensor::layout_type::strided,
+        logical_tensor::property_type::constant};
+    if (is_causal) {
+      neg_inf = {
+          static_cast<size_t>(TensorID::neg_inf),
+          to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())),
+          scalar_shape,
+          logical_tensor::layout_type::strided,
+          logical_tensor::property_type::constant};
+    }
+    if (attn_mask_.has_value()) {
+      const data_type mask_dtype =
+          to_logical_tensor_data_type(attn_mask_->scalar_type());
+      TORCH_INTERNAL_ASSERT(
+          (mask_dtype != data_type::undef),
+          "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask");
+      LOGIC_TENSOR_DESC(attn_mask, mask_dtype);
+    }
+    LOGIC_TENSOR_DESC(grad_query, dtype);
+    LOGIC_TENSOR_DESC(grad_key, dtype);
+    LOGIC_TENSOR_DESC(grad_value, dtype);
+#undef LOGIC_TENSOR_DESC
+  }
+  std::vector<logical_tensor> get_input() const {
+    std::vector<logical_tensor> input = {
+        grad_out, query, key, value, out, logsumexp, scale};
+    if (neg_inf.has_value()) {
+      input.push_back(neg_inf.value());
+    }
+    if (attn_mask.has_value()) {
+      input.push_back(attn_mask.value());
+    }
+    return input;
+  }
+  std::vector<logical_tensor> get_output() const {
+    std::vector<logical_tensor> output = {grad_query, grad_key, grad_value};
+    return output;
+  }
+};
+
+partition create_sdpa_backward_graph_partition(
+    bool is_causal,
+    data_type dtype,
+    const SDPABackwardLogicalParams& params) {
+  // graph building and partitioning
+  size_t lt_id = static_cast<size_t>(SDPABackwardLogicalParams::TensorID::end);
+  size_t op_id = 0;
+
+  // OneDNN graph has optimized implementation for `f16` or `bf16` SDPA with
+  // `f32` intermediate data type on Intel Graphics Products with Intel(R) Xe
+  // Matrix Extensions (Intel(R) XMX) support, which means the
+  // Q/K/V tensors have bf16 or f16 data type while the output of the first
+  // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type.
+  logical_tensor matmul_qk_out{lt_id++, sdpa_intermediate_dtype};
+  op matmul_qk{
+      op_id++,
+      op::kind::MatMul,
+      {params.query, params.key},
+      {matmul_qk_out},
+      "matmul_qk"};
+  matmul_qk.set_attr<bool>(op::attr::transpose_b, true);
+
+  logical_tensor scaled_qk_out{lt_id++, sdpa_intermediate_dtype};
+  op scale_mul{
+      op_id++,
+      op::kind::Multiply,
+      {matmul_qk_out, params.scale},
+      {scaled_qk_out},
+      "scale_mul"};
+
+  std::optional<logical_tensor> masked_qk_out;
+
+  // For optional additive mask
+  std::optional<op> mask_add;
+
+  // For optional implicite causal mask
+  std::optional<op> mask_gen_idx_row;
+  std::optional<logical_tensor> mask_row_idx;
+  std::optional<op> mask_gen_idx_col;
+  std::optional<logical_tensor> mask_col_idx;
+  std::optional<op> mask_gt;
+  std::optional<logical_tensor> mask_gt_out;
+  std::optional<op> mask_select;
+
+  if (params.attn_mask.has_value()) {
+    TORCH_INTERNAL_ASSERT(
+        !is_causal, "Additive mask cannot use with is_causal.");
+    masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
+    mask_add = {
+        op_id++,
+        op::kind::Add,
+        {scaled_qk_out, params.attn_mask.value()},
+        {masked_qk_out.value()},
+        "mask_add"};
+  } else if (is_causal) {
+    mask_row_idx = {lt_id++, data_type::s32};
+    mask_gen_idx_row = {
+        op_id++,
+        op::kind::GenIndex,
+        {scaled_qk_out},
+        {mask_row_idx.value()},
+        "mask_gen_idx_row"};
+    mask_gen_idx_row->set_attr<int64_t>(op::attr::axis, -2);
+
+    mask_col_idx = {lt_id++, data_type::s32};
+    mask_gen_idx_col = {
+        op_id++,
+        op::kind::GenIndex,
+        {scaled_qk_out},
+        {mask_col_idx.value()},
+        "mask_gen_idx_col"};
+    mask_gen_idx_col->set_attr<int64_t>(op::attr::axis, -1);
+
+    mask_gt_out = {lt_id++, data_type::boolean};
+    mask_gt = {
+        op_id++,
+        op::kind::GreaterEqual,
+        {mask_row_idx.value(), mask_col_idx.value()},
+        {mask_gt_out.value()},
+        "mask_gt"};
+
+    masked_qk_out = {lt_id++, sdpa_intermediate_dtype};
+    mask_select = {
+        op_id++,
+        op::kind::Select,
+        {mask_gt_out.value(), scaled_qk_out, params.neg_inf.value()},
+        {masked_qk_out.value()},
+        "mask_select"};
+  }
+
+  // attention_probs = softmax(masked_score) = exp(masked_score - logsumexp)
+  logical_tensor sub_out{lt_id++, sdpa_intermediate_dtype};
+  op subtract{
+      op_id++,
+      op::kind::Subtract,
+      {masked_qk_out.value_or(scaled_qk_out), params.logsumexp},
+      {sub_out},
+      "subtract"};
+  logical_tensor prob{lt_id++, sdpa_intermediate_dtype};
+  op exp{op_id++, op::kind::Exp, {sub_out}, {prob}, "exp"};
+
+  // The following matmul doesn't support different input dtypes, insert a
+  // typecast
+  logical_tensor prob_casted = prob;
+  op typecast = op(op_id++, op::kind::TypeCast, "typecast");
+  if (dtype != sdpa_intermediate_dtype) {
+    prob_casted = logical_tensor(lt_id++, dtype);
+    typecast.add_inputs({prob});
+    typecast.add_outputs({prob_casted});
+  }
+
+  // grad_value = prob^T * grad_out
+  // TODO: handle GQA headnum because (batch_size, num_head_kv, seq_len_kv,
+  // head_dim_v) != (batch_size, num_head_q, seqlen_kv, seq_len_q) *
+  // (batch_size, num_head_q, seqlen_q, head_dim_v)
+  op matmul_grad_value{
+      op_id++,
+      op::kind::MatMul,
+      {prob_casted, params.grad_out},
+      {params.grad_value},
+      "matmul_grad_value"};
+  matmul_grad_value.set_attr<bool>(op::attr::transpose_a, true);
+
+  // grad_prop = grad_out * value^T
+  // TODO: handle GQA headnum because (batch_size, num_head_q, seq_len_q,
+  // seq_len_kv) != (batch_size, num_head_q, seq_len_q, head_dim_v) *
+  // (batch_size, num_head_kv, head_dim_v, seq_len_kv)
+  logical_tensor grad_prop{lt_id++, sdpa_intermediate_dtype};
+  op matmul_grad_prop{
+      op_id++,
+      op::kind::MatMul,
+      {params.grad_out, params.value},
+      {grad_prop},
+      "matmul_grad_prop"};
+  matmul_grad_prop.set_attr<bool>(op::attr::transpose_b, true);
+
+  // grad_masked_score = softmaxbackward(grad_prop)
+  logical_tensor grad_masked_score{lt_id++, sdpa_intermediate_dtype};
+  op softmax_backward{
+      op_id++,
+      op::kind::SoftMaxBackward,
+      {grad_prop, prob},
+      {grad_masked_score},
+      "softmax_backward"};
+  softmax_backward.set_attr<int64_t>(op::attr::axis, -1);
+
+  // TODO: add output tensor grad_attn_mask = grad_masked_score once OneDNN
+  // supports output grad_attn_mask.
+
+  // grad_scaled_score = grad_masked_score * scale
+  logical_tensor grad_scaled_score{lt_id++, sdpa_intermediate_dtype};
+  op grad_scale_mul{
+      op_id++,
+      op::kind::Multiply,
+      {grad_masked_score, params.scale},
+      {grad_scaled_score},
+      "grad_scale_mul"};
+
+  // The following matmul doesn't support different input dtypes, insert a
+  // typecast
+  logical_tensor grad_scaled_score_cast = grad_scaled_score;
+  op typecast2 = op(op_id++, op::kind::TypeCast, "typecast2");
+  if (dtype != sdpa_intermediate_dtype) {
+    grad_scaled_score_cast = logical_tensor(lt_id++, dtype);
+    typecast2.add_inputs({grad_scaled_score});
+    typecast2.add_outputs({grad_scaled_score_cast});
+  }
+
+  // grad_query = grad_scaled_score_cast * key
+  // TODO: handle GQA headnum because (batch_size, num_head_q, seq_len_q,
+  // head_dim_qk) != (batch_size, num_head_q, seq_len_q, seq_len_kv) *
+  // (batch_size, num_head_kv, seq_len_kv, head_dim_qk)
+  op matmul_grad_query{
+      op_id++,
+      op::kind::MatMul,
+      {grad_scaled_score_cast, params.key},
+      {params.grad_query},
+      "matmul_grad_query"};
+
+  // grad_key = grad_scaled_score_cast^T * query
+  op matmul_grad_key{
+      op_id++,
+      op::kind::MatMul,
+      {grad_scaled_score_cast, params.query},
+      {params.grad_key},
+      "matmul_grad_key"};
+  matmul_grad_key.set_attr<bool>(op::attr::transpose_a, true);
+
+  constexpr auto ekind = dnnl::engine::kind::gpu;
+  dnnl::graph::graph g(ekind);
+  g.add_op(matmul_qk);
+  g.add_op(scale_mul);
+  if (mask_add.has_value()) {
+    g.add_op(mask_add.value());
+  }
+  if (is_causal) {
+    g.add_op(mask_gen_idx_row.value());
+    g.add_op(mask_gen_idx_col.value());
+    g.add_op(mask_gt.value());
+    g.add_op(mask_select.value());
+  }
+  g.add_op(subtract);
+  g.add_op(exp);
+  g.add_op(matmul_grad_value);
+  g.add_op(matmul_grad_prop);
+  g.add_op(softmax_backward);
+  g.add_op(grad_scale_mul);
+  g.add_op(matmul_grad_query);
+  g.add_op(matmul_grad_key);
+  if (dtype != sdpa_intermediate_dtype) {
+    g.add_op(typecast);
+    g.add_op(typecast2);
+  }
+  g.finalize();
+  auto partitions = g.get_partitions();
+  TORCH_INTERNAL_ASSERT(
+      (partitions.size() == 1) && partitions[0].is_supported(),
+      "oneDNN doesn't support this fusion pattern. If you'd like its support, please submit a issue.");
+  return partitions[0];
+}
+
+partition& find_or_create_backward_graph_partition(
+    bool is_causal,
+    const SDPABackwardLogicalParams& params) {
+  thread_local PartitionCache cache;
+  const data_type dtype = params.query.get_data_type();
+
+  // cache key creation
+  // patternID is determined on the basis of the arguments provided
+  std::bitset<32> patternID;
+  if (dtype == data_type::f32) {
+    patternID.set(static_cast<uint8_t>(PartitionCache::BitType::Float32), 1);
+  }
+  if (dtype == data_type::bf16) {
+    patternID.set(static_cast<uint8_t>(PartitionCache::BitType::Bfloat16), 1);
+  }
+  // sdpa backward pattern
+  patternID.set(
+      static_cast<uint8_t>(PartitionCache::BitType::SdpaBwdPattern), 1);
+
+  // Refer to comments in Utils.h. The first 8 bits are reserved
+  int pos = 8;
+  // attn_mask
+  patternID.set(pos++, params.attn_mask.has_value());
+  patternID.set(pos++, is_causal);
+
+  auto partition_ = cache.find_partition(patternID);
+  if (!partition_.has_value()) {
+    // partition cache no hit
+    // graph building and partitioning
+    partition sdpa_backward_partition =
+        create_sdpa_backward_graph_partition(is_causal, dtype, params);
+    partition_ =
+        cache.insert_partition_cache(patternID, sdpa_backward_partition);
+  }
+  return *partition_;
+}
+} // namespace sdpa_backward
 } // namespace
 
 namespace at::native::onednn {
-void gpu_float_sdpa(
+void sdpa(
     int batch_size,
     int seq_len_q,
     int seq_len_kv,
@@ -355,7 +796,9 @@ void gpu_float_sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
-    const Tensor& output) {
+    const Tensor& attention,
+    bool compute_logsumexp,
+    const Tensor& logsumexp) {
   auto& eng = GpuEngineManager::Instance().get_engine();
   auto& strm = GpuStreamManager::Instance().get_stream();
 
@@ -370,8 +813,8 @@ void gpu_float_sdpa(
   };
 
   // OneDNN doesn't support fp32 ukernel for implicit causal mask,
-  // and the reference implementation is worse than aten math + explict causal
-  // mask. Fall back to explict causal mask until OneDNN v3.9 which has fp32
+  // and the reference implementation is worse than aten math + explicit causal
+  // mask. Fall back to explicit causal mask until OneDNN v3.9 which has fp32
   // ukernel for implicit causal mask.
   if (is_causal && query.dtype() == at::kFloat) {
     attn_mask = get_tril_mask();
@@ -381,32 +824,27 @@ void gpu_float_sdpa(
   std::vector<dnnl::graph::logical_tensor> l_inputs, l_outputs;
   std::optional<dnnl::graph::compiled_partition> compiled_partition;
 
-  auto get_compiled_partition = [&]() {
-    const SDPALogicalParams logical_params(
-        query,
-        key,
-        value,
-        attn_mask,
-        output,
-        batch_size,
-        seq_len_q,
-        seq_len_kv,
-        num_head_q,
-        num_head_kv,
-        head_dim_qk,
-        head_dim_v,
-        is_causal);
-    auto& partition_ =
-        find_or_create_graph_partition(is_causal, logical_params);
-    auto i = logical_params.get_input();
-    auto o = logical_params.get_output();
-    auto compiled_partition = partition_.compile(i, o, eng);
-    l_inputs = std::move(i);
-    l_outputs = std::move(o);
-    return compiled_partition;
-  };
-
-  compiled_partition = get_compiled_partition();
+  const sdpa_forward::SDPALogicalParams logical_params(
+      query,
+      key,
+      value,
+      attn_mask,
+      attention,
+      logsumexp,
+      batch_size,
+      seq_len_q,
+      seq_len_kv,
+      num_head_q,
+      num_head_kv,
+      head_dim_qk,
+      head_dim_v,
+      is_causal,
+      compute_logsumexp);
+  auto& partition = sdpa_forward::find_or_create_graph_partition(
+      is_causal, compute_logsumexp, logical_params);
+  l_inputs = std::move(logical_params.get_input());
+  l_outputs = std::move(logical_params.get_output());
+  compiled_partition = partition.compile(l_inputs, l_outputs, eng);
 
   Tensor softmax_scale1 = at::full(
       {},
@@ -416,26 +854,147 @@ void gpu_float_sdpa(
   if (is_causal) {
     neg_inf = at::full(
         {},
-        -INFINITY,
+        -std::numeric_limits<float>::infinity(),
         query.options().dtype(at::toOpMathType(query.scalar_type())));
   }
 
   std::vector<dnnl::graph::tensor> outputs = {
-      {l_outputs[0], eng, output.data_ptr()},
+      {l_outputs[0], eng, attention.data_ptr()},
   };
+  if (compute_logsumexp) {
+    outputs.emplace_back(l_outputs[1], eng, logsumexp.data_ptr());
+  }
+
   size_t i = 0;
   std::vector<dnnl::graph::tensor> inputs;
   inputs.reserve(l_inputs.size());
-  inputs.emplace_back(l_inputs[i++], eng, query.data_ptr());
-  inputs.emplace_back(l_inputs[i++], eng, key.data_ptr());
-  inputs.emplace_back(l_inputs[i++], eng, softmax_scale1.data_ptr());
+
+#define ADD_INPUT(variable) \
+  inputs.emplace_back(l_inputs[i++], eng, variable.data_ptr())
+
+  ADD_INPUT(query);
+  ADD_INPUT(key);
+  ADD_INPUT(softmax_scale1);
   if (neg_inf.has_value()) {
-    inputs.emplace_back(l_inputs[i++], eng, neg_inf->data_ptr());
+    ADD_INPUT((*neg_inf));
   }
   if (attn_mask.has_value()) {
-    inputs.emplace_back(l_inputs[i++], eng, attn_mask->data_ptr());
+    ADD_INPUT((*attn_mask));
   }
-  inputs.emplace_back(l_inputs[i++], eng, value.data_ptr());
+  ADD_INPUT(value);
+#undef ADD_INPUT
+
+  compiled_partition->execute(strm, inputs, outputs);
+}
+
+void sdpa_backward(
+    int batch_size,
+    int num_head_q,
+    int num_head_kv,
+    int seq_len_q,
+    int seq_len_kv,
+    int head_dim_qk,
+    int head_dim_v,
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    std::optional<at::Tensor> attn_mask,
+    bool is_causal,
+    double scale,
+    Tensor& grad_query,
+    Tensor& grad_key,
+    Tensor& grad_value) {
+  auto& eng = GpuEngineManager::Instance().get_engine();
+  auto& strm = GpuStreamManager::Instance().get_stream();
+
+  const auto get_tril_mask = [&]() {
+    auto opts = query.options();
+    auto bool_tril =
+        at::ones_symint({seq_len_q, seq_len_kv}, opts.dtype(at::kBool)).tril();
+    return at::where(
+        bool_tril,
+        0.f,
+        at::scalar_tensor(-std::numeric_limits<float>::infinity(), opts));
+  };
+
+  // OneDNN doesn't support fp32 ukernel for implicit causal mask,
+  // and the reference implementation is worse than aten math + explicit causal
+  // mask. Fall back to explicit causal mask until OneDNN v3.9 which has fp32
+  // ukernel for implicit causal mask.
+  if (is_causal && query.dtype() == at::kFloat) {
+    attn_mask = get_tril_mask();
+    is_causal = false;
+  }
+
+  std::vector<dnnl::graph::logical_tensor> l_inputs, l_outputs;
+  std::optional<dnnl::graph::compiled_partition> compiled_partition;
+
+  const sdpa_backward::SDPABackwardLogicalParams logical_params(
+      grad_out,
+      query,
+      key,
+      value,
+      out,
+      logsumexp,
+      attn_mask,
+      grad_query,
+      grad_key,
+      grad_value,
+      batch_size,
+      num_head_q,
+      num_head_kv,
+      seq_len_q,
+      seq_len_kv,
+      head_dim_qk,
+      head_dim_v,
+      is_causal);
+  auto& partition = sdpa_backward::find_or_create_backward_graph_partition(
+      is_causal, logical_params);
+  l_inputs = std::move(logical_params.get_input());
+  l_outputs = std::move(logical_params.get_output());
+  compiled_partition = partition.compile(l_inputs, l_outputs, eng);
+
+  Tensor softmax_scale = at::full(
+      {}, scale, query.options().dtype(at::toOpMathType(query.scalar_type())));
+  std::optional<at::Tensor> neg_inf;
+  if (is_causal) {
+    neg_inf = at::full(
+        {},
+        -std::numeric_limits<float>::infinity(),
+        query.options().dtype(at::toOpMathType(query.scalar_type())));
+  }
+
+  std::vector<dnnl::graph::tensor> outputs = {
+      {l_outputs[0], eng, grad_query.data_ptr()},
+      {l_outputs[1], eng, grad_key.data_ptr()},
+      {l_outputs[2], eng, grad_value.data_ptr()},
+  };
+
+  size_t i = 0;
+  std::vector<dnnl::graph::tensor> inputs;
+  inputs.reserve(l_inputs.size());
+
+#define ADD_INPUT(variable) \
+  inputs.emplace_back(l_inputs[i++], eng, variable.data_ptr())
+
+  ADD_INPUT(grad_out);
+  ADD_INPUT(query);
+  ADD_INPUT(key);
+  ADD_INPUT(value);
+  ADD_INPUT(out);
+  ADD_INPUT(logsumexp);
+  ADD_INPUT(softmax_scale);
+  if (neg_inf.has_value()) {
+    ADD_INPUT((*neg_inf));
+  }
+  if (attn_mask.has_value()) {
+    ADD_INPUT((*attn_mask));
+  }
+#undef ADD_INPUT
+
   compiled_partition->execute(strm, inputs, outputs);
 }
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
index ac8645d3e4a5..52f89bc1395d 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h
@@ -110,11 +110,21 @@ struct PartitionCache {
   // bit 1: is uint8
   // bit 2: fp16(0) / bf16(1)
   // bit 3: is fp32
-  // bit 4: is sdp pattern
-  // bit 5-7: N/A
+  // bit 4: is sdpa pattern
+  // bit 5: is sdpa backward pattern
+  // bit 6-7: reserved for future use
   // The rest of the bits depend upon the arguments provided
   // However, down the line, we might have different bitsets for different
   // patterns
+  enum class BitType : uint8_t {
+    Int8 = 0,
+    Uint8 = 1,
+    Bfloat16 = 2,
+    Float32 = 3,
+    SdpaPattern = 4,
+    SdpaBwdPattern = 5
+  };
+
   dnnl::graph::partition& insert_partition_cache(
       std::bitset<32>& patternID,
       dnnl::graph::partition& p) {
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
index e73cb73e8b1e..6b2bf01e6d73 100644
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@@ -164,7 +164,7 @@ void quantized_matmul(
     std::string_view unary_post_op_algorithm,
     bool m2_trnas);
 
-void gpu_float_sdpa(
+void sdpa(
     int batch_size,
     int seq_len_q,
     int seq_len_kv,
@@ -178,5 +178,28 @@ void gpu_float_sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
-    const Tensor& output);
+    const Tensor& attention,
+    bool compute_logsumexp,
+    const Tensor& logsumexp);
+
+void sdpa_backward(
+    int batch_size,
+    int num_head_q,
+    int num_head_kv,
+    int seq_len_q,
+    int seq_len_kv,
+    int head_dim_qk,
+    int head_dim_v,
+    const Tensor& grad_out,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const Tensor& out,
+    const Tensor& logsumexp,
+    std::optional<at::Tensor> attn_mask,
+    bool is_causal,
+    double scale,
+    Tensor& grad_query,
+    Tensor& grad_key,
+    Tensor& grad_value);
 } // namespace at::native::onednn
diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
index 1c6e2a6c89da..c014313a5b35 100644
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
@@ -1,5 +1,7 @@
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+#include <ATen/native/mkldnn/xpu/qconv.h>
+
 #include <c10/core/MemoryFormat.h>
 #include <c10/core/ScalarType.h>
 #include <torch/library.h>
@@ -7,7 +9,7 @@
 using namespace at::native::onednn;
 namespace at::native::xpu {
 
-static inline c10::ScalarType qconv_decide_out_dtype(
+inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype(
     const at::Tensor& act,
     const std::optional<c10::ScalarType> output_dtype) {
   bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@@ -19,7 +21,7 @@ static inline c10::ScalarType qconv_decide_out_dtype(
   return dst_dtype;
 }
 
-static at::Tensor qconv_prepack_xpu(
+at::Tensor QConvoneDNNXPU::qconv_prepack_xpu(
     at::Tensor weight,
     at::Tensor weight_scales,
     double input_scale,
@@ -33,222 +35,265 @@ static at::Tensor qconv_prepack_xpu(
   return weight;
 }
 
-class QConvoneDNNXPU final {
- public:
-  static at::Tensor run_pointwise(
-      at::Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double inv_output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view attr,
-      torch::List<std::optional<at::Scalar>> scalars,
-      std::optional<std::string_view> algorithm) {
-    if (act.dim() == 3 || act.dim() == 5) {
-      TORCH_CHECK(
-          attr == "none",
-          "quantized pointwise conv",
-          act.dim() - 2,
-          "d doesn't support unary_post_op fusion. Got unary_post_op:",
-          attr,
-          ".");
-    } else {
-      TORCH_CHECK(
-          attr == "none" || attr == "relu" || attr == "hardtanh" ||
-              attr == "hardswish" || attr == "swish",
-          "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
-          attr,
-          ".");
-    }
+at::Tensor QConvoneDNNXPU::run_pointwise(
+    at::Tensor act,
+    double act_scale,
+    int64_t act_zero_point,
+    at::Tensor weight,
+    at::Tensor weight_scales,
+    at::Tensor weight_zero_points,
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    double inv_output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    std::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm) {
+  if (act.dim() == 3 || act.dim() == 5) {
+    TORCH_CHECK(
+        attr == "none",
+        "quantized pointwise conv",
+        act.dim() - 2,
+        "d doesn't support unary_post_op fusion. Got unary_post_op:",
+        attr,
+        ".");
+  } else {
+    TORCH_CHECK(
+        attr == "none" || attr == "relu" || attr == "hardtanh" ||
+            attr == "hardswish" || attr == "swish",
+        "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:",
+        attr,
+        ".");
+  }
 
-    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
-    auto mfmt = is_channels_last_suggested
-        ? get_cl_tag_by_ndim(act.ndimension())
-        : at::MemoryFormat::Contiguous;
-    Tensor input_ = act.contiguous(mfmt);
-    Tensor weight_ = weight.contiguous(mfmt);
+  bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+  auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
+                                         : at::MemoryFormat::Contiguous;
+  Tensor input_ = act.contiguous(mfmt);
+  Tensor weight_ = weight.contiguous(mfmt);
 
-    auto dst_tz = conv_dst_size(
-        input_.ndimension(),
-        input_.sizes(),
-        weight_.sizes(),
-        padding.vec(),
-        padding.vec(),
-        stride.vec(),
-        dilation.vec());
+  auto dst_tz = conv_dst_size(
+      input_.ndimension(),
+      input_.sizes(),
+      weight_.sizes(),
+      padding.vec(),
+      padding.vec(),
+      stride.vec(),
+      dilation.vec());
 
-    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
-    Tensor output =
-        at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+  auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+  Tensor output =
+      at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
 
-    return quantized_convolution(
-        act,
-        act_scale,
-        act_zero_point,
-        weight,
-        weight_scales,
-        weight_zero_points,
-        bias,
-        stride,
-        padding,
-        dilation,
-        /*transposed*/ false,
-        groups,
-        output,
-        inv_output_scale,
-        output_zero_point,
-        /*accum*/ std::nullopt,
-        /*accum_scale*/ 0.0,
-        /*accum_zero_point*/ 0,
-        /*output_dtype*/ output_dtype,
-        /*binary_attr*/ std::nullopt,
-        /*binary_alpha*/ std::nullopt,
-        /*unary_attr*/ attr,
-        /*unary_scalars*/ scalars,
-        /*unary_algorithm*/ algorithm);
-  }
+  return quantized_convolution(
+      act,
+      act_scale,
+      act_zero_point,
+      weight,
+      weight_scales,
+      weight_zero_points,
+      bias,
+      stride,
+      padding,
+      dilation,
+      /*transposed*/ false,
+      groups,
+      output,
+      inv_output_scale,
+      output_zero_point,
+      /*accum*/ std::nullopt,
+      /*accum_scale*/ 0.0,
+      /*accum_zero_point*/ 0,
+      /*output_dtype*/ output_dtype,
+      /*binary_attr*/ std::nullopt,
+      /*binary_alpha*/ std::nullopt,
+      /*unary_attr*/ attr,
+      /*unary_scalars*/ scalars,
+      /*unary_algorithm*/ algorithm);
+}
 
-  static at::Tensor run_pointwise_tensor(
-      at::Tensor act,
-      at::Tensor act_scale,
-      at::Tensor act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      std::string_view attr,
-      torch::List<std::optional<at::Scalar>> scalars,
-      std::optional<std::string_view> algorithm) {
-    return run_pointwise(
-        act,
-        act_scale.item().toDouble(),
-        act_zero_point.item().toLong(),
-        weight,
-        weight_scales,
-        weight_zero_points,
-        bias,
-        stride,
-        padding,
-        dilation,
-        groups,
-        output_scale,
-        output_zero_point,
-        output_dtype,
-        /*unary_attr*/ attr,
-        /*unary_scalars*/ scalars,
-        /*unary_algorithm*/ algorithm);
-  }
+at::Tensor QConvoneDNNXPU::run_pointwise_tensor(
+    at::Tensor act,
+    at::Tensor act_scale,
+    at::Tensor act_zero_point,
+    at::Tensor weight,
+    at::Tensor weight_scales,
+    at::Tensor weight_zero_points,
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    std::string_view attr,
+    torch::List<std::optional<at::Scalar>> scalars,
+    std::optional<std::string_view> algorithm) {
+  return run_pointwise(
+      act,
+      act_scale.item().toDouble(),
+      act_zero_point.item().toLong(),
+      weight,
+      weight_scales,
+      weight_zero_points,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      output_scale,
+      output_zero_point,
+      output_dtype,
+      /*unary_attr*/ attr,
+      /*unary_scalars*/ scalars,
+      /*unary_algorithm*/ algorithm);
+}
 
-  static at::Tensor run_pointwise_binary(
-      at::Tensor act,
-      double act_scale,
-      int64_t act_zero_point,
-      at::Tensor weight,
-      at::Tensor weight_scales,
-      at::Tensor weight_zero_points,
-      at::Tensor accum,
-      std::optional<at::Tensor> bias,
-      torch::List<int64_t> stride,
-      torch::List<int64_t> padding,
-      torch::List<int64_t> dilation,
-      int64_t groups,
-      double output_scale,
-      int64_t output_zero_point,
-      std::optional<c10::ScalarType> output_dtype,
-      double accum_scale,
-      int64_t accum_zero_point,
-      std::string_view binary_attr,
-      std::optional<at::Scalar> alpha,
-      std::optional<std::string_view> unary_attr,
-      torch::List<std::optional<at::Scalar>> unary_scalars,
-      std::optional<std::string_view> unary_algorithm) {
-    TORCH_CHECK(
-        act.dim() == 4 && binary_attr == "sum" &&
-            (!unary_attr.has_value() ||
-             (unary_attr.has_value() &&
-              (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
-        "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
-        binary_attr,
-        " unary_post_op: ",
-        unary_attr.has_value() ? unary_attr.value() : "none",
-        ".")
+at::Tensor QConvoneDNNXPU::run_pointwise_binary(
+    at::Tensor act,
+    double act_scale,
+    int64_t act_zero_point,
+    at::Tensor weight,
+    at::Tensor weight_scales,
+    at::Tensor weight_zero_points,
+    at::Tensor accum,
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    std::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm) {
+  TORCH_CHECK(
+      act.dim() == 4 && binary_attr == "sum" &&
+          (!unary_attr.has_value() ||
+           (unary_attr.has_value() &&
+            (unary_attr.value() == "none" || unary_attr.value() == "relu"))),
+      "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ",
+      binary_attr,
+      " unary_post_op: ",
+      unary_attr.has_value() ? unary_attr.value() : "none",
+      ".")
 
-    bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
-    auto mfmt = is_channels_last_suggested
-        ? get_cl_tag_by_ndim(act.ndimension())
-        : at::MemoryFormat::Contiguous;
-    Tensor input_ = act.contiguous(mfmt);
-    Tensor weight_ = weight.contiguous(mfmt);
+  bool is_channels_last_suggested = use_channels_last_for_conv(act, weight);
+  auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension())
+                                         : at::MemoryFormat::Contiguous;
+  Tensor input_ = act.contiguous(mfmt);
+  Tensor weight_ = weight.contiguous(mfmt);
 
-    auto dst_tz = conv_dst_size(
-        input_.ndimension(),
-        input_.sizes(),
-        weight_.sizes(),
-        padding.vec(),
-        padding.vec(),
-        stride.vec(),
-        dilation.vec());
+  auto dst_tz = conv_dst_size(
+      input_.ndimension(),
+      input_.sizes(),
+      weight_.sizes(),
+      padding.vec(),
+      padding.vec(),
+      stride.vec(),
+      dilation.vec());
 
-    auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
-    bool has_accum_postop_sum = binary_attr == "sum";
-    Tensor output = has_accum_postop_sum
-        ? accum
-        : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
+  auto dst_dtype = qconv_decide_out_dtype(act, output_dtype);
+  bool has_accum_postop_sum = binary_attr == "sum";
+  Tensor output = has_accum_postop_sum
+      ? accum
+      : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt));
 
-    output = quantized_convolution(
-        act,
-        act_scale,
-        act_zero_point,
-        weight,
-        weight_scales,
-        weight_zero_points,
-        bias,
-        stride,
-        padding,
-        dilation,
-        /*transposed*/ false,
-        groups,
-        output,
-        output_scale,
-        output_zero_point,
-        /*accum*/ accum,
-        /*accum_scale*/ accum_scale,
-        /*accum_zero_point*/ accum_zero_point,
-        /*output_dtype*/ output_dtype,
-        /*binary_attr*/ binary_attr,
-        /*binary_alpha*/ alpha,
-        /*unary_attr*/ unary_attr,
-        /*unary_scalars*/ unary_scalars,
-        /*unary_algorithm*/ unary_algorithm);
+  output = quantized_convolution(
+      act,
+      act_scale,
+      act_zero_point,
+      weight,
+      weight_scales,
+      weight_zero_points,
+      bias,
+      stride,
+      padding,
+      dilation,
+      /*transposed*/ false,
+      groups,
+      output,
+      output_scale,
+      output_zero_point,
+      /*accum*/ accum,
+      /*accum_scale*/ accum_scale,
+      /*accum_zero_point*/ accum_zero_point,
+      /*output_dtype*/ output_dtype,
+      /*binary_attr*/ binary_attr,
+      /*binary_alpha*/ alpha,
+      /*unary_attr*/ unary_attr,
+      /*unary_scalars*/ unary_scalars,
+      /*unary_algorithm*/ unary_algorithm);
 
-    if (!has_accum_postop_sum) {
-      return output;
-    } else {
-      return accum;
-    }
+  if (!has_accum_postop_sum) {
+    return output;
+  } else {
+    return accum;
   }
-};
+}
+
+at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor(
+    at::Tensor act, // contains quantized values but not QTensor
+    at::Tensor act_scale,
+    at::Tensor act_zero_point,
+    at::Tensor weight, // contains quantized values but not QTensor
+    at::Tensor weight_scales,
+    at::Tensor weight_zero_points,
+    at::Tensor accum, // contains quantized values but not QTensor
+    std::optional<at::Tensor> bias,
+    torch::List<int64_t> stride,
+    torch::List<int64_t> padding,
+    torch::List<int64_t> dilation,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    std::optional<c10::ScalarType> output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    std::string_view binary_attr,
+    std::optional<at::Scalar> alpha,
+    std::optional<std::string_view> unary_attr,
+    torch::List<std::optional<at::Scalar>> unary_scalars,
+    std::optional<std::string_view> unary_algorithm) {
+  return run_pointwise_binary(
+      act,
+      act_scale.item().toDouble(),
+      act_zero_point.item().toLong(),
+      weight,
+      weight_scales,
+      weight_zero_points,
+      accum,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      output_scale,
+      output_zero_point,
+      output_dtype,
+      accum_scale,
+      accum_zero_point,
+      binary_attr,
+      alpha,
+      unary_attr,
+      unary_scalars,
+      unary_algorithm);
+}
 
 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv_prepack"),
-      TORCH_FN(xpu::qconv_prepack_xpu));
+      TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"),
       QConvoneDNNXPU::run_pointwise);
@@ -267,6 +312,9 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
       QConvoneDNNXPU::run_pointwise_tensor);
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"),
+      QConvoneDNNXPU::run_pointwise_binary_tensor);
 }
 
 } // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.h b/aten/src/ATen/native/mkldnn/xpu/qconv.h
new file mode 100644
index 000000000000..e9ddd4fa2969
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <ATen/Tensor.h>
+
+namespace at::native::xpu {
+class QConvoneDNNXPU final {
+ public:
+  C10_API static at::Tensor run_pointwise(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double inv_output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm);
+
+  C10_API static at::Tensor run_pointwise_tensor(
+      at::Tensor act,
+      at::Tensor act_scale,
+      at::Tensor act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm);
+
+  C10_API static at::Tensor run_pointwise_binary(
+      at::Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      at::Tensor accum,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double accum_scale,
+      int64_t accum_zero_point,
+      std::string_view binary_attr,
+      std::optional<at::Scalar> alpha,
+      std::optional<std::string_view> unary_attr,
+      torch::List<std::optional<at::Scalar>> unary_scalars,
+      std::optional<std::string_view> unary_algorithm);
+
+  C10_API static at::Tensor run_pointwise_binary_tensor(
+      at::Tensor act,
+      at::Tensor act_scale,
+      at::Tensor act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      at::Tensor accum,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double accum_scale,
+      int64_t accum_zero_point,
+      std::string_view binary_attr,
+      std::optional<at::Scalar> alpha,
+      std::optional<std::string_view> unary_attr,
+      torch::List<std::optional<at::Scalar>> unary_scalars,
+      std::optional<std::string_view> unary_algorithm);
+
+  static inline c10::ScalarType qconv_decide_out_dtype(
+      const at::Tensor& act,
+      const std::optional<c10::ScalarType> output_dtype);
+
+  static at::Tensor qconv_prepack_xpu(
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      double input_scale,
+      int64_t input_zero_point,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      std::optional<torch::List<int64_t>> input_shape);
+};
+
+} // namespace at::native::xpu
\ No newline at end of file
diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
index 7e3f2f01fa1e..e9584e8289eb 100644
--- a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp
@@ -1,13 +1,14 @@
 #include <torch/library.h>
 
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
+#include <ATen/native/mkldnn/xpu/qlinear.h>
 #include <c10/core/ScalarType.h>
 
 using namespace at::native::onednn;
 
 namespace at::native::xpu {
 
-static inline c10::ScalarType qlinear_decide_out_dtype(
+inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype(
     const at::Tensor& act,
     const std::optional<c10::ScalarType> output_dtype) {
   bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat);
@@ -19,7 +20,7 @@ static inline c10::ScalarType qlinear_decide_out_dtype(
   return dst_dtype;
 }
 
-static Tensor q_linear_pointwise(
+Tensor QLinearOnednnXPU::q_linear_pointwise(
     Tensor act,
     double act_scale,
     int64_t act_zero_point,
@@ -78,7 +79,7 @@ static Tensor q_linear_pointwise(
   return qout;
 }
 
-static Tensor q_linear_pointwise_tensor(
+Tensor QLinearOnednnXPU::q_linear_pointwise_tensor(
     Tensor act,
     Tensor act_scale,
     Tensor act_zero_point,
@@ -137,7 +138,7 @@ static Tensor q_linear_pointwise_tensor(
   return qout;
 }
 
-static Tensor q_linear_pointwise_binary(
+Tensor QLinearOnednnXPU::q_linear_pointwise_binary(
     Tensor act,
     double act_scale,
     int64_t act_zero_point,
@@ -208,7 +209,7 @@ static Tensor q_linear_pointwise_binary(
   return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
 }
 
-static Tensor q_linear_pointwise_binary_tensor(
+Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
     Tensor act,
     Tensor act_scale,
     Tensor act_zero_point,
@@ -248,7 +249,7 @@ static Tensor q_linear_pointwise_binary_tensor(
       unary_post_op_algorithm);
 }
 
-static at::Tensor q_linear_prepack_onednn(
+Tensor QLinearOnednnXPU::q_linear_prepack_onednn(
     at::Tensor weight,
     std::optional<torch::List<int64_t>> input_shape) {
   at::Tensor weight_transposed = weight.transpose(0, 1);
@@ -258,19 +259,19 @@ static at::Tensor q_linear_prepack_onednn(
 TORCH_LIBRARY_IMPL(onednn, XPU, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"),
-      TORCH_FN(q_linear_pointwise));
+      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"),
-      TORCH_FN(q_linear_pointwise_tensor));
+      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_tensor));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"),
-      TORCH_FN(q_linear_prepack_onednn));
+      TORCH_FN(QLinearOnednnXPU::q_linear_prepack_onednn));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"),
-      TORCH_FN(q_linear_pointwise_binary));
+      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary));
   m.impl(
       TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"),
-      TORCH_FN(q_linear_pointwise_binary_tensor));
+      TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor));
 }
 
 } // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.h b/aten/src/ATen/native/mkldnn/xpu/qlinear.h
new file mode 100644
index 000000000000..738227666424
--- /dev/null
+++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/List.h>
+
+namespace at::native::xpu {
+
+class QLinearOnednnXPU final {
+ public:
+  C10_API static Tensor q_linear_pointwise(
+      Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      Tensor weight,
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view post_op_name,
+      torch::List<std::optional<at::Scalar>> post_op_args,
+      std::string_view post_op_algorithm);
+
+  C10_API static Tensor q_linear_pointwise_tensor(
+      Tensor act,
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor weight,
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view post_op_name,
+      torch::List<std::optional<at::Scalar>> post_op_args,
+      std::string_view post_op_algorithm);
+
+  C10_API static Tensor q_linear_pointwise_binary(
+      Tensor act,
+      double act_scale,
+      int64_t act_zero_point,
+      Tensor weight,
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<at::Tensor> other,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double other_scale,
+      int64_t other_zero_point,
+      std::string_view binary_post_op,
+      double binary_alpha,
+      std::string_view unary_post_op,
+      torch::List<std::optional<at::Scalar>> unary_post_op_args,
+      std::string_view unary_post_op_algorithm);
+
+  C10_API static Tensor q_linear_pointwise_binary_tensor(
+      Tensor act,
+      Tensor act_scale,
+      Tensor act_zero_point,
+      Tensor weight,
+      Tensor weight_scales,
+      Tensor weight_zero_points,
+      std::optional<at::Tensor> other,
+      std::optional<Tensor> bias,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      double other_scale,
+      int64_t other_zero_point,
+      std::string_view binary_post_op,
+      double binary_alpha,
+      std::string_view unary_post_op,
+      torch::List<std::optional<at::Scalar>> unary_post_op_args,
+      std::string_view unary_post_op_algorithm);
+
+  C10_API static Tensor q_linear_prepack_onednn(
+      at::Tensor weight,
+      std::optional<torch::List<int64_t>> input_shape);
+
+  static inline c10::ScalarType qlinear_decide_out_dtype(
+      const at::Tensor& act,
+      const std::optional<c10::ScalarType> output_dtype);
+
+}; // class QLinearOnednnXPU
+
+} // namespace at::native::xpu
diff --git a/aten/src/ATen/native/mps/kernels/Attention.metal b/aten/src/ATen/native/mps/kernels/Attention.metal
index 6bb2cbfb3d71..5a317895f508 100644
--- a/aten/src/ATen/native/mps/kernels/Attention.metal
+++ b/aten/src/ATen/native/mps/kernels/Attention.metal
@@ -14,8 +14,8 @@ template <typename T, int D, int V = D>
     device T* out [[buffer(3)]],
     const constant uint& gqa_factor [[buffer(4)]],
     const constant uint& N [[buffer(5)]],
-    const constant uint2& k_head_seq_stride [[buffer(6)]],
-    const constant uint2& v_head_seq_stride [[buffer(7)]],
+    const constant uint3& qkv_head_strides [[buffer(6)]],
+    const constant uint3& qkv_seq_strides [[buffer(7)]],
     const constant float& scale [[buffer(8)]],
     const device bool* mask [[buffer(9)]],
     const constant uint3& mask_strides [[buffer(10)]],
@@ -28,10 +28,12 @@ template <typename T, int D, int V = D>
   constexpr uint BD = 32;
   constexpr uint qk_per_thread = D / BD;
   constexpr uint v_per_thread = V / BD;
-  const uint k_head_stride = k_head_seq_stride.x;
-  const uint k_seq_stride = k_head_seq_stride.y;
-  const uint v_head_stride = v_head_seq_stride.x;
-  const uint v_seq_stride = v_head_seq_stride.y;
+  const uint q_head_stride = qkv_head_strides.x;
+  const uint q_seq_stride = qkv_seq_strides.x;
+  const uint k_head_stride = qkv_head_strides.y;
+  const uint k_seq_stride = qkv_seq_strides.y;
+  const uint v_head_stride = qkv_head_strides.z;
+  const uint v_seq_stride = qkv_seq_strides.z;
   const uint mask_head_stride = mask_strides.x;
   const uint mask_kv_seq_stride = mask_strides.y;
   const uint mask_q_seq_stride = mask_strides.z;
@@ -54,9 +56,9 @@ template <typename T, int D, int V = D>
   const int kv_head_idx = head_idx / gqa_factor;
   const int Q = tpg.y;
   const int group_offset = head_idx * Q + q_seq_idx;
-  const int q_offset = group_offset;
   const int o_offset = group_offset;
-  queries += q_offset * D + simd_lid * qk_per_thread;
+  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
+      simd_lid * qk_per_thread;
   keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride +
       simd_lid * qk_per_thread;
   values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride +
@@ -156,8 +158,8 @@ template <typename T, int D, int V = D>
     device float* maxs [[buffer(5)]],
     const constant uint& gqa_factor [[buffer(6)]],
     const constant uint& N [[buffer(7)]],
-    const constant uint2& k_head_seq_stride [[buffer(8)]],
-    const constant uint2& v_head_seq_stride [[buffer(9)]],
+    const constant uint3& qkv_head_strides [[buffer(8)]],
+    const constant uint3& qkv_seq_strides [[buffer(9)]],
     const constant float& scale [[buffer(10)]],
     const device bool* mask [[buffer(11)]],
     const constant uint3& mask_strides [[buffer(12)]],
@@ -170,10 +172,12 @@ template <typename T, int D, int V = D>
   constexpr int BD = 32;
   constexpr int qk_per_thread = D / BD;
   constexpr int v_per_thread = V / BD;
-  const int k_head_stride = k_head_seq_stride.x;
-  const int k_seq_stride = k_head_seq_stride.y;
-  const int v_head_stride = v_head_seq_stride.x;
-  const int v_seq_stride = v_head_seq_stride.y;
+  const int q_head_stride = qkv_head_strides.x;
+  const int q_seq_stride = qkv_seq_strides.x;
+  const int k_head_stride = qkv_head_strides.y;
+  const int k_seq_stride = qkv_seq_strides.y;
+  const int v_head_stride = qkv_head_strides.z;
+  const int v_seq_stride = qkv_seq_strides.z;
   const int mask_kv_seq_stride = mask_strides.x;
   const int mask_q_seq_stride = mask_strides.y;
   const int mask_head_stride = mask_strides.z;
@@ -196,10 +200,10 @@ template <typename T, int D, int V = D>
   const int head_idx = tid.x;
   const int q_seq_idx = tid.y;
   const int o_offset = head_idx * tpg.y + q_seq_idx;
-  const int q_offset = o_offset;
   const int kv_head_idx = head_idx / gqa_factor;
 
-  queries += q_offset * D + simd_lid * qk_per_thread;
+  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
+      simd_lid * qk_per_thread;
   keys += kv_head_idx * k_head_stride +
       (block_idx * BN + simd_gid) * k_seq_stride + simd_lid * qk_per_thread;
   values += kv_head_idx * v_head_stride +
@@ -520,25 +524,25 @@ kernel void attention(
   }
 }
 
-#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)    \
-  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM     \
-                       "_" #VALUE_DIM)]] kernel void         \
-  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                     \
-      const device DTYPE* queries [[buffer(0)]],             \
-      const device DTYPE* keys [[buffer(1)]],                \
-      const device DTYPE* values [[buffer(2)]],              \
-      device DTYPE* out [[buffer(3)]],                       \
-      const constant uint& gqa_factor [[buffer(4)]],         \
-      const constant uint& N [[buffer(5)]],                  \
-      const constant uint2& k_head_seq_stride [[buffer(6)]], \
-      const constant uint2& v_head_seq_stride [[buffer(7)]], \
-      const constant float& scale [[buffer(8)]],             \
-      const device bool* mask [[buffer(9)]],                 \
-      const constant uint3& mask_strides [[buffer(10)]],     \
-      const constant bool& has_mask [[buffer(11)]],          \
-      uint3 tid [[threadgroup_position_in_grid]],            \
-      uint3 tpg [[threadgroups_per_grid]],                   \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],      \
+#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)   \
+  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM    \
+                       "_" #VALUE_DIM)]] kernel void        \
+  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                    \
+      const device DTYPE* queries [[buffer(0)]],            \
+      const device DTYPE* keys [[buffer(1)]],               \
+      const device DTYPE* values [[buffer(2)]],             \
+      device DTYPE* out [[buffer(3)]],                      \
+      const constant uint& gqa_factor [[buffer(4)]],        \
+      const constant uint& N [[buffer(5)]],                 \
+      const constant uint3& qkv_head_strides [[buffer(6)]], \
+      const constant uint3& qkv_seq_strides [[buffer(7)]],  \
+      const constant float& scale [[buffer(8)]],            \
+      const device bool* mask [[buffer(9)]],                \
+      const constant uint3& mask_strides [[buffer(10)]],    \
+      const constant bool& has_mask [[buffer(11)]],         \
+      uint3 tid [[threadgroup_position_in_grid]],           \
+      uint3 tpg [[threadgroups_per_grid]],                  \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],     \
       uint simd_lid [[thread_index_in_simdgroup]]);
 
 #define INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, QK_DIM, VALUE_DIM) \
@@ -553,8 +557,8 @@ kernel void attention(
       device float* maxs [[buffer(5)]],                           \
       const constant uint& gqa_factor [[buffer(6)]],              \
       const constant uint& N [[buffer(7)]],                       \
-      const constant uint2& k_head_seq_stride [[buffer(8)]],      \
-      const constant uint2& v_head_seq_stride [[buffer(9)]],      \
+      const constant uint3& qkv_head_strides [[buffer(8)]],       \
+      const constant uint3& qkv_seq_strides [[buffer(9)]],        \
       const constant float& scale [[buffer(10)]],                 \
       const device bool* mask [[buffer(11)]],                     \
       const constant uint3& mask_strides [[buffer(12)]],          \
diff --git a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
index f6f4935608e4..0539eab79500 100644
--- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
@@ -39,6 +39,13 @@ struct lerp_alpha_functor {
   }
 };
 
+struct native_dropout_mask_and_scale_functor {
+  template <typename TI, typename TA>
+  inline TA operator()(const TI a, const TI b, const TA scale) {
+    return static_cast<TA>(a) * static_cast<TA>(b) * scale;
+  }
+};
+
 struct fmax_functor {
   template <typename T>
   inline T operator()(const T a, const T b) {
@@ -315,6 +322,20 @@ struct fmod_functor {
   }
 };
 
+struct igamma_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return c10::metal::igamma(a, b);
+  }
+};
+
+struct igammac_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return c10::metal::igammac(a, b);
+  }
+};
+
 #define REGISTER_INTEGER_BINARY_OP(NAME)  \
   REGISTER_BINARY_OP(NAME, long, long);   \
   REGISTER_BINARY_OP(NAME, int, int);     \
@@ -386,6 +407,8 @@ REGISTER_OPMATH_FLOAT_BINARY_OP(remainder);
 REGISTER_INTEGER_BINARY_OP(remainder);
 REGISTER_OPMATH_FLOAT_BINARY_OP(fmod);
 REGISTER_INTEGER_BINARY_OP(fmod);
+REGISTER_OPMATH_FLOAT_BINARY_OP(igamma);
+REGISTER_OPMATH_FLOAT_BINARY_OP(igammac);
 REGISTER_BINARY_ALPHA_OP(add_alpha, long, long, long);
 REGISTER_BINARY_ALPHA_OP(add_alpha, int, int, int);
 REGISTER_BINARY_ALPHA_OP(add_alpha, float, float, float);
@@ -411,6 +434,10 @@ REGISTER_BINARY_ALPHA_OP(lerp_alpha, uchar, uchar, uchar);
 REGISTER_BINARY_ALPHA_OP(lerp_alpha, char, char, char);
 REGISTER_BINARY_ALPHA_OP(lerp_alpha, bool, bool, bool);
 
+REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, float, float, float);
+REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, bfloat, bfloat, bfloat);
+REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, half, half, half);
+
 REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat);
 REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat);
 REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat);
diff --git a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
index 23c4810a2496..7db38da80532 100644
--- a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
@@ -490,11 +490,6 @@ struct bitwise_not_functor {
   }
 };
 
-template <typename T>
-float erfc(T x) {
-  return 1.0 - erf(x);
-}
-
 struct round_decimals_functor {
   template <typename T>
   inline T operator()(const T x, const long ndigits) {
@@ -503,6 +498,17 @@ struct round_decimals_functor {
   }
 };
 
+struct round_functor {
+  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return static_cast<T>(rint(float(x)));
+  }
+  template <typename T, enable_if_t<is_scalar_integral_v<T>, bool> = true>
+  inline T operator()(const T x) {
+    return x;
+  }
+};
+
 DEFINE_UNARY_FLOATING_FUNCTOR(erf);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfc);
 DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
@@ -515,6 +521,13 @@ REGISTER_UNARY_OP(neg, char, char);
 REGISTER_UNARY_OP(neg, uchar, uchar);
 REGISTER_UNARY_OP(neg, float, float);
 REGISTER_UNARY_OP(neg, half, half);
+REGISTER_UNARY_OP(round, int, int);
+REGISTER_UNARY_OP(round, long, long);
+REGISTER_UNARY_OP(round, short, short);
+REGISTER_UNARY_OP(round, char, char);
+REGISTER_UNARY_OP(round, uchar, uchar);
+REGISTER_UNARY_OP(round, float, float);
+REGISTER_UNARY_OP(round, half, half);
 
 REGISTER_UNARY_OP(bitwise_not, int, int);
 REGISTER_UNARY_OP(bitwise_not, long, long);
@@ -558,6 +571,7 @@ REGISTER_UNARY_OP(abs, half, half);
 
 INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat);
 REGISTER_UNARY_OP(neg, bfloat, bfloat);
+REGISTER_UNARY_OP(round, bfloat, bfloat);
 REGISTER_UNARY_OP(abs, bfloat, bfloat);
 INSTANTIATE_UNARY_KERNELS2(half, half);
 INSTANTIATE_UNARY_KERNELS2(float, float);
diff --git a/aten/src/ATen/native/mps/operations/Attention.mm b/aten/src/ATen/native/mps/operations/Attention.mm
index 69ec9af055ba..11498ade6fd0 100644
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@@ -182,6 +182,8 @@
   uint maxSeqLength = k_.size(2);
   uint N = k_.size(2);
   uint B = q_.size(0) * q_.size(1);
+  uint q_head_stride = q_.stride(1);
+  uint q_seq_stride = q_.stride(2);
   uint k_head_stride = k_.stride(1);
   uint k_seq_stride = k_.stride(2);
   uint v_head_stride = v_.stride(1);
@@ -209,8 +211,8 @@
                   out,
                   1,
                   N,
-                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
-                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
+                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
                   scale_factor);
 
       if (has_mask) {
@@ -257,6 +259,8 @@
   uint B = batchSize * num_heads;
   uint gqa_factor = q_.size(1) / k_.size(1);
 
+  uint q_head_stride = q_.stride(1);
+  uint q_seq_stride = q_.stride(2);
   uint k_head_stride = k_.stride(1);
   uint k_seq_stride = k_.stride(2);
   uint v_head_stride = v_.stride(1);
@@ -294,8 +298,8 @@
                   maxs,
                   gqa_factor,
                   N,
-                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
-                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
+                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
                   scale_factor);
 
       if (has_mask) {
diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
index b2a1b2757b13..0b303f48028f 100644
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@@ -168,6 +168,10 @@ static void lerp_scalar_mps_kernel(at::TensorIteratorBase& iter, const Scalar& w
   lib.exec_binary_kernel(iter, "lerp_alpha", weight);
 }
 
+static void native_dropout_mask_and_scale_mps_kernel(at::TensorIteratorBase& iter, const Scalar& scale) {
+  lib.exec_binary_kernel(iter, "native_dropout_mask_and_scale", scale);
+}
+
 static void mul_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "mul");
 }
@@ -192,6 +196,14 @@ static void fmod_mps_kernel(TensorIteratorBase& iter) {
   lib.exec_binary_kernel(iter, "fmod");
 }
 
+static void igamma_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "igamma");
+}
+
+static void igammac_mps_kernel(TensorIteratorBase& iter) {
+  lib.exec_binary_kernel(iter, "igammac");
+}
+
 REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel)
 REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
 REGISTER_DISPATCH(copysign_stub, &copysign_mps_kernel)
@@ -217,4 +229,6 @@ static void fmod_mps_kernel(TensorIteratorBase& iter) {
 REGISTER_DISPATCH(div_trunc_stub, &div_trunc_mps_kernel)
 REGISTER_DISPATCH(fmod_stub, &fmod_mps_kernel)
 REGISTER_DISPATCH(remainder_stub, &remainder_mps_kernel)
+REGISTER_DISPATCH(igamma_stub, &igamma_mps_kernel)
+REGISTER_DISPATCH(igammac_stub, &igammac_mps_kernel)
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/Dropout.mm b/aten/src/ATen/native/mps/operations/Dropout.mm
new file mode 100644
index 000000000000..116367d809eb
--- /dev/null
+++ b/aten/src/ATen/native/mps/operations/Dropout.mm
@@ -0,0 +1,45 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/TensorOperators.h>
+#include <ATen/mps/MPSGeneratorImpl.h>
+#include <ATen/native/Distributions.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/operations/BinaryKernel.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/bernoulli.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/native_dropout_backward_native.h>
+#include <ATen/ops/native_dropout_native.h>
+#include <ATen/ops/ones_like.h>
+#endif
+
+namespace at::native {
+
+static Tensor native_dropout_mask_and_scale(const Tensor& input, const Tensor& mask, float scale) {
+  auto output = at::empty_like(input);
+  mps::binary_op_kernel("native_dropout_mask_and_scale", input, mask, output, scale);
+  return output;
+}
+
+std::tuple<Tensor, Tensor> native_dropout_mps(const Tensor& input, double p, std::optional<bool> train) {
+  if (input.numel() == 0 || !train.value_or(false) || p == 0) {
+    return {input.clone(), at::ones_like(input, input.options().dtype(c10::kBool))};
+  }
+
+  float p_comp = 1.0f - p;
+  Tensor mask = at::empty_like(input, input.options().dtype(c10::kBool));
+  mask.bernoulli_(p_comp);
+  auto scale = p_comp == 0 ? 0.0f : 1.0f / p_comp;
+  Tensor output = native_dropout_mask_and_scale(input, mask, scale);
+  return {std::move(output), std::move(mask)};
+}
+
+Tensor native_dropout_backward_mps(const Tensor& grad, const Tensor& mask, double scale) {
+  auto grad_float = isFloatingType(grad.scalar_type()) ? grad : grad.to(c10::kFloat);
+  return native_dropout_mask_and_scale(grad_float, mask, scale);
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm
index 42769c13f1e1..219086edd8e3 100644
--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@@ -115,7 +115,10 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
     return output;
   }
 
-  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) {
+  // No-graph execution causes nonsense if these are non-contiguous.
+  const bool is_contiguous = input.is_contiguous() && weight.is_contiguous() && bias.is_contiguous();
+
+  if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_contiguous) {
     _mps_linear_nograph(input, weight, bias, output);
     // Squeeze last dim of 1D linear
     return weight_arg.dim() != 1 ? output : output.squeeze(-1);
diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm
index cfec1e443e25..6ff47044df13 100644
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@@ -2,6 +2,7 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
+#include <ATen/native/SortingUtils.h>
 #include <ATen/native/TensorShape.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
@@ -11,10 +12,85 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/kthvalue_native.h>
 #include <ATen/ops/sort.h>
 #include <ATen/ops/sort_native.h>
 #endif
 namespace at::native {
+namespace {
+
+void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) {
+  using namespace mps;
+  if (self.dim() == 0 && self.numel() == 1) {
+    values.copy_(self);
+    indices.zero_();
+    return;
+  }
+  // Handle empty tensors
+  if (self.numel() == 0) {
+    values.copy_(self);
+    indices.copy_(values.toType(at::ScalarType::Long));
+    return;
+  }
+  // issue #154890, raising error to prevent crash within MPSGraph until
+  // workaround is implemented.
+  TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
+
+  auto stream = getCurrentMPSStream();
+  struct CachedGraph : public MPSCachedGraph {
+    CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
+    MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil;
+  };
+
+  // MPSGraph kthvalue is always sorted.
+  @autoreleasepool {
+    // Input as placeholders
+    MPSShape* input_shape = getMPSShape(self);
+    NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","];
+    std::string key = std::string("kthvalue:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" +
+        std::to_string(k) + ":dim" + std::to_string(dim);
+    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
+      newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape);
+
+      MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor;
+      MPSDataType dataType = getMPSDataType(self);
+      // #issue 104398441 sortWithTensor and argsortWithTensor
+      if (dataType != MPSDataTypeInt32 && dataType != MPSDataTypeFloat32 && dataType != MPSDataTypeFloat16) {
+        dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32;
+        castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor toType:dataType name:@"castInputTensor"];
+      }
+      MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor
+                                                         axis:(NSUInteger)dim
+                                                   descending:false
+                                                         name:nil];
+      sortedTensor = [mpsGraph sliceTensor:sortedTensor
+                                 dimension:(NSUInteger)dim
+                                     start:((NSUInteger)k - 1)
+                                    length:1
+                                      name:nil];
+      MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor
+                                                               axis:(NSInteger)dim
+                                                         descending:false
+                                                               name:@"kthvalue_out"];
+      argSortedTensor = [mpsGraph sliceTensor:argSortedTensor
+                                    dimension:dim
+                                        start:((NSUInteger)k - 1)
+                                       length:1
+                                         name:nil];
+      newCachedGraph->valuesTensor = sortedTensor;
+      newCachedGraph->indicesTensor = argSortedTensor;
+    });
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self);
+    // Outputs as placeholders
+    Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values);
+    Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices);
+    // Create dictionary of inputs and outputs
+    auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
+    auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder);
+    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+  }
+}
+} // anonymous namespace
 
 // sort
 TORCH_IMPL_FUNC(sort_stable_out_mps)
@@ -81,4 +157,31 @@
     runMPSGraph(stream, cachedGraph->graph(), feeds, results);
   }
 }
+
+std::tuple<Tensor&, Tensor&> kthvalue_out_mps(const Tensor& self,
+                                              int64_t k,
+                                              int64_t dim_,
+                                              bool keepdim,
+                                              Tensor& values,
+                                              Tensor& indices) {
+  // See note [Writing Nondeterministic Operations]
+  // If there are duplicate elements of the kth value, the procedure for choosing which
+  // of the duplicates to use for the indices output is nondeterministic.
+  at::globalContext().alertNotDeterministic("kthvalue MPS");
+
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim);
+  TORCH_CHECK(k >= 1 && k <= slicesize, "kthvalue(): selected number k out of range for dimension ", dim);
+  at::assert_no_overlap(self, values);
+  _reduction_with_indices_allocate_or_resize_output(values, indices, self, dim, keepdim);
+
+  kthvalue_out_mps_impl(self, k, dim, values, indices);
+
+  if (!keepdim) {
+    values.squeeze_(dim);
+    indices.squeeze_(dim);
+  }
+
+  return std::forward_as_tuple(values, indices);
+}
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 16e0608012f3..7b637d896f85 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -335,6 +335,9 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
 }
 
 static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) {
+  if (iter.numel() == 0) {
+    return;
+  }
   const auto& self = iter.input(0);
   auto& out = iter.output(0);
   @autoreleasepool {
diff --git a/aten/src/ATen/native/mps/operations/UnaryKernel.mm b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
index b560739ed40c..7e150b133cc6 100644
--- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@@ -50,6 +50,7 @@ static void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals) {
 REGISTER_UNARY_TI_DISPATCH(log);
 REGISTER_UNARY_TI_DISPATCH(log1p);
 REGISTER_UNARY_TI_DISPATCH(bitwise_not);
+REGISTER_UNARY_TI_DISPATCH(round);
 REGISTER_UNARY_TI_DISPATCH(sigmoid);
 REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel);
 } // namespace at::native
diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm
index 8fbefcb6ab8a..d7ce40e5cbb4 100644
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@@ -184,7 +184,6 @@ static void unary_op(const Tensor& self,
 
 REGISTER_MPS_UNARY_STUB(ceil, ceil);
 REGISTER_MPS_UNARY_STUB(floor, floor);
-REGISTER_MPS_UNARY_STUB(round, round);
 REGISTER_MPS_UNARY_STUB(trunc, truncate);
 
 #define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub)                                         \
@@ -418,6 +417,7 @@ static void cumulative_op_impl(const Tensor& self,
 
 Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) {
   TORCH_CHECK(self.is_complex());
+  TORCH_CHECK(self.dtype() != at::kComplexDouble);
   mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
     return [mpsGraph conjugateWithTensor:inputTensor name:nil];
   });
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 113db1c1e437..abb061afc5c9 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -288,6 +288,7 @@
   dispatch:
     CPU: native_dropout_cpu
     CUDA: native_dropout_cuda
+    MPS: native_dropout_mps
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested
   tags: [nondeterministic_seeded, core]
   autogen: native_dropout.out
@@ -296,6 +297,7 @@
   dispatch:
     CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward
     CUDA: native_dropout_backward_cuda
+    MPS: native_dropout_backward_mps
   autogen: native_dropout_backward.out
   tags: pointwise
 
@@ -340,8 +342,8 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: abs
-    SparseCPU, SparseCUDA: abs_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr
+    SparseCPU, SparseCUDA, SparseMPS: abs_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs
   tags: [core, pointwise]
 
@@ -350,16 +352,16 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: abs_
-    SparseCPU, SparseCUDA: abs_sparse_
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_
+    SparseCPU, SparseCUDA, SparseMPS: abs_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_
 
 - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA, MPS, MTIA: abs_out
-    SparseCPU, SparseCUDA: abs_sparse_out
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out
+    SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out
   tags: pointwise
 
 # Note [Adding an alias]
@@ -428,7 +430,7 @@
   variants: function, method
   structured_delegate: sgn.out
   dispatch:
-    SparseCPU, SparseCUDA: sgn_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn
   tags: pointwise
@@ -437,7 +439,7 @@
   variants: method
   structured_delegate: sgn.out
   dispatch:
-    SparseCPU, SparseCUDA: sgn_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_
   tags: pointwise
@@ -448,7 +450,7 @@
   dispatch:
     CPU, CUDA: sgn_out
     MPS: sgn_out_mps
-    SparseCPU, SparseCUDA: sgn_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out
   tags: pointwise
 
@@ -476,7 +478,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: _conj_physical
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr
   autogen: _conj_physical.out
 
 - func: conj_physical(Tensor self) -> Tensor
@@ -487,8 +489,8 @@
   dispatch:
     CPU, CUDA: conj_physical_out
     MPS: conj_physical_out_mps
-    SparseCPU, SparseCUDA: conj_physical_out_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out
+    SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out
   tags: pointwise
 
 - func: conj_physical_(Tensor(a!) self) -> Tensor(a!)
@@ -554,7 +556,7 @@
   structured_delegate: add.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: add_sparse
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr
     MkldnnCPU: mkldnn_add
     ZeroTensor: add_zerotensor
@@ -566,7 +568,7 @@
   variants: method
   structured_delegate: add.out
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: add_sparse_
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_
     MkldnnCPU: mkldnn_add_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor
@@ -582,6 +584,7 @@
   dispatch:
     SparseCPU, SparseMeta: add_out_sparse_cpu
     SparseCUDA: add_out_sparse_cuda
+    SparseMPS: add_out_sparse_mps
     SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu
     SparseCsrCUDA: add_out_sparse_compressed_cuda
     MkldnnCPU: mkldnn_add_out
@@ -874,7 +877,7 @@
   variants: function, method
   structured_delegate: asinh.out
   dispatch:
-    SparseCPU, SparseCUDA: asinh_sparse
+    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr
   tags: [core, pointwise]
 
@@ -882,7 +885,7 @@
   variants: function, method
   structured_delegate: asinh.out
   dispatch:
-    SparseCPU, SparseCUDA: asinh_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_
   tags: pointwise
 
@@ -892,7 +895,7 @@
   dispatch:
     CPU, CUDA: asinh_out
     MPS: asinh_out_mps
-    SparseCPU, SparseCUDA: asinh_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out
   tags: pointwise
 
@@ -909,7 +912,7 @@
   structured_delegate: atanh.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: atanh_sparse
+    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr
   tags: [core, pointwise]
 
@@ -917,7 +920,7 @@
   structured_delegate: atanh.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: atanh_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_
   tags: pointwise
 
@@ -927,7 +930,7 @@
   dispatch:
     CPU, CUDA: atanh_out
     MPS: atanh_out_mps
-    SparseCPU, SparseCUDA: atanh_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out
   tags: pointwise
 # arctanh, alias for atanh
@@ -964,7 +967,7 @@
   variants: function, method
   structured_delegate: asin.out
   dispatch:
-    SparseCPU, SparseCUDA: asin_sparse
+    SparseCPU, SparseCUDA, SparseMPS: asin_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr
   tags: [core, pointwise]
 
@@ -973,7 +976,7 @@
   variants: function, method
   structured_delegate: asin.out
   dispatch:
-    SparseCPU, SparseCUDA: asin_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: asin_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_
   tags: pointwise
 
@@ -983,7 +986,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: asin_out
-    SparseCPU, SparseCUDA: asin_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out
   tags: pointwise
 
@@ -1001,7 +1004,7 @@
   structured_delegate: atan.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: atan_sparse
+    SparseCPU, SparseCUDA, SparseMPS: atan_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr
   tags: [core, pointwise]
 
@@ -1010,7 +1013,7 @@
   structured_delegate: atan.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: atan_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: atan_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_
   tags: pointwise
 
@@ -1020,7 +1023,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: atan_out
-    SparseCPU, SparseCUDA: atan_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out
   tags: pointwise
 
@@ -1459,7 +1462,7 @@
   structured_delegate: ceil.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: ceil_sparse
+    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr
   tags: [core, pointwise]
 
@@ -1468,7 +1471,7 @@
   structured_delegate: ceil.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: ceil_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_
   tags: pointwise
 
@@ -1478,7 +1481,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: ceil_out
-    SparseCPU, SparseCUDA: ceil_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out
   tags: pointwise
 
@@ -2406,7 +2409,7 @@
     MPS: empty_mps
     Meta: empty_meta_symint
     MkldnnCPU: empty_mkldnn
-    SparseCPU, SparseCUDA: empty_sparse
+    SparseCPU, SparseCUDA, SparseMPS: empty_sparse
     SparseMeta: empty_sparse_symint
     SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed
     SparseCsrMeta: empty_sparse_compressed_symint
@@ -2534,7 +2537,7 @@
   structured_delegate: erf.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: erf_sparse
+    SparseCPU, SparseCUDA, SparseMPS: erf_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr
   tags: [core, pointwise]
 
@@ -2543,7 +2546,7 @@
   structured_delegate: erf.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: erf_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: erf_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_
   tags: pointwise
 
@@ -2553,7 +2556,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: erf_out
-    SparseCPU, SparseCUDA: erf_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out
   tags: pointwise
 
@@ -2619,7 +2622,7 @@
   structured_delegate: expm1.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: expm1_sparse
+    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr
   tags: [core, pointwise]
 
@@ -2628,7 +2631,7 @@
   structured_delegate: expm1.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: expm1_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_
   tags: pointwise
 
@@ -2638,7 +2641,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: expm1_out
-    SparseCPU, SparseCUDA: expm1_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out
   tags: pointwise
 
@@ -2737,7 +2740,7 @@
   structured_delegate: floor.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: floor_sparse
+    SparseCPU, SparseCUDA, SparseMPS: floor_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr
   tags: [core, pointwise]
 
@@ -2746,7 +2749,7 @@
   structured_delegate: floor.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: floor_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: floor_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_
   tags: pointwise
 
@@ -2756,7 +2759,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: floor_out
-    SparseCPU, SparseCUDA: floor_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out
   tags: pointwise
 
@@ -2764,7 +2767,7 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    CPU, CUDA, MPS: floor_divide
+    CPU, CUDA, MPS, MTIA: floor_divide
     SparseCPU, SparseCUDA: floor_divide_sparse
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -2798,7 +2801,7 @@
   structured_delegate: frac.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: frac_sparse
+    SparseCPU, SparseCUDA, SparseMPS: frac_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr
   tags: pointwise
 
@@ -2807,7 +2810,7 @@
   structured_delegate: frac.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: frac_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: frac_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_
   tags: pointwise
 
@@ -2818,7 +2821,7 @@
   dispatch:
     CPU, CUDA: frac_out
     MPS: frac_out_mps
-    SparseCPU, SparseCUDA: frac_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out
   tags: pointwise
 
@@ -3208,7 +3211,7 @@
   dispatch:
     CPU, CUDA, MPS, MTIA: isnan
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan
-    SparseCPU, SparseCUDA: isnan_sparse
+    SparseCPU, SparseCUDA, SparseMPS: isnan_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr
   autogen: isnan.out
   tags: [core, pointwise]
@@ -3289,6 +3292,7 @@
   dispatch:
     CPU: kthvalue_out_cpu
     CUDA: kthvalue_out_cuda
+    MPS: kthvalue_out_mps
 
 - func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
@@ -3336,21 +3340,21 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: nan_to_num
-    SparseCPU, SparseCUDA: nan_to_num_sparse
+    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse
   tags: pointwise
 
 - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: nan_to_num_
-    SparseCPU, SparseCUDA: nan_to_num_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_
   tags: pointwise
 
 - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CPU, CUDA, MTIA: nan_to_num_out
     MPS: nan_to_num_out_mps
-    SparseCPU, SparseCUDA: nan_to_num_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out
   tags: pointwise
 
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
@@ -3553,7 +3557,7 @@
   structured_delegate: log1p.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: log1p_sparse
+    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr
   tags: [core, pointwise]
 
@@ -3562,7 +3566,7 @@
   structured_delegate: log1p.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: log1p_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_
   tags: pointwise
 
@@ -3572,7 +3576,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: log1p_out
-    SparseCPU, SparseCUDA: log1p_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out
   tags: pointwise
 
@@ -4664,7 +4668,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: rad2deg
-    SparseCPU, SparseCUDA: rad2deg_sparse
+    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr
   tags: pointwise
 
@@ -4672,14 +4676,14 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: rad2deg_
-    SparseCPU, SparseCUDA: rad2deg_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_
   tags: pointwise
 
 - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: rad2deg_out
-    SparseCPU, SparseCUDA: rad2deg_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out
   tags: pointwise
 
@@ -4687,7 +4691,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: deg2rad
-    SparseCPU, SparseCUDA: deg2rad_sparse
+    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr
   tags: pointwise
 
@@ -4695,14 +4699,14 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: deg2rad_
-    SparseCPU, SparseCUDA: deg2rad_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_
   tags: pointwise
 
 - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
     CompositeExplicitAutograd: deg2rad_out
-    SparseCPU, SparseCUDA: deg2rad_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out
   tags: pointwise
 
@@ -4928,7 +4932,7 @@
   structured_delegate: neg.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: neg_sparse
+    SparseCPU, SparseCUDA, SparseMPS: neg_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg
   tags: [core, pointwise]
@@ -4938,7 +4942,7 @@
   structured_delegate: neg.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: neg_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: neg_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_
   tags: pointwise
@@ -4949,7 +4953,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: neg_out
-    SparseCPU, SparseCUDA: neg_out_sparse
+    SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out
   tags: pointwise
 # Alias for neg
@@ -5033,7 +5037,7 @@
   structured_delegate: round.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: round_sparse
+    SparseCPU, SparseCUDA, SparseMPS: round_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr
   tags: [core, pointwise]
 
@@ -5042,7 +5046,7 @@
   structured_delegate: round.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: round_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: round_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_
   tags: pointwise
 
@@ -5052,7 +5056,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: round_out
-    SparseCPU, SparseCUDA: round_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: round_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out
   tags: pointwise
 
@@ -5095,7 +5099,7 @@
     QuantizedCPU: relu_quantized_cpu
     QuantizedCUDA: relu_quantized_cuda
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu
-    SparseCPU, SparseCUDA: relu_sparse
+    SparseCPU, SparseCUDA, SparseMPS: relu_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr
   tags: [core, pointwise]
 
@@ -5110,7 +5114,7 @@
     QuantizedCPU: relu_quantized_cpu_
     QuantizedCUDA: relu_quantized_cuda_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_
-    SparseCPU, SparseCUDA: relu_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: relu_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_
   autogen: relu.out
   tags: pointwise
@@ -5397,7 +5401,7 @@
   variants: function, method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr
-    SparseCPU, SparseCUDA: sin_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sin_sparse
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin
   tags: [core, pointwise]
 
@@ -5407,7 +5411,7 @@
   variants: function, method
   dispatch:
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_
-    SparseCPU, SparseCUDA: sin_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sin_sparse_
   tags: pointwise
 
 - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -5417,7 +5421,7 @@
   dispatch:
     CPU, CUDA, MPS, MTIA: sin_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out
-    SparseCPU, SparseCUDA: sin_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out
   tags: pointwise
 
 - func: sinc(Tensor self) -> Tensor
@@ -5442,7 +5446,7 @@
   structured_delegate: sinh.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sinh_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr
   tags: [core, pointwise]
 
@@ -5451,7 +5455,7 @@
   structured_delegate: sinh.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sinh_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_
   tags: pointwise
 
@@ -5461,7 +5465,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: sinh_out
-    SparseCPU, SparseCUDA: sinh_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out
 
 # Returns a copy of this `Variable` that is detached from its autograd graph.
@@ -5509,6 +5513,13 @@
   tags: core
   manual_cpp_binding: True
 
+- func: sym_is_contiguous(Tensor self, MemoryFormat memory_format=contiguous_format) -> SymBool
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
 - func: sym_numel(Tensor self) -> SymInt
   variants: function
   device_check: NoCheck
@@ -5904,7 +5915,7 @@
   variants: function, method
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt
-    SparseCPU, SparseCUDA: sqrt_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr
   tags: [core, pointwise]
 
@@ -5913,7 +5924,7 @@
   structured_delegate: sqrt.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sqrt_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_
   tags: pointwise
 
@@ -5923,7 +5934,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: sqrt_out
-    SparseCPU, SparseCUDA: sqrt_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out
   tags: pointwise
 
@@ -6061,7 +6072,7 @@
   structured_delegate: tan.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: tan_sparse
+    SparseCPU, SparseCUDA, SparseMPS: tan_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr
   tags: [core, pointwise]
 
@@ -6070,7 +6081,7 @@
   structured_delegate: tan.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: tan_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: tan_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_
   tags: pointwise
 
@@ -6080,7 +6091,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: tan_out
-    SparseCPU, SparseCUDA: tan_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out
   tags: pointwise
 
@@ -6091,7 +6102,7 @@
   dispatch:
     QuantizedCPU: tanh_quantized_cpu
     MkldnnCPU: mkldnn_tanh
-    SparseCPU, SparseCUDA: tanh_sparse
+    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh
   tags: [core, pointwise]
@@ -6102,7 +6113,7 @@
   variants: function, method
   dispatch:
     MkldnnCPU: mkldnn_tanh_
-    SparseCPU, SparseCUDA: tanh_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_
   tags: pointwise
@@ -6113,7 +6124,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS, MTIA: tanh_out
-    SparseCPU, SparseCUDA: tanh_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out
   tags: pointwise
 
@@ -6385,8 +6396,8 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: trunc_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr
+    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr
   tags: [core, pointwise]
 
 - func: trunc_(Tensor(a!) self) -> Tensor(a!)
@@ -6394,8 +6405,8 @@
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: trunc_sparse_
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_
+    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_
   tags: pointwise
 
 - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -6404,8 +6415,8 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     CPU, CUDA, MPS: trunc_out
-    SparseCPU, SparseCUDA: trunc_sparse_out
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out
+    SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out
   tags: pointwise
 # Alias for trunc
 
@@ -6915,7 +6926,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: clone
-    SparseCPU, SparseCUDA: clone_sparse
+    SparseCPU, SparseCUDA, SparseMPS: clone_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed
     MkldnnCPU: mkldnn_clone
     QuantizedCPU, QuantizedCUDA: quantized_clone
@@ -6950,7 +6961,7 @@
     CPU, CUDA: zero_
     MPS: zero_mps_
     Meta: zero_meta_
-    SparseCPU, SparseCUDA, SparseMeta: zero_sparse_
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zero_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_
     MkldnnCPU: mkldnn_zero_
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_
@@ -7156,6 +7167,7 @@
 - func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
   variants: function
   dispatch:
+    CompositeExplicitAutograd: _grouped_mm
     CUDA: _grouped_mm_cuda
 
 # NOTE [ Sparse: autograd and API ]
@@ -7367,8 +7379,8 @@
 - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA: sparse_to_dense
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense
+    SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense
     MkldnnCPU: mkldnn_to_dense
   autogen: _to_dense.out
 
@@ -7394,8 +7406,8 @@
 - func: dense_dim(Tensor self) -> int
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse
-    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse
+    SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr
     CompositeExplicitAutograd: dense_dim_default
   device_check: NoCheck
   device_guard: False
@@ -7528,7 +7540,7 @@
   device_check: NoCheck  # Allows copy into different device
   variants: function
   dispatch:
-    SparseCPU, SparseCUDA, SparseMeta: copy_sparse_
+    SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_
   autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
 
 # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors
@@ -9719,7 +9731,7 @@
   structured_delegate: sign.out
   variants: function, method
   dispatch:
-    SparseCPU, SparseCUDA: sign_sparse
+    SparseCPU, SparseCUDA, SparseMPS: sign_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr
   tags: [core, pointwise]
 
@@ -9728,7 +9740,7 @@
   structured_delegate: sign.out
   variants: method
   dispatch:
-    SparseCPU, SparseCUDA: sign_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: sign_sparse_
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_
   tags: pointwise
 
@@ -9739,7 +9751,7 @@
   dispatch:
     CPU, CUDA: sign_out
     MPS: sign_out_mps
-    SparseCPU, SparseCUDA: sign_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out
   tags: pointwise
 
@@ -9747,7 +9759,7 @@
   variants: function, method
   structured_delegate: signbit.out
   dispatch:
-    SparseCPU, SparseCUDA: signbit_sparse
+    SparseCPU, SparseCUDA, SparseMPS: signbit_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr
   tags: pointwise
 
@@ -9758,7 +9770,7 @@
     CPU: signbit_out
     CUDA: signbit_out
     MPS: signbit_out_mps
-    SparseCPU, SparseCUDA: signbit_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out
   tags: pointwise
 
@@ -9941,7 +9953,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: igamma_out
+    CPU, CUDA, MPS: igamma_out
   tags: pointwise
 
 - func: igamma(Tensor self, Tensor other) -> Tensor
@@ -9958,7 +9970,7 @@
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
-    CPU, CUDA: igammac_out
+    CPU, CUDA, MPS: igammac_out
   tags: pointwise
 
 - func: igammac(Tensor self, Tensor other) -> Tensor
@@ -13262,7 +13274,7 @@
   dispatch:
     CompositeExplicitAutograd: isinf
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf
-    SparseCPU, SparseCUDA: isinf_sparse
+    SparseCPU, SparseCUDA, SparseMPS: isinf_sparse
     SparseMeta: isinf_sparse_meta
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr
   autogen: isinf.out
@@ -13278,7 +13290,7 @@
   structured_delegate: isposinf.out
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf
-    SparseCPU, SparseCUDA: isposinf_sparse
+    SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr
   tags: pointwise
 
@@ -13287,7 +13299,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: isposinf_out
-    SparseCPU, SparseCUDA: isposinf_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out
   tags: pointwise
 
@@ -13296,7 +13308,7 @@
   structured_delegate: isneginf.out
   dispatch:
     NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf
-    SparseCPU, SparseCUDA: isneginf_sparse
+    SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr
   tags: pointwise
 
@@ -13305,7 +13317,7 @@
   structured_inherits: TensorIteratorBase
   dispatch:
     CPU, CUDA, MPS: isneginf_out
-    SparseCPU, SparseCUDA: isneginf_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out
   tags: pointwise
 
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index 4ca777be9cd4..f804670c3153 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -335,6 +335,8 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
     const int64_t n_bins,
     const double ratio,
     int64_t bit_width) {
+  const float* input_row = input_tensor.const_data_ptr<float>();
+  TORCH_CHECK_VALUE(input_row != nullptr, "input tensor is empty and has no data");
 
   if (numel < 0 || numel > input_tensor.numel()) {
     TORCH_CHECK(false, "numel is out of the bound of input tensor");
@@ -342,7 +344,7 @@ std::tuple<Tensor, Tensor> choose_qparams_optimized(
 
   TORCH_CHECK(numel <= input_tensor.numel(), "numel ", numel,
       " greater than input_tensor.numel() ", input_tensor.numel());
-  const float* input_row = input_tensor.const_data_ptr<float>();
+
   float xmin = *std::min_element(input_row, input_row + numel);
   float xmax = *std::max_element(input_row, input_row + numel);
   float n_bins_float = static_cast<float>(n_bins);
diff --git a/aten/src/ATen/native/sparse/mps/FlattenIndices.mm b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm
new file mode 100644
index 000000000000..41efa545cd2a
--- /dev/null
+++ b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm
@@ -0,0 +1,73 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/SparseTensorUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/sparse/SparseStubs.h>
+#include <ATen/native/sparse/FlattenIndicesCommon.h>
+#include <ATen/ExpandUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/zeros_native.h>
+#endif
+
+namespace at::native {
+namespace {
+
+using namespace mps;
+using namespace at::sparse;
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/FlattenIndices_metallib.h>
+#endif
+
+Tensor flatten_indices_mps(const Tensor& indices, IntArrayRef size) {
+  TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
+  TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
+              "flatten_indices: indices.size(0) must equal size.size()");
+
+  const int64_t sparse_dim = indices.size(0);
+  const int64_t nnz = indices.size(1);
+
+  if (nnz == 0) {
+    return at::empty({0}, indices.options().dtype(kLong));
+  }
+
+  // Row-major multipliers for flattening: mul[d] = prod_{j>d}(size[j])
+  std::vector<int64_t> row_muls(sparse_dim);
+  row_muls[sparse_dim - 1] = 1;
+  for (int64_t i = sparse_dim - 2; i >= 0; --i) {
+    row_muls[i] = row_muls[i + 1] * size[i + 1];
+  }
+
+  auto flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
+
+  auto stream = getCurrentMPSStream();
+  dispatch_sync_with_rethrow(stream->queue(), ^() {
+    @autoreleasepool {
+      auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
+      auto encoder = stream->commandEncoder();
+      [encoder setComputePipelineState:pipeline];
+      mtl_setArgs(encoder,
+                  indices,
+                  row_muls,
+                  flat_indices,
+                  static_cast<uint>(sparse_dim),
+                  indices.strides()
+      );
+
+      mtl_dispatch1DJob(encoder, pipeline, nnz);
+    }
+  });
+  return flat_indices;
+}
+
+} // namespace
+REGISTER_MPS_DISPATCH(flatten_indices_stub, &flatten_indices_mps)
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm
index 7ccdf4077542..3e0ac4e35da1 100644
--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm
@@ -20,46 +20,9 @@
 #ifndef PYTORCH_JIT_COMPILE_SHADERS
 static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
 #else
-#include <ATen/native/mps/Sparse_metallib.h>
+#include <ATen/native/mps/Coalesce_metallib.h>
 #endif
 
-
-static Tensor flatten_indices(const Tensor& indices, IntArrayRef size) {
-
-  TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D");
-  TORCH_CHECK(static_cast<size_t>(indices.size(0)) == size.size(),
-              "flatten_indices: indices.size(0) must equal size.size()");
-
-  int64_t sparse_dim = indices.size(0);
-  int64_t nnz = indices.size(1);
-
-  if (nnz == 0) {
-    return at::empty({0}, indices.options().dtype(kLong));
-  }
-
-  std::vector<int64_t> strides(sparse_dim);
-  strides[sparse_dim - 1] = 1;
-  for (int64_t i = sparse_dim - 2; i >= 0; i--) {
-    strides[i] = strides[i + 1] * size[i + 1];
-  }
-
-  Tensor flat_indices = at::empty({nnz}, indices.options().dtype(kLong));
-
-  auto stream = getCurrentMPSStream();
-  dispatch_sync_with_rethrow(stream->queue(), ^() {
-    @autoreleasepool {
-      auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel");
-      auto encoder = stream->commandEncoder();
-      [encoder setComputePipelineState:pipeline];
-
-      mtl_setArgs(encoder, indices, strides, flat_indices, sparse_dim, nnz);
-      mtl_dispatch1DJob(encoder, pipeline, nnz);
-    }
-  });
-
-  return flat_indices;
-}
-
 static Tensor compute_output_positions(const Tensor& is_unique) {
 
   int64_t nnz = is_unique.size(0);
diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
new file mode 100644
index 000000000000..07ee2e097b49
--- /dev/null
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@@ -0,0 +1,183 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/SparseTensorUtils.h>
+#include <ATen/native/mps/OperationUtils.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/add_native.h>
+#include <ATen/ops/empty_native.h>
+#include <ATen/ops/zeros_native.h>
+#include <ATen/ops/result_type.h>
+#include <ATen/ops/copy_sparse_to_sparse.h>
+#include <ATen/ops/mul.h>
+#endif
+
+namespace at::native {
+
+using namespace at::sparse;
+
+Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
+
+Tensor& add_out_dense_sparse_mps(
+    Tensor& out,
+    const Tensor& dense,
+    const SparseTensor& sparse,
+    const Scalar& alpha) {
+  TORCH_CHECK(dense.is_mps(),  "add: expected 'self' to be an MPS tensor, got ", dense.device());
+  TORCH_CHECK(sparse.is_mps(), "add: expected 'other' to be an MPS tensor, got ", sparse.device());
+  TORCH_CHECK(out.is_mps(),    "add: expected 'out' to be an MPS tensor, got ", out.device());
+  TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
+              "add: expected 'self' and 'other' to have same size, but self has size ",
+              dense.sizes(), " while other has size ", sparse.sizes(),
+              " (FYI: dense-sparse addition does not currently support broadcasting)");
+
+  const int64_t nnz = sparse._nnz();
+  if (nnz == 0) {
+    out.resize_as_(dense);
+    out.copy_(dense);
+    return out;
+  }
+
+  auto commonDtype = at::result_type(dense, sparse);
+  TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", out.scalar_type());
+
+  Tensor r;
+  const bool need_separate_buffer = out.is_same(dense) || (out.scalar_type() != commonDtype);
+  if (need_separate_buffer) {
+    r = at::empty(dense.sizes(), out.options().dtype(commonDtype));
+  } else {
+    r = out;
+    r.resize_as_(dense);
+  }
+
+  Tensor dense_buffer = dense.to(commonDtype);
+  if (!r.is_same(dense_buffer)) {
+    r.copy_(dense_buffer);
+  }
+
+  Tensor indices = sparse._indices();
+  Tensor values  = sparse._values().to(commonDtype);
+  if (values.numel() == 0) {
+    if (!out.is_same(r)) {
+      out.resize_as_(dense);
+      out.copy_(r);
+    }
+    return out;
+  }
+
+  const int64_t nDim  = r.dim();
+  const int64_t nDimI = sparse.sparse_dim();
+  TORCH_CHECK(nDimI >= 0 && nDimI <= nDim,
+              "Invalid sparse_dim=", nDimI, " for dense tensor of dim ", nDim);
+
+  Tensor indices1D = at::sparse::flatten_indices(indices, sparse.sizes()).contiguous();
+
+  int64_t view_rows = 1;
+  int64_t view_cols = 1;
+  for (int64_t i = 0; i < nDimI; i++) {
+    view_rows *= r.size(i);
+  }
+  for (int64_t i = nDimI; i < nDim; i++) {
+    view_cols *= r.size(i);
+  }
+
+  if (view_cols == 1) {
+    Tensor r_flat = r.reshape({view_rows});
+    Tensor values_1d  = values.reshape({nnz});
+    r_flat.index_add_(0, indices1D, values_1d, alpha);
+  } else {
+    Tensor r_view = r.view({view_rows, view_cols});
+    Tensor values_2d  = values.reshape({nnz, view_cols});
+    r_view.index_add_(0, indices1D, values_2d, alpha);
+  }
+
+  if (!out.is_same(r)) {
+    out.resize_as_(dense);
+    out.copy_(r);
+  }
+  return out;
+}
+
+
+SparseTensor& add_out_sparse_mps(const SparseTensor& self,
+                                 const SparseTensor& other,
+                                 const Scalar& alpha,
+                                 SparseTensor& out) {
+  TORCH_CHECK(other.is_sparse(), "add(sparse, dense) is not supported. Use add(dense, sparse) instead.");
+  TORCH_CHECK(self.is_mps(),  "add: expected 'self' to be MPS, but got ", self.device());
+  TORCH_CHECK(other.is_mps(), "add: expected 'other' to be MPS, but got ", other.device());
+  TORCH_CHECK(out.is_mps(),   "add: expected 'out' to be MPS, but got ", out.device());
+  if (!self.is_sparse()) {
+    return add_out_dense_sparse_mps(out, self, other, alpha);
+  }
+  auto commonDtype = at::result_type(self, other);
+  TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
+              "Can't convert result type ", commonDtype, " to output ", out.scalar_type());
+
+  TORCH_CHECK(self.sizes().equals(other.sizes()),
+              "add: expected 'self' and 'other' to have same size, but ", self.sizes(), " != ", other.sizes());
+
+  if (other._nnz() == 0) {
+    out.resize_as_(self);
+    Tensor vals = self._values();
+    if (vals.scalar_type() != out.scalar_type()) {
+      vals = vals.to(out.scalar_type());
+    }
+    alias_into_sparse(out, self._indices(), vals);
+    out._coalesced_(self.is_coalesced());
+    return out;
+  }
+
+  if (self._nnz() == 0) {
+    out.resize_as_(other);
+    Tensor vals = other._values();
+    if (!alpha.isIntegral(false) || alpha.to<double>() != 1.0) {
+      vals = at::mul(vals, alpha);
+    }
+    if (vals.scalar_type() != out.scalar_type()) {
+      vals = vals.to(out.scalar_type());
+    }
+    alias_into_sparse(out, other._indices(), vals);
+    out._coalesced_(other.is_coalesced());
+    return out;
+  }
+
+  TORCH_CHECK(is_same_density(self, other),
+              "add: expected 'self' and 'other' to have same density, but 'self' has ",
+              self.sparse_dim(), " sparse dimensions while 'other' has ", other.sparse_dim(), " sparse dimensions");
+
+  Tensor t_indices_ = self._indices();
+  Tensor s_indices_ = other._indices();
+
+  Tensor t_values_ = self._values().to(commonDtype);
+  Tensor s_values_ = other._values().to(commonDtype);
+  if (!alpha.isIntegral(false) || alpha.to<double>() != 1.0) {
+    s_values_ = at::mul(s_values_, alpha);
+  }
+
+  Tensor r_indices_ = at::cat({t_indices_, s_indices_}, 1);
+  Tensor r_values_  = at::cat({t_values_,  s_values_ }, 0);
+
+  SparseTensor tmp = empty({0}, out.options().dtype(commonDtype));
+  tmp.resize_as_(other);
+  alias_into_sparse(tmp, r_indices_, r_values_);
+  tmp = _coalesce_sparse_mps(tmp);
+
+  out.resize_as_(other);
+  Tensor out_vals = tmp._values();
+  if (out.scalar_type() != commonDtype) {
+    out_vals = out_vals.to(out.scalar_type());
+  }
+  alias_into_sparse(out, tmp._indices(), out_vals);
+  out._coalesced_(tmp.is_coalesced());
+
+  return out;
+}
+
+} // namespace at::native
\ No newline at end of file
diff --git a/aten/src/ATen/native/sparse/mps/kernels/Sparse.metal b/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal
similarity index 89%
rename from aten/src/ATen/native/sparse/mps/kernels/Sparse.metal
rename to aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal
index 8b85950e393a..e32d1edf1c2f 100644
--- a/aten/src/ATen/native/sparse/mps/kernels/Sparse.metal
+++ b/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal
@@ -2,19 +2,6 @@
 #include <metal_stdlib>
 using namespace metal;
 
-kernel void flatten_indices_kernel(
-    device const int64_t* indices [[buffer(0)]],
-    device const int64_t* strides [[buffer(1)]],
-    device int64_t* flat_indices [[buffer(2)]],
-    constant uint& sparse_dim [[buffer(3)]],
-    constant uint& nnz [[buffer(4)]],
-    uint gid [[thread_position_in_grid]]) {
-  int64_t flat_idx = 0;
-  for (uint d = 0; d < sparse_dim; d++) {
-    flat_idx += indices[d * nnz + gid] * strides[d];
-  }
-  flat_indices[gid] = flat_idx;
-}
 
 kernel void compute_output_positions_kernel(
     device const bool* is_unique [[buffer(0)]],
@@ -125,4 +112,6 @@ INSTANTIATE_COALESCE_WITH_POSITIONS(long);
 INSTANTIATE_COALESCE_WITH_POSITIONS(char);
 INSTANTIATE_COALESCE_WITH_POSITIONS(uchar);
 INSTANTIATE_COALESCE_WITH_POSITIONS(short);
-INSTANTIATE_COALESCE_WITH_POSITIONS(int);
\ No newline at end of file
+INSTANTIATE_COALESCE_WITH_POSITIONS(int);
+INSTANTIATE_COALESCE_WITH_POSITIONS(float2);
+INSTANTIATE_COALESCE_WITH_POSITIONS(half2);
\ No newline at end of file
diff --git a/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal b/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal
new file mode 100644
index 000000000000..00156dddb06c
--- /dev/null
+++ b/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal
@@ -0,0 +1,19 @@
+#include <metal_stdlib>
+using namespace metal;
+
+
+kernel void flatten_indices_kernel(
+    device const long* indices        [[ buffer(0) ]],
+    device const long* row_muls       [[ buffer(1) ]],
+    device long*       flat_indices   [[ buffer(2) ]],
+    constant uint&     sparse_dim     [[ buffer(3) ]],
+    constant long2&    idx_strides    [[ buffer(4) ]],
+    uint               gid            [[ thread_position_in_grid ]]) {
+  long flat = 0;
+  for (uint d = 0; d < sparse_dim; ++d) {
+    long off = (long)d * idx_strides.x + (long)gid * idx_strides.y;
+    long v = indices[off];
+    flat += v * row_muls[d];
+  }
+  flat_indices[gid] = flat;
+}
\ No newline at end of file
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index 1a3e2825d4fa..b8b43e0086c1 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -1396,12 +1396,15 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     at::Tensor v_t = value.transpose(1, 2);
     at::Tensor output_t = res.transpose(1, 2);
     bool is_causal;
-    if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
-      is_causal = true;
-    } else if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
+    if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
       is_causal = false;
     } else {
-      TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now");
+      is_causal = true;
+#if AOTRITON_V3_API == 0
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) {
+        TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now");
+      }
+#endif
     }
 
     at::Tensor atomic_counter;
@@ -1426,7 +1429,51 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, c10::SymInt, c10::SymInt> _efficient_
     auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr<int64_t>() : nullptr);
     auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
     hipError_t err; // TODO: Error handling
-    if (seqstart_q.has_value()) {
+    if constexpr (AOTRITON_ALWAYS_V3_API) {  // Better readability than nesting ifdef
+#if AOTRITON_V3_API  // if constexpr does not stop errors from undefined functions
+      using aotriton::v3::flash::CausalType;
+      using aotriton::v3::flash::VarlenType;
+      using aotriton::v3::flash::WindowValue;
+      aotriton::v3::flash::attn_fwd_params params;
+      params.Q = mk_aotensor(q_t, "q");
+      params.K = mk_aotensor(k_t, "k");
+      params.V = mk_aotensor(v_t, "v");
+      params.Sm_scale = softmax_scale;
+      params.L = compute_logsumexp ? mk_aotensor<2>(softmax_lse, "M") : empty_t2;
+      params.Out = mk_aotensor(output_t, "Out");
+      params.Max_seqlen_q = max_seqlen_q;    // Unused if cu_seqlens_q is empty
+      params.Max_seqlen_k = max_seqlen_k;    // Unused if cu_seqlens_k is empty
+      params.dropout_p = dropout_p;
+      params.philox_seed_ptr = seed;
+      params.philox_offset1 = offset1;
+      params.philox_offset2 = offset2;
+      params.philox_seed_output = seed_output;
+      params.philox_offset_output = offset_output;
+      params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax");
+      params.persistent_atomic_counter = persistent_counter;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
+        params.window_left = WindowValue::TopLeftAligned;
+        params.window_right = WindowValue::TopLeftAligned;
+      } else if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) {
+        params.window_left = WindowValue::BottomRightAligned;
+        params.window_right = WindowValue::BottomRightAligned;
+      }
+      if (bias.has_value()) {
+        params.B = mk_aotensor(bias.value(), "bias");
+      }
+      if (seqstart_q.has_value()) {
+        params.varlen_type = VarlenType::CompactVarlen;
+        params.cu_seqlens_q = mk_aotensor<1>(seqstart_q.value(), "cu_seqlens_q");
+        params.cu_seqlens_k = mk_aotensor<1>(seqstart_k.value(), "cu_seqlens_k");
+      } else {
+        params.varlen_type = VarlenType::None;
+      }
+      err = aotriton::v3::flash::attn_fwd(params,
+                                          aotriton::v3::flash::attn_fwd_params::kVersion,
+                                          stream);
+#endif  // AOTRITON_V3_API
+    } else if (seqstart_q.has_value()) {
       // varlen aka nested tensor
       err = attn_fwd_compact_varlen(mk_aotensor(q_t, "q"),
                                     mk_aotensor(k_t, "k"),
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index 6940bbbcb812..55fc1e261219 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -24,6 +24,7 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/zeros.h>
 #include <ATen/ops/zeros_like.h>
 #include <ATen/ops/empty_strided.h>
 #include <ATen/ops/_cudnn_attention_backward.h>
@@ -47,6 +48,7 @@
 #include <ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h>
 #include <ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h>
 #else
+#include <ATen/native/transformers/hip/gemm_kernel_utils.h>
 // MemoryEfficient Attention Specific Imports for ROCM
 #ifndef DISABLE_AOTRITON
 #include <ATen/native/transformers/hip/aotriton_adapter.h>
@@ -544,12 +546,15 @@ _efficient_attention_backward(
     }
     const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float();
     bool is_causal;
-    if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
-      is_causal = true;
-    } else if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
+    if (static_cast<int64_t>(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) {
       is_causal = false;
     } else {
-      TORCH_CHECK(false, "[_efficient_attention_backward] Unsupported mask type in AOTriton, for now");
+      is_causal = true;
+#if AOTRITON_V3_API == 0
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) {
+        TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now");
+      }
+#endif
     }
     at::Tensor q_t = query.permute({0,2,1,3});
     at::Tensor k_t = key.permute({0,2,1,3});
@@ -568,7 +573,62 @@ _efficient_attention_backward(
     using sdp::aotriton_adapter::mk_aoscalartensor;
     using sdp::aotriton_adapter::cast_dtype;
     aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype()));
-    if (cu_seqlens_q.has_value()) {
+    if constexpr (AOTRITON_ALWAYS_V3_API) {  // Better readability than nesting ifdef
+#if AOTRITON_V3_API  // if constexpr does not stop errors from undefined functions
+      using aotriton::v3::flash::CausalType;
+      using aotriton::v3::flash::VarlenType;
+      using aotriton::v3::flash::WindowValue;
+      aotriton::v3::flash::attn_bwd_params params;
+      params.Q = mk_aotensor(q_t, "q");
+      params.K = mk_aotensor(k_t, "k");
+      params.V = mk_aotensor(v_t, "v");
+      params.B = bias.has_value() ? mk_aotensor(bias.value(), "bias") : empty_t4;
+      params.Sm_scale = softmax_scale;
+      params.Out = mk_aotensor(out_t, "out");
+      params.DO = mk_aotensor(dout_t, "dout");
+      params.DK = mk_aotensor(dk_t, "dk");
+      params.DV = mk_aotensor(dv_t, "dv");
+      params.DQ = mk_aotensor(dq_t, "dq");
+      params.DB = bias_requires_grad ? mk_aotensor(grad_bias, "db") : empty_t4;
+      params.L = mk_aotensor<2>(softmax_lse, "L");
+      params.Max_seqlen_q = max_seqlen_q;        // Unused if cu_seqlens_q is empty
+      params.Max_seqlen_k = max_seqlen_k;        // Unused if cu_seqlens_k is empty
+      params.dropout_p = float(dropout_p);
+      params.philox_seed_ptr =  mk_aoscalartensor(philox_seed);
+      params.philox_offset1 = mk_aoscalartensor(philox_offset);
+      params.philox_offset2 = 0;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
+      if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) {
+        params.window_left = WindowValue::TopLeftAligned;
+        params.window_right = WindowValue::TopLeftAligned;
+      } else if (static_cast<int64_t>(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) {
+        params.window_left = WindowValue::BottomRightAligned;
+        params.window_right = WindowValue::BottomRightAligned;
+      }
+#if AOTRITON_ALWAYS_V3_API
+      using sdp::aotriton_adapter::mklazy_empty_like;
+      using sdp::aotriton_adapter::mklazy_fp32zeros;
+      using sdp::aotriton_adapter::LazyTensorContext;
+      LazyTensorContext lazy_delta { .like_tensor = softmax_lse, .tensor_name = "delta" };
+      LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" };
+      params.D = mklazy_empty_like<2>(&lazy_delta);
+      params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc);
+#else
+      at::Tensor delta = at::empty_like(softmax_lse).contiguous();
+      params.D = mk_aotensor<2>(delta, "delta");
+#endif
+      if (cu_seqlens_q.has_value()) {
+        params.varlen_type = VarlenType::CompactVarlen;
+        params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q.value(), "cu_seqlens_q");
+        params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k.value(), "cu_seqlens_k");
+      } else {
+        params.varlen_type = VarlenType::None;
+      }
+      err = aotriton::v3::flash::attn_bwd(params,
+                                          aotriton::v3::flash::attn_bwd_params::kVersion,
+                                          stream);
+#endif  // AOTRITON_V3_API
+    } else if (cu_seqlens_q.has_value()) {
       at::Tensor delta = at::empty_like(softmax_lse).contiguous();
       // varlen aka Nested tensor
       err = attn_bwd_compact_varlen(mk_aotensor(q_t, "q"),
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
index 00a43920b096..660aee3647ce 100644
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -16,6 +16,7 @@
 #include <c10/util/irange.h>
 #include <c10/util/Array.h>
 #include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
 
 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
@@ -25,9 +26,12 @@
 
 #if USE_ROCM
 #if defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION)
+#include <ATen/native/transformers/hip/aotriton_versions.h>
 #include <aotriton/flash.h>
 #define USE_ROCM_ATTENTION 1
 #endif
+#else
+#define USE_ROCM_ATTENTION 0
 #endif
 
 // Avoid potential compiler -Wall -Werror complains undefined macro
@@ -72,13 +76,14 @@ bool priority_order_init_ = false;
 // TODO(eqy): more benchmarking to determine whether this should include sm86/89
 // Needs to be kept in-sync with test_fused_chocie in test_transformers.py
 bool check_prefer_cudnn_attention() {
-  static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") == true;
+  static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") != false;
   if (!prefer_cudnn) {
     return false;
   }
-#if (defined(CUDNN_VERSION) && (CUDNN_VERSION > 90000))
+#if (defined(CUDNN_VERSION) && (CUDNN_VERSION >= 90900))
   auto dprops = at::cuda::getCurrentDeviceProperties();
-  return dprops->major >= 9 && !dprops->minor;
+  auto major = dprops->major;
+  return (major == 9 || major == 10) && !dprops->minor;
 #else
   return false;
 #endif
@@ -129,9 +134,24 @@ int64_t minimum_gemm_alignment(sdp_params const& params) {
 // caller_is_meff is added to make the TORCH_WARN message showing the correct result
 template<bool caller_is_meff = false>
 bool check_head_dim_size_flash(sdp_params const& params, bool debug) {
-#if USE_ROCM_ATTENTION && AOTRITON_VERSION_MINOR >= 9
+#if USE_ROCM_ATTENTION
   // AOTriton 0.9+ supports head_dim up to 512
-  const auto max_size = c10::SymInt(512);
+  const static auto max_hdim = []() {
+#if AOTRITON_VERSION_CURRENT == AOTRITON_VERSION_INT(0, 11)
+    // gfx11xx only support hdim <= 256 on AOTriton 0.11
+    auto dprops = at::cuda::getCurrentDeviceProperties();
+    const c10::basic_string_view<char> arch(dprops->gcnArchName);
+    if (arch.starts_with("gfx11")) {
+      return 256;
+    }
+#endif // AOTriton 0.11
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 9)
+    return 512;
+#else
+    return 256;
+#endif
+  }();
+  const auto max_size = c10::SymInt(max_hdim);
 #else
   // All head_dim sizes must be equal and less than 256
   const auto max_size = c10::SymInt(256);
@@ -586,7 +606,7 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) {
 
   const auto dprop = at::cuda::getCurrentDeviceProperties();
   // Check that the input is nested
-  if ((dprop->major == 9 || dprop->major == 10) && has_for_nested_inputs(params)) {
+  if (!(dprop->major == 9 || dprop->major == 10) && has_for_nested_inputs(params)) {
     if (debug) {
       TORCH_WARN("cuDNN SDPA supports nested tensors on SM 9.0, SM 10.0.");
     }
@@ -646,6 +666,15 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
     TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use cuDNN Attention (< v9.0.0)");
   }
   return false;
+#endif
+#if defined(CUDNN_VERSION)
+  static auto cudnn_version = cudnnGetVersion();
+  if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
+    if (debug) {
+      TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
+    }
+    return false;
+  }
 #endif
   // Define gate functions that determine if a flash kernel can be ran
   // Replace with std::to_array when we migrate to c++20
diff --git a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
index aedb205e5710..d316808cf9be 100644
--- a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
+++ b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h
@@ -2,8 +2,12 @@
 
 #ifdef USE_ROCM
 
+// Expect to be included after headers of at::zeros_like and at::empty_like
+
 #include <aotriton/dtypes.h>
 #include <aotriton/util.h>
+#include <aotriton/config.h>
+#include <ATen/native/transformers/hip/aotriton_versions.h>
 
 ////////////////////////////////////////////////////////////////////////////////
 // Common macros copied from cuda/mem_eff_attention/gemm_kernel_utils.h
@@ -111,6 +115,61 @@ inline aotriton::TensorView<0> mk_atomictensor(const int32_t* ptr)
                                  aotriton::DType::kInt32);
 }
 
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11)
+
+struct LazyTensorContext {
+  at::Tensor like_tensor;
+  std::string_view tensor_name;
+  at::Tensor tensor;
+};
+
+template<int kRank, bool kRequireZeros>
+struct LazyTensorFunctions : public LazyTensorContext {
+  static aotriton::TensorView<kRank> acquire(void* cookie) {
+    auto ctx = (LazyTensorContext*)cookie;
+    if (!ctx->tensor.defined()) {
+      auto q = ctx->like_tensor;
+      if constexpr (kRequireZeros) {
+        ctx->tensor = at::zeros(q.sizes(),
+                                q.options().dtype(at::kFloat));
+      } else {
+        ctx->tensor = at::empty_like(q);
+      }
+    }
+    return mk_aotensor<kRank>(ctx->tensor, ctx->tensor_name);
+  }
+
+  static void dispose(void* cookie) {
+  }
+};
+
+template<int kRank, bool kRequireZeros>
+aotriton::LazyTensor<kRank> mklazy_common(LazyTensorContext* cookie)
+{
+  using LTF = LazyTensorFunctions<kRank, kRequireZeros>;
+  return aotriton::LazyTensor<kRank> {
+    .cookie = cookie,
+    .acquire = &LTF::acquire,
+    .dispose = &LTF::dispose
+  };
+}
+
+template<int kRank>
+auto mklazy_empty_like(LazyTensorContext* cookie)
+{
+  return mklazy_common<kRank, false>(cookie);
+}
+
+
+// Note: this will not keep the original strides
+template<int kRank>
+auto mklazy_fp32zeros(LazyTensorContext* cookie)
+{
+  return mklazy_common<kRank, true>(cookie);
+}
+
+#endif  // >= 0.11
+
 } // namespace aotriton_adapter
 
 } // namespace sdp
diff --git a/aten/src/ATen/native/transformers/hip/aotriton_versions.h b/aten/src/ATen/native/transformers/hip/aotriton_versions.h
new file mode 100644
index 000000000000..2f5d3f0e1222
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/aotriton_versions.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#ifdef USE_ROCM
+
+#define AOTRITON_VERSION_INT(x, y) (x * 100 + y)
+#define AOTRITON_VERSION_CURRENT (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR)
+
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11)
+#define AOTRITON_ALWAYS_V3_API 1
+#else
+#define AOTRITON_ALWAYS_V3_API 0
+#endif
+
+#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 10)
+#define AOTRITON_V3_API 1
+#else
+#define AOTRITON_V3_API 0
+#endif
+
+#endif
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
index 1d4926c02274..b5b1ed429289 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@@ -60,20 +60,13 @@
 #include <c10/util/Exception.h>
 
 // AOTriton headers
-#include <aotriton/config.h>
 #include <aotriton/flash.h>
 #include <aotriton/runtime.h>
 
-#if AOTRITON_VERSION_MINOR < 9
+#if AOTRITON_VERSION_CURRENT < AOTRITON_VERSION_INT(0, 9)
 #error "This adaptor code is only tested with AOTriton >= 0.9"
 #endif
 
-#if (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR) >= 10
-#define V3_API 1
-#else
-#define V3_API 0
-#endif
-
 namespace pytorch_flash {
 
 namespace {
@@ -93,15 +86,15 @@ calculate_swa(std::optional<int64_t> window_size_left,
               int max_seqlen_q,
               int max_seqlen_k,
               bool is_causal) {
-#if V3_API  // SWA is exposed through V3 API
+#if AOTRITON_V3_API  // SWA is exposed through V3 API
   bool needs_swa = false;
   using aotriton::v3::flash::WindowValue;
   // Default values when std::optional window_size_left/right have no value
   int window_left = max_seqlen_q;
   int window_right = max_seqlen_k;
   if (is_causal) {
-    window_left = WindowValue::TopLeftAligned;
-    window_right = WindowValue::TopLeftAligned;
+    window_left = WindowValue::BottomRightAligned;
+    window_right = WindowValue::BottomRightAligned;
   }
   if (window_size_left.has_value() || window_size_right.has_value()) {
     needs_swa = true;
@@ -248,10 +241,10 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
                                                               seqlen_q,
                                                               seqlen_k,
                                                               is_causal);
-#if V3_API
+#if AOTRITON_V3_API
   const bool uses_swa = needs_swa;
 #else
-  // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
   // optimized out (hopefully).
   constexpr bool uses_swa = false;
 #endif
@@ -278,8 +271,8 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
   auto seed_output = mk_philoxtensor(use_philox_state ? seed_t.data_ptr<int64_t>() : nullptr);
   auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr<int64_t>() : nullptr);
   auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr<int32_t>() : nullptr);
-  if (uses_swa) {
-#if V3_API
+  if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
     using aotriton::v3::flash::CausalType;
     using aotriton::v3::flash::VarlenType;
     aotriton::v3::flash::attn_fwd_params params;
@@ -299,7 +292,7 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
     params.philox_offset_output = offset_output;
     params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax");
     params.persistent_atomic_counter = persistent_counter;
-    params.causal_type = CausalType::WindowedAttention;
+    params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
     params.varlen_type = VarlenType::None;
     params.window_left = window_left;
     params.window_right = window_right;
@@ -449,10 +442,10 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
                                                               max_seqlen_q,
                                                               max_seqlen_k,
                                                               is_causal);
-#if V3_API
+#if AOTRITON_V3_API
   const bool uses_swa = needs_swa;
 #else
-  // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
   // optimized out (hopefully).
   constexpr bool uses_swa = false;
 #endif
@@ -482,8 +475,8 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
     auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr<int64_t>()) : nullscalar;
     auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr<int64_t>()) : nullscalar;
     auto persistent_counter = is_causal ? mk_philoxtensor(atomic_counter.data_ptr<int64_t>()) : nullscalar;
-    if (uses_swa) {
-#if V3_API
+    if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
       using aotriton::v3::flash::CausalType;
       using aotriton::v3::flash::VarlenType;
       aotriton::v3::flash::attn_fwd_params params;
@@ -505,7 +498,7 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
       params.philox_offset_output = offset_output;
       params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax");
       params.persistent_atomic_counter = persistent_counter;
-      params.causal_type = CausalType::WindowedAttention;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
       params.varlen_type = VarlenType::CompactVarlen;
       params.window_left = window_left;
       params.window_right = window_right;
@@ -599,10 +592,6 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
   const int seqlen_k = k.size(1);
   const int num_heads_k = k.size(2);
 
-  if (is_causal){
-    TORCH_CHECK((seqlen_q == seqlen_k), "For backwards kernel seqlen_q must equal seqlen_k for causal kernels");
-  }
-
   TORCH_CHECK(batch_size > 0, "batch size must be positive");
   TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
   TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
@@ -654,10 +643,10 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
                                                               seqlen_q,
                                                               seqlen_k,
                                                               is_causal);
-#if V3_API
+#if AOTRITON_V3_API
   const bool uses_swa = needs_swa;
 #else
-  // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
   // optimized out (hopefully).
   constexpr bool uses_swa = false;
 #endif
@@ -681,10 +670,9 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
   hipError_t err; // TODO: Error handling
   using sdp::aotriton_adapter::mk_aotensor;
   using sdp::aotriton_adapter::mk_aoscalartensor;
-  if (uses_swa) {
-#if V3_API
+  if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
     // Fused BWD does not support SWA
-    at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
     using aotriton::v3::flash::CausalType;
     using aotriton::v3::flash::VarlenType;
     aotriton::v3::flash::attn_bwd_params params;
@@ -694,21 +682,32 @@ mha_bwd_aot(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x hea
     params.Sm_scale = softmax_scale;
     params.Out = mk_aotensor(out_t, "out");
     params.DO = mk_aotensor(dout_t, "dout");
-    params.DK = mk_aotensor(dq_t, "dq");
-    params.DV = mk_aotensor(dk_t, "dk");
-    params.DQ = mk_aotensor(dv_t, "dv");
+    params.DQ = mk_aotensor(dq_t, "dq");
+    params.DK = mk_aotensor(dk_t, "dk");
+    params.DV = mk_aotensor(dv_t, "dv");
     params.L = mk_aotensor<2>(softmax_lse_cont, "L");
-    params.D = mk_aotensor<2>(delta, "delta");
     params.Max_seqlen_q = seqlen_q;        // Unused if cu_seqlens_q is empty
     params.Max_seqlen_k = seqlen_k;        // Unused if cu_seqlens_k is empty
     params.dropout_p = p_dropout;
     params.philox_seed_ptr =  mk_aoscalartensor(philox_seed);
     params.philox_offset1 = mk_aoscalartensor(philox_offset);
     params.philox_offset2 = 0;
-    params.causal_type = CausalType::WindowedAttention;
-    params.varlen_type = VarlenType::None;
+    params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
     params.window_left = window_left;
     params.window_right = window_right;
+    params.varlen_type = VarlenType::None;
+#if AOTRITON_ALWAYS_V3_API
+    using sdp::aotriton_adapter::mklazy_empty_like;
+    using sdp::aotriton_adapter::mklazy_fp32zeros;
+    using sdp::aotriton_adapter::LazyTensorContext;
+    LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" };
+    LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" };
+    params.D = mklazy_empty_like<2>(&lazy_delta);
+    params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc);
+#else
+    at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
+    params.D = mk_aotensor<2>(delta, "delta");
+#endif
     err = aotriton::v3::flash::attn_bwd(params,
                                         aotriton::v3::flash::attn_bwd_params::kVersion,
                                         stream);
@@ -843,7 +842,6 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
   CHECK_SHAPE(cu_seqlens_k, batch_size + 1);
 
   at::Tensor softmax_lse_cont = softmax_lse.view({batch_size * num_heads, max_seqlen_q}).contiguous();
-  at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
 
   at::Tensor q_padded, k_padded, v_padded;
   q_padded = q.unsqueeze(0).transpose(1, 2);
@@ -901,10 +899,10 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
                                                               max_seqlen_q,
                                                               max_seqlen_k,
                                                               is_causal);
-#if V3_API
+#if AOTRITON_V3_API
   const bool uses_swa = needs_swa;
 #else
-  // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
+  // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be
   // optimized out (hopefully).
   constexpr bool uses_swa = false;
 #endif
@@ -924,8 +922,8 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
     hipError_t err; // TODO: Error handling
     using sdp::aotriton_adapter::mk_aotensor;
     using sdp::aotriton_adapter::mk_aoscalartensor;
-    if (uses_swa) {
-#if V3_API
+    if (uses_swa || AOTRITON_ALWAYS_V3_API) {
+#if AOTRITON_V3_API
       using aotriton::v3::flash::CausalType;
       using aotriton::v3::flash::VarlenType;
       aotriton::v3::flash::attn_bwd_params params;
@@ -935,11 +933,10 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
       params.Sm_scale = softmax_scale;
       params.Out = mk_aotensor(out_t, "out");
       params.DO = mk_aotensor(dout_t, "dout");
-      params.DK = mk_aotensor(dq_padded, "dq");
-      params.DV = mk_aotensor(dk_padded, "dk");
-      params.DQ = mk_aotensor(dv_padded, "dv");
+      params.DK = mk_aotensor(dk_padded, "dk");
+      params.DV = mk_aotensor(dv_padded, "dv");
+      params.DQ = mk_aotensor(dq_padded, "dq");
       params.L = mk_aotensor<2>(softmax_lse_cont, "L");
-      params.D = mk_aotensor<2>(delta, "delta");
       params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q, "cu_seqlens_q");
       params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k, "cu_seqlens_k");
       params.Max_seqlen_q = max_seqlen_q;        // Unused if cu_seqlens_q is empty
@@ -948,17 +945,30 @@ mha_varlen_bwd_aot(const at::Tensor &dout,  // total_q x num_heads, x head_size
       params.philox_seed_ptr =  mk_aoscalartensor(philox_seed);
       params.philox_offset1 = mk_aoscalartensor(philox_offset);
       params.philox_offset2 = 0;
-      params.causal_type = CausalType::WindowedAttention;
+      params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None;
       params.varlen_type = VarlenType::CompactVarlen;
       params.window_left = window_left;
       params.window_right = window_right;
+#if AOTRITON_ALWAYS_V3_API
+      using sdp::aotriton_adapter::mklazy_empty_like;
+      using sdp::aotriton_adapter::mklazy_fp32zeros;
+      using sdp::aotriton_adapter::LazyTensorContext;
+      LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" };
+      LazyTensorContext lazy_dq_acc { .like_tensor = dq_padded, .tensor_name = "dq_acc" };
+      params.D = mklazy_empty_like<2>(&lazy_delta);
+      params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc);
+#else
+      at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
+      params.D = mk_aotensor<2>(delta, "delta");
+#endif
       err = aotriton::v3::flash::attn_bwd(params,
                                           aotriton::v3::flash::attn_bwd_params::kVersion,
                                           stream);
-#endif
+#endif  // AOTRITON_ALWAYS_V3_API
     } else {
       using aotriton::v2::flash::attn_bwd_compact_varlen;
       using sdp::aotriton_adapter::cast_dtype;
+      at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous();
       aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype()));
       err = attn_bwd_compact_varlen(mk_aotensor(q_padded, "q"),
                                     mk_aotensor(k_padded, "k"),
diff --git a/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h
new file mode 100644
index 000000000000..c18744afc1ff
--- /dev/null
+++ b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file is a trimmed version of cuda/mem_eff_attention/gemm_kernel_utils.h
+#pragma once
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  TORCH_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  TORCH_CHECK(                         \
+      uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                    \
+  {                                                                    \
+    A = B;                                                             \
+    TORCH_CHECK(                                                    \
+        B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
diff --git a/aten/src/ATen/templates/FunctionalInverses.h b/aten/src/ATen/templates/FunctionalInverses.h
index 3217e097d7ad..b15cd09a6c65 100644
--- a/aten/src/ATen/templates/FunctionalInverses.h
+++ b/aten/src/ATen/templates/FunctionalInverses.h
@@ -2,22 +2,12 @@
 
 // ${generated_comment}
 
+#include <ATen/FunctionalStorageImpl.h>
 #include <ATen/Tensor.h>
 
 namespace at {
 namespace functionalization {
 
-enum class InverseReturnMode {
-  /// Specifies that functional inverses should always return a view.
-  AlwaysView,
-  /// Specifies that functional inverses should always return a non-view / copy.
-  NeverView,
-  /// Specifies that functional inverses should return a view unless a (copying) scatter
-  /// inverse exists, in which case that will be used instead.
-  /// This avoids as_strided() calls that can be difficult for subclasses to handle.
-  ViewOrScatterInverse,
-};
-
 struct FunctionalInverses {
 
 ${view_inverse_declarations}
diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp
index dc8619c25fc5..408aff0cdab4 100644
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@@ -4,7 +4,7 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
 #include <ATen/FunctionalTensorWrapper.h>
-#include <ATen/FunctionalInverses.h>
+#include <ATen/ViewMetaClasses.h>
 #include <ATen/MemoryOverlap.h>
 #include <torch/library.h>
 
diff --git a/aten/src/ATen/templates/ViewMetaClasses.cpp b/aten/src/ATen/templates/ViewMetaClasses.cpp
new file mode 100644
index 000000000000..0fd53171935f
--- /dev/null
+++ b/aten/src/ATen/templates/ViewMetaClasses.cpp
@@ -0,0 +1,19 @@
+// ${generated_comment}
+
+#include <ATen/FunctionalInverses.h>
+#include <ATen/ViewMetaClasses.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#include <ATen/NativeFunctions.h>
+#else
+${op_headers}
+#endif
+
+namespace at {
+namespace functionalization {
+
+${view_meta_implementations}
+
+} // namespace functionalization
+} // namespace at
diff --git a/aten/src/ATen/templates/ViewMetaClasses.h b/aten/src/ATen/templates/ViewMetaClasses.h
new file mode 100644
index 000000000000..be2dee2a871b
--- /dev/null
+++ b/aten/src/ATen/templates/ViewMetaClasses.h
@@ -0,0 +1,12 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <ATen/FunctionalStorageImpl.h>
+
+namespace at {
+namespace functionalization {
+
+${view_meta_declarations}
+
+} // namespace functionalization
+} // namespace at
diff --git a/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp b/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
new file mode 100644
index 000000000000..c784e5abe5c8
--- /dev/null
+++ b/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
@@ -0,0 +1,11 @@
+#include <ATen/ViewMetaClasses.h>
+#include <torch/csrc/functionalization/Module.h>
+
+namespace torch::functionalization {
+
+void initGenerated(PyObject* module) {
+  auto functionalization = py::handle(module).cast<py::module>();
+  $view_meta_bindings
+}
+
+} // namespace torch::functionalization
diff --git a/aten/src/ATen/test/cuda_allocator_test.cpp b/aten/src/ATen/test/cuda_allocator_test.cpp
index 5aa2378c22c4..27a352e7d5a2 100644
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@@ -5,51 +5,6 @@
 
 #include <ATen/test/allocator_clone_test.h>
 
-#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
-
 TEST(AllocatorTestCUDA, test_clone) {
   test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
-
-static int called_dummy_free_0 = 0;
-static int called_dummy_free_1 = 0;
-
-void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
-void dummy_free_0(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_0++;
-}
-void dummy_free_1(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_1++;
-}
-
-// Tests that data_ptrs have their respective deleters
-// when mixing allocators
-TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
-  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
-  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
-  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
-  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
-  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Manually use a's deleter
-  auto* ctx = a.storage().data_ptr().get_context();
-  a.storage().data_ptr().get_deleter()(ctx);
-  a.storage().mutable_data_ptr().release_context();
-
-  // a's deleter is dummy_free_0
-  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
-  ASSERT_TRUE(called_dummy_free_0 == 1);
-
-  // Manually use b's deleter
-  ctx = b.storage().data_ptr().get_context();
-  b.storage().data_ptr().get_deleter()(ctx);
-  b.storage().mutable_data_ptr().release_context();
-
-  // b's deleter is dummy_free_1
-  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
-  ASSERT_TRUE(called_dummy_free_1 == 1);
-}
diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
index 7ba7bcb99bce..e4c18102526a 100644
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -10,8 +10,13 @@ using namespace at::native::memory;
 
 constexpr int buffer_size = 1024;
 
+#if defined(CUDA_VERSION) && CUDA_VERSION < 13000
 __managed__ double4 buffer1[buffer_size];
 __managed__ double4 buffer2[buffer_size];
+#else
+__managed__ double4_16a buffer1[buffer_size];
+__managed__ double4_16a buffer2[buffer_size];
+#endif
 
 void reset_buffers() {
   for (int i = 0; i < buffer_size; i++) {
diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py
index 5cd714fe02e9..678cee5f752c 100644
--- a/benchmarks/dynamo/check_accuracy.py
+++ b/benchmarks/dynamo/check_accuracy.py
@@ -15,6 +15,8 @@
     "timm_efficientnet",  # see https://github.com/pytorch/pytorch/issues/148699
     "XGLMForCausalLM",  # discovered in https://github.com/pytorch/pytorch/pull/128148
     "moondream",  # discovered in https://github.com/pytorch/pytorch/pull/159291
+    # discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it
+    "mobilenetv3_large_100",
 }
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
index 1dceba2f8ba9..1def1d99bd53 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv
@@ -130,7 +130,7 @@ mnasnet_100,pass,7
 
 
 
-mobilenetv2_100,fail_accuracy,7
+mobilenetv2_100,pass,7
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
index 01762c5f5f29..1d199fe8ea66 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
index e68aa2fa5351..a4dbaeb7b546 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@@ -98,11 +98,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
index aec659fdcd65..885029ba8c56 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@@ -98,11 +98,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
index 4f2eec149352..aa7a3161afcc 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@@ -98,11 +98,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
index 3e4c3caa1ca9..20cad351b127 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
index 3630f9a75af8..5050b3762ed9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -2,7 +2,7 @@ name,accuracy,graph_breaks
 
 
 
-torchrec_dlrm,fail_to_run,3
+torchrec_dlrm,pass,6
 
 
 
@@ -94,7 +94,7 @@ hf_Bert_large,pass,6
 
 
 
-hf_BigBird,fail_to_run,3
+hf_BigBird,pass,6
 
 
 
@@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0
 
 
 
-hf_Reformer,fail_to_run,21
+hf_Reformer,pass,25
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
index c8db4d582320..f26dea6f692e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@@ -82,11 +82,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
index f4c9ffddd997..39149853947c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@@ -98,11 +98,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3
 
 
 
-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
index 63d0efa38f63..2b2c1a504647 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
index 01762c5f5f29..1d199fe8ea66 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
index fbd169539ab7..e41018657c0e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -106,7 +106,7 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,pass,4
+doctr_det_predictor,pass,3
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
index 6f316b219bb9..bf70642a855e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
index 4b5138ce9c36..e019365ccbfd 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
index a3fc7cf19237..fed8ebded682 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
index 6f316b219bb9..bf70642a855e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
index 8ccf95da9659..014e23e41cb3 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@@ -106,11 +106,11 @@ dlrm,pass,0
 
 
 
-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3
 
 
 
-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1
 
 
 
diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml
index bf0a1b6c31e8..6a15cf33222b 100644
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@@ -219,7 +219,9 @@ skip:
       - timm_regnet
       - timm_nfnet
 
-    cuda: []
+    cuda:
+      # Temporary until https://github.com/pytorch/pytorch/issues/162282 is fixed
+      - sam_fast
 
   test:
     training:
diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py
index cb836bb5eaa4..3f79ed2318c4 100644
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@@ -4,6 +4,7 @@
 import functools
 import json
 import os
+import platform
 import timeit
 from collections import namedtuple
 from dataclasses import asdict, dataclass
@@ -17,6 +18,7 @@
 
 # needs to be imported after torch
 import torch.utils.cpp_extension as cpp_extension  # noqa: F401
+from torch.utils.benchmark import Timer
 
 
 """Performance microbenchmarks.
@@ -191,6 +193,11 @@ def __init__(self, args):
         self.predefined_minimum_secs = 1
         self.max_iters = 1e6
         self.use_jit = args.use_jit
+        self.use_compile = args.use_compile
+        if self.use_jit and self.use_compile:
+            raise ValueError(
+                "use_jit and use_compile are mutually exclusive, please specify one."
+            )
         self.num_runs = args.num_runs
         self.print_per_iter = False
         self.output_csv = args.output_csv
@@ -222,7 +229,7 @@ def _print_header(self):
             if self.args.operators:
                 print(f"# {self.args.operators}")
 
-    def _print_perf_result(self, reported_run_time_us, test_case):
+    def _print_perf_result(self, results, test_case):
         if self.args.report_aibench:
             # Output for AIBench
             # Print out per iteration execution time instead of avg time
@@ -236,12 +243,14 @@ def _print_perf_result(self, reported_run_time_us, test_case):
                             "type": test_name,
                             "metric": "latency",
                             "unit": "us",
-                            "value": str(reported_run_time_us[run]),
+                            "value": str(results["reported_run_time_us"[run]]),
                         }
                     )
                 )
         else:
-            print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}")
+            print(
+                f"# Mode: {'JIT' if self.use_jit else 'Compile' if self.use_compile else 'Eager'}"
+            )
             print(
                 f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}"
             )
@@ -250,25 +259,33 @@ def _print_perf_result(self, reported_run_time_us, test_case):
             if self.num_runs > 1:
                 for run in range(self.num_runs):
                     print(
-                        f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}"
+                        f"Run: {run}, {mode} Execution Time (us) : {results['reported_run_time_us'][run]:.3f}"
                     )
                 print()
             else:
-                print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n")
+                print(
+                    f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}"
+                )
+                print(f"Peak Memory (KB) : {results['peak_memory']}\n")
 
-    def _perf_result_to_dict(self, reported_run_time_us, test_case):
+    def _perf_result_to_dict(self, results, test_case):
         """This function is the parallel of _print_perf_result, which instead of
         writing information to terminal, returns a dictionary.
         """
         if self.args.report_aibench:
             return {}
+
         out = {
             "test_name": test_case.test_config.test_name,
             "input_config": test_case.test_config.input_config,
-            "mode": "JIT" if self.use_jit else "Eager",
+            "runtime": (
+                "JIT" if self.use_jit else "Compile" if self.use_compile else "Eager"
+            ),
             "run": "Backward" if test_case.test_config.run_backward else "Forward",
-            "latency": round(reported_run_time_us[0], 3),
+            "latency": round(results["reported_run_time_us"][0], 3),
             "latency unit": "us",
+            "peak memory": results["peak_memory"],
+            "memory unit": "KB",
         }
 
         # parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary
@@ -330,10 +347,26 @@ def _launch_forward(self, test_case, iters, print_per_iter):
         func = test_case.run_forward
         if self.use_jit:
             func = test_case.run_jit_forward
-        forward_time = timeit.timeit(
-            functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+        if self.use_compile:
+            func = test_case.run_compile_forward
+
+        if not cuda_sync:
+            forward_time = timeit.timeit(
+                functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+            )
+            return forward_time
+        # Stable timing with Timer
+        timer = Timer(
+            stmt="func(iters, print_per_iter, cuda_sync)",
+            globals={
+                "func": func,
+                "iters": iters,
+                "print_per_iter": print_per_iter,
+                "cuda_sync": cuda_sync,
+            },
         )
-        return forward_time
+        result = timer.adaptive_autorange(min_run_time=0.0001)
+        return result.median * iters
 
     def _launch_backward(self, test_case, iters, print_per_iter=False):
         """This function runs forward path of an op to get an output. Then the backward path is executed
@@ -346,7 +379,7 @@ def _launch_backward(self, test_case, iters, print_per_iter=False):
         )
         return backward_time
 
-    def _measure_time(self, launch_test, test_case, iters, print_per_iter):
+    def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
         """
         This function execute the operator for <iters> iterations then look at the time.
         If it's not significant, the number of iterations will be increased before rerun.
@@ -354,8 +387,25 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
         """
         curr_test_total_time = 0
         time_trace = []
+        peak_memory = 0
+        input_values = test_case.op_bench.inputs.values()
+        device, device_module = None, None
+        if input_values and isinstance(next(iter(input_values)), torch.Tensor):
+            # The device and device module information are crucial for memory metric calculation,
+            # In case of ops where inputs are integers (not tensor), memory metrics need not be calculated.
+            sample_input = next(iter(input_values))
+            device = sample_input.device
+            device_module = torch.get_device_module(device.type)
+        # TODO: add support for cpu memory measurement
         while True:
+            if hasattr(device_module, "reset_peak_memory_stats"):
+                device_module.reset_peak_memory_stats(device)
             run_time_sec = launch_test(test_case, iters, print_per_iter)
+            if hasattr(device_module, "synchronize"):
+                device_module.synchronize(device)
+            # Memory measurement process
+            if hasattr(device_module, "max_memory_allocated"):
+                peak_memory = device_module.max_memory_allocated(device)
             curr_test_total_time += run_time_sec
             # Analyze time after each run to decide if the result is stable
             results_are_significant = self._iteration_result_is_significant(
@@ -369,7 +419,13 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
             time_trace.append(report_run_time)
             # Print out the time spent in each epoch in ms
             if self.args.report_aibench:
-                mode = "JIT" if self.use_jit else "Eager"
+                mode = (
+                    "JIT"
+                    if self.use_jit
+                    else "Compile"
+                    if self.use_compile
+                    else "Eager"
+                )
                 test_name = "_".join(
                     [test_case.framework, test_case.test_config.test_name, mode]
                 )
@@ -381,7 +437,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
                             "metric": "latency",
                             "unit": "ms",
                             "value": str(report_run_time / 1e3),
-                        }
+                        },
                     )
                 )
             if results_are_significant:
@@ -391,7 +447,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter):
             # iteration count, and run the benchmark again...
             iters = self._predict_num_iter_needed(iters)
         reported_run_time_us = np.percentile(np.array(time_trace), 50)
-        return reported_run_time_us
+        return reported_run_time_us, peak_memory / 1024
 
     def _check_keep(self, test_flag, cmd_flag):
         return cmd_flag is None or test_flag == cmd_flag
@@ -478,6 +534,7 @@ def _output_json(
         self,
         perf_list,
         output_file,
+        benchmark_name="PyTorch operator benchmark",
     ):
         """
         Write the result into JSON format, so that it can be uploaded to the benchmark database
@@ -495,8 +552,10 @@ def _output_json(
             input_config = perf_item.get("input_config", "")
             run_type = perf_item.get("run")
             latency = perf_item.get("latency", 0)
-
-            dtype = "float32"  # default
+            peak_memory = perf_item.get("peak memory", 0)
+            device = perf_item.get("device", "unknown")
+            dtype = perf_item.get("dtype", "torch.float").split(".")[1]
+            runtime = perf_item.get("runtime", None)
 
             # Extract mode based on run_type
             mode = None
@@ -505,6 +564,22 @@ def _output_json(
             elif run_type == "Backward":
                 mode = "training"
 
+            # Extract use_compile from it
+            if runtime == "Compile":
+                use_compile = True
+            elif runtime == "Eager":
+                use_compile = False
+            else:
+                use_compile = None
+
+            device_arch = (
+                torch.cuda.get_device_name(0)
+                if device == "cuda"
+                else platform.processor()
+                if device == "cpu"
+                else "unknown"
+            )
+
             # Create the record
             @dataclass
             class BenchmarkInfo:
@@ -532,12 +607,18 @@ class BenchmarkRecord:
                 model: ModelInfo
                 metric: MetricInfo
 
-            record = BenchmarkRecord(
+            # Add record for latency
+            record_latency = BenchmarkRecord(
                 benchmark=BenchmarkInfo(
-                    name="PyTorch operator benchmark",
+                    name=benchmark_name,
                     mode=mode,
                     dtype=dtype,
-                    extra_info={"input_config": input_config},
+                    extra_info={
+                        "input_config": input_config,
+                        "device": device,
+                        "arch": device_arch,
+                        "use_compile": use_compile,
+                    },
                 ),
                 model=ModelInfo(
                     name=test_name, type="micro-benchmark", origins=["pytorch"]
@@ -549,8 +630,17 @@ class BenchmarkRecord:
                     target_value=None,
                 ),
             )
-
-            records.append(asdict(record))
+            records.append(asdict(record_latency))
+
+            # Add record for peak memory
+            record_memory = copy.deepcopy(record_latency)
+            record_memory.metric = MetricInfo(
+                name="peak memory",
+                unit="KB",
+                benchmark_values=[peak_memory],
+                target_value=None,
+            )
+            records.append(asdict(record_memory))
 
         # Write all records to the output file
         with open(output_file, "w", encoding="utf-8") as f:
@@ -566,6 +656,7 @@ def run(self):
             "tag",
             "run_backward",
             "Execution Time",
+            "Peak Memory (KB)",
         ]
 
         if self.args.output_json or self.args.output_json_for_dashboard:
@@ -603,13 +694,16 @@ def run(self):
                     test_case, self.args.warmup_iterations, print_per_iter=False
                 )
                 # Actual Execution
-                reported_time = [
-                    self._measure_time(
+                results = [
+                    self._measure_metrics(
                         launch_func, test_case, self.iters, self.print_per_iter
                     )
                     for _ in range(self.num_runs)
                 ]
-                self._print_perf_result(reported_time, test_case)
+                result_dict = dict()
+                result_dict["reported_run_time_us"] = [r[0] for r in results]
+                result_dict["peak_memory"] = results[0][1]
+                self._print_perf_result(results=result_dict, test_case=test_case)
 
                 # output results to csv
                 self._output_csv(
@@ -625,16 +719,17 @@ def run(self):
                         ),
                         test_case.test_config.tag,
                         test_case.test_config.run_backward,
-                        reported_time[0],
+                        result_dict["reported_run_time_us"][0],
+                        result_dict["peak_memory"],
                     ],
                 )
                 if self.args.output_json or self.args.output_json_for_dashboard:
-                    perf_list.append(
-                        self._perf_result_to_dict(reported_time, test_case)
-                    )
+                    perf_list.append(self._perf_result_to_dict(result_dict, test_case))
 
         if self.args.output_json_for_dashboard:
-            self._output_json(perf_list, self.args.output_json_for_dashboard)
+            self._output_json(
+                perf_list, self.args.output_json_for_dashboard, self.args.benchmark_name
+            )
 
         if self.args.output_json:
             with open(self.args.output_json, "w") as f:
diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py
index 52ae47047daa..cfed9ebac04b 100644
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@@ -4,6 +4,15 @@
 import torch
 
 
+# Import the C++ extension to register the _consume operator
+try:
+    import benchmark_cpp_extension  # noqa: F401
+except ImportError as err:
+    # If the extension isn't built, the script must raise an error
+    raise ImportError(
+        "Failed to import C++ extension, please build it using \ncd pt_extension \npython -m pip install ."
+    ) from err
+
 """PyTorch performance microbenchmarks.
 
 This module contains PyTorch-specific functionalities for performance
@@ -71,6 +80,16 @@ def forward_consume(self, iters: int):
         for _ in range(iters):
             torch.ops.operator_benchmark._consume(self.forward_impl())
 
+    def forward_impl_eager(self):
+        # This is to supply the inputs to the forward function which
+        # will be called in both the eager and compile mode of local runs
+        return self.forward(*self.get_inputs())
+
+    def forward_consume_eager(self, iters: int):
+        # Eager version of forward_consume without decorators (compilation handled by torch.compile)
+        for _ in range(iters):
+            torch.ops.operator_benchmark._consume(self.forward_impl_eager())
+
     def module_name(self):
         """this is used to label the operator being benchmarked"""
         if self.user_given_name:
@@ -117,18 +136,34 @@ def __init__(self, op_bench, test_config):
         self.framework = "PyTorch"
         self.time_series = []
         self._jit_forward_graph = None
+        self._compile_forward_graph = None
 
     def _generate_jit_forward_graph(self):
         """generate a graph for the forward function via scripting"""
         scripted_op_bench = torch.jit.script(self.op_bench)
         return scripted_op_bench.forward_consume
 
+    def _generate_compile_forward_graph(self):
+        """generate a compiled graph for the forward function via torch.compile"""
+        compiled_forward_consume = torch.compile(
+            self.op_bench.forward_consume_eager, backend="inductor"
+        )
+        return compiled_forward_consume
+
     def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
         """Run the forward path of an op with JIT mode"""
         if self._jit_forward_graph is None:
             self._jit_forward_graph = self._generate_jit_forward_graph()
         self._jit_forward_graph(num_runs)
 
+    def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
+        """Run the forward path of an op with compile mode"""
+        if self._compile_forward_graph is None:
+            self._compile_forward_graph = self._generate_compile_forward_graph()
+        self._compile_forward_graph(num_runs)
+        if cuda_sync:
+            torch.cuda.synchronize(torch.cuda.current_device())
+
     def _print_per_iter(self):
         # print last 50 values
         length = min(len(self.time_series), 50)
@@ -150,14 +185,14 @@ def run_forward(self, num_runs, print_per_iter, cuda_sync):
         if print_per_iter:
             for _ in range(num_runs):
                 start_time = time.time()
-                self.output = self.op_bench.forward_impl()
+                self.output = self.op_bench.forward_impl_eager()
                 if cuda_sync:
                     torch.cuda.synchronize(torch.cuda.current_device())
                 end_time = time.time()
                 self.time_series.append((end_time - start_time) * 1e3)
         else:
             for _ in range(num_runs):
-                self.output = self.op_bench.forward_impl()
+                self.output = self.op_bench.forward_impl_eager()
             if cuda_sync:
                 torch.cuda.synchronize(torch.cuda.current_device())
 
diff --git a/benchmarks/operator_benchmark/benchmark_runner.py b/benchmarks/operator_benchmark/benchmark_runner.py
index 9dfab781498e..6568cf9bf3ee 100644
--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@@ -62,6 +62,13 @@ def parse_args():
         default=None,
     )
 
+    parser.add_argument(
+        "--benchmark-name",
+        "--benchmark_name",
+        help="Name of the benchmark to store results to",
+        default="PyTorch operator benchmark",
+    )
+
     parser.add_argument(
         "--list-tests",
         "--list_tests",
@@ -135,6 +142,16 @@ def parse_args():
         help="Run operators with PyTorch JIT mode",
     )
 
+    parser.add_argument(
+        "--use-compile",
+        "--use_compile",
+        type=benchmark_utils.str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="Run operators with PyTorch Compile mode",
+    )
+
     parser.add_argument(
         "--forward-only",
         "--forward_only",
@@ -162,7 +179,7 @@ def parse_args():
         "--output-json-for-dashboard",
         "--output_json_for_dashboard",
         help="Save results in JSON format for display on the OSS dashboard",
-        default="False",
+        default="benchmark-results.json",
     )
 
     args, _ = parser.parse_known_args()
diff --git a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
index 873f14d20127..9a7b6797e982 100644
--- a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
@@ -1,5 +1,5 @@
 Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
-PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
+PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459
 PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
 PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
 PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449
@@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547
-PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375
 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333
 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664
-PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525
 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458
 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728
@@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619
-PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613
-PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295
 PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189
@@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp
 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763
 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667
 PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333
-PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
\ No newline at end of file
+PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
diff --git a/benchmarks/operator_benchmark/pt/add_test.py b/benchmarks/operator_benchmark/pt/add_test.py
index 54504c4f3005..739b8ef14a54 100644
--- a/benchmarks/operator_benchmark/pt/add_test.py
+++ b/benchmarks/operator_benchmark/pt/add_test.py
@@ -52,27 +52,6 @@ def forward(self, input_one, input_two):
 op_bench.generate_pt_test(add_long_configs + add_short_configs, AddBenchmark)
 op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddBenchmark)
 
-
-"""Mircobenchmark for addmm operator."""
-
-
-class AddmmBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K, device):
-        self.inputs = {
-            "input_one": torch.rand(M, K, device=device, requires_grad=self.auto_set()),
-            "mat1": torch.rand(M, N, device=device, requires_grad=self.auto_set()),
-            "mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set()),
-        }
-        self.set_module_name("addmm")
-
-    def forward(self, input_one, mat1, mat2):
-        return torch.addmm(input_one, mat1, mat2)
-
-
-op_bench.generate_pt_test(add_long_configs + add_short_configs, AddmmBenchmark)
-op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBenchmark)
-
-
 """Mircobenchmark for addr operator."""
 
 
@@ -106,46 +85,5 @@ def forward(self, input_one, vec1, vec2):
 op_bench.generate_pt_test(addr_configs, AddrBenchmark)
 op_bench.generate_pt_gradient_test(addr_configs, AddrBenchmark)
 
-
-"""Mircobenchmark for addbmm operator."""
-
-
-class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, B, M, N, K, device):
-        self.inputs = {
-            "input_one": torch.rand(
-                (M, N), device=device, requires_grad=self.auto_set()
-            ),
-            "batch1": torch.rand(
-                (B, M, K), device=device, requires_grad=self.auto_set()
-            ),
-            "batch2": torch.rand(
-                (
-                    B,
-                    K,
-                    N,
-                ),
-                device=device,
-                requires_grad=self.auto_set(),
-            ),
-        }
-        self.set_module_name("addbmm")
-
-    def forward(self, input_one, batch1, batch2):
-        return torch.addbmm(input_one, batch1, batch2)
-
-
-addbmm_configs = op_bench.cross_product_configs(
-    B=[2, 100],
-    M=[8, 256],
-    N=[256, 16],
-    K=[15, 16],
-    device=["cpu", "cuda"],
-    tags=["addbmm"],
-)
-
-op_bench.generate_pt_test(addbmm_configs, AddbmmBenchmark)
-op_bench.generate_pt_gradient_test(addbmm_configs, AddbmmBenchmark)
-
 if __name__ == "__main__":
     op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/pt/addmm_test.py b/benchmarks/operator_benchmark/pt/addmm_test.py
new file mode 100644
index 000000000000..a98628944b3e
--- /dev/null
+++ b/benchmarks/operator_benchmark/pt/addmm_test.py
@@ -0,0 +1,115 @@
+import operator_benchmark as op_bench
+
+import torch
+
+
+"""Microbenchmarks for add_(matmul) operator. Supports both Caffe2/PyTorch."""
+
+# Configs for PT add operator
+addmm_long_configs = op_bench.cross_product_configs(
+    M=[256, 1024, 3000],
+    N=[512, 4096],
+    K=[512, 4096],
+    device=["cuda"],
+    tags=["long"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
+)
+
+
+addmm_short_configs = op_bench.config_list(
+    attr_names=["M", "N", "K"],
+    attrs=[
+        [1, 1, 1],
+        [64, 64, 64],
+        [64, 64, 128],
+    ],
+    cross_product_configs={
+        "device": ["cpu", "cuda"],
+        "dtype": [torch.float],
+    },
+    tags=["short"],
+)
+
+
+"""Mircobenchmark for addmm operator."""
+
+
+class AddmmBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, K, device, dtype):
+        self.inputs = {
+            "input_one": torch.rand(
+                M, K, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "mat1": torch.rand(
+                M, N, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "mat2": torch.rand(
+                N, K, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+        }
+        self.set_module_name("addmm")
+
+    def forward(self, input_one, mat1, mat2):
+        return torch.addmm(input_one, mat1, mat2)
+
+
+op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark)
+op_bench.generate_pt_gradient_test(
+    addmm_long_configs + addmm_long_configs, AddmmBenchmark
+)
+
+"""Mircobenchmark for addbmm operator."""
+
+
+class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, B, M, N, K, device, dtype):
+        self.inputs = {
+            "input_one": torch.rand(
+                (M, N), device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "batch1": torch.rand(
+                (B, M, K), device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "batch2": torch.rand(
+                (
+                    B,
+                    K,
+                    N,
+                ),
+                device=device,
+                requires_grad=self.auto_set(),
+                dtype=dtype,
+            ),
+        }
+        self.set_module_name("addbmm")
+
+    def forward(self, input_one, batch1, batch2):
+        return torch.addbmm(input_one, batch1, batch2)
+
+
+addbmm_long_configs = op_bench.cross_product_configs(
+    B=[8, 32],
+    M=[256, 1024],
+    N=[256, 1024],
+    K=[64, 128],
+    device=["cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
+    tags=["long"],
+)
+addbmm_short_configs = op_bench.cross_product_configs(
+    B=[1, 8],
+    M=[8, 128],
+    N=[32, 64],
+    K=[256, 512],
+    device=["cpu", "cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
+    tags=["short"],
+)
+
+op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark)
+op_bench.generate_pt_gradient_test(
+    addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark
+)
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/pt/bmm_test.py b/benchmarks/operator_benchmark/pt/bmm_test.py
index 1c6d1f9aca55..f867f6ac09f8 100644
--- a/benchmarks/operator_benchmark/pt/bmm_test.py
+++ b/benchmarks/operator_benchmark/pt/bmm_test.py
@@ -27,12 +27,12 @@
 )
 
 batched_binary_configs_long = op_bench.cross_product_configs(
-    B=[1, 128],
-    M=[8, 128],
-    N=[32, 64],
-    K=[4, 256],
-    device=["cpu", "cuda"],
-    dtype=[torch.float, torch.bfloat16],
+    B=[8, 32],
+    M=[256, 1024],
+    N=[256, 1024],
+    K=[64, 128],
+    device=["cuda"],
+    dtype=[torch.float32, torch.bfloat16, torch.float16],
     tags=["long"],
 )
 
@@ -40,8 +40,12 @@
 class BatchedBinaryOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, B, M, N, K, device, dtype, op_func):
         self.inputs = {
-            "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype),
-            "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype),
+            "batch1": torch.rand(
+                (B, M, N), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
+            "batch2": torch.rand(
+                (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
         }
         self.op_func = op_func
 
@@ -54,6 +58,11 @@ def forward(self, batch1, batch2):
     batched_binary_configs_short + batched_binary_configs_long,
     BatchedBinaryOpBenchmark,
 )
+op_bench.generate_pt_gradient_tests_from_op_list(
+    batched_binary_ops,
+    batched_binary_configs_long,
+    BatchedBinaryOpBenchmark,
+)
 
 
 # batched ternary ops
@@ -66,9 +75,15 @@ def forward(self, batch1, batch2):
 class BatchedTernaryOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, B, M, N, K, device, dtype, op_func):
         self.inputs = {
-            "input_": torch.rand((B, M, K), device=device).to(dtype=dtype),
-            "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype),
-            "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype),
+            "input_": torch.rand(
+                (B, M, K), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
+            "batch1": torch.rand(
+                (B, M, N), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
+            "batch2": torch.rand(
+                (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
         }
         self.op_func = op_func
 
@@ -81,6 +96,12 @@ def forward(self, input_, batch1, batch2):
     batched_binary_configs_short + batched_binary_configs_long,
     BatchedTernaryOpBenchmark,
 )
+op_bench.generate_pt_gradient_tests_from_op_list(
+    batched_ternary_ops,
+    batched_binary_configs_long,
+    BatchedTernaryOpBenchmark,
+)
+
 
 # TODO: does it automatically register new scripts?
 
diff --git a/benchmarks/operator_benchmark/pt/matmul_test.py b/benchmarks/operator_benchmark/pt/matmul_test.py
index e92728e9ebd3..d0c58aa16e8f 100644
--- a/benchmarks/operator_benchmark/pt/matmul_test.py
+++ b/benchmarks/operator_benchmark/pt/matmul_test.py
@@ -13,33 +13,46 @@
         [128, 128, 128, True, False],
         [256, 256, 256, False, True],
     ],
-    cross_product_configs={
-        "device": ["cpu", "cuda"],
-    },
+    cross_product_configs={"device": ["cpu", "cuda"]},
     tags=["short"],
 )
 
 
 mm_long_configs = op_bench.cross_product_configs(
-    M=[32],
-    N=[512, 128],
-    K=[64],
+    M=[256, 1024, 3000],
+    N=[512, 4096],
+    K=[512, 4096],
     trans_a=[False, True],
     trans_b=[True, False],
-    device=["cpu", "cuda"],
+    device=["cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
     tags=["long"],
 )
 
 
 class MatMulBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K, trans_a, trans_b, device):
+    def init(self, M, N, K, trans_a, trans_b, device, dtype=torch.float):
+        # Create tensors without requires_grad first, then set it separately
+        # This avoids creating graph leaves that cannot be deep copied
+        if trans_a:
+            input_one = torch.rand(M, N, device=device, dtype=dtype)
+        else:
+            input_one = torch.rand(N, M, device=device, dtype=dtype).t()
+
+        if trans_b:
+            input_two = torch.rand(N, K, device=device, dtype=dtype)
+        else:
+            input_two = torch.rand(K, N, device=device, dtype=dtype).t()
+
+        # Set requires_grad after tensor creation to avoid graph leaf issues
+        if self.auto_set():
+            input_one.requires_grad_(True)
+        if self.auto_set():
+            input_two.requires_grad_(True)
+
         self.inputs = {
-            "input_one": torch.rand(M, N, device=device)
-            if trans_a
-            else torch.rand(N, M, device=device).t(),
-            "input_two": torch.rand(N, K, device=device)
-            if trans_b
-            else torch.rand(K, N, device=device).t(),
+            "input_one": input_one,
+            "input_two": input_two,
         }
         self.set_module_name("matmul")
 
@@ -48,6 +61,7 @@ def forward(self, input_one, input_two):
 
 
 op_bench.generate_pt_test(mm_long_configs + mm_short_configs, MatMulBenchmark)
+op_bench.generate_pt_gradient_test(mm_long_configs, MatMulBenchmark)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/operator_benchmark/pt/mm_test.py b/benchmarks/operator_benchmark/pt/mm_test.py
index bf2a2651e8fb..f9e0743ba712 100644
--- a/benchmarks/operator_benchmark/pt/mm_test.py
+++ b/benchmarks/operator_benchmark/pt/mm_test.py
@@ -23,11 +23,11 @@
 )
 
 mm_long_configs = op_bench.cross_product_configs(
-    M=[8, 128],
-    N=[32, 64],
-    K=[256, 512],
-    device=["cpu", "cuda"],
-    dtype=[torch.float, torch.bfloat16],
+    M=[256, 1024, 3000],
+    N=[512, 4096],
+    K=[512, 4096],
+    device=["cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
     tags=["long"],
 )
 
@@ -35,8 +35,12 @@
 class MmOpBenchmark(op_bench.TorchBenchmarkBase):
     def init(self, M, N, K, device, dtype, op_func):
         self.inputs = {
-            "input_one": torch.randn(M, N, device=device).to(dtype=dtype),
-            "input_two": torch.randn(N, K, device=device).to(dtype=dtype),
+            "input_one": torch.randn(
+                M, N, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "input_two": torch.randn(
+                N, K, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
         }
         self.op_func = op_func
 
@@ -47,6 +51,9 @@ def forward(self, input_one, input_two):
 op_bench.generate_pt_tests_from_op_list(
     ops_list, mm_short_configs + mm_long_configs, MmOpBenchmark
 )
+op_bench.generate_pt_gradient_tests_from_op_list(
+    ops_list, mm_long_configs, MmOpBenchmark
+)
 
 
 if __name__ == "__main__":
diff --git a/buckbuild.bzl b/buckbuild.bzl
index c5608f53ffea..193c16fbd4e5 100644
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@@ -11,7 +11,7 @@ load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
 load("//tools/build_defs:platform_defs.bzl", "APPLETVOS", "IOS", "MACOSX")
 load("//tools/build_defs:type_defs.bzl", "is_list", "is_string")
 load("//tools/build_defs/android:build_mode_defs.bzl", is_production_build_android = "is_production_build")
-load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build")
+load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build", is_profile_build_ios = "is_profile_build")
 load(
     ":build_variables.bzl",
     "aten_cpu_source_list",
@@ -74,7 +74,7 @@ def _is_build_mode_dev():
     if is_production_build_android():
         # Android Prod builds
         return False
-    if is_production_build_ios():
+    if is_production_build_ios() or is_profile_build_ios():
         # iOS Prod builds
         return False
 
@@ -391,6 +391,8 @@ def get_aten_generated_files(enabled_backends):
         "CompositeExplicitAutogradFunctions_inl.h",
         "CompositeExplicitAutogradNonFunctionalFunctions.h",
         "CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
+        "ViewMetaClasses.h",
+        "ViewMetaClasses.cpp",
         "VmapGeneratedPlumbing.h",
         "core/ATenOpList.cpp",
         "core/TensorBody.h",
@@ -1192,6 +1194,7 @@ def define_buck_targets(
             "NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
             "Operators.h": ":gen_aten[Operators.h]",
             "RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
+            "ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
             "core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
             "core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
             "core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
diff --git a/build.bzl b/build.bzl
index 7c2c3e24dc5a..91529e75c9f0 100644
--- a/build.bzl
+++ b/build.bzl
@@ -118,6 +118,9 @@ def define_targets(rules):
             ":LazyNonNativeIr.h",
             ":RegisterDispatchDefinitions.ini",
             ":RegisterDispatchKey.cpp",
+            ":ViewMetaClassesPythonBinding.cpp",
+            ":ViewMetaClasses.cpp",
+            ":ViewMetaClasses.h",
             ":native_functions.yaml",
             ":shape_inference.h",
             ":tags.yaml",
@@ -170,6 +173,7 @@ GENERATED_H = [
     "FunctionalInverses.h",
     "RedispatchFunctions.h",
     "RegistrationDeclarations.h",
+    "ViewMetaClasses.h",
     "VmapGeneratedPlumbing.h",
 ]
 
@@ -246,6 +250,7 @@ GENERATED_CPP = [
     "RegisterFunctionalization_1.cpp",
     "RegisterFunctionalization_2.cpp",
     "RegisterFunctionalization_3.cpp",
+    "ViewMetaClasses.cpp",
 ]
 
 GENERATED_CPP_CORE = [
@@ -307,6 +312,7 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
     "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
     "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
     "torch/csrc/autograd/generated/python_variable_methods.cpp",
+    "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
 ]
 
 GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP
diff --git a/build_variables.bzl b/build_variables.bzl
index dfae1d527bb7..05f5fb1068c8 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -512,6 +512,7 @@ libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/TCPStore.cpp",
     "torch/csrc/distributed/c10d/TCPStoreBackend.cpp",
     "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp",
+    "torch/csrc/distributed/c10d/Types.cpp",
     "torch/csrc/distributed/c10d/Utils.cpp",
     "torch/csrc/distributed/c10d/Work.cpp",
     "torch/csrc/distributed/c10d/comm.cpp",
@@ -635,6 +636,12 @@ libtorch_nativert_sources = [
     "torch/nativert/graph/passes/pass_manager/GraphPasses.cpp",
     "torch/nativert/graph/passes/pass_manager/PassManager.cpp",
     "torch/nativert/kernels/KernelHandlerRegistry.cpp",
+    "torch/nativert/kernels/TritonKernel.cpp",
+    "torch/nativert/executor/triton/CpuTritonKernelManager.cpp",
+]
+
+libtorch_nativert_cuda_sources = [
+    "torch/nativert/executor/triton/CudaTritonKernelManager.cpp",
 ]
 
 torch_mobile_tracer_sources = [
@@ -755,14 +762,22 @@ libtorch_cuda_distributed_extra_sources = [
     "torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu",
     "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
     "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
+    "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
     "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
 ]
 
+libtorch_nvshmem_sources = [
+    "torch/csrc/distributed/c10d/cuda/utils.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
+    "torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu",
+    "torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu",
+]
+
 libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
 
 libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
     "torch/csrc/cuda/nccl.cpp",
-]
+] + libtorch_nativert_cuda_sources
 
 torch_cpp_srcs = [
     "torch/csrc/api/src/cuda.cpp",  # this just forwards stuff, no real CUDA
@@ -992,6 +1007,7 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/disable_torch_function.cpp",
     "torch/csrc/utils/verbose.cpp",
     "torch/csrc/cpu/Module.cpp",
+    "torch/csrc/functionalization/Module.cpp",
     "torch/csrc/instruction_counter/Module.cpp",
     "torch/nativert/python/Bindings.cpp",
 ] + lazy_tensor_core_python_sources
@@ -1034,6 +1050,7 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
         "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
         "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
         "torch/csrc/autograd/generated/python_variable_methods.cpp",
+        "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp",
     ]]
 
     _libtorch_python_sources.extend(libtorch_python_core_sources)
@@ -1079,6 +1096,7 @@ aten_cpu_source_non_codegen_list = [
     "aten/src/ATen/DeviceAccelerator.cpp",
     "aten/src/ATen/Context.cpp",
     "aten/src/ATen/DLConvertor.cpp",
+    "aten/src/ATen/DTensorState.cpp",
     "aten/src/ATen/EmptyTensor.cpp",
     "aten/src/ATen/ExpandUtils.cpp",
     "aten/src/ATen/CachedTensorUtils.cpp",
diff --git a/c10/core/AllocatorConfig.cpp b/c10/core/AllocatorConfig.cpp
index e154338d501b..c6b6e95f43b2 100644
--- a/c10/core/AllocatorConfig.cpp
+++ b/c10/core/AllocatorConfig.cpp
@@ -45,7 +45,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) {
       63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart);
   const size_t interval_end =
       63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
       interval_end - interval_start == kRoundUpPowerOfTwoIntervals,
       "kRoundUpPowerOfTwoIntervals mismatch");
 
@@ -64,7 +64,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize(
       std::numeric_limits<size_t>::max() / kMB;
 
   size_t val_env = tokenizer.toSizeT(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
       val_env >= min_allowed_split_size_mb,
       "CachingAllocator option max_split_size_mb too small, must be >= ",
       min_allowed_split_size_mb);
@@ -83,7 +83,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize(
       std::numeric_limits<size_t>::max() / kMB;
 
   size_t val_env = tokenizer.toSizeT(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
       val_env >= min_allowed_split_size_mb,
       "CachingAllocator option max_non_split_rounding_mb too small, must be >= ",
       min_allowed_split_size_mb);
@@ -98,7 +98,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold(
     size_t i) {
   tokenizer.checkToken(++i, ":");
   double val_env = tokenizer.toDouble(++i);
-  TORCH_CHECK_VALUE(
+  TORCH_CHECK(
       val_env > 0 && val_env < 1.0,
       "garbage_collect_threshold is invalid, set it in (0.0, 1.0)");
   garbage_collection_threshold_ = val_env;
@@ -119,7 +119,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
       size_t value_index = i;
       tokenizer.checkToken(++i, ":");
       size_t value = tokenizer.toSizeT(++i);
-      TORCH_CHECK_VALUE(
+      TORCH_CHECK(
           value == 0 || llvm::isPowerOf2_64(value),
           "For roundups, the divisions has to be power of 2 or 0 to disable roundup ");
 
@@ -133,7 +133,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
             value);
       } else {
         size_t boundary = tokenizer.toSizeT(value_index);
-        TORCH_CHECK_VALUE(
+        TORCH_CHECK(
             llvm::isPowerOf2_64(boundary),
             "For roundups, the intervals have to be power of 2 ");
 
@@ -163,7 +163,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions(
         "Expected closing bracket ']' in ConfigTokenizer but reached end of config");
   } else { // Keep this for backwards compatibility
     size_t value = tokenizer.toSizeT(i);
-    TORCH_CHECK_VALUE(
+    TORCH_CHECK(
         llvm::isPowerOf2_64(value),
         "For roundups, the divisions has to be power of 2 ");
     std::fill(
diff --git a/c10/core/AllocatorConfig.h b/c10/core/AllocatorConfig.h
index efde5e3a8ff9..68cc47a8417c 100644
--- a/c10/core/AllocatorConfig.h
+++ b/c10/core/AllocatorConfig.h
@@ -76,7 +76,7 @@ class ConfigTokenizer {
     } else if (token == "False") {
       return false;
     } else {
-      TORCH_CHECK_VALUE(
+      TORCH_CHECK(
           false,
           "Expected 'True' or 'False' at index ",
           i,
diff --git a/c10/core/Contiguity.h b/c10/core/Contiguity.h
index 279a795583b1..eed3f2498342 100644
--- a/c10/core/Contiguity.h
+++ b/c10/core/Contiguity.h
@@ -33,7 +33,8 @@ bool _compute_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides, T numel) {
 }
 
 // Return a SymBool with underlying symbolic expression that represents
-// contiguity. Guaranteed not to add guards.
+// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
+// or symbolic True.
 inline static c10::SymBool _compute_contiguous_sym(
     ArrayRef<c10::SymInt> sizes,
     ArrayRef<c10::SymInt> strides,
@@ -76,6 +77,8 @@ inline static c10::SymBool _compute_contiguous_sym(
     return true;
   };
 
+  // We try to minimize creating large symbolic expressions when not needed to
+  // avoid symbolic evaluation perf issues.
   if (is_contiguous_or_false()) {
     return c10::SymBool(true);
   }
@@ -94,6 +97,9 @@ inline static c10::SymBool _compute_contiguous_sym(
   return is_contiguous_cond.sym_or(is_empty);
 }
 
+// When T is SymInt this function may throw a data dependent error.
+// _compute_channels_last_contiguous_2d_sym does not. Only use this function
+// when inputs are hinted.
 template <typename T>
 bool _compute_channels_last_contiguous_2d(
     ArrayRef<T> sizes,
@@ -105,8 +111,8 @@ bool _compute_channels_last_contiguous_2d(
       T expected = 1;
       for (auto& d : {1, 3, 2, 0}) {
         const auto& size_d = sizes[d];
-        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
-          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
+        if (size_d != 1) {
+          if (strides[d] != expected) {
             return false;
           }
           expected *= size_d;
@@ -123,6 +129,65 @@ bool _compute_channels_last_contiguous_2d(
   }
 }
 
+// Return a SymBool with underlying symbolic expression that represents
+// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions
+// or symbolic True.
+inline static c10::SymBool _compute_channels_last_contiguous_2d_sym(
+    ArrayRef<c10::SymInt> sizes,
+    ArrayRef<c10::SymInt> strides) {
+  switch (sizes.size()) {
+    case 4: {
+      // When this function return True, result always true. When it return
+      // False, result could be False or data dependent.
+      auto guard_or_false = [&]() {
+        c10::SymInt expected = 1;
+        for (auto& d : {1, 3, 2, 0}) {
+          const auto& size_d = sizes[d];
+          // Not taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
+            continue;
+          }
+          // Taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
+            return false;
+          }
+          expected *= size_d;
+        }
+        return true;
+      };
+
+      // We try to minimize creating large symbolic expressions when not needed
+      // to avoid symbolic evaluation perf issues.
+      if (guard_or_false()) {
+        return c10::SymBool(true);
+      }
+
+      // Result is either false, or data dependent.
+      c10::SymInt expected_stride = 1;
+      c10::SymBool cond = true;
+
+      for (auto& d : {1, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        cond = cond.sym_and(
+            size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
+        expected_stride *= size_d;
+      }
+      return cond;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 3:
+      // TODO dim == 3 case will be enabled once it is fully tested
+      return c10::SymBool(false);
+    default:
+      return c10::SymBool(false);
+  }
+}
+
+// When T is SymInt this function may throw a data dependent error.
+// _compute_channels_last_contiguous_3d_sym does not. Only use this function
+// when inputs are hinted.
 template <typename T>
 bool _compute_channels_last_contiguous_3d(
     ArrayRef<T> sizes,
@@ -134,8 +199,8 @@ bool _compute_channels_last_contiguous_3d(
       T expected = 1;
       for (auto& d : {1, 4, 3, 2, 0}) {
         const auto& size_d = sizes[d];
-        if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) {
-          if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) {
+        if (size_d != 1) {
+          if (strides[d] != expected) {
             return false;
           }
           expected *= size_d;
@@ -152,6 +217,59 @@ bool _compute_channels_last_contiguous_3d(
   }
 }
 
+inline static c10::SymBool _compute_channels_last_contiguous_3d_sym(
+    ArrayRef<c10::SymInt> sizes,
+    ArrayRef<c10::SymInt> strides) {
+  switch (sizes.size()) {
+    case 5: {
+      // When this function return True, result always true. When it return
+      // False, result could be False or data dependent.
+      auto guard_or_false = [&]() {
+        c10::SymInt expected = 1;
+        for (auto& d : {1, 4, 3, 2, 0}) {
+          const auto& size_d = sizes[d];
+          // Not taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) {
+            continue;
+          }
+          // Taking this branch could make this return False instead of True
+          // but not vice-versa. so its ok.
+          if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) {
+            return false;
+          }
+          expected *= size_d;
+        }
+        return true;
+      };
+
+      // We try to minimize creating large symbolic expressions when not needed
+      // to avoid symbolic evaluation perf issues.
+      if (guard_or_false()) {
+        return c10::SymBool(true);
+      }
+
+      // Result is either false, or data dependent.
+      c10::SymInt expected_stride = 1;
+      c10::SymBool cond = true;
+
+      for (auto& d : {1, 4, 3, 2, 0}) {
+        const auto& size_d = sizes[d];
+        cond = cond.sym_and(
+            size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride)));
+        expected_stride *= size_d;
+      }
+      return cond;
+    }
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+    case 4:
+      // TODO dim == 4 case will be enabled once it is fully tested
+      return c10::SymBool(false);
+    default:
+      return c10::SymBool(false);
+  }
+}
+
 template <typename T>
 bool _compute_non_overlapping_and_dense(
     ArrayRef<T> sizes,
diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp
index c6c2743d8358..b78ca94dc514 100644
--- a/c10/core/SymInt.cpp
+++ b/c10/core/SymInt.cpp
@@ -20,6 +20,14 @@ void SymInt::promote_to_negative() {
   s.data_ = 0;
 }
 
+std::optional<int64_t> SymInt::maybe_as_int_slow_path() const {
+  auto* node = toSymNodeImplUnowned();
+  if (auto c = node->constant_int()) {
+    return c;
+  }
+  return node->maybe_as_int();
+}
+
 SymNode SymInt::toSymNode() const {
   TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE(
       is_heap_allocated(), "SymInt::toSymNode is_heap_allocated");
@@ -45,12 +53,11 @@ bool SymInt::has_hint() const {
 #define DEFINE_BINARY(API, OP, METHOD, RET)                          \
   RET SymInt::API(const SymInt& sci) const {                         \
     if (auto ma = maybe_as_int()) {                                  \
-      if (auto mb = sci.maybe_as_int()) {                            \
-        return RET(OP(*ma, *mb));                                    \
-      } else {                                                       \
-        auto b = sci.toSymNode();                                    \
-        return RET(b->wrap_int(*ma)->METHOD(b));                     \
-      }                                                              \
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(                              \
+          !sci.maybe_as_int(),                                       \
+          "should have hit fast path in the header in this case.");  \
+      auto b = sci.toSymNode();                                      \
+      return RET(b->wrap_int(*ma)->METHOD(b));                       \
     } else {                                                         \
       if (auto mb = sci.maybe_as_int()) {                            \
         auto a = toSymNodeImplUnowned();                             \
@@ -61,19 +68,19 @@ bool SymInt::has_hint() const {
     }                                                                \
   }
 
-DEFINE_BINARY(operator+, std::plus<>(), add, SymInt)
-DEFINE_BINARY(operator-, std::minus<>(), sub, SymInt)
-DEFINE_BINARY(operator*, std::multiplies<>(), mul, SymInt)
-DEFINE_BINARY(operator/, std::divides<>(), floordiv, SymInt)
-DEFINE_BINARY(operator%, std::modulus<>(), mod, SymInt)
-DEFINE_BINARY(sym_eq, std::equal_to<>(), eq, SymBool)
-DEFINE_BINARY(sym_ne, std::not_equal_to<>(), ne, SymBool)
-DEFINE_BINARY(sym_lt, std::less<>(), lt, SymBool)
-DEFINE_BINARY(sym_le, std::less_equal<>(), le, SymBool)
-DEFINE_BINARY(sym_gt, std::greater<>(), gt, SymBool)
-DEFINE_BINARY(sym_ge, std::greater_equal<>(), ge, SymBool)
-DEFINE_BINARY(min, std::min, sym_min, SymInt)
-DEFINE_BINARY(max, std::max, sym_max, SymInt)
+DEFINE_BINARY(operator_add_slow_path, std::plus<>(), add, SymInt)
+DEFINE_BINARY(operator_sub_slow_path, std::minus<>(), sub, SymInt)
+DEFINE_BINARY(operator_mul_slow_path, std::multiplies<>(), mul, SymInt)
+DEFINE_BINARY(operator_div_slow_path, std::divides<>(), floordiv, SymInt)
+DEFINE_BINARY(operator_mod_slow_path, std::modulus<>(), mod, SymInt)
+DEFINE_BINARY(sym_eq_slow_path, std::equal_to<>(), eq, SymBool)
+DEFINE_BINARY(sym_ne_slow_path, std::not_equal_to<>(), ne, SymBool)
+DEFINE_BINARY(sym_lt_slow_path, std::less<>(), lt, SymBool)
+DEFINE_BINARY(sym_le_slow_path, std::less_equal<>(), le, SymBool)
+DEFINE_BINARY(sym_gt_slow_path, std::greater<>(), gt, SymBool)
+DEFINE_BINARY(sym_ge_slow_path, std::greater_equal<>(), ge, SymBool)
+DEFINE_BINARY(min_slow_path, std::min, sym_min, SymInt)
+DEFINE_BINARY(max_slow_path, std::max, sym_max, SymInt)
 
 SymInt::operator SymFloat() const {
   if (auto ma = maybe_as_int()) {
@@ -153,15 +160,15 @@ SymInt operator-(const SymInt& s) {
   }
 }
 
-void SymInt::operator*=(const SymInt& sci) {
+void SymInt::operator_imul_slow_path(const SymInt& sci) {
   *this = *this * sci;
 }
 
-void SymInt::operator/=(const SymInt& sci) {
+void SymInt::operator_idiv_slow_path(const SymInt& sci) {
   *this = *this / sci;
 }
 
-void SymInt::operator+=(const SymInt& sci) {
+void SymInt::operator_iadd_slow_path(const SymInt& sci) {
   *this = *this + sci;
 }
 
diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h
index 51686f8b81af..9b1c776cbe2a 100644
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@@ -7,6 +7,7 @@
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
 
+#include <algorithm>
 #include <cstdint>
 #include <iterator>
 #include <numeric>
@@ -177,23 +178,136 @@ class C10_API SymInt {
 #endif
   }
 
-  SymInt operator+(const SymInt& sci) const;
-  SymInt operator-(const SymInt& sci) const;
-  SymInt operator*(const SymInt& sci) const;
-  SymInt operator/(const SymInt& sci) const;
-  SymInt operator%(const SymInt& sci) const;
-  void operator*=(const SymInt& sci);
-  void operator+=(const SymInt& sci);
-  void operator/=(const SymInt& sci);
+  SymInt operator+(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma + *mb);
+      }
+    }
+    return operator_add_slow_path(sci);
+  }
+
+  SymInt operator-(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma - *mb);
+      }
+    }
+    return operator_sub_slow_path(sci);
+  }
+
+  SymInt operator*(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma * *mb);
+      }
+    }
+    return operator_mul_slow_path(sci);
+  }
+
+  SymInt operator/(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma / *mb);
+      }
+    }
+    return operator_div_slow_path(sci);
+  }
+
+  SymInt operator%(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(*ma % *mb);
+      }
+    }
+    return operator_mod_slow_path(sci);
+  }
+
+  void operator*=(const SymInt& sci) {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        *this = SymInt(*ma * *mb);
+        return;
+      }
+    }
+    operator_imul_slow_path(sci);
+  }
+
+  void operator+=(const SymInt& sci) {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        *this = SymInt(*ma + *mb);
+        return;
+      }
+    }
+    operator_iadd_slow_path(sci);
+  }
+
+  void operator/=(const SymInt& sci) {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        *this = SymInt(*ma / *mb);
+        return;
+      }
+    }
+    operator_idiv_slow_path(sci);
+  }
 
   SymInt clone() const;
 
-  SymBool sym_eq(const SymInt&) const;
-  SymBool sym_ne(const SymInt&) const;
-  SymBool sym_lt(const SymInt&) const;
-  SymBool sym_le(const SymInt&) const;
-  SymBool sym_gt(const SymInt&) const;
-  SymBool sym_ge(const SymInt&) const;
+  SymBool sym_eq(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma == *mb);
+      }
+    }
+    return sym_eq_slow_path(sci);
+  }
+
+  SymBool sym_ne(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma != *mb);
+      }
+    }
+    return sym_ne_slow_path(sci);
+  }
+
+  SymBool sym_lt(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma < *mb);
+      }
+    }
+    return sym_lt_slow_path(sci);
+  }
+
+  SymBool sym_le(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma <= *mb);
+      }
+    }
+    return sym_le_slow_path(sci);
+  }
+
+  SymBool sym_gt(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma > *mb);
+      }
+    }
+    return sym_gt_slow_path(sci);
+  }
+
+  SymBool sym_ge(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymBool(*ma >= *mb);
+      }
+    }
+    return sym_ge_slow_path(sci);
+  }
 
   bool operator==(const SymInt& o) const {
     return sym_eq(o).guard_bool(__FILE__, __LINE__);
@@ -214,8 +328,23 @@ class C10_API SymInt {
     return sym_ge(o).guard_bool(__FILE__, __LINE__);
   }
 
-  SymInt min(const SymInt& sci) const;
-  SymInt max(const SymInt& sci) const;
+  SymInt min(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(std::min(*ma, *mb));
+      }
+    }
+    return min_slow_path(sci);
+  }
+
+  SymInt max(const SymInt& sci) const {
+    if (auto ma = maybe_as_int()) {
+      if (auto mb = sci.maybe_as_int()) {
+        return SymInt(std::max(*ma, *mb));
+      }
+    }
+    return max_slow_path(sci);
+  }
 
   // If both are symbolic, this checks if
   // they share the same node.
@@ -239,11 +368,7 @@ class C10_API SymInt {
     if (!is_heap_allocated()) {
       return data_;
     }
-    auto* node = toSymNodeImplUnowned();
-    if (auto c = node->constant_int()) {
-      return c;
-    }
-    return node->maybe_as_int();
+    return maybe_as_int_slow_path();
   }
 
   // Return whether the integer is directly coercible to a SymInt
@@ -264,6 +389,25 @@ class C10_API SymInt {
 
  private:
   void promote_to_negative();
+  SymInt operator_add_slow_path(const SymInt& sci) const;
+  SymInt operator_sub_slow_path(const SymInt& sci) const;
+  SymInt operator_mul_slow_path(const SymInt& sci) const;
+  SymInt operator_div_slow_path(const SymInt& sci) const;
+  SymInt operator_mod_slow_path(const SymInt& sci) const;
+  void operator_imul_slow_path(const SymInt& sci);
+  void operator_iadd_slow_path(const SymInt& sci);
+  void operator_idiv_slow_path(const SymInt& sci);
+  SymBool sym_eq_slow_path(const SymInt& sci) const;
+  SymBool sym_ne_slow_path(const SymInt& sci) const;
+  SymBool sym_lt_slow_path(const SymInt& sci) const;
+  SymBool sym_le_slow_path(const SymInt& sci) const;
+  SymBool sym_gt_slow_path(const SymInt& sci) const;
+  SymBool sym_ge_slow_path(const SymInt& sci) const;
+
+  SymInt min_slow_path(const SymInt& sci) const;
+  SymInt max_slow_path(const SymInt& sci) const;
+
+  std::optional<int64_t> maybe_as_int_slow_path() const;
 
   // Constraints on the internal representation:
   //
diff --git a/c10/core/SymbolicShapeMeta.cpp b/c10/core/SymbolicShapeMeta.cpp
index 6fa2ab0ed4f1..01276d416fbb 100644
--- a/c10/core/SymbolicShapeMeta.cpp
+++ b/c10/core/SymbolicShapeMeta.cpp
@@ -71,6 +71,27 @@ normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) {
   return std::tuple<SymNode, std::vector<SymNode>, std::vector<SymNode>>(
       std::move(base), std::move(size_nodes), std::move(stride_nodes));
 }
+namespace {
+bool all_hinted(
+    const c10::SymIntArrayRef& sizes,
+    const c10::SymIntArrayRef& strides) {
+  auto all_hinted = true;
+  for (const auto& s : sizes) {
+    if (!s.has_hint()) {
+      return false;
+    }
+  }
+
+  if (all_hinted) {
+    for (const auto& s : strides) {
+      if (!s.has_hint()) {
+        return false;
+      }
+    }
+  }
+  return all_hinted;
+}
+} // namespace
 
 // Special treatment because of numel
 SymBool SymbolicShapeMeta::compute_contiguous() const {
@@ -88,28 +109,61 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
     return maybe_as_bool.value();
   }
 
-  auto all_hinted = true;
-  for (const auto& s : sizes) {
-    if (!s.has_hint()) {
-      all_hinted = false;
-      break;
-    }
+  if (all_hinted(sizes, strides)) {
+    // We avoid going through the slow path if everything is hinted,
+    // because evaluating a large SymPy expression can be expensive.
+    // TODO exclude backed_size_oblivious from this path.
+    return _compute_contiguous<SymInt>(sizes_, strides_, numel());
   }
 
-  if (all_hinted) {
-    for (const auto& s : strides) {
-      if (!s.has_hint()) {
-        all_hinted = false;
-        break;
-      }
-    }
+  return result;
+}
+
+SymBool SymbolicShapeMeta::compute_channels_last_contiguous_2d() const {
+  if (!strides_valid_) {
+    return false;
   }
+  c10::SymIntArrayRef sizes(sizes_);
+  c10::SymIntArrayRef strides(strides_);
 
-  if (all_hinted) {
+  auto result = _compute_channels_last_contiguous_2d_sym(sizes, strides);
+
+  // If the result is already determined without guarding, just return it.
+  auto maybe_as_bool = result.maybe_as_bool();
+  if (maybe_as_bool.has_value()) {
+    return maybe_as_bool.value();
+  }
+
+  if (all_hinted(sizes, strides)) {
     // We avoid going through the slow path if everything is hinted,
     // because evaluating a large SymPy expression can be expensive.
     // TODO exclude backed_size_oblivious from this path.
-    return _compute_contiguous<SymInt>(sizes_, strides_, numel());
+    return _compute_channels_last_contiguous_2d<SymInt>(sizes_, strides_);
+  }
+
+  return result;
+}
+
+SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d() const {
+  if (!strides_valid_) {
+    return false;
+  }
+  c10::SymIntArrayRef sizes(sizes_);
+  c10::SymIntArrayRef strides(strides_);
+
+  auto result = _compute_channels_last_contiguous_3d_sym(sizes, strides);
+
+  // If the result is already determined without guarding, just return it.
+  auto maybe_as_bool = result.maybe_as_bool();
+  if (maybe_as_bool.has_value()) {
+    return maybe_as_bool.value();
+  }
+
+  if (all_hinted(sizes, strides)) {
+    // We avoid going through the slow path if everything is hinted,
+    // because evaluating a large SymPy expression can be expensive.
+    // TODO exclude backed_size_oblivious from this path.
+    return _compute_channels_last_contiguous_3d<SymInt>(sizes_, strides_);
   }
 
   return result;
@@ -143,8 +197,6 @@ SymBool SymbolicShapeMeta::compute_contiguous() const {
   }
 
 // clang-format off
-DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_2d, _compute_channels_last_contiguous_2d)
-DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_3d, _compute_channels_last_contiguous_3d)
 DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_2d, is_channels_last_strides_2d)
 DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_3d, is_channels_last_strides_3d)
 
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index f3ec2f2d46ea..cd0321d3bb6f 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -313,8 +313,15 @@ void TensorImpl::throw_data_ptr_access_error() const {
 c10::SymBool TensorImpl::sym_is_contiguous_custom(
     at::MemoryFormat memory_format) const {
   if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) {
-    return pyobj_slot_.load_pyobj_interpreter()->is_contiguous(
-        this, memory_format);
+    // TO reduce BC breaking and reduce having to introduce
+    // sym_is_contiguous. call is_contiguous when tensor does not
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      return pyobj_slot_.load_pyobj_interpreter()->sym_is_contiguous(
+          this, memory_format);
+    } else {
+      return pyobj_slot_.load_pyobj_interpreter()->is_contiguous(
+          this, memory_format);
+    }
   }
 
   return sym_is_contiguous_default(memory_format);
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index b4ae1d612e96..913bc7872657 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -60,6 +60,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
   bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const override {
     PANIC(is_contiguous);
   }
+  c10::SymBool sym_is_contiguous(const TensorImpl* self, at::MemoryFormat)
+      const override {
+    PANIC(sym_is_contiguous);
+  }
   bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
       const override {
     PANIC(is_strides_like);
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
index 09d4801f7d83..def708c24b80 100644
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@@ -168,6 +168,9 @@ struct C10_API PyInterpreterVTable {
 
   virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat)
       const = 0;
+  virtual c10::SymBool sym_is_contiguous(
+      const TensorImpl* self,
+      at::MemoryFormat) const = 0;
   virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
       const = 0;
   virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0;
diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index d2efb8c593e4..8706f7362a3d 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -25,6 +25,7 @@ CUDAAllocatorConfig::CUDAAllocatorConfig()
 #endif
       m_release_lock_on_cudamalloc(false),
       m_pinned_use_cuda_host_register(false),
+      m_graph_capture_record_stream_reuse(false),
       m_pinned_use_background_threads(false) {
   m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0);
 }
@@ -373,6 +374,9 @@ void CUDAAllocatorConfig::parseArgs(const std::optional<std::string>& env) {
     } else if (config_item_view == "pinned_use_background_threads") {
       i = parsePinnedUseBackgroundThreads(config, i);
       used_native_specific_option = true;
+    } else if (config_item_view == "graph_capture_record_stream_reuse") {
+      i = parseGraphCaptureRecordStreamReuse(config, i);
+      used_native_specific_option = true;
     } else {
       TORCH_CHECK(
           false, "Unrecognized CachingAllocator option: ", config_item_view);
@@ -406,6 +410,23 @@ size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister(
   return i;
 }
 
+size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  if (++i < config.size()) {
+    TORCH_CHECK(
+        (config[i] == "True" || config[i] == "False"),
+        "Expected a single True/False argument for graph_capture_record_stream_reuse");
+    m_graph_capture_record_stream_reuse = (config[i] == "True");
+  } else {
+    TORCH_CHECK(
+        false, "Error, expecting graph_capture_record_stream_reuse value", "");
+  }
+
+  return i;
+}
+
 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
     const std::vector<std::string>& config,
     size_t i) {
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index fda3cc02e5d0..54c41ba70fb6 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -53,6 +53,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_release_lock_on_cudamalloc;
   }
 
+  static bool graph_capture_record_stream_reuse() {
+    return instance().m_graph_capture_record_stream_reuse;
+  }
+
   /** Pinned memory allocator settings */
   static bool pinned_use_cuda_host_register() {
     return instance().m_pinned_use_cuda_host_register;
@@ -142,6 +146,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
   size_t parsePinnedUseBackgroundThreads(
       const std::vector<std::string>& config,
       size_t i);
+  size_t parseGraphCaptureRecordStreamReuse(
+      const std::vector<std::string>& config,
+      size_t i);
 
   std::atomic<size_t> m_max_split_size;
   std::atomic<size_t> m_max_non_split_rounding_size;
@@ -153,6 +160,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
       m_expandable_segments_handle_type;
   std::atomic<bool> m_release_lock_on_cudamalloc;
   std::atomic<bool> m_pinned_use_cuda_host_register;
+  std::atomic<bool> m_graph_capture_record_stream_reuse;
   std::atomic<bool> m_pinned_use_background_threads;
   std::string m_last_allocator_settings;
   std::mutex m_last_allocator_settings_mutex;
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index e701f1527c00..93ac4f7a4c64 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -1167,8 +1167,13 @@ class DeviceCachingAllocator {
   // tracks which pools we can use as a last resort before ooming
   ska::flat_hash_set<MempoolId_t, MempoolIdHash> use_on_oom_pools;
 
-  // See free() for this thing's purpose
-  std::vector<Block*> needs_events_deferred_until_no_capture;
+  // Map of blocks whose freeing is deferred until after CUDA graph capture.
+  //   - Key: Block* to be freed.
+  //   - Value: List of "empty nodes" inserted as free markers during capture.
+  //     If the vector is empty, the block must always be deferred until capture
+  //     ends.
+  ska::flat_hash_map<Block*, std::vector<cudaGraphNode_t>> deferred_blocks;
+
   // outstanding cuda events
   ska::flat_hash_map<
       cuda::CUDAStream,
@@ -1329,6 +1334,11 @@ class DeviceCachingAllocator {
       //    capture. Cross-stream memory use is uncommon, so the deferral's
       //    effect on memory use during capture should be small.
       process_events(context);
+    } else {
+      if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
+        // We check if there is some block that is safe to reuse on this stream
+        free_safe_blocks_in_capture(context, stream);
+      }
     }
     size_t size = round_size(orig_size);
     auto& pool = get_pool(size, stream);
@@ -1619,6 +1629,248 @@ class DeviceCachingAllocator {
     return block;
   }
 
+  // Insert "free marker" (empty nodes) into the CUDA graph for all streams that
+  // have used the block, including the allocation stream. These nodes mark the
+  // last use of the block in the capture graph. Returns a vector of the
+  // inserted nodes, or an empty vector if any stream is not capturing.
+  std::vector<cudaGraphNode_t> insert_free_marker(Block* block) {
+    std::vector<cudaGraphNode_t> empty_nodes;
+
+    auto try_add_empty_node = [&](cudaStream_t stream) -> bool {
+      cudaStreamCaptureStatus status{};
+      cudaGraph_t graph{};
+      const cudaGraphNode_t* deps = nullptr;
+      size_t num_deps = 0;
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+      C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
+          stream, &status, nullptr, &graph, &deps, nullptr, &num_deps));
+#else
+      C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
+          stream, &status, nullptr, &graph, &deps, &num_deps));
+#endif
+
+      TORCH_INTERNAL_ASSERT(
+          status != cudaStreamCaptureStatusInvalidated,
+          "Invalid stream capture status");
+
+      if (status == cudaStreamCaptureStatusNone) {
+        return false;
+      }
+
+      cudaGraphNode_t node{};
+      C10_CUDA_CHECK(cudaGraphAddEmptyNode(&node, graph, deps, num_deps));
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+      C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
+          stream, &node, nullptr, 1, cudaStreamSetCaptureDependencies));
+#else
+      C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies(
+          stream, &node, 1, cudaStreamSetCaptureDependencies));
+#endif
+      empty_nodes.push_back(node);
+      return true;
+    };
+
+    // If any stream is not currently capturing, return an empty node vector.
+    // An empty vector indicates that the block should be deferred for freeing
+    // until after capture.
+
+    // Attempt to add an empty node for the allocation stream.
+    if (!try_add_empty_node(block->stream)) {
+      return {};
+    }
+    // Attempt to add empty nodes for all streams that have used the block.
+    for (const auto& s : block->stream_uses) {
+      if (!try_add_empty_node(s.stream())) {
+        return {};
+      }
+    }
+    return empty_nodes;
+  }
+
+  // Returns the current set of "terminal" nodes in the CUDA graph for a given
+  // stream. These represent the current endpoints of the stream, and may
+  // include additional nodes if the graph branches. Any new work captured will
+  // be attached after one or more of these terminals.
+  std::vector<cudaGraphNode_t> get_terminals(cudaStream_t stream) {
+    std::vector<cudaGraphNode_t> result;
+
+    cudaStreamCaptureStatus status{};
+    cudaGraph_t graph{};
+    const cudaGraphNode_t* dependencies = nullptr;
+    size_t num_dependencies = 0;
+
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+    C10_CUDA_CHECK(cudaStreamGetCaptureInfo(
+        stream,
+        &status,
+        nullptr,
+        &graph,
+        &dependencies,
+        nullptr,
+        &num_dependencies));
+#else
+    C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2(
+        stream, &status, nullptr, &graph, &dependencies, &num_dependencies));
+#endif
+
+    TORCH_INTERNAL_ASSERT(
+        status == cudaStreamCaptureStatusActive,
+        "Invalid stream capture status");
+
+    for (size_t i = 0; i < num_dependencies; i++) {
+      auto node = dependencies[i];
+      if (node != nullptr) {
+        result.push_back(node);
+      }
+    }
+
+    return result;
+  }
+
+  // Returns the set of "reusable" free markers (empty nodes) in the current
+  // CUDA graph capture. A free marker is considered reusable if it is a
+  // predecessor of every terminal node.
+  // This ensures that all future captured work will occur after the free
+  // marker, making it safe to reuse.
+  ska::flat_hash_set<cudaGraphNode_t> get_reusable_empty_nodes(
+      cudaStream_t stream) {
+    auto terminals = get_terminals(stream);
+    if (terminals.empty()) {
+      // No terminal nodes found; nothing to free.
+      return {};
+    }
+
+    auto get_dependencies = [](cudaGraphNode_t node,
+                               cudaGraphNode_t* pDependencies,
+                               size_t* pNumDependencies) -> void {
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000)
+      C10_CUDA_CHECK(cudaGraphNodeGetDependencies(
+          node, pDependencies, nullptr, pNumDependencies));
+#else
+      C10_CUDA_CHECK(
+          cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies));
+#endif
+    };
+
+    // Helper to retrieve all parent nodes (dependencies) of a given node.
+    auto get_parents =
+        [&](cudaGraphNode_t node) -> std::vector<cudaGraphNode_t> {
+      size_t count = 0;
+      get_dependencies(node, nullptr, &count);
+      std::vector<cudaGraphNode_t> out(count);
+      if (count) {
+        get_dependencies(node, out.data(), &count);
+        out.resize(count);
+      }
+      return out;
+    };
+
+    // Helper to determine if a node is an empty node (used as a free marker).
+    auto is_empty_node = [](cudaGraphNode_t n) -> bool {
+      cudaGraphNodeType type{};
+      C10_CUDA_CHECK(cudaGraphNodeGetType(n, &type));
+      return type == cudaGraphNodeTypeEmpty;
+    };
+
+    // For each terminal node, perform a reverse DFS to count, for each empty
+    // node, how many terminals it can reach (i.e., for how many terminals it is
+    // a predecessor). An empty node is reusable if it is a predecessor of all
+    // terminal nodes.
+    ska::flat_hash_map<cudaGraphNode_t, size_t> num_terminals_reachable;
+
+    for (auto terminal : terminals) {
+      ska::flat_hash_set<cudaGraphNode_t> visited;
+      ska::flat_hash_set<cudaGraphNode_t> empty_nodes;
+
+      std::function<void(cudaGraphNode_t)> reverse_dfs =
+          [&](cudaGraphNode_t node) {
+            if (!visited.insert(node).second)
+              return;
+
+            if (is_empty_node(node)) {
+              num_terminals_reachable[node]++;
+              empty_nodes.insert(node);
+            }
+            auto parents = get_parents(node);
+            for (auto p : parents) {
+              reverse_dfs(p);
+            }
+          };
+
+      reverse_dfs(terminal);
+    }
+
+    ska::flat_hash_set<cudaGraphNode_t> reusable_empty_nodes;
+    for (auto [node, count] : num_terminals_reachable) {
+      if (count == terminals.size()) {
+        reusable_empty_nodes.insert(node);
+      }
+    }
+
+    return reusable_empty_nodes;
+  }
+
+  // A block is considered reusable during CUDA graph capture if every free
+  // marker (empty node) associated with the block is a predecessor of every
+  // terminal node.
+  //
+  // This ensures that any new operation added to the graph will be attached
+  // after all terminal nodes, which themselves are after all free markers. As a
+  // result, all future work is guaranteed to occur after the block's last use
+  // on every stream, so the block's previous lifetime ends before any new
+  // lifetime begins. This check relies solely on the DAG topology and does not
+  // require event queries, making it safe to use during capture.
+  //
+  // This function iterates over all deferred blocks, determines if their empty
+  // nodes are reusable according to the above criteria, and frees the block if
+  // so.
+  void free_safe_blocks_in_capture(
+      const std::shared_ptr<GatheredContext>& context,
+      cudaStream_t stream) {
+    auto reusable_empty_nodes = get_reusable_empty_nodes(stream);
+
+    // If there are no reusable empty nodes (e.g., not currently capturing),
+    // there is nothing to do.
+    if (reusable_empty_nodes.empty()) {
+      return;
+    }
+
+    std::vector<Block*> blocks_to_erase;
+
+    for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
+      // Skip this block if it has no empty nodes, as we defer its freeing until
+      // after graph capture. Also skip if the block was not allocated on the
+      // current stream; such blocks will be freed when
+      // free_safe_blocks_in_capture is attempted on that stream.
+      if (inserted_empty_nodes.empty() || block->stream != stream) {
+        continue;
+      }
+
+      bool is_reusable = true;
+
+      for (const auto& node : inserted_empty_nodes) {
+        if (reusable_empty_nodes.find(node) == reusable_empty_nodes.end()) {
+          is_reusable = false;
+          break;
+        }
+      }
+
+      if (is_reusable) {
+        // Clear stream uses since the graph ensures proper synchronization.
+        // No need to insert events.
+        block->stream_uses.clear();
+
+        free_block(block, context);
+        blocks_to_erase.push_back(block);
+      }
+    }
+
+    // Remove blocks that were freed from the deferred_blocks map.
+    for (auto* block : blocks_to_erase) {
+      deferred_blocks.erase(block);
+    }
+  }
+
   void free(Block* block) {
     std::shared_ptr<GatheredContext> context =
         maybeGatherContext(RecordContext::ALL);
@@ -1654,14 +1906,22 @@ class DeviceCachingAllocator {
     if (block->size >= CUDAAllocatorConfig::max_split_size())
       stats.oversize_allocations.decrease(1);
 
+    // If the block has been used on more than one stream, handle accordingly.
     if (!block->stream_uses.empty()) {
       if (C10_UNLIKELY(!captures_underway.empty())) {
-        // It's forbidden to cudaEventQuery an event recorded during CUDA graph
-        // capture. We conservatively defer recording end-of-life events until
-        // the next call to process_events() (which won't happen until no
-        // captures are underway)
-        needs_events_deferred_until_no_capture.push_back(block);
+        if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) {
+          // insert_free_marker returns a vector of free markers,
+          // or an empty vector if any associated stream is not currently
+          // capturing. The empty vector means that we will defer the free until
+          // capture is finished.
+          deferred_blocks.emplace(block, insert_free_marker(block));
+        } else {
+          // If graph_capture_record_stream_reuse is not enabled, always defer
+          // the free until capture is finished.
+          deferred_blocks.emplace(block, std::vector<cudaGraphNode_t>{});
+        }
       } else {
+        // If not in a capture, insert events for the block.
         insert_events(block);
       }
     } else {
@@ -2977,8 +3237,8 @@ class DeviceCachingAllocator {
           --it;
         }
         if (!(*cur)->expandable_segment_) {
-          release_block(*cur, context);
           totalReleased += (*cur)->size;
+          release_block(*cur, context);
         }
         if (is_first) {
           break;
@@ -3287,8 +3547,8 @@ class DeviceCachingAllocator {
 
   void insert_events_deferred_until_no_capture(
       const std::shared_ptr<GatheredContext>& context) {
-    if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) {
-      for (auto* block : needs_events_deferred_until_no_capture) {
+    if (C10_UNLIKELY(!deferred_blocks.empty())) {
+      for (auto& [block, inserted_empty_nodes] : deferred_blocks) {
         TORCH_INTERNAL_ASSERT(!block->stream_uses.empty());
         // only streams recorded before cudagraph will be used to insert events
         // since we know all streams recorded during cudagraph must have
@@ -3300,7 +3560,7 @@ class DeviceCachingAllocator {
           free_block(block, context);
         }
       }
-      needs_events_deferred_until_no_capture.clear();
+      deferred_blocks.clear();
     }
   }
 
@@ -3731,6 +3991,8 @@ class NativeCachingAllocator : public CUDAAllocator {
     md.pinned_use_host_register =
         CUDAAllocatorConfig::pinned_use_cuda_host_register();
     md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings();
+    md.graph_capture_record_stream_reuse =
+        CUDAAllocatorConfig::graph_capture_record_stream_reuse();
     md.roundup_power2_divisions =
         CUDAAllocatorConfig::roundup_power2_divisions();
 
diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h
index a89adb91e61d..bfc486d69fcf 100644
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@@ -163,6 +163,7 @@ struct AllocatorConfigInfo {
   bool expandable_segments;
   bool release_lock_on_malloc;
   bool pinned_use_host_register;
+  bool graph_capture_record_stream_reuse;
   std::string last_allocator_settings;
   std::vector<size_t> roundup_power2_divisions;
 };
diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp
index 683ed9b76845..9839e4e72049 100644
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@@ -78,6 +78,18 @@ int device_count_impl(bool fail_if_no_driver) {
           "would like to use GPUs, turn off ASAN.");
       break;
 #endif // C10_ASAN_ENABLED
+#if _WIN32 && CUDA_VERSION >= 13000
+    // Workaround for CUDA-13.0 error handling on Windows, see
+    // https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585
+    case cudaErrorNotSupported:
+      if (!fail_if_no_driver) {
+        TORCH_WARN(
+            "cudaGetDeviceCount() returned cudaErrorNotSupported, "
+            "likely using older driver or on CPU machine");
+        count = 0;
+        break;
+      }
+#endif
     default:
       TORCH_CHECK(
           false,
diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp
index f936b02ec9ab..d545bf5477b6 100644
--- a/c10/cuda/driver_api.cpp
+++ b/c10/cuda/driver_api.cpp
@@ -61,11 +61,14 @@ void* get_symbol(const char* name, int version) {
   }
 #endif
 
+  // As of CUDA 13, this API is deprecated.
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 13000)
   // This fallback to the old API to try getting the symbol again.
   if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres);
       st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) {
     return out;
   }
+#endif
 
   // If the symbol cannot be resolved, report and return nullptr;
   // the caller is responsible for checking the pointer.
diff --git a/c10/metal/igamma.h b/c10/metal/igamma.h
new file mode 100644
index 000000000000..8dabdbbb621c
--- /dev/null
+++ b/c10/metal/igamma.h
@@ -0,0 +1,744 @@
+#pragma once
+
+#include <c10/metal/utils.h>
+#include <metal_math>
+#include <metal_stdlib>
+
+using namespace c10::metal;
+using namespace metal;
+
+namespace c10 {
+namespace metal {
+
+template <typename T>
+inline float log_gamma(const T);
+
+inline float expm1f(float a);
+
+template <typename T>
+float erfc(T x);
+
+} // namespace metal
+} // namespace c10
+
+namespace {
+
+template <typename T>
+inline float lgamma(const T a) {
+  return log_gamma(a);
+}
+
+inline float expm1(float a) {
+  return expm1f(a);
+}
+
+// NOTE: The following code was ported directly from the CUDA implementation in
+// `aten/src/ATen/native/cuda/IGammaKernel.cu`
+
+/*
+ * This implementation of the regularized incomplete gamma functions and
+ * their helper functions are derived from the implementation of SciPy's
+ * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations.
+ * See NOTICE for the licenses.
+ */
+// regularized lower & upper incomplete gamma
+template <typename scalar_t>
+scalar_t ratevl(
+    scalar_t x,
+    const scalar_t num[],
+    int64_t M,
+    const scalar_t denom[],
+    int64_t N) {
+  // evaluating rational function, i.e., the ratio of two polynomials
+  // the coefficients for numerator are given by `num` while coeffs for
+  // denumerator are given by `denom`
+
+  using accscalar_t = opmath_t<scalar_t>;
+  int64_t i, dir;
+  accscalar_t y, num_ans, denom_ans;
+  accscalar_t absx = ::fabs(x);
+  thread const accscalar_t* p;
+
+  if (absx > 1) {
+    /* Evaluate as a polynomial in 1/x. */
+    dir = -1;
+    p = num + M;
+    y = 1 / x;
+  } else {
+    dir = 1;
+    p = num;
+    y = x;
+  }
+
+  /* Evaluate the numerator */
+  num_ans = *p;
+  p += dir;
+  for (i = 1; i <= M; i++) {
+    num_ans = num_ans * y + *p;
+    p += dir;
+  }
+  /* Evaluate the denominator */
+  if (absx > 1) {
+    p = denom + N;
+  } else {
+    p = denom;
+  }
+
+  denom_ans = *p;
+  p += dir;
+  for (i = 1; i <= N; i++) {
+    denom_ans = denom_ans * y + *p;
+    p += dir;
+  }
+  if (absx > 1) {
+    i = N - M;
+    return ::pow(x, static_cast<accscalar_t>(i)) * num_ans / denom_ans;
+  } else {
+    return num_ans / denom_ans;
+  }
+}
+
+template <typename scalar_t>
+scalar_t lanczos_sum_expg_scaled(scalar_t x) {
+  // lanczos approximation
+  using accscalar_t = opmath_t<scalar_t>;
+
+  const accscalar_t lanczos_sum_expg_scaled_num[13] = {
+      0.006061842346248906525783753964555936883222,
+      0.5098416655656676188125178644804694509993,
+      19.51992788247617482847860966235652136208,
+      449.9445569063168119446858607650988409623,
+      6955.999602515376140356310115515198987526,
+      75999.29304014542649875303443598909137092,
+      601859.6171681098786670226533699352302507,
+      3481712.15498064590882071018964774556468,
+      14605578.08768506808414169982791359218571,
+      43338889.32467613834773723740590533316085,
+      86363131.28813859145546927288977868422342,
+      103794043.1163445451906271053616070238554,
+      56906521.91347156388090791033559122686859};
+  const accscalar_t lanczos_sum_expg_scaled_denom[13] = {
+      1.,
+      66.,
+      1925.,
+      32670.,
+      357423.,
+      2637558.,
+      13339535.,
+      45995730.,
+      105258076.,
+      150917976.,
+      120543840.,
+      39916800.,
+      0};
+  return ratevl(
+      static_cast<accscalar_t>(x),
+      lanczos_sum_expg_scaled_num,
+      sizeof(lanczos_sum_expg_scaled_num) /
+              sizeof(lanczos_sum_expg_scaled_num[0]) -
+          1,
+      lanczos_sum_expg_scaled_denom,
+      sizeof(lanczos_sum_expg_scaled_denom) /
+              sizeof(lanczos_sum_expg_scaled_denom[0]) -
+          1);
+}
+
+template <typename scalar_t>
+scalar_t _igam_helper_fac(scalar_t a, scalar_t x) {
+  // compute x^a * exp(-a) / gamma(a)
+  // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with
+  // exp(a - x).
+
+  using accscalar_t = opmath_t<scalar_t>;
+  accscalar_t ax, fac, res, num, numfac;
+  const accscalar_t MAXLOG = 88.72283905206835;
+  const accscalar_t EXP1 = 2.718281828459045;
+  const accscalar_t lanczos_g = 6.024680040776729583740234375;
+
+  if (::fabs(a - x) > 0.4 * ::fabs(a)) {
+    ax = a * ::log(x) - x - ::lgamma(a);
+    if (ax < -MAXLOG) {
+      return 0.0;
+    }
+    return ::exp(ax);
+  }
+
+  fac = a + lanczos_g - 0.5;
+  res = ::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a);
+
+  if ((a < 200) && (x < 200)) {
+    res *= ::exp(a - x) * ::pow(x / fac, a);
+  } else {
+    num = x - a - lanczos_g + 0.5;
+    numfac = num / fac;
+    res *= ::exp(a * (::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac);
+  }
+  return res;
+}
+
+template <typename scalar_t>
+scalar_t _igam_helper_series(scalar_t a, scalar_t x) {
+  // Compute igam using DLMF 8.11.4. [igam1]
+
+  using accscalar_t = opmath_t<scalar_t>;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+  const int MAXITER = 2000;
+
+  int i;
+  accscalar_t ans, ax, c, r;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* power series */
+  r = a;
+  c = 1.0;
+  ans = 1.0;
+
+  for (i = 0; i < MAXITER; i++) {
+    r += 1.0;
+    c *= x / r;
+    ans += c;
+    if (c <= MACHEP * ans) {
+      break;
+    }
+  }
+  return (ans * ax / a);
+}
+
+template <typename scalar_t>
+scalar_t _igamc_helper_series(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in
+  // _igam_helper_series but extra care is taken to avoid cancellation.
+
+  using accscalar_t = opmath_t<scalar_t>;
+  int n;
+  accscalar_t fac = 1;
+  accscalar_t sum = 0;
+  accscalar_t term, logx;
+  const int MAXITER = 2000;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+
+  for (n = 1; n < MAXITER; n++) {
+    fac *= -x / n;
+    term = fac / (a + n);
+    sum += term;
+    if (::fabs(term) <= MACHEP * ::fabs(sum)) {
+      break;
+    }
+  }
+
+  logx = ::log(x);
+  term = -::expm1(a * logx - ::lgamma(1 + a));
+  return term - ::exp(a * logx - ::lgamma(a)) * sum;
+}
+
+template <typename scalar_t>
+scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) {
+  // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1]
+
+  using accscalar_t = opmath_t<scalar_t>;
+  const accscalar_t d[25][25] = {
+      {-3.3333333333333333e-1,  8.3333333333333333e-2,
+       -1.4814814814814815e-2,  1.1574074074074074e-3,
+       3.527336860670194e-4,    -1.7875514403292181e-4,
+       3.9192631785224378e-5,   -2.1854485106799922e-6,
+       -1.85406221071516e-6,    8.296711340953086e-7,
+       -1.7665952736826079e-7,  6.7078535434014986e-9,
+       1.0261809784240308e-8,   -4.3820360184533532e-9,
+       9.1476995822367902e-10,  -2.551419399494625e-11,
+       -5.8307721325504251e-11, 2.4361948020667416e-11,
+       -5.0276692801141756e-12, 1.1004392031956135e-13,
+       3.3717632624009854e-13,  -1.3923887224181621e-13,
+       2.8534893807047443e-14,  -5.1391118342425726e-16,
+       -1.9752288294349443e-15},
+      {-1.8518518518518519e-3,  -3.4722222222222222e-3,  2.6455026455026455e-3,
+       -9.9022633744855967e-4,  2.0576131687242798e-4,   -4.0187757201646091e-7,
+       -1.8098550334489978e-5,  7.6491609160811101e-6,   -1.6120900894563446e-6,
+       4.6471278028074343e-9,   1.378633446915721e-7,    -5.752545603517705e-8,
+       1.1951628599778147e-8,   -1.7543241719747648e-11, -1.0091543710600413e-9,
+       4.1627929918425826e-10,  -8.5639070264929806e-11, 6.0672151016047586e-14,
+       7.1624989648114854e-12,  -2.9331866437714371e-12, 5.9966963656836887e-13,
+       -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14,
+       -4.13125571381061e-15},
+      {4.1335978835978836e-3,  -2.6813271604938272e-3,  7.7160493827160494e-4,
+       2.0093878600823045e-6,  -1.0736653226365161e-4,  5.2923448829120125e-5,
+       -1.2760635188618728e-5, 3.4235787340961381e-8,   1.3721957309062933e-6,
+       -6.298992138380055e-7,  1.4280614206064242e-7,   -2.0477098421990866e-10,
+       -1.4092529910867521e-8, 6.228974084922022e-9,    -1.3670488396617113e-9,
+       9.4283561590146782e-13, 1.2872252400089318e-10,  -5.5645956134363321e-11,
+       1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12,
+       4.6622399463901357e-13, -9.905105763906906e-14,  1.8931876768373515e-17,
+       8.8592218725911273e-15},
+      {6.4943415637860082e-4,   2.2947209362139918e-4,  -4.6918949439525571e-4,
+       2.6772063206283885e-4,   -7.5618016718839764e-5, -2.3965051138672967e-7,
+       1.1082654115347302e-5,   -5.6749528269915966e-6, 1.4230900732435884e-6,
+       -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8,
+       -1.9111168485973654e-8,  2.3928620439808118e-12, 2.0620131815488798e-9,
+       -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14,
+       -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12,
+       6.2088195734079014e-17,  2.126978363279737e-13,  -9.3446887915174333e-14,
+       2.0453671226782849e-14},
+      {-8.618882909167117e-4,   7.8403922172006663e-4,
+       -2.9907248030319018e-4,  -1.4638452578843418e-6,
+       6.6414982154651222e-5,   -3.9683650471794347e-5,
+       1.1375726970678419e-5,   2.5074972262375328e-10,
+       -1.6954149536558306e-6,  8.9075075322053097e-7,
+       -2.2929348340008049e-7,  2.956794137544049e-11,
+       2.8865829742708784e-8,   -1.4189739437803219e-8,
+       3.4463580499464897e-9,   -2.3024517174528067e-13,
+       -3.9409233028046405e-10, 1.8602338968504502e-10,
+       -4.356323005056618e-11,  1.2786001016296231e-15,
+       4.6792750266579195e-12,  -2.1492464706134829e-12,
+       4.9088156148096522e-13,  -6.3385914848915603e-18,
+       -5.0453320690800944e-14},
+      {-3.3679855336635815e-4, -6.9728137583658578e-5,  2.7727532449593921e-4,
+       -1.9932570516188848e-4, 6.7977804779372078e-5,   1.419062920643967e-7,
+       -1.3594048189768693e-5, 8.0184702563342015e-6,   -2.2914811765080952e-6,
+       -3.252473551298454e-10, 3.4652846491085265e-7,   -1.8447187191171343e-7,
+       4.8240967037894181e-8,  -1.7989466721743515e-14, -6.3061945000135234e-9,
+       3.1624176287745679e-9,  -7.8409242536974293e-10, 5.1926791652540407e-15,
+       9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11,
+       -3.661886712685252e-17, -1.210902069055155e-12,  5.6807435849905643e-13,
+       -1.3249659916340829e-13},
+      {5.3130793646399222e-4,  -5.9216643735369388e-4,  2.7087820967180448e-4,
+       7.9023532326603279e-7,  -8.1539693675619688e-5,  5.6116827531062497e-5,
+       -1.8329116582843376e-5, -3.0796134506033048e-9,  3.4651553688036091e-6,
+       -2.0291327396058604e-6, 5.7887928631490037e-7,   2.338630673826657e-13,
+       -8.8286007463304835e-8, 4.7435958880408128e-8,   -1.2545415020710382e-8,
+       8.6496488580102925e-14, 1.6846058979264063e-9,   -8.5754928235775947e-10,
+       2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11,
+       1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18,
+       3.6902800842763467e-13},
+      {3.4436760689237767e-4,   5.1717909082605922e-5,
+       -3.3493161081142236e-4,  2.812695154763237e-4,
+       -1.0976582244684731e-4,  -1.2741009095484485e-7,
+       2.7744451511563644e-5,   -1.8263488805711333e-5,
+       5.7876949497350524e-6,   4.9387589339362704e-10,
+       -1.0595367014026043e-6,  6.1667143761104075e-7,
+       -1.7562973359060462e-7,  -1.2974473287015439e-12,
+       2.695423606288966e-8,    -1.4578352908731271e-8,
+       3.887645959386175e-9,    -3.8810022510194121e-17,
+       -5.3279941738772867e-10, 2.7437977643314845e-10,
+       -6.9957960920705679e-11, 2.5899863874868481e-17,
+       8.8566890996696381e-12,  -4.403168815871311e-12,
+       1.0865561947091654e-12},
+      {-6.5262391859530942e-4,  8.3949872067208728e-4,  -4.3829709854172101e-4,
+       -6.969091458420552e-7,   1.6644846642067548e-4,  -1.2783517679769219e-4,
+       4.6299532636913043e-5,   4.5579098679227077e-9,  -1.0595271125805195e-5,
+       6.7833429048651666e-6,   -2.1075476666258804e-6, -1.7213731432817145e-11,
+       3.7735877416110979e-7,   -2.1867506700122867e-7, 6.2202288040189269e-8,
+       6.5977038267330006e-16,  -9.5903864974256858e-9, 5.2132144922808078e-9,
+       -1.3991589583935709e-9,  5.382058999060575e-16,  1.9484714275467745e-10,
+       -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18,
+       -3.3721464474854592e-12},
+      {-5.9676129019274625e-4,  -7.2048954160200106e-5,
+       6.7823088376673284e-4,   -6.4014752602627585e-4,
+       2.7750107634328704e-4,   1.8197008380465151e-7,
+       -8.4795071170685032e-5,  6.105192082501531e-5,
+       -2.1073920183404862e-5,  -8.8585890141255994e-10,
+       4.5284535953805377e-6,   -2.8427815022504408e-6,
+       8.7082341778646412e-7,   3.6886101871706965e-12,
+       -1.5344695190702061e-7,  8.862466778790695e-8,
+       -2.5184812301826817e-8,  -1.0225912098215092e-14,
+       3.8969470758154777e-9,   -2.1267304792235635e-9,
+       5.7370135528051385e-10,  -1.887749850169741e-19,
+       -8.0931538694657866e-11, 4.2382723283449199e-11,
+       -1.1002224534207726e-11},
+      {1.3324454494800656e-3,   -1.9144384985654775e-3,  1.1089369134596637e-3,
+       9.932404122642299e-7,    -5.0874501293093199e-4,  4.2735056665392884e-4,
+       -1.6858853767910799e-4,  -8.1301893922784998e-9,  4.5284402370562147e-5,
+       -3.127053674781734e-5,   1.044986828530338e-5,    4.8435226265680926e-11,
+       -2.1482565873456258e-6,  1.329369701097492e-6,    -4.0295693092101029e-7,
+       -1.7567877666323291e-13, 7.0145043163668257e-8,   -4.040787734999483e-8,
+       1.1474026743371963e-8,   3.9642746853563325e-18,  -1.7804938269892714e-9,
+       9.7480262548731646e-10,  -2.6405338676507616e-10, 5.794875163403742e-18,
+       3.7647749553543836e-11},
+      {1.579727660730835e-3,   1.6251626278391582e-4,   -2.0633421035543276e-3,
+       2.1389686185689098e-3,  -1.0108559391263003e-3,  -3.9912705529919201e-7,
+       3.6235025084764691e-4,  -2.8143901463712154e-4,  1.0449513336495887e-4,
+       2.1211418491830297e-9,  -2.5779417251947842e-5,  1.7281818956040463e-5,
+       -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6,
+       -6.8693396379526735e-7, 2.0653236975414887e-7,   4.6714772409838506e-14,
+       -3.5609886164949055e-8, 2.0470855345905963e-8,   -5.8091738633283358e-9,
+       -1.332821287582869e-16, 9.0354604391335133e-10,  -4.9598782517330834e-10,
+       1.3481607129399749e-10},
+      {-4.0725121195140166e-3, 6.4033628338080698e-3,  -4.0410161081676618e-3,
+       -2.183732802866233e-6,  2.1740441801254639e-3,  -1.9700440518418892e-3,
+       8.3595469747962458e-4,  1.9445447567109655e-8,  -2.5779387120421696e-4,
+       1.9009987368139304e-4,  -6.7696499937438965e-5, -1.4440629666426572e-10,
+       1.5712512518742269e-5,  -1.0304008744776893e-5, 3.304517767401387e-6,
+       7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7,
+       -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8,
+       -1.1407719956357511e-8, 3.2355857064185555e-9,  4.1759468293455945e-20,
+       -5.0423112718105824e-10},
+      {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3,
+       -9.8576315587856125e-3, 5.0134695031021538e-3,  1.2807521786221875e-6,
+       -2.0626019342754683e-3, 1.7109128573523058e-3,  -6.7695312714133799e-4,
+       -6.9011545676562133e-9, 1.8855128143995902e-4,  -1.3395215663491969e-4,
+       4.6263183033528039e-5,  4.0034230613321351e-11, -1.0255652921494033e-5,
+       6.612086372797651e-6,   -2.0913022027253008e-6, -2.0951775649603837e-13,
+       3.9756029041993247e-7,  -2.3956211978815887e-7, 7.1182883382145864e-8,
+       8.925574873053455e-16,  -1.2101547235064676e-8, 6.9350618248334386e-9,
+       -1.9661464453856102e-9},
+      {1.7402027787522711e-2,   -2.9527880945699121e-2, 2.0045875571402799e-2,
+       7.0289515966903407e-6,   -1.2375421071343148e-2, 1.1976293444235254e-2,
+       -5.4156038466518525e-3,  -6.3290893396418616e-8, 1.8855118129005065e-3,
+       -1.473473274825001e-3,   5.5515810097708387e-4,  5.2406834412550662e-10,
+       -1.4357913535784836e-4,  9.9181293224943297e-5,  -3.3460834749478311e-5,
+       -3.5755837291098993e-12, 7.1560851960630076e-6,  -4.5516802628155526e-6,
+       1.4236576649271475e-6,   1.8803149082089664e-14, -2.6623403898929211e-7,
+       1.5950642189595716e-7,   -4.7187514673841102e-8, -6.5107872958755177e-17,
+       7.9795091026746235e-9},
+      {3.0249124160905891e-2,  2.4817436002649977e-3,  -4.9939134373457022e-2,
+       5.9915643009307869e-2,  -3.2483207601623391e-2, -5.7212968652103441e-6,
+       1.5085251778569354e-2,  -1.3261324005088445e-2, 5.5515262632426148e-3,
+       3.0263182257030016e-8,  -1.7229548406756723e-3, 1.2893570099929637e-3,
+       -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4,
+       -7.7378565221244477e-5, 2.5625836246985201e-5,  1.0766165333192814e-12,
+       -5.3246809282422621e-6, 3.349634863064464e-6,   -1.0381253128684018e-6,
+       -5.608909920621128e-15, 1.9150821930676591e-7,  -1.1418365800203486e-7,
+       3.3654425209171788e-8},
+      {-9.9051020880159045e-2, 1.7954011706123486e-1,   -1.2989606383463778e-1,
+       -3.1478872752284357e-5, 9.0510635276848131e-2,   -9.2828824411184397e-2,
+       4.4412112839877808e-2,  2.7779236316835888e-7,   -1.7229543805449697e-2,
+       1.4182925050891573e-2,  -5.6214161633747336e-3,  -2.39598509186381e-9,
+       1.6029634366079908e-3,  -1.1606784674435773e-3,  4.1001337768153873e-4,
+       1.8365800754090661e-11, -9.5844256563655903e-5,  6.3643062337764708e-5,
+       -2.076250624489065e-5,  -1.1806020912804483e-13, 4.2131808239120649e-6,
+       -2.6262241337012467e-6, 8.0770620494930662e-7,   6.0125912123632725e-16,
+       -1.4729737374018841e-7},
+      {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1,
+       -4.6435192311733545e-1, 2.6640934719197893e-1,  3.4038266027147191e-5,
+       -1.3784338709329624e-1, 1.276467178337056e-1,   -5.6213828755200985e-2,
+       -1.753150885483011e-7,  1.9235592956768113e-2,  -1.5088821281095315e-2,
+       5.7401854451350123e-3,  1.0622382710310225e-9,  -1.5335082692563998e-3,
+       1.0819320643228214e-3,  -3.7372510193945659e-4, -6.6170909729031985e-12,
+       8.4263617380909628e-5,  -5.5150706827483479e-5, 1.7769536448348069e-5,
+       3.8827923210205533e-14, -3.53513697488768e-6,   2.1865832130045269e-6,
+       -6.6812849447625594e-7},
+      {7.2438608504029431e-1,   -1.3918010932653375,    1.0654143352413968,
+       1.876173868950258e-4,    -8.2705501176152696e-1, 8.9352433347828414e-1,
+       -4.4971003995291339e-1,  -1.6107401567546652e-6, 1.9235590165271091e-1,
+       -1.6597702160042609e-1,  6.8882222681814333e-2,  1.3910091724608687e-8,
+       -2.146911561508663e-2,   1.6228980898865892e-2,  -5.9796016172584256e-3,
+       -1.1287469112826745e-10, 1.5167451119784857e-3,  -1.0478634293553899e-3,
+       3.5539072889126421e-4,   8.1704322111801517e-13, -7.7773013442452395e-5,
+       5.0291413897007722e-5,   -1.6035083867000518e-5, 1.2469354315487605e-14,
+       3.1369106244517615e-6},
+      {1.6668949727276811,     1.165462765994632e-1,   -3.3288393225018906,
+       4.4692325482864037,     -2.6977693045875807,    -2.600667859891061e-4,
+       1.5389017615694539,     -1.4937962361134612,    6.8881964633233148e-1,
+       1.3077482004552385e-6,  -2.5762963325596288e-1, 2.1097676102125449e-1,
+       -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2,
+       -1.7813678334552311e-2, 6.3970330388900056e-3,  4.9430807090480523e-11,
+       -1.5554602758465635e-3, 1.0561196919903214e-3,  -3.5277184460472902e-4,
+       9.3002334645022459e-14, 7.5285855026557172e-5,  -4.8186515569156351e-5,
+       1.5227271505597605e-5},
+      {-6.6188298861372935,    1.3397985455142589e+1,  -1.0789350606845146e+1,
+       -1.4352254537875018e-3, 9.2333694596189809,     -1.0456552819547769e+1,
+       5.5105526029033471,     1.2024439690716742e-5,  -2.5762961164755816,
+       2.3207442745387179,     -1.0045728797216284,    -1.0207833290021914e-7,
+       3.3975092171169466e-1,  -2.6720517450757468e-1, 1.0235252851562706e-1,
+       8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2,
+       -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3,
+       -1.1082898580743683e-3, 3.654545161310169e-4,   -5.1290032026971794e-11,
+       -7.6340103696869031e-5},
+      {-1.7112706061976095e+1, -1.1208044642899116,    3.7131966511885444e+1,
+       -5.2298271025348962e+1, 3.3058589696624618e+1,  2.4791298976200222e-3,
+       -2.061089403411526e+1,  2.088672775145582e+1,   -1.0045703956517752e+1,
+       -1.2238783449063012e-5, 4.0770134274221141,     -3.473667358470195,
+       1.4329352617312006,     7.1359914411879712e-8,  -4.4797257159115612e-1,
+       3.4112666080644461e-1,  -1.2699786326594923e-1, -2.8953677269081528e-10,
+       3.3125776278259863e-2,  -2.3274087021036101e-2, 8.0399993503648882e-3,
+       -1.177805216235265e-9,  -1.8321624891071668e-3, 1.2108282933588665e-3,
+       -3.9479941246822517e-4},
+      {7.389033153567425e+1,   -1.5680141270402273e+2, 1.322177542759164e+2,
+       1.3692876877324546e-2,  -1.2366496885920151e+2, 1.4620689391062729e+2,
+       -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1,
+       -3.8210340013273034e+1, 1.719522294277362e+1,   9.3519707955168356e-7,
+       -6.2716159907747034,    5.1168999071852637,     -2.0319658112299095,
+       -4.9507215582761543e-9, 5.9626397294332597e-1,  -4.4220765337238094e-1,
+       1.6079998700166273e-1,  -2.4733786203223402e-8, -4.0307574759979762e-2,
+       2.7849050747097869e-2,  -9.4751858992054221e-3, 6.419922235909132e-6,
+       2.1250180774699461e-3},
+      {2.1216837098382522e+2,  1.3107863022633868e+1,  -4.9698285932871748e+2,
+       7.3121595266969204e+2,  -4.8213821720890847e+2, -2.8817248692894889e-2,
+       3.2616720302947102e+2,  -3.4389340280087117e+2, 1.7195193870816232e+2,
+       1.4038077378096158e-4,  -7.52594195897599e+1,   6.651969984520934e+1,
+       -2.8447519748152462e+1, -7.613702615875391e-7,  9.5402237105304373,
+       -7.5175301113311376,    2.8943997568871961,     -4.6612194999538201e-7,
+       -8.0615149598794088e-1, 5.8483006570631029e-1,  -2.0845408972964956e-1,
+       1.4765818959305817e-4,  5.1000433863753019e-2,  -3.3066252141883665e-2,
+       1.5109265210467774e-2},
+      {-9.8959643098322368e+2, 2.1925555360905233e+3,  -1.9283586782723356e+3,
+       -1.5925738122215253e-1, 1.9569985945919857e+3,  -2.4072514765081556e+3,
+       1.3756149959336496e+3,  1.2920735237496668e-3,  -7.525941715948055e+2,
+       7.3171668742208716e+2,  -3.4137023466220065e+2, -9.9857390260608043e-6,
+       1.3356313181291573e+2,  -1.1276295161252794e+2, 4.6310396098204458e+1,
+       -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1,
+       -4.1690817945270892,    3.1008219800117808e-3,  1.1220095449981468,
+       -7.6052379926149916e-1, 3.6262236505085254e-1,  2.216867741940747e-1,
+       4.8683443692930507e-1}};
+
+  int k, n, sgn;
+  int maxpow = 0;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+  accscalar_t lambda = x / a;
+  accscalar_t sigma = (x - a) / a;
+  accscalar_t eta, res, ck, ckterm, term, absterm;
+  accscalar_t absoldterm = INFINITY;
+  accscalar_t etapow[25] = {1};
+  accscalar_t sum = 0;
+  accscalar_t afac = 1;
+
+  if (igam) {
+    sgn = -1;
+  } else {
+    sgn = 1;
+  }
+
+  if (lambda > 1) {
+    eta = ::sqrt(-2 * (::log1p(sigma) - sigma));
+  } else if (lambda < 1) {
+    eta = -::sqrt(-2 * (::log1p(sigma) - sigma));
+  } else {
+    eta = 0;
+  }
+  res = 0.5 * ::erfc(sgn * eta * ::sqrt(a / 2));
+
+  for (k = 0; k < 25; k++) {
+    ck = d[k][0];
+    for (n = 1; n < 25; n++) {
+      if (n > maxpow) {
+        etapow[n] = eta * etapow[n - 1];
+        maxpow += 1;
+      }
+      ckterm = d[k][n] * etapow[n];
+      ck += ckterm;
+      if (::fabs(ckterm) < MACHEP * ::fabs(ck)) {
+        break;
+      }
+    }
+    term = ck * afac;
+    absterm = ::fabs(term);
+    if (absterm > absoldterm) {
+      break;
+    }
+    sum += term;
+    if (absterm < MACHEP * ::fabs(sum)) {
+      break;
+    }
+    absoldterm = absterm;
+    afac /= a;
+  }
+  res += sgn * ::exp(-0.5 * a * eta * eta) * sum / ::sqrt(2 * 3.1415926535 * a);
+
+  return res;
+}
+
+template <typename scalar_t>
+scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) {
+  // Compute igamc using DLMF 8.9.2. [igam1]
+
+  using accscalar_t = opmath_t<scalar_t>;
+  int i;
+  accscalar_t ans, ax, c, yc, r, t, y, z;
+  accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2;
+  const int MAXITER = 2000;
+  const accscalar_t MACHEP = 5.9604644775390625E-8;
+  const accscalar_t BIG = 16777216.;
+  const accscalar_t BIGINV = 5.9604644775390625E-8;
+
+  ax = _igam_helper_fac(a, x);
+  if (ax == 0.0) {
+    return 0.0;
+  }
+
+  /* continued fraction */
+  y = 1.0 - a;
+  z = x + y + 1.0;
+  c = 0.0;
+  pkm2 = 1.0;
+  qkm2 = x;
+  pkm1 = x + 1.0;
+  qkm1 = z * x;
+  ans = pkm1 / qkm1;
+
+  for (i = 0; i < MAXITER; i++) {
+    c += 1.0;
+    y += 1.0;
+    z += 2.0;
+    yc = y * c;
+    pk = pkm1 * z - pkm2 * yc;
+    qk = qkm1 * z - qkm2 * yc;
+    if (qk != 0) {
+      r = pk / qk;
+      t = ::fabs((ans - r) / r);
+      ans = r;
+    } else {
+      t = 1.0;
+    }
+    pkm2 = pkm1;
+    pkm1 = pk;
+    qkm2 = qkm1;
+    qkm1 = qk;
+    if (::fabs(pk) > BIG) {
+      pkm2 *= BIGINV;
+      pkm1 *= BIGINV;
+      qkm2 *= BIGINV;
+      qkm1 *= BIGINV;
+    }
+    if (t <= MACHEP) {
+      break;
+    }
+  }
+  return ans * ax;
+}
+
+template <typename scalar_t>
+scalar_t calc_igammac(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized upper incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.4 [igam1])
+   * - if x > 1.1 and x < a, using the subtraction from the regularized lower
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (5)
+   */
+
+  using accscalar_t = opmath_t<scalar_t>;
+  accscalar_t absxma_a;
+
+  const accscalar_t SMALL = 20.0;
+  const accscalar_t LARGE = 200.0;
+  const accscalar_t SMALLRATIO = 0.3;
+  const accscalar_t LARGERATIO = 4.5;
+
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return NAN;
+  } else if (a == 0) {
+    if (x > 0) {
+      return 0.0;
+    } else {
+      return NAN;
+    }
+  } else if (x == 0) {
+    return 1.0;
+  } else if (isinf(a)) {
+    if (isinf(x)) {
+      return NAN;
+    }
+    return 1.0;
+  } else if (isinf(x)) {
+    return 0.0;
+  }
+
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 0);
+  } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 0);
+  }
+
+  if (x > 1.1) {
+    if (x < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    } else {
+      return _igamc_helper_continued_fraction(a, x);
+    }
+  } else if (x <= 0.5) {
+    if (-0.4 / ::log(x) < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    } else {
+      return _igamc_helper_series(a, x);
+    }
+  } else {
+    if (x * 1.1 < a) {
+      return 1.0 - _igam_helper_series(a, x);
+    } else {
+      return _igamc_helper_series(a, x);
+    }
+  }
+}
+
+template <typename scalar_t>
+scalar_t calc_igamma(scalar_t a, scalar_t x) {
+  /* the calculation of the regularized lower incomplete gamma function
+   * is done differently based on the values of a and x:
+   * - if x and/or a is at the boundary of defined region, then assign the
+   *   result at the boundary
+   * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for
+   *   Large Parameter (see DLMF 8.12.3 [igam1])
+   * - if x > 1 and x > a, using the subtraction from the regularized upper
+   *   incomplete gamma
+   * - otherwise, calculate the series from [igam2] eq (4)
+   */
+
+  using accscalar_t = opmath_t<scalar_t>;
+  accscalar_t absxma_a;
+  const accscalar_t SMALL = 20.0;
+  const accscalar_t LARGE = 200.0;
+  const accscalar_t SMALLRATIO = 0.3;
+  const accscalar_t LARGERATIO = 4.5;
+
+  // boundary values following SciPy
+  if ((x < 0) || (a < 0)) {
+    // out of defined-region of the function
+    return NAN;
+  } else if (a == 0) {
+    if (x > 0) {
+      return 1.0;
+    } else {
+      return NAN;
+    }
+  } else if (x == 0) {
+    return 0.0; // zero integration limit
+  } else if (isinf(a)) {
+    if (isinf(x)) {
+      return NAN;
+    }
+    return 0.0;
+  } else if (isinf(x)) {
+    return 1.0;
+  }
+
+  /* Asymptotic regime where a ~ x. */
+  absxma_a = ::fabs(x - a) / a;
+  if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) {
+    return _igam_helper_asymptotic_series(a, x, 1);
+  }
+
+  if ((x > 1.0) && (x > a)) {
+    return 1.0 - calc_igammac(a, x);
+  }
+
+  return _igam_helper_series(a, x);
+}
+
+} // namespace
+
+// end of regularized lower & upper incomplete gamma
+
+namespace c10 {
+namespace metal {
+
+template <typename T>
+inline T igamma(T a, T b) {
+  return calc_igamma(a, b);
+}
+
+template <typename T>
+inline T igammac(T a, T b) {
+  return calc_igammac(a, b);
+}
+
+} // namespace metal
+} // namespace c10
diff --git a/c10/metal/special_math.h b/c10/metal/special_math.h
index 34f6ab6d1d09..29a45ff4c30b 100644
--- a/c10/metal/special_math.h
+++ b/c10/metal/special_math.h
@@ -1,6 +1,7 @@
 // Implementation of specal math functions for Metal
 #pragma once
 #include <c10/metal/expm1f.h>
+#include <c10/metal/igamma.h>
 #include <c10/metal/utils.h>
 #include <metal_stdlib>
 
@@ -47,6 +48,11 @@ inline float erf(T x) {
   return r;
 }
 
+template <typename T>
+float erfc(T x) {
+  return 1.0 - erf(x);
+}
+
 template <typename T>
 inline float erfinv(T y) {
   /* coefficients in rational expansion */
diff --git a/c10/test/build.bzl b/c10/test/build.bzl
index 2f54c8a2faa5..deb917dd8fcf 100644
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@@ -46,7 +46,7 @@ def define_targets(rules):
                 "util/typeid_test.cpp",
             ],
         ),
-        copts = ["-Wno-deprecated-declarations"],
+        copts = ["-Wno-deprecated-declarations", "-Wno-ctad-maybe-unsupported"],
         deps = [
             ":Macros",
             ":complex_math_test_common",
diff --git a/c10/test/core/SymInt_test.cpp b/c10/test/core/SymInt_test.cpp
index 7cefa1e4a771..e408543f5362 100644
--- a/c10/test/core/SymInt_test.cpp
+++ b/c10/test/core/SymInt_test.cpp
@@ -1,5 +1,6 @@
 #include <gtest/gtest.h>
 
+#include <c10/core/ConstantSymNodeImpl.h>
 #include <c10/core/SymInt.h>
 #include <c10/core/SymNodeImpl.h>
 #include <c10/macros/Macros.h>
@@ -35,4 +36,169 @@ TEST(SymIntTest, Overflows) {
 }
 #endif
 
+namespace {
+
+// We need a SymNodeImpl that 1) has working arithmetic with
+// predictable results and 2) causes SymInt::maybe_as_int to return
+// nullopt so that we can hit all 4 cases (zero/one/both arguments
+// have null maybe_as_int) in the operator implementations.
+class ConstantIntPretendingToBeSymbolicSymNodeImpl
+    : public ConstantSymNodeImpl<int64_t> {
+ public:
+  using ConstantSymNodeImpl<int64_t>::ConstantSymNodeImpl;
+  std::optional<int64_t> constant_int() override {
+    return std::nullopt;
+  }
+  std::optional<int64_t> maybe_as_int() override {
+    return std::nullopt;
+  }
+  // Needs to be implemented for arithmetic to actually
+  // work. NestedIntSymNodeImpl does this, for example.
+  c10::SymNode wrap_int(int64_t num) override {
+    return SymNode(
+        c10::make_intrusive<ConstantIntPretendingToBeSymbolicSymNodeImpl>(num));
+  }
+
+  c10::SymNode wrap_bool(bool b) override {
+    return SymNode(c10::make_intrusive<ConstantSymNodeImpl<bool>>(b));
+  }
+
+  SymNode add(const SymNode& other) override {
+    return wrap_int(int_() + other->int_());
+  }
+
+  SymNode sub(const SymNode& other) override {
+    return wrap_int(int_() - other->int_());
+  }
+
+  SymNode mul(const SymNode& other) override {
+    return wrap_int(int_() * other->int_());
+  }
+
+  SymNode floordiv(const SymNode& other) override {
+    return wrap_int(int_() / other->int_());
+  }
+
+  SymNode sym_min(const SymNode& other) override {
+    return wrap_int(std::min(int_(), other->int_()));
+  }
+
+  SymNode sym_max(const SymNode& other) override {
+    return wrap_int(std::max(int_(), other->int_()));
+  }
+
+  SymNode mod(const SymNode& other) override {
+    return wrap_int(int_() % other->int_());
+  }
+
+  SymNode eq(const SymNode& other) override {
+    return wrap_bool(int_() == other->int_());
+  }
+
+  SymNode ne(const SymNode& other) override {
+    return wrap_bool(int_() != other->int_());
+  }
+
+  SymNode lt(const SymNode& other) override {
+    return wrap_bool(int_() < other->int_());
+  }
+
+  SymNode le(const SymNode& other) override {
+    return wrap_bool(int_() <= other->int_());
+  }
+
+  SymNode gt(const SymNode& other) override {
+    return wrap_bool(int_() > other->int_());
+  }
+
+  SymNode ge(const SymNode& other) override {
+    return wrap_bool(int_() >= other->int_());
+  }
+};
+
+SymInt create_symbolic_symint(int64_t value) {
+  return SymInt(
+      SymNode(c10::make_intrusive<ConstantIntPretendingToBeSymbolicSymNodeImpl>(
+          value)));
+}
+
+auto unwrap(const SymInt& x) {
+  return x.guard_int(__FILE__, __LINE__);
+}
+
+auto unwrap(bool b) {
+  return b;
+}
+
+template <template <typename> class Op>
+void test_operator() {
+  for (const auto& arg1 : {SymInt(42), create_symbolic_symint(42)}) {
+    for (const auto& arg2 : {SymInt(27), create_symbolic_symint(27)}) {
+      EXPECT_EQ(unwrap(Op<SymInt>()(arg1, arg2)), Op<int64_t>()(42, 27));
+    }
+  }
+}
+} // namespace
+
+TEST(SymIntTest, BinaryPlus) {
+  test_operator<std::plus>();
+}
+
+TEST(SymIntTest, BinaryMinus) {
+  test_operator<std::minus>();
+}
+
+TEST(SymIntTest, BinaryMultiplies) {
+  test_operator<std::multiplies>();
+}
+
+TEST(SymIntTest, BinaryDivides) {
+  test_operator<std::divides>();
+}
+
+TEST(SymIntTest, BinaryModulus) {
+  test_operator<std::modulus>();
+}
+
+TEST(SymIntTest, BinaryComparisonOperators) {
+  test_operator<std::equal_to>();
+  test_operator<std::not_equal_to>();
+  test_operator<std::less>();
+  test_operator<std::less_equal>();
+  test_operator<std::greater>();
+  test_operator<std::greater_equal>();
+}
+
+template <typename T>
+struct MinWrapper {
+  auto operator()(T lhs, T rhs) const {
+    return std::min(lhs, rhs);
+  }
+};
+
+template <>
+struct MinWrapper<SymInt> {
+  auto operator()(const SymInt& lhs, const SymInt& rhs) const {
+    return lhs.min(rhs);
+  }
+};
+
+template <typename T>
+struct MaxWrapper {
+  auto operator()(T lhs, T rhs) const {
+    return std::max(lhs, rhs);
+  }
+};
+
+template <>
+struct MaxWrapper<SymInt> {
+  auto operator()(const SymInt& lhs, const SymInt& rhs) const {
+    return lhs.max(rhs);
+  }
+};
+
+TEST(SymIntTest, MinMax) {
+  test_operator<MinWrapper>();
+  test_operator<MaxWrapper>();
+}
 #endif
diff --git a/c10/xpu/XPUDeviceProp.h b/c10/xpu/XPUDeviceProp.h
index 591a14f4ad91..085c6367477f 100644
--- a/c10/xpu/XPUDeviceProp.h
+++ b/c10/xpu/XPUDeviceProp.h
@@ -115,19 +115,22 @@ namespace c10::xpu {
 
 #define AT_FORALL_XPU_EXT_DEVICE_PROPERTIES(_)                                \
   /* the number of EUs associated with the Intel GPU. */                      \
-  _(gpu_eu_count, 512)                                                        \
+  _(gpu_eu_count, gpu_eu_count, 512)                                          \
                                                                               \
   /* the number of EUs in a subslice. */                                      \
-  _(gpu_eu_count_per_subslice, 8)                                             \
+  _(gpu_eu_count_per_subslice, gpu_eu_count_per_subslice, 8)                  \
                                                                               \
   /* the simd width of EU of GPU. */                                          \
-  _(gpu_eu_simd_width, 8)                                                     \
+  _(gpu_eu_simd_width, gpu_eu_simd_width, 8)                                  \
                                                                               \
   /* the number of hardware threads per EU of GPU. */                         \
-  _(gpu_hw_threads_per_eu, 8)                                                 \
+  _(gpu_hw_threads_per_eu, gpu_hw_threads_per_eu, 8)                          \
                                                                               \
   /* the device identifier of the Intel GPU, also known as the product ID. */ \
-  _(device_id, 0)
+  _(device_id, device_id, 0)                                                  \
+                                                                              \
+  /* the device descriptor for device Universal Unique ID, 16 bytes*/         \
+  _(uuid, device_info_uuid, (std::array<unsigned char, 16>{}))
 
 #define AT_FORALL_XPU_DEVICE_ASPECT(_)                  \
   /* sycl::half is supported on device. */              \
diff --git a/c10/xpu/XPUFunctions.cpp b/c10/xpu/XPUFunctions.cpp
index 5ea7d30e34cf..6947c078483e 100644
--- a/c10/xpu/XPUFunctions.cpp
+++ b/c10/xpu/XPUFunctions.cpp
@@ -157,9 +157,9 @@ void initDeviceProperties(DeviceProp* device_prop, DeviceIndex device) {
 #define ASSIGN_DEVICE_PROP(property) \
   device_prop->property = raw_device.get_info<device::property>();
 
-#define ASSIGN_EXT_DEVICE_PROP(property, default_value)                      \
-  device_prop->property = raw_device.has(sycl::aspect::ext_intel_##property) \
-      ? raw_device.get_info<intel::info::device::property>()                 \
+#define ASSIGN_EXT_DEVICE_PROP(property, aspect_tag, default_value)            \
+  device_prop->property = raw_device.has(sycl::aspect::ext_intel_##aspect_tag) \
+      ? raw_device.get_info<intel::info::device::property>()                   \
       : default_value;
 
 #define ASSIGN_DEVICE_ASPECT(member) \
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 781e134ad0d3..6ab41b6c8479 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -316,6 +316,7 @@ set(GENERATED_CXX_PYTHON
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
+  "${TORCH_SRC_DIR}/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
   )
 
 set(GENERATED_H_PYTHON
@@ -379,6 +380,9 @@ add_custom_command(
     "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
     "${TORCH_ROOT}/aten/src/ATen/templates/LazyNonNativeIr.h"
     "${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.h"
+    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp"
     ${autograd_python}
     ${autograd_yaml}
     ${autograd_templates}
@@ -581,6 +585,7 @@ if(USE_CUDA)
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
         PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
       )
     endif()
@@ -1062,7 +1067,7 @@ elseif(USE_CUDA)
         UNFUSE_FMA                      # Addressing issue #121558
       )
     target_sources(torch_cuda PRIVATE $<TARGET_OBJECTS:flash_attention>)
-    target_include_directories(torch_cuda PUBLIC
+    target_include_directories(torch_cuda SYSTEM PUBLIC
       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc>
       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/flash-attention/include>
       $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/third_party/cutlass/include>
@@ -1651,6 +1656,10 @@ if(USE_CUDA)
   # order of the libraries in the linker call matters here when statically
   # linking; libculibos and cublas must be last.
   target_link_libraries(torch_cuda PUBLIC torch_cpu_library ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
+  if(USE_FBGEMM_GENAI)
+    # Link fbgemm_genai to torch_cuda (only for (1) CUDA build for SM100).
+    target_link_libraries(torch_cuda PRIVATE fbgemm_genai)
+  endif()
 endif()
 
 # ---[ XPU library.
@@ -1772,9 +1781,10 @@ if(USE_ROCM)
   target_link_libraries(torch_hip PRIVATE ${Caffe2_HIP_DEPENDENCY_LIBS})
 
   if(USE_FBGEMM_GENAI)
-    target_link_libraries(torch_hip PRIVATE fbgemm_genai)
+    if(USE_ROCM)
+      target_link_libraries(torch_hip PRIVATE fbgemm_genai)
+    endif()
   endif()
-
   # Since PyTorch files contain HIP headers, this is also needed to capture the includes.
   # ROCM_INCLUDE_DIRS is defined in LoadHIP.cmake
   target_include_directories(torch_hip PRIVATE ${Caffe2_HIP_INCLUDE} ${ROCM_INCLUDE_DIRS})
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 944c7821f667..daceebd8bc88 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1037,6 +1037,22 @@ if(USE_ROCM)
        list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
     endif(CMAKE_BUILD_TYPE MATCHES Debug)
 
+    # Get EnVar 'PYTORCH_LAYERNORM_FAST_RECIPROCAL' (or default to on).
+    if(DEFINED ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
+      set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE $ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
+    else()
+      set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE ON)
+    endif()
+
+    set(PYTORCH_LAYERNORM_FAST_RECIPROCAL
+      ${PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE}
+      CACHE BOOL "Enable fast reciprocals within layer normalization." FORCE
+    )
+
+    if(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+      add_definitions(-DPYTORCH_LAYERNORM_FAST_RECIPROCAL)
+    endif()
+
     # needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
     list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)
 
@@ -1666,9 +1682,9 @@ if(USE_KINETO)
         set(CMAKE_REQUIRED_LINK_OPTIONS "")
         if(NOT EXCEPTIONS_WORK)
           message(FATAL_ERROR
-            "Detected that statically linking against CUPTI causes exceptions to stop working. "
-            "See https://github.com/pytorch/pytorch/issues/57744 for more details. "
-            "Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python setup.py develop")
+            "Detected that statically linking against CUPTI causes exceptions to stop working.  "
+            "See https://github.com/pytorch/pytorch/issues/57744 for more details.  "
+            "Perhaps try: USE_CUPTI_SO=1 CMAKE_FRESH=1 python -m pip install -e . -v --no-build-isolation")
         endif()
       endif()
 
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index 54564e42c902..5d9158774654 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -9,97 +9,160 @@ if(NOT __AOTRITON_INCLUDED)
   # Replaces .ci/docker/aotriton_version.txt
   # Note packages information may have versions skipped (due to no ABI breaks)
   # But they must be listed from lower version to higher version
-  set(__AOTRITON_VER "0.10b")
+  set(__AOTRITON_VER "0.11b")
   set(__AOTRITON_MANYLINUX_LIST
+      "manylinux_2_28"  # rocm6.2
       "manylinux_2_28"  # rocm6.3
       "manylinux_2_28"  # rocm6.4
-      "manylinux_2_28"  # rocm6.5
       "manylinux_2_28"  # rocm7.0
       )
   set(__AOTRITON_ROCM_LIST
+      "rocm6.2"
       "rocm6.3"
       "rocm6.4"
-      "rocm6.5"
       "rocm7.0"
       )
-  set(__AOTRITON_CI_COMMIT "6fca155f4deeb8d9529326f7b69f350aeeb93477")
+  set(__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895")
   set(__AOTRITON_SHA256_LIST
-      "861cd9f7479eec943933c27cb86920247e5b5dd139bc7c1376c81808abb7d7fe"  # rocm6.3
-      "acea7d811a2d3bbe718b6e07fc2a9f739e49eecd60b4b6a36fcb3fe8edf85d78"  # rocm6.4
-      "7e29c325d5bd33ba896ddb106f5d4fc7d715274dca7fe937f724fffa82017838"  # rocm6.5
-      "1e9b3dddf0c7fc07131c6f0f5266129e83ce2331f459fa2be8c63f4ae91b0f5b"  # rocm7.0
+      "6cae3d5de75ee205d22e088f7dfaab1227056d02ea67f29ccdbc09f2be4e8c8f"  # rocm6.2
+      "72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe"  # rocm6.3
+      "c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393"  # rocm6.4
+      "a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf"  # rocm7.0
       )
+  set(__AOTRITON_IMAGE_LIST
+      "amd-gfx90a"
+      "amd-gfx942"
+      "amd-gfx950"
+      "amd-gfx11xx"
+      "amd-gfx120x"
+     )
+  set(__AOTRITON_IMAGE_SHA256_LIST
+     "c19a41c9480510ab32e6fb05e6ed0a3832d6b07634f050b836b760200befa735" # amd-gfx90a
+     "3a06a99971dddb7703a30378f1c5d6b41468d926ea51821156d1b6857b985bc4" # amd-gfx942
+     "27fc21f6761d57987a700436de8cf29cbdd9eeee91318dfed596eeb147d219ad" # amd-gfx950
+     "ec134032087344176695505db659387374d1916adfee16f0db47dee38d9c8603" # amd-gfx11xx
+     "fec05205747ff51649b1e151545267d5aa2037ba9d0338cad286882915b941b0" # amd-gfx120x
+     )
+  set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/")  # @lint-ignore
   set(__AOTRITON_Z "gz")
-
-  # Note it is INSTALL"ED"
-  if(DEFINED ENV{AOTRITON_INSTALLED_PREFIX})
-    install(DIRECTORY
-            $ENV{AOTRITON_INSTALLED_PREFIX}/lib
-            $ENV{AOTRITON_INSTALLED_PREFIX}/include
-            DESTINATION ${__AOTRITON_INSTALL_DIR})
-    set(__AOTRITON_INSTALL_DIR "$ENV{AOTRITON_INSTALLED_PREFIX}")
-    message(STATUS "Using Preinstalled AOTriton at ${__AOTRITON_INSTALL_DIR}")
-  elseif(DEFINED ENV{AOTRITON_INSTALL_FROM_SOURCE})
-    ExternalProject_Add(aotriton_external
+  function(aotriton_build_from_source noimage project)
+    if(noimage)
+      SET(RECURSIVE "OFF")
+    else()
+      SET(RECURSIVE "ON")
+    endif()
+    message(STATUS "PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}")
+    ExternalProject_Add(${project}
       GIT_REPOSITORY https://github.com/ROCm/aotriton.git
+      GIT_SUBMODULES_RECURSE ${RECURSIVE}
       GIT_TAG ${__AOTRITON_CI_COMMIT}
       PREFIX ${__AOTRITON_EXTERN_PREFIX}
-      INSTALL_DIR ${__AOTRITON_INSTALL_DIR}
-      CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${__AOTRITON_INSTALL_DIR}
+      CMAKE_CACHE_ARGS
       -DAOTRITON_TARGET_ARCH:STRING=${PYTORCH_ROCM_ARCH}
+      -DCMAKE_INSTALL_PREFIX:FILEPATH=${__AOTRITON_INSTALL_DIR}
+      CMAKE_ARGS
       -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+      -DAOTRITON_GPU_BUILD_TIMEOUT=0
       -DAOTRITON_NO_PYTHON=ON
-      -DAOTRITON_NO_SHARED=OFF
-      # CONFIGURE_COMMAND ""
-      BUILD_COMMAND ""  # No build, install command will repeat the build process due to problems in the build system.
+      -DAOTRITON_NOIMAGE_MODE=${noimage}
       BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
       USES_TERMINAL_DOWNLOAD TRUE
       USES_TERMINAL_CONFIGURE TRUE
       USES_TERMINAL_BUILD TRUE
       USES_TERMINAL_INSTALL TRUE
-      # INSTALL_COMMAND ${MAKE_COMMAND} install
-      )
-    add_dependencies(__caffe2_aotriton aotriton_external)
-    message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}")
-  else()
-    set(__AOTRITON_SYSTEM_ROCM "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
-    list(GET __AOTRITON_ROCM_LIST 0 __AOTRITON_ROCM_DEFAULT_STR)
-    # Initialize __AOTRITON_ROCM to lowest version, in case all builds > system's ROCM
-    string(SUBSTRING ${__AOTRITON_ROCM_DEFAULT_STR} 4 -1 __AOTRITON_ROCM)
-    foreach(AOTRITON_ROCM_BUILD_STR IN LISTS __AOTRITON_ROCM_LIST)
-      # len("rocm") == 4
-      string(SUBSTRING ${AOTRITON_ROCM_BUILD_STR} 4 -1 AOTRITON_ROCM_BUILD)
-      # Find the last build that <= system's ROCM
-      # Assume the list is from lower to higher
-      if(AOTRITON_ROCM_BUILD VERSION_GREATER __AOTRITON_SYSTEM_ROCM)
-        break()
-      endif()
-      set(__AOTRITON_ROCM ${AOTRITON_ROCM_BUILD})
-    endforeach()
-    list(FIND __AOTRITON_ROCM_LIST "rocm${__AOTRITON_ROCM}" __AOTRITON_ROCM_INDEX)
-    list(GET __AOTRITON_SHA256_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_SHA256)
-    list(GET __AOTRITON_MANYLINUX_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_MANYLINUX)
-    set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
+    )
+  endfunction()
+
+  set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
+  function(aotriton_download_runtime index project)
+    list(GET __AOTRITON_ROCM_LIST ${index} __AOTRITON_ROCM)
+    list(GET __AOTRITON_MANYLINUX_LIST ${index} __AOTRITON_MANYLINUX)
+    list(GET __AOTRITON_SHA256_LIST ${index} __AOTRITON_SHA256)
+
     string(CONCAT __AOTRITON_FILE "aotriton-"
                                   "${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
-                                  "_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}"
+                                  "_${__AOTRITON_ARCH}-${__AOTRITON_ROCM}"
                                   "-shared.tar.${__AOTRITON_Z}")
-    string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/"  # @lint-ignore
-                                 "${__AOTRITON_VER}/${__AOTRITON_FILE}")
-    ExternalProject_Add(aotriton_external
+    string(CONCAT __AOTRITON_URL
+           "${__AOTRITON_BASE_URL}"
+           "${__AOTRITON_VER}/${__AOTRITON_FILE}")
+    ExternalProject_Add(${project}
       URL "${__AOTRITON_URL}"
       URL_HASH SHA256=${__AOTRITON_SHA256}
-      SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball
+      SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime
       CONFIGURE_COMMAND ""
       BUILD_COMMAND ""
       INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
-      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball"
+      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
       "${__AOTRITON_INSTALL_DIR}"
       BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
     )
-    add_dependencies(__caffe2_aotriton aotriton_external)
-    message(STATUS "Using AOTriton from pre-compiled binary ${__AOTRITON_URL}.\
+    message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
     Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
+  endfunction()
+
+  function(aotriton_download_image image project)
+    list(FIND __AOTRITON_IMAGE_LIST ${image} index)
+    list(GET __AOTRITON_IMAGE_SHA256_LIST ${index} __AOTRITON_SHA256)
+
+    string(CONCAT __AOTRITON_FILE
+           "aotriton-${__AOTRITON_VER}-images-"
+           "${image}.tar.${__AOTRITON_Z}")
+    string(CONCAT __AOTRITON_URL
+           "${__AOTRITON_BASE_URL}"
+           "${__AOTRITON_VER}/${__AOTRITON_FILE}")
+    ExternalProject_Add(${project}
+      URL "${__AOTRITON_URL}"
+      URL_HASH SHA256=${__AOTRITON_SHA256}
+      SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND ""
+      INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
+      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_image-${image}"
+      "${__AOTRITON_INSTALL_DIR}"
+      BUILD_BYPRODUCTS
+      "${__AOTRITON_INSTALL_DIR}/lib/aotriton.images/${image}/__signature__"
+    )
+    message(STATUS "Download AOTriton pre-compiled GPU images from ${__AOTRITON_URL}.")
+  endfunction()
+
+  # Note it is INSTALL"ED"
+  if(DEFINED ENV{AOTRITON_INSTALLED_PREFIX})
+    install(DIRECTORY
+            $ENV{AOTRITON_INSTALLED_PREFIX}/lib
+            $ENV{AOTRITON_INSTALLED_PREFIX}/include
+            DESTINATION ${__AOTRITON_INSTALL_DIR})
+    set(__AOTRITON_INSTALL_DIR "$ENV{AOTRITON_INSTALLED_PREFIX}")
+    message(STATUS "Using Preinstalled AOTriton at ${__AOTRITON_INSTALL_DIR}")
+  elseif(DEFINED ENV{AOTRITON_INSTALL_FROM_SOURCE})
+    aotriton_build_from_source(OFF aotriton_external)
+    add_dependencies(__caffe2_aotriton aotriton_external)
+    message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}")
+  else()
+    set(__AOTRITON_SYSTEM_ROCM "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}")
+    list(FIND __AOTRITON_ROCM_LIST "rocm${__AOTRITON_SYSTEM_ROCM}" __AOTRITON_RUNTIME_INDEX)
+    if(${__AOTRITON_RUNTIME_INDEX} LESS 0)
+      message(STATUS "Cannot find AOTriton runtime for ROCM ${__AOTRITON_SYSTEM_ROCM}. \
+      Build runtime from source")
+      aotriton_build_from_source(ON aotriton_runtime)
+    else()
+      aotriton_download_runtime(${__AOTRITON_RUNTIME_INDEX} aotriton_runtime)
+    endif()
+    add_dependencies(__caffe2_aotriton aotriton_runtime)
+    set(__AOTRITON_CHAINED_IMAGE "aotriton_runtime")
+    foreach(image ${__AOTRITON_IMAGE_LIST})
+      string(SUBSTRING ${image} 7 -1 gfx_pattern)
+      string(REPLACE "x" "." gfx_regex ${gfx_pattern})
+      foreach(target ${PYTORCH_ROCM_ARCH})
+        if(target MATCHES ${gfx_regex})
+          set(__AOTRITON_DOWNLOAD_TARGET aotriton_image_${gfx_pattern})
+          aotriton_download_image(${image} ${__AOTRITON_DOWNLOAD_TARGET})
+          add_dependencies(${__AOTRITON_CHAINED_IMAGE} ${__AOTRITON_DOWNLOAD_TARGET})
+          set(__AOTRITON_CHAINED_IMAGE ${__AOTRITON_DOWNLOAD_TARGET})
+          break()
+        endif()
+      endforeach()
+    endforeach()
   endif()
   target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so)
   target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
diff --git a/cmake/Modules/FindMKLDNN.cmake b/cmake/Modules/FindMKLDNN.cmake
index 00fd0130d834..2018d5ec9370 100644
--- a/cmake/Modules/FindMKLDNN.cmake
+++ b/cmake/Modules/FindMKLDNN.cmake
@@ -46,8 +46,8 @@ IF(NOT MKLDNN_FOUND)
       endif()
     endif()
     ExternalProject_Add(xpu_mkldnn_proj
-      GIT_REPOSITORY https://github.com/oneapi-src/oneDNN
-      GIT_TAG v3.8.1
+      GIT_REPOSITORY https://github.com/uxlfoundation/oneDNN
+      GIT_TAG v3.9.1
       PREFIX ${XPU_MKLDNN_DIR_PREFIX}
       BUILD_IN_SOURCE 0
       CMAKE_ARGS  -DCMAKE_C_COMPILER=icx
diff --git a/cmake/public/cuda.cmake b/cmake/public/cuda.cmake
index ad50f622fe0e..218c50a69c6f 100644
--- a/cmake/public/cuda.cmake
+++ b/cmake/public/cuda.cmake
@@ -282,9 +282,15 @@ endif()
 # cufft
 add_library(caffe2::cufft INTERFACE IMPORTED)
 if(CAFFE2_STATIC_LINK_CUDA AND NOT WIN32)
-    set_property(
-        TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
-        CUDA::cufft_static_nocallback)
+    if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
+      set_property(
+          TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
+          CUDA::cufft_static_nocallback)
+    else()
+      set_property(
+          TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
+          CUDA::cufft_static)
+    endif()
 else()
     set_property(
         TARGET caffe2::cufft PROPERTY INTERFACE_LINK_LIBRARIES
diff --git a/docs/cpp/source/conf.py b/docs/cpp/source/conf.py
index b7a0a0172732..10d854c21db4 100644
--- a/docs/cpp/source/conf.py
+++ b/docs/cpp/source/conf.py
@@ -40,7 +40,34 @@
     "sphinx.ext.intersphinx",
 ] + (["breathe", "exhale"] if run_doxygen else [])
 
-intersphinx_mapping = {"pytorch": ("https://pytorch.org/docs/main", None)}
+intersphinx_mapping = {"pytorch": ("https://docs.pytorch.org/docs/main", None)}
+
+# Configure Sphinx warnings and error handling
+suppress_warnings = [
+    "ref.citation",
+    "ref.footnote",
+    "ref.doc",
+    "toc.excluded",
+    "toc.not_readable",
+    "misc.highlighting_failure",
+]
+
+# Configure Breathe
+breathe_show_define_initializer = True
+breathe_show_enumvalue_initializer = True
+breathe_default_members = ("members", "undoc-members")
+
+
+# Fix for Python 3.10+ compatibility with exhale 2.3.0
+# MutableMapping was moved from collections to collections.abc in Python 3.10
+try:
+    import collections
+    from collections.abc import MutableMapping
+
+    if not hasattr(collections, "MutableMapping"):
+        collections.MutableMapping = MutableMapping
+except ImportError:
+    pass
 
 # Setup absolute paths for communicating with breathe / exhale where
 # items are expected / should be trimmed by.
@@ -101,6 +128,21 @@
         Welcome to the developer reference for the PyTorch C++ API.
     """
     ),
+    ############################################################################
+    # Duplicate handling and error management.                                 #
+    ############################################################################
+    # Note: Using Doxyfile instead of stdin configuration
+    # "exhaleDoxygenStdin" is not compatible with "exhaleUseDoxyfile"
+    # Handle unresolved references more gracefully
+    "unabridgedOrphanKinds": {
+        "function",
+        "define",
+        "enum",
+        "enumvalue",
+        "typedef",
+        "variable",
+    },
+    "fullToctreeMaxDepth": 2,
 }
 
 # Tell sphinx what the primary language being documented is.
@@ -174,6 +216,7 @@
 #
 html_theme_options = {
     "canonical_url": "https://pytorch.org/docs/stable/",
+    "analytics_id": "GTM-T8XT4PS",
     "collapse_navigation": False,
     "logo": {"text": "Home"},
     "icon_links": [
diff --git a/docs/source/accelerator/index.md b/docs/source/accelerator/index.md
new file mode 100644
index 000000000000..4c604ba10b01
--- /dev/null
+++ b/docs/source/accelerator/index.md
@@ -0,0 +1,52 @@
+# Accelerator Integration
+
+Since PyTorch 2.1, the community has made significant progress in streamlining the process of integrating new accelerators into the PyTorch ecosystem. These improvements include, but are not limited to: refinements to the `PrivateUse1` Dispatch Key, the introduction and enhancement of core subsystem extension mechanisms, and the device-agnostic refactoring of key modules (e.g., `torch.accelerator`, `memory management`). Taken together, these advances provide the foundation for a **robust**, **flexible**, and **developer-friendly** pathway for accelerator integration.
+
+## Why Does This Matter?
+
+This integration pathway offers several major benefits:
+
+* **Speed**: Extensibility is built into all core PyTorch modules. Developers can integrate new accelerators into their downstream codebases independently—without modifying upstream code and without being limited by community review bandwidth.
+* **Future-proofing**: This is the default integration path for all future PyTorch features, meaning that as new modules and features are added, they will automatically support scaling to new accelerators if this path is followed.
+* **Autonomy**: Vendors maintain full control over their accelerator integration timelines, enabling fast iteration cycles and reducing reliance on upstream coordination.
+
+## About This Document
+
+This guide aims to provide a **comprehensive overview of the modern integration pathway** for new accelerator in PyTorch. It walks through the full integration surface, from low-level device primitives to higher-level domain modules like compilation and quantization. The structure follows a **modular and scenario-driven approach**, where each topic is paired with corresponding code examples from [torch_openreg][OpenReg URL], an official reference implementation.
+
+The goal is to help developers:
+
+* Understand the full scope of accelerator integration;
+* Follow best practices to quickly launch new accelerators;
+* Avoid common pitfalls through clear, targeted examples.
+
+## Target Audience
+
+This document is intended for:
+
+* **Accelerator Developers** who are integrating accelerator into PyTorch;
+* **Advanced PyTorch Users** interested in the inner workings of key modules;
+
+## Quick Overview
+
+This document outlines the key processes and practical scenarios involved in integrating new devices into PyTorch, providing developers with a comprehensive and detailed guide for bringing up new backends. The discussion is structured around four major axes:
+
+* **Runtime**: Covers core components such as Event, Stream, Memory, Generator, Guard, Hooks, as well as the supporting C++ scaffolding.
+* **Operators**: Involve the minimum necessary set of operators, forward and backward operators, fallback operators, fallthroughs, STUBs, etc. in both C++ and Python implementations.
+* **Python Frontend**: Focuses on Python bindings for modules and device-agnostic APIs.
+* **High-level Modules**: Explores integration with major subsystems such as `AMP`, `Compiler`, `ONNX`, and `Distributed` and so on.
+
+Next, we will officially embark on the integration journey for a new PyTorch accelerator.
+
+```{note}
+This guide is a work in progress. For more details, please refer to the [roadmap](https://github.com/pytorch/pytorch/issues/158917).
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+
+operators
+```
+
+[OpenReg URL]: https://github.com/pytorch/pytorch/tree/main/test/cpp_extensions/open_registration_extension/torch_openreg "OpenReg URL"
diff --git a/docs/source/accelerator/operators.md b/docs/source/accelerator/operators.md
new file mode 100644
index 000000000000..2930d6b7f6e4
--- /dev/null
+++ b/docs/source/accelerator/operators.md
@@ -0,0 +1,406 @@
+# Operator Registration
+
+For new accelerators, one of the most important and fundamental aspects of integration is supporting high-performance operators. To facilitate operator adaptation for users and accelerator developers, PyTorch provides multiple methods for developing and registering operators in both `Python` and `C++`. The following sections detail some of PyTorch's fundamental capabilities for operator registration.
+
+```{note}
+`Dispatch Key` is used to uniquely identify accelerator within PyTorch, such as `CPU`, `CUDA`, `MPS`, and `PrivateUse1`. In theory, all subsequent new accelerators will share `PrivateUse1`, leveraging its built-in comprehensive scaffolding capabilities to complete the integration of new accelerators. Please refer to [Let's talk about the PyTorch dispatcher](https://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/) if you are interested with dispatcher.
+```
+
+(operator-set)=
+
+## Operator Set
+
+PyTorch currently has over 3500 built-in operators (including related operator variants). This represents a significant workload from any perspective, and supporting this massive number of operators in a short period of time is no easy task. Therefore, as the first step in developing new backend operators, our goal should be to focus on the essential operators. For other operators, we can first use the community's fallback mechanism to support the feature as the first priority. After that, we can gradually complete other operators to improve the performance of the new backend.
+
+The required operator set is listed below, primarily consisting of low-level operators required by factory functions and fallback operators:
+
+| Operator Name                      | Dispatch Key | Description                                                                                                        |
+| :---:                              | :---:        | :---:                                                                                                              |
+| empty.memory_format                | PrivateUse1  | Create an uninitialized Tensor with the specified shape and memory layout (the stride is automatically calculated) |
+| empty_strided                      | PrivateUse1  | Create an uninitialized Tensor of the specified shape and stride (more degrees of freedom)                         |
+| as_strided                         | PrivateUse1  | Create a shared view of the input Tensor with new shape, stride, and offset (without allocating new memory)        |
+| view                               | PrivateUse1  | Create a shared view of the input Tensor with new shape, but the original Tensor must be memory-contiguous         |
+| _reshape_alias                     | PrivateUse1  | Creates a shared view without safety checks(Internal version of reshape)                                           |
+| resize_                            | PrivateUse1  | Modify the shape of the Tensor in place and reallocate memory if capacity is insufficient                          |
+| _copy_from                         | PrivateUse1  | The underlying core function of Tensor.copy_ is responsible for the actual cross-device data copying               |
+| _copy_from_and_resize              | PrivateUse1  | Combine `resize_` and `_copy_from` to resize first and then copy                                                   |
+| _local_scalar_dense                | PrivateUse1  | The underlying implementation of `.item()`, extracting values from Tensor to CPU scalars                           |
+| set_.source_Tensor                 | PrivateUse1  | Set the current Tensor using the specified Tensor                                                                  |
+| set_.source_Storage                | PrivateUse1  | Set the current Tensor using the specified Storage                                                                 |
+| set_.source_Storage_storage_offset | PrivateUse1  | Set the current Tensor using the specified Storage with the storage offset                                         |
+| fallback                           | PrivateUse1  | Fallback to CPU                                                                                                    |
+
+## Basics
+
+Now that we have defined the initial scope of operator support, we can begin developing operator adaptations. This section will explain these implementations in `Python` and `C++` based on actual scenarios.
+
+(step-one)=
+
+### Step 1
+
+{ref}`The operators mentioned above <operator-set>` share a common characteristic: They are built-in PyTorch operators with defined `namespaces` and `Schemas`, and these operators' built-in accelerators (`CPU`, `CUDA`, etc.) have been implemented. What we have to do next is to implement these operators for the new accelerators.
+
+::::{tab-set}
+
+:::{tab-item} C++
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: EMPTY.MEMORY_FORMAT IMPL
+    :end-before: LITERALINCLUDE END: EMPTY.MEMORY_FORMAT IMPL
+    :linenos:
+
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: EMPTY.MEMORY_FORMAT WRAPPER
+    :end-before: LITERALINCLUDE END: EMPTY.MEMORY_FORMAT WRAPPER
+    :linenos:
+```
+
+:::
+
+::::
+
+Taking the `empty.memory_format` operator as an example, we first need to query the operator's `schema` information in `native_functions.yaml`, which contains detailed signature information. Then, we can implement the operator based on the capabilities of the new accelerator.
+
+```Yaml
+- func: empty.memory_format(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+dispatch:
+    CPU: empty_cpu
+    CUDA: empty_cuda
+    ...
+```
+
+::::{tab-set-code}
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: TORCH_LIBRARY_IMPL DEFAULT
+    :end-before:  LITERALINCLUDE END: TORCH_LIBRARY_IMPL DEFAULT
+    :emphasize-lines: 1,2
+    :linenos:
+```
+
+::::
+
+After completing the `wrapper_empty_memory_format`, we can register `aten::empty.memory_format` for `PrivateUse1` through `TORCH_LIBRARY_IMPL`.
+
+### Step 2
+
+By following {ref}`Step 1<step-one>`, we can complete the development and registration of all operators except `fallback`. Next, to support operators related to operations (such as mathematical operations and convolution operations), we need to implement the registration of fallback semantics. This is a built-in capability provided by the PyTorch framework that can fallback some operations that are not supported by new accelerators to the CPU for execution. For new backends in development, this is an extremely effective way to ensure functionality at the expense of performance.
+
+::::{tab-set}
+
+:::{tab-item} C++
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK IMPL
+    :end-before: LITERALINCLUDE END: FALLBACK IMPL
+    :emphasize-lines: 15
+    :linenos:
+
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK WRAPPER
+    :end-before: LITERALINCLUDE END: FALLBACK WRAPPER
+    :linenos:
+
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK GLOBAL
+    :end-before: LITERALINCLUDE END: FALLBACK GLOBAL
+    :linenos:
+```
+
+:::
+
+::::
+
+`wrapper_cpu_fallback` wraps the `at::native::cpu_fallback` method provided by PyTorch and is registered with `PrivateUse1` in PyTorch via `TORCH_LIBRARY_IMPL`. Subsequent operations not supported by the new backend will automatically fall back to the CPU for execution, and the results will be passed back to the new backend after execution.
+
+## Advanced
+
+### Selective Fallback
+
+Enabling the fallback mechanism only for certain operators, while following PyTorch's default behavior for other operators (an error will be reported if the accelerator does not have a corresponding operator implementation), this is a very reasonable scenario as well.
+
+::::{tab-set}
+
+:::{tab-item} C++
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK WRAPPER
+    :end-before: LITERALINCLUDE END: FALLBACK WRAPPER
+    :linenos:
+
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegMinimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK SINGLE
+    :end-before: LITERALINCLUDE END: FALLBACK SINGLE
+    :linenos:
+```
+
+:::
+
+::::
+
+Per-operator fallbacks are very similar to global fallbacks, the only difference being the registration method: calling `m.impl` registers an implementation for a specific operator, while `m.fallback` registers a default implementation for all operators.
+
+::::{tab-set-code}
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: FALLBACK IMPL
+    :end-before: LITERALINCLUDE END: FALLBACK IMPL
+    :emphasize-lines: 2-5
+    :linenos:
+```
+
+::::
+
+Of course, global fallbacks can also be combined with a blacklist of fallbacks, which is a common approach, especially when only a few operators do not support fallbacks.
+
+### PyTorch STUB
+
+PyTorch also provides another approach for built-in operators: `STUB`. This method is essentially based on the `Step 1<step-one>` approach, but adds secondary scheduling capabilities (for example, scheduling based on CPU characteristics).
+
+```{note}
+The `STUB` method currently supports only a limited set of operators. For new accelerator devices, the advantage of the `STUB` method is that it significantly reduces the cost of development at the cost of a small performance overhead. PyTorch currently does not clearly list the set of operators that can be registered through `STUB`. Due to the large number of related operators, only the query method for the supported operator list is provided here.
+```
+
+```shell
+pushd ${TORCH_ROOT}
+
+find aten -type f -a -name "*.h" | xargs -I {} grep -wl "^DECLARE_DISPATCH" {}
+
+popd
+```
+
+`DECLARE_DISPATCH` is a macro used to explicitly declare `STUB`. It is currently distributed in the `aten` directory. Based on this macro, you can find all operators that can be integrated using the `STUB` method.
+
+```text
+...
+aten/src/ATen/native/Activation.h
+aten/src/ATen/native/FusedSGD.h
+aten/src/ATen/native/nested/NestedTensorBinaryOps.h
+aten/src/ATen/native/TensorCompare.h
+aten/src/ATen/native/Sorting.h
+...
+```
+
+```c++
+using unary_fn = void(*)(TensorIteratorBase&);
+
+DECLARE_DISPATCH(unary_fn, abs_stub)
+```
+
+The above listing contains the file that declares the `STUB` operator, where you can clearly see the STUB name and the associated function signature. Next, we will take `abs_stub` as an example to briefly introduce the path to support operators through `STUB`.
+
+::::{tab-set}
+
+:::{tab-item} C++
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Extra.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: STUB ABS
+    :end-before: LITERALINCLUDE END: STUB ABS
+    :linenos:
+```
+
+```{eval-rst}
+.. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+    :language: c++
+    :start-after: LITERALINCLUDE START: STUB DEFAULT
+    :end-before: LITERALINCLUDE END: STUB DEFAULT
+    :emphasize-lines: 1
+    :linenos:
+```
+
+:::
+
+::::
+
+From the signature, we can see that the input of `abs_stub` is `TensorIteratorBase`, a powerful helper class provided by PyTorch that contains all input and output operators, as well as some other auxiliary methods. Based on it, we can develop the `abs_kernel` operator and then call `REGISTER_PRIVATEUSE1_DISPATCH` to specify `abs_stub` to complete the registration.
+
+### Custom Operators
+
+In addition to PyTorch's built-in operators, custom accelerator operators are also very common to improve performance in specific scenarios. These can be categorized into three main approaches:
+
+* Forward-only
+* Forward and backward: Separate registration
+* Forward and backward: Implemented using `torch.autograd.Function`
+
+```{note}
+There are more details in PyTorch tutorials, so refer to [PyTorch Custom Operators](https://docs.pytorch.org/tutorials/advanced/custom_ops_landing_page.html) if you are interested.
+```
+
+#### Forward Only
+
+Here, we'll briefly introduce the implementation process of custom operators, focusing on the forward-only approach. The implementation can be summarized into the following three points:
+
+1. **Define Schema:**
+
+    ::::{tab-set}
+
+    :::{tab-item} C++
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: CUSTOM OPERATOR SCHEMA
+        :end-before: LITERALINCLUDE END: CUSTOM OPERATOR SCHEMA
+        :emphasize-lines: 2
+        :linenos:
+    ```
+
+    :::
+
+    ::::
+
+    * Namespace Name: `openreg`
+    * Function Name: `custom_abs`
+    * Input Parameters:
+        * Type: `Tensor`
+        * Name: `input`
+    * Output Type: `Tensor`
+
+2. **Register Operator&Autograd Fallback:**
+
+    ::::{tab-set}
+
+    :::{tab-item} C++
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: CUSTOM OPERATOR DEFAULT
+        :end-before: LITERALINCLUDE END: CUSTOM OPERATOR DEFAULT
+        :linenos:
+
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/OpenRegExtra.cpp
+        :language: c++
+        :start-after: LITERALINCLUDE START: CUSTOM OPERATOR FALLBACK
+        :end-before: LITERALINCLUDE END: CUSTOM OPERATOR FALLBACK
+        :emphasize-lines: 2
+        :linenos:
+    ```
+
+    :::
+
+    ::::
+
+    Use `TORCH_LIBRARY_IMPL` to register the `wrapper_custom_abs` implementation for the `custom_abs` operator in `PrivateUse1`. However, because `Autograd` is always enabled in PyTorch, PyTorch defaults to finding and executing the corresponding backward implementation even if only forward computation is required(will fallthrough in backward implementation). Therefore, we also need to register the corresponding implementation for `AutogradPrivateUse1` of the `custom_abs` operator. Fortunately, PyTorch also provides a general `Autograd Fallback` mechanism named `torch::autograd::autogradNotImplementedFallback`, if only forward computation is involved, it is equivalent to a fallthrough operation, selecting the next DispatchKey for computation; if backward computation is involved, an error is thrown.
+
+3. **Register Metadata(optional, but required by the graph mode, etc.):**
+
+    ::::{tab-set-code}
+
+    ```{eval-rst}
+    .. literalinclude:: ../../../test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/openreg/meta.py
+        :language: python
+        :start-after: LITERALINCLUDE START: CUSTOM OPERATOR META
+        :end-before: LITERALINCLUDE END: CUSTOM OPERATOR META
+        :linenos:
+    ```
+
+    ::::
+
+    PyTorch supports registering `Meta` in both C++ and Python. Since Python registration is simpler, Python is used as an example here. Similar to the `TORCH_LIBRARY_IMPL` function in C++, Python provides the more user-friendly `torch.library.impl` decorator.
+
+## Tools
+
+Operator registration in PyTorch is complex, with diverse registration methods and numerous scenarios. Therefore, the PyTorch community has provided a number of tools to help developers quickly understand the underlying principles and assist in troubleshooting. Here we briefly introduce several commonly used tools:
+
+### Commands
+
+PyTorch provides a set of commands prefixed with `torch._C._dispatch_` around its Dispatch feature. You can query all related interfaces using the following command:
+
+```Shell
+python -c 'import torch; print("\n".join([x for x in dir(torch._C) if x.startswith("_dispatch_")]))'
+
+...
+_dispatch_dump
+_dispatch_dump_table
+_dispatch_has_kernel
+_dispatch_has_kernel_for_any_dispatch_key
+_dispatch_has_kernel_for_dispatch_key
+_dispatch_isTensorSubclassLike
+_dispatch_is_alias_key
+_dispatch_is_included_in_alias
+_dispatch_is_main_interpreter
+_dispatch_kernel_for_dispatch_key_is_fallthrough
+_dispatch_key_for_device
+_dispatch_key_name
+_dispatch_key_parse
+_dispatch_key_set
+...
+```
+
+Here are explanations for several commonly used commands:
+
+* `torch._C._dispatch_key_set`:
+
+    Displays the DispatchKey of the current Tensor, with priority increasing from left to right.
+
+    ```Python
+    >>> import torch
+    >>> a = torch.randn(3,3,device="cuda")
+    >>> torch._C._dispatch_key_set(a)
+    'DispatchKeySet(CUDA, ADInplaceOrView, AutogradCUDA, AutocastCUDA)'
+    ```
+
+* `torch._C._dispatch_dump_table`:
+
+    Queries the support status of a given operator across different Dispatch Keys, making it easy to locate the corresponding implementation code.
+
+    ```Python
+    >>> import torch
+    >>> print(torch._C._dispatch_dump_table("aten::add.Tensor"))
+    >>> ...
+        CPU: registered at ./build/aten/src/ATen/RegisterCPU_0.cpp:1309 [kernel]
+        CUDA: registered at ./build/aten/src/ATen/RegisterCUDA_0.cpp:2420 [kernel]
+        HIP: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        MPS: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        IPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        XPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        HPU: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        VE: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        MTIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        MAIA: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        PrivateUse1: registered at ./build/aten/src/ATen/RegisterCompositeExplicitAutogradNonFunctional_0.cpp:1373 [default backend kernel]
+        ...
+    ```
+
+    You can easily query the corresponding implementation of the `aten::add.Tensor` operator on other platforms, so that you can track the entire operator calling process from the source code level.
+
+### Environment Variables
+
+PyTorch also provides some dispatcher-related environment variables that can help with learning and quickly locating issues.
+
+* TORCH_SHOW_DISPATCH_TRACE
+
+    Displays detailed internal dispatch key scheduling during PyTorch execution.
+
+    ```Bash
+    export TORCH_SHOW_DISPATCH_TRACE=1
+    ```
+
+    ```Python
+    >>> import torch
+    >>> a = torch.randn(3,3)
+     [call] op=[aten::randn], key=[BackendSelect]
+       [redispatch] op=[aten::randn], key=[CPU]
+         [call] op=[aten::empty.memory_format], key=[BackendSelect]
+           [redispatch] op=[aten::empty.memory_format], key=[CPU]
+         [call] op=[aten::normal_], key=[CPU]
+    ```
+
+    You can clearly see all the underlying operators called by Python-level operators within PyTorch: including the operator name, calling hierarchy, and corresponding `Dispatch Key`.
diff --git a/docs/source/compile/programming_model.error_on_graph_break.md b/docs/source/compile/programming_model.error_on_graph_break.md
new file mode 100644
index 000000000000..02acf1e7c8f2
--- /dev/null
+++ b/docs/source/compile/programming_model.error_on_graph_break.md
@@ -0,0 +1,242 @@
+---
+file_format: mystnb
+kernelspec:
+  name: python3
+mystnb:
+  execution_timeout: 30
+  execution_show_tb: True
+  merge_streams: True
+---
+
+```{code-cell}
+:tags: [remove-cell]
+import torch
+
+import header_code
+torch._logging.set_logs(graph_breaks=True)
+```
+
+# Toggling `error_on_graph_break`
+
+**Summary:**
+
+- When `fullgraph=False`, we can use `torch._dynamo.error_on_graph_break()` for more flexibility in
+  dealing with graph breaks.
+
+So far, we have introduced two ways in dealing with graph breaks in `torch.compile`:
+1. `fullgraph=True` errors on the first graph break and additionally guarantees that only one graph is traced from the code.
+2. `fullgraph=False` continues tracing even when encountering graph breaks.
+
+What if we want to disallow graph breaks for most of the code, but there are a few problematic functions where the graph breaks are hard to remove,
+and we are okay with having those graph breaks? We can use `torch._dynamo.error_on_graph_break()` to achieve this.
+
+`torch.compile` has an `error_on_graph_break` setting (initially set to `False`).
+If a graph break or compiler error occurs in code while `error_on_graph_break` is set to `False`, then `torch.compile` will attempt to continue compilation after the graph break/error.
+If `error_on_graph_break` is set to `True`, then `torch.compile` will abort compilation and propagate the error to user code.
+
+A significant difference between `error_on_graph_break=True` and `fullgraph=True` is that the former **does not guarantee that a single graph will be captured**.
+`error_on_graph_break` **can be arbitrarily toggled during compile time** by using the `torch._dynamo.error_on_graph_break()` context manager/decorator.
+In comparison, once `fullgraph` is set to `True`, it cannot be set back to `False`.
+Finally, `error_on_graph_break` has lower precedence than `fullgraph` - `error_on_graph_break` only takes effect when `fullgraph=False`.
+
+
+## `error_on_graph_break(False)` example
+
+```{code-cell}
+@torch._dynamo.error_on_graph_break(False)
+def code_with_a_difficult_graph_break(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+def inner(x):
+    return code_with_a_difficult_graph_break(x)
+
+# NOTE: fullgraph=False
+@torch._dynamo.error_on_graph_break(True)
+@torch.compile
+def fn(x):
+    return inner(x)
+
+# No error, but there is a graph break
+fn(torch.randn(3))
+```
+
+Using `error_on_graph_break(False)` under `error_on_graph_break(True)` is helpful for when we want to minimize graph breaks (i.e. follow the `fullgraph=True` programming model),
+but there are some sections of code with non-performance-critical graph breaks that are difficult to work around.
+
+`error_on_graph_break()` can be used as a context manager as well:
+
+```{code-cell}
+# NOTE: fullgraph=False
+@torch._dynamo.error_on_graph_break(True)
+@torch.compile
+def fn(x):
+    x = x + 1
+    with torch._dynamo.error_on_graph_break(False):
+        torch._dynamo.graph_break()  # no error
+    return x + 2
+
+# No error, but there is a graph break
+fn(torch.randn(3))
+```
+
+You can use monkey patching to toggle `error_on_graph_break` for code where you cannot edit the source (e.g. framework code):
+
+```{code-cell}
+class ThirdPartyModule(torch.nn.Module):
+    def forward(self, x):
+        x = x + 1
+        torch._dynamo.graph_break()
+        return x + 2
+
+tp_mod = ThirdPartyModule()
+tp_mod.forward = torch._dynamo.error_on_graph_break(False)(tp_mod.forward)
+
+@torch._dynamo.error_on_graph_break(True)
+@torch.compile
+def fn(x):
+    return tp_mod.forward(x)
+
+# No error, but there is a graph break
+fn(torch.randn(3))
+```
+
+## `error_on_graph_break(True)` example
+
+```{code-cell}
+@torch._dynamo.error_on_graph_break(True)
+def inner2(x):
+    x = x + 1
+    torch._dynamo.graph_break()  # error
+    return x + 2
+
+def inner(x):
+    return inner2(x)
+
+# fullgraph=False, error_on_graph_break=False
+@torch.compile
+def fn(x):
+    x = x + 4
+    torch._dynamo.graph_break()  # no error
+    return inner(x)
+
+try:
+    fn(torch.randn(3))
+except Exception as e:
+    print(e)
+```
+
+Using `error_on_graph_break(True)` under `error_on_graph_break(False)` is helpful for when we want to use `torch.compile` flexibly (i.e. follow the `fullgraph=False` programming model),
+but there are some sections of the code that are performance-critical and we want to ensure that those sections do not contain graph breaks.
+
+## `error_on_graph_break` nesting behavior
+
+`torch._dynamo.error_on_graph_break()` affects the `error_on_graph_break` setting of nested calls as well:
+
+```{code-cell}
+def inner(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+def inner2(x):
+    with torch._dynamo.error_on_graph_break(False):
+        return inner(x)
+
+@torch._dynamo.error_on_graph_break(True)
+@torch.compile
+def fn(x):
+    return inner2(x)
+
+# no error
+fn(torch.randn(3))
+```
+
+`torch._dynamo.error_on_graph_break()` can be used under another `torch._dynamo.error_on_graph_break()` region:
+
+```{code-cell}
+def inner(x):
+    x = x + 1
+    with torch._dynamo.error_on_graph_break(False):
+        torch._dynamo.graph_break()
+    return x + 2
+
+def inner2(x):
+    with torch._dynamo.error_on_graph_break(True):
+        return inner(x)
+
+@torch.compile
+def fn(x):
+    return inner2(x)
+
+# no error
+fn(torch.randn(3))
+```
+
+## Interaction with `fullgraph`
+
+`fullgraph=True` takes higher precedence than `error_on_graph_break`:
+
+
+```{code-cell}
+@torch._dynamo.error_on_graph_break(False)
+def inner(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+@torch.compile(fullgraph=True)
+def fn(x):
+    return inner(x)
+
+try:
+    fn(torch.randn(3))
+except Exception as e:
+    print(e)
+```
+
+`fullgraph=True` cannot be toggled back to `fullgraph=False`:
+
+```{code-cell}
+@torch.compile(fullgraph=False)
+def inner(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+@torch.compile(fullgraph=True)
+def fn(x):
+    return inner(x)
+
+try:
+    fn(torch.randn(3))
+except Exception as e:
+    print(e)
+```
+
+```{code-cell}
+@torch.compile(fullgraph=True)
+def inner(x):
+    x = x + 1
+    torch._dynamo.graph_break()
+    return x + 2
+
+@torch.compile(fullgraph=False)
+def fn(x):
+    return inner(x)
+
+try:
+    fn(torch.randn(3))
+except Exception as e:
+    print(e)
+```
+
+## Summary of `fullgraph=True/False` vs `error_on_graph_break`
+
+Here is a table summarizing the differences between `fullgraph=True/False` and `error_on_graph_break`:
+
+|  | `error_on_graph_break=True` | `error_on_graph_break=False` (default) |
+| --- | --- | --- |
+| `fullgraph=True` | Graph breaks result in errors. Only the first graph break will be reported. **One graph guarantee.**<br><br>`fullgraph` cannot be toggled to `False`. `error_on_graph_break` has no effect.<br><br>User code must be fully compatible with `torch.compile`. Guarantees no performance hits from graph breaks (because there are no graph breaks).<br><br>Ideal for code sensitive to graph breaks: framework/library code or cases where getting maximum performance is required. Prevents downstream user code from inadvertently allowing graph breaks. | Same as `fullgraph=True` and `error_on_graph_break=True` as `error_on_graph_break` has no effect when `fullgraph=True`. |
+| `fullgraph=False` (default) | Graph breaks result in errors. Only the first graph break will be reported. **No one graph guarantee.**<br><br>`error_on_graph_break` can be toggled to `False`.<br><br>User code must be fully compatible with `torch.compile`. Guarantees no performance hits from graph breaks (because there are no graph breaks).<br><br>Ideal for user code sensitive to graph breaks. `error_on_graph_break` can be toggled to `False` to deal with sections that have graph breaks that are difficult to work around. | Will continue to compile after encountering graph breaks. All graph breaks will be reported.<br><br>`error_on_graph_break` can be toggled to `True`.<br><br>Doesn’t require many user code changes to work. Performance may be negatively impacted due to graph breaks.<br><br>Ideal for out-of-the-box use cases, on “non-weird” code, or where squeezing maximal performance is not necessary |
diff --git a/docs/source/compile/programming_model.fullgraph_false.md b/docs/source/compile/programming_model.fullgraph_false.md
index 249ae128a5ec..df26ae804cdc 100644
--- a/docs/source/compile/programming_model.fullgraph_false.md
+++ b/docs/source/compile/programming_model.fullgraph_false.md
@@ -19,6 +19,7 @@ The strategy for using `torch.compile(fullgraph=False)` is as follows:
 ```{toctree}
 programming_model.where_to_apply_compile
 programming_model.compiler_disable
+programming_model.error_on_graph_break
 programming_model.nested_graph_breaks
 programming_model.skipped_functions
 ```
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 9b04d22c087d..17bdb33721be 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -133,7 +133,7 @@
 html_theme_options = {
     "logo": {"text": "Home"},
     "analytics_id": "GTM-T8XT4PS",
-    "canonical_url": "https://pytorch.org/docs/stable/",
+    "canonical_url": "https://docs.pytorch.org/docs/stable/",
     "switcher": {
         "json_url": "https://docs.pytorch.org/docs/pytorch-versions.json",
         "version_match": switcher_version,
@@ -143,7 +143,7 @@
     "external_links": [
         {
             "name": "Tutorials",
-            "url": "https://pytorch.org/tutorials/",
+            "url": "https://docs.pytorch.org/tutorials/",
         },
     ],
     "show_version_warning_banner": True,
@@ -1221,9 +1221,6 @@
     "reduce_typed_storage_child",
     "storage_from_cache",
     # torch.multiprocessing.spawn
-    # Added docstring for this but I think we need to go through
-    # and add the entire torch.multiprocessing.spawn module to a .rst...
-    "should_use_parallel_start",
     "start_processes",
     # torch.nn.functional
     "adaptive_max_pool1d_with_indices",  # documented as adaptive_max_pool1d
@@ -2517,6 +2514,8 @@
     # torch.distributed.checkpoint.hf_storage
     "HuggingFaceStorageReader",
     "HuggingFaceStorageWriter",
+    # torch.distributed.checkpoint.quantized_hf_storage
+    "QuantizedHuggingFaceStorageReader",
     # torch.distributed.checkpoint.metadata
     "BytesStorageMetadata",
     "ChunkStorageMetadata",
diff --git a/docs/source/distributed.checkpoint.md b/docs/source/distributed.checkpoint.md
index 694dfef1098a..c733ffef18d9 100644
--- a/docs/source/distributed.checkpoint.md
+++ b/docs/source/distributed.checkpoint.md
@@ -173,6 +173,9 @@ We also provide other storage layers, including ones to interact with HuggingFac
 .. autoclass:: torch.distributed.checkpoint.HuggingFaceStorageWriter
   :members:
 
+.. autoclass:: torch.distributed.checkpoint.QuantizedHuggingFaceStorageReader
+  :members:
+
 We provide default implementations of `LoadPlanner` and `SavePlanner` that
 can handle all of torch.distributed constructs such as FSDP, DDP, ShardedTensor and DistributedTensor.
 
diff --git a/docs/source/distributed.md b/docs/source/distributed.md
index 9762e79c7ea3..1a5f8d2b6f3f 100644
--- a/docs/source/distributed.md
+++ b/docs/source/distributed.md
@@ -1139,6 +1139,10 @@ If you are running single node training, it may be convenient to interactively b
 .. py:module:: torch.distributed.checkpoint.hf_storage
 ```
 
+```{eval-rst}
+.. py:module:: torch.distributed.checkpoint.quantized_hf_storage
+```
+
 ```{eval-rst}
 .. py:module:: torch.distributed.checkpoint.metadata
 ```
diff --git a/docs/source/export.md b/docs/source/export.md
index fcebcc6d4962..b550e0270b32 100644
--- a/docs/source/export.md
+++ b/docs/source/export.md
@@ -645,6 +645,7 @@ export/programming_model
 export/ir_spec
 export/pt2_archive
 export/draft_export
+export/joint_with_descriptors
 cond
 generated/exportdb/index
 torch.compiler_aot_inductor
diff --git a/docs/source/export/joint_with_descriptors.md b/docs/source/export/joint_with_descriptors.md
new file mode 100644
index 000000000000..67c6e70fd98a
--- /dev/null
+++ b/docs/source/export/joint_with_descriptors.md
@@ -0,0 +1,111 @@
+# Joint with descriptors
+
+Joint with descriptors is an experimental API for exporting a traced joint
+graph that supports all of torch.compile's features in full generality and,
+after processing, can be converted back into a differentiable callable that
+can be executed as normal.  For example, it is used to implement autoparallel,
+a system that takes a model and reshards inputs and parameters to make it
+a distributed SPMD program.
+
+```{eval-rst}
+.. currentmodule:: torch._functorch.aot_autograd
+.. autofunction:: aot_export_joint_with_descriptors
+.. autofunction:: aot_compile_joint_with_descriptors
+```
+
+## Descriptors
+
+```{eval-rst}
+.. currentmodule:: torch._functorch._aot_autograd.descriptors
+
+.. autoclass:: AOTInput
+  :members:
+
+.. autoclass:: AOTOutput
+  :members:
+
+.. autoclass:: BackwardTokenAOTInput
+  :members:
+
+.. autoclass:: BackwardTokenAOTOutput
+  :members:
+
+.. autoclass:: BufferAOTInput
+  :members:
+
+.. autoclass:: DummyAOTInput
+  :members:
+
+.. autoclass:: DummyAOTOutput
+  :members:
+
+.. autoclass:: GradAOTOutput
+  :members:
+
+.. autoclass:: InputMutationAOTOutput
+  :members:
+
+.. autoclass:: IntermediateBaseAOTOutput
+  :members:
+
+.. autoclass:: ParamAOTInput
+  :members:
+
+.. autoclass:: PhiloxBackwardBaseOffsetAOTInput
+  :members:
+
+.. autoclass:: PhiloxBackwardSeedAOTInput
+  :members:
+
+.. autoclass:: PhiloxForwardBaseOffsetAOTInput
+  :members:
+
+.. autoclass:: PhiloxForwardSeedAOTInput
+  :members:
+
+.. autoclass:: PhiloxUpdatedBackwardOffsetAOTOutput
+  :members:
+
+.. autoclass:: PhiloxUpdatedForwardOffsetAOTOutput
+  :members:
+
+.. autoclass:: PlainAOTInput
+  :members:
+
+.. autoclass:: PlainAOTOutput
+  :members:
+
+.. autoclass:: SavedForBackwardsAOTOutput
+  :members:
+
+.. autoclass:: SubclassGetAttrAOTInput
+  :members:
+
+.. autoclass:: SubclassGetAttrAOTOutput
+  :members:
+
+.. autoclass:: SubclassSizeAOTInput
+  :members:
+
+.. autoclass:: SubclassSizeAOTOutput
+  :members:
+
+.. autoclass:: SubclassStrideAOTInput
+  :members:
+
+.. autoclass:: SubclassStrideAOTOutput
+  :members:
+
+.. autoclass:: SyntheticBaseAOTInput
+  :members:
+
+.. autoclass:: ViewBaseAOTInput
+  :members:
+```
+
+## FX utilities
+
+```{eval-rst}
+.. automodule:: torch._functorch._aot_autograd.fx_utils
+  :members:
+```
diff --git a/docs/source/export/pt2_archive.md b/docs/source/export/pt2_archive.md
index cfb589f7bdfe..447e944972ad 100644
--- a/docs/source/export/pt2_archive.md
+++ b/docs/source/export/pt2_archive.md
@@ -22,18 +22,22 @@ The following is a sample archive. We will walk through the archive folder by fo
 ├── data
 │   ├── aotinductor
 │   │   └── model1
-│   │       ├── aotinductor_pickle_data.json
-│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.cpp
-│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.so
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.kernel_metadata.json
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.kernel.cpp
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.wrapper_metadata.json
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.wrapper.cpp
+│   │       ├── cf5ez6ifexr7i2hezzz4s7xfusj4wtisvu2gddeamh37bw6bghjw.wrapper.so
 │   │       ├── cg7domx3woam3nnliwud7yvtcencqctxkvvcafuriladwxw4nfiv.cubin
 │   │       └── cubaaxppb6xmuqdm4bej55h2pftbce3bjyyvljxbtdfuolmv45ex.cubin
 │   ├── weights
-│   │  ├── model1_model_param_config.json
+│   │  ├── model1_weights_config.json
+│   │  ├── model2_weights_config.json
 │   │  ├── weight_0
 │   │  ├── weight_1
 │   │  ├── weight_2
 │   └── constants
-│   │  ├── model1_model_constants_config.json
+│   │  ├── model1_constants_config.json
+│   │  ├── model2_constants_config.json
 │   │  ├── tensor_0
 │   │  ├── tensor_1
 │   │  ├── custom_obj_0
@@ -67,11 +71,12 @@ example, compilation artifacts for the `model1` model on A100 and H100 will be
 saved in `model1-a100` and `model1-h100` folders separately.
 
 The folder typically contains
-* `<uuid>.so`: Dynamic library compiled from <uuid>.cpp.
-* `<uuid>.cpp`: AOTInductor generated cpp wrapper file.
+* `<uuid>.wrapper.so`: Dynamic library compiled from <uuid>.cpp.
+* `<uuid>.wrapper.cpp`: AOTInductor generated cpp wrapper file.
+* `<uuid>.kernel.cpp`: AOTInductor generated cpp kernel file.
 * `*.cubin`: Triton kernels compiled from triton codegen kernels
+* `<uuid>.wrapper_metadata.json`: Metadata which was passed in from the `aot_inductor.metadata` inductor config
 * (optional) `<uuid>.json`: External fallback nodes for custom ops to be executed by `ProxyExecutor`, serialized according to `ExternKernelNode` struct. If the model doesn’t use custom ops/ProxyExecutor, this file would be omitted.
-* `<uuid>_metadata.json`: Metadata which was passed in from the `aot_inductor.metadata` inductor config
 
 ### Weights
 
@@ -79,16 +84,16 @@ Path: `/data/weights/*`
 
 Model parameters and buffers are saved in the `/data/weights/` folder. Each
 tensor is saved as a separated file. The file only contains the raw data blob,
-tensor metadata are saved separately in the
-`<model_name>_model_param_config.json`.
+tensor metadata and mapping from model weight FQN to saved raw data blob are saved separately in the
+`<model_name>_weights_config.json`.
 
 ### Constants
 
 Path: `/data/constants/*`
 
 TensorConstants, non-persistent buffers and TorchBind objects are saved in the
-`/data/constants/` folder. Metadata is saved separately in the
-`<model_name>_model_constants_config.json`
+`/data/constants/` folder. Metadata and mapping from model constant FQN to saved raw data blob are saved separately in the
+`<model_name>_constants_config.json`
 
 ### Sample Inputs
 
diff --git a/docs/source/nn.attention.flex_attention.md b/docs/source/nn.attention.flex_attention.md
index 7087bec6c9d9..4cfb51c5945c 100644
--- a/docs/source/nn.attention.flex_attention.md
+++ b/docs/source/nn.attention.flex_attention.md
@@ -14,6 +14,12 @@
 ```{eval-rst}
 .. autofunction:: flex_attention
 ```
+```{eval-rst}
+.. autoclass:: AuxOutput
+```
+```{eval-rst}
+.. autoclass:: AuxRequest
+```
 
 ## BlockMask Utilities
 
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 8ad4c87a7139..8981ac1bf6ed 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -608,6 +608,14 @@ Available options:
   for processing events. This avoids any slow path associated with querying/processing of
   events in the fast allocation path. This feature is disabled by default.
 
+* ``graph_capture_record_stream_reuse`` (experimental, default: `False`)
+  If set to `True`, the CUDA caching allocator will attempt to reclaim device memory during
+  CUDA Graph capture by using the graph topology (instead of CUDA events) to determine
+  when a freed block is safe to reuse. This can reduce peak memory during long captures that free
+  and reallocate buffers across multiple streams, especially when the capture DAG frequently
+  reaches joined frontiers. Note: Enabling this option can significantly increase the time spent
+  capturing the graph.
+
 .. note::
 
     Some stats reported by the
diff --git a/docs/source/notes/get_start_xpu.rst b/docs/source/notes/get_start_xpu.rst
index 6414730c28d4..57cb47bd840d 100644
--- a/docs/source/notes/get_start_xpu.rst
+++ b/docs/source/notes/get_start_xpu.rst
@@ -24,17 +24,12 @@ For Intel Client GPU
 +-------------------------------------+----------------------------------------------------------------------------------------------------+
 | Supported OS                        | Validated Hardware                                                                                 |
 +=====================================+====================================================================================================+
-|| Windows 11 & Ubuntu 24.10          || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
+|| Windows 11 & Ubuntu 24.04/25.04    || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
 ||                                    || Intel® Arc B-Series Graphics (CodeName: Battlemage)                                               |
 ||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
 ||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
 ||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
 +-------------------------------------+----------------------------------------------------------------------------------------------------+
-|| Ubuntu 24.04 & WSL2 (Ubuntu 24.04) || Intel® Arc A-Series Graphics (CodeName: Alchemist)                                                |
-||                                    || Intel® Core™ Ultra Processors with Intel® Arc™ Graphics (CodeName: Meteor Lake-H)                 |
-||                                    || Intel® Core™ Ultra Desktop Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Lunar Lake) |
-||                                    || Intel® Core™ Ultra Mobile Processors (Series 2) with Intel® Arc™ Graphics (CodeName: Arrow Lake-H)|
-+-------------------------------------+----------------------------------------------------------------------------------------------------+
 
 Intel GPUs support (Prototype) is ready from PyTorch* 2.5 for Intel® Client GPUs and Intel® Data Center GPU Max Series on both Linux and Windows, which brings Intel GPUs and the SYCL* software stack into the official PyTorch stack with consistent user experience to embrace more AI application scenarios.
 
diff --git a/docs/source/onnx.md b/docs/source/onnx.md
index b0ed78dbe69b..73a24b671553 100644
--- a/docs/source/onnx.md
+++ b/docs/source/onnx.md
@@ -113,7 +113,6 @@ also be interested in reading our [development wiki](https://github.com/pytorch/
 .. autofunction:: register_custom_op_symbolic
 .. autofunction:: unregister_custom_op_symbolic
 .. autofunction:: select_model_mode_for_export
-.. autoclass:: JitScalarType
 ```
 
 ```{eval-rst}
diff --git a/docs/source/onnx_verification.md b/docs/source/onnx_verification.md
index cbaad021e960..4036aea8f81a 100644
--- a/docs/source/onnx_verification.md
+++ b/docs/source/onnx_verification.md
@@ -1,4 +1,5 @@
 # torch.onnx.verification
+
 ```{eval-rst}
 .. automodule:: torch.onnx.verification
 ```
@@ -11,23 +12,3 @@
 .. autoclass:: VerificationInfo
     :members:
 ```
-
-```{eval-rst}
-.. autofunction:: verify
-```
-
-## Deprecated
-
-The following classes and functions are deprecated.
-
-<!-- Some deprecated members are not publicly shown -->
-```{eval-rst}
-.. py:class:: check_export_model_diff
-.. py:class:: GraphInfo
-.. py:class:: GraphInfoPrettyPrinter
-.. py:class:: OnnxBackend
-.. py:class:: OnnxTestCaseRepro
-.. py:class:: VerificationOptions
-.. py:function:: find_mismatch
-.. py:function:: verify_aten_graph
-```
diff --git a/docs/source/torch_cuda_memory.md b/docs/source/torch_cuda_memory.md
index e5fa147ee785..f7f1fe706dad 100644
--- a/docs/source/torch_cuda_memory.md
+++ b/docs/source/torch_cuda_memory.md
@@ -32,7 +32,7 @@ torch.cuda.memory._dump_snapshot("my_snapshot.pickle")
 
 ## Using the visualizer
 
-Open [pytorch.org/memory_viz](https://pytorch.org/memory_viz) and drag/drop the pickled snapshot file into the visualizer.
+Open <https://pytorch.org/memory_viz> and drag/drop the pickled snapshot file into the visualizer.
 The visualizer is a javascript application that runs locally on your computer. It does not upload any snapshot data.
 
 
diff --git a/docs/source/user_guide/index.md b/docs/source/user_guide/index.md
index c07f50c0e8b0..3a341893ef90 100644
--- a/docs/source/user_guide/index.md
+++ b/docs/source/user_guide/index.md
@@ -37,3 +37,10 @@ pytorch_main_components
 
 ../notes
 ```
+
+```{toctree}
+:maxdepth: 1
+:caption: Accelerator Integration
+
+../accelerator/index
+```
diff --git a/pt_template_srcs.bzl b/pt_template_srcs.bzl
index d3a8dcabaa7e..84f5f8bd3e62 100644
--- a/pt_template_srcs.bzl
+++ b/pt_template_srcs.bzl
@@ -156,6 +156,7 @@ def get_generate_code_bin_outs():
             "autograd/generated/python_torch_functions_1.cpp": ["autograd/generated/python_torch_functions_1.cpp"],
             "autograd/generated/python_torch_functions_2.cpp": ["autograd/generated/python_torch_functions_2.cpp"],
             "autograd/generated/python_variable_methods.cpp": ["autograd/generated/python_variable_methods.cpp"],
+            "functionalization/generated/ViewMetaClassesPythonBinding.cpp": ["functionalization/generated/ViewMetaClassesPythonBinding.cpp"],
         })
     return outs
 
diff --git a/pyproject.toml b/pyproject.toml
index a911a2a723b1..925742b4c334 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,11 +16,44 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 
+[dependency-groups]
+dev = [
+    # This list should be kept in sync with the requirements-build.txt
+    # in PyTorch root until the project fully migrates to pyproject.toml
+    # after which this can be removed as it is already specified in the
+    # [build-system] section
+    "setuptools>=70.1.0,<80.0",  # setuptools develop deprecated on 80.0
+    "cmake>=3.27",
+    "ninja",
+    "numpy",
+    "packaging",
+    "pyyaml",
+    "requests",
+    "six",  # dependency chain: NNPACK -> PeachPy -> six
+    "typing-extensions>=4.10.0",
+
+    # This list should be kept in sync with the requirements.txt in
+    # PyTorch root until the project fully migrates to pyproject.toml
+    "build[uv]",
+    "expecttest>=0.3.0",
+    "filelock",
+    "fsspec>=0.8.5",
+    "hypothesis",
+    "jinja2",
+    "lintrunner; platform_machine != 's390x' and platform_machine != 'riscv64'",
+    "networkx>=2.5.1",
+    "optree>=0.13.0",
+    "psutil",
+    "sympy>=1.13.3",
+    "typing-extensions>=4.13.2",
+    "wheel",
+]
+
 [project]
 name = "torch"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 readme = "README.md"
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 # TODO: change to `license = "BSD-3-Clause"` and enable PEP 639 after pinning setuptools>=77
 # FIXME: As of 2025.06.20, it is hard to ensure the minimum version of setuptools in our CI environment.
 # TOML-table-based license deprecated in setuptools>=77, and the deprecation warning will be changed
@@ -41,7 +74,6 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules",
     "Programming Language :: C++",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -152,6 +184,13 @@ ignore = [
     "UP007", # keep-runtime-typing
     "UP045", # keep-runtime-typing
     "TC006",
+    # TODO: Remove Python-3.10 specific suppressions
+    "B905",
+    "UP035",
+    "UP036",
+    "UP038",
+    "UP041",
+    "FURB161",
 ]
 select = [
     "B",
diff --git a/related_commits b/related_commits
new file mode 100644
index 000000000000..b96cf18c181a
--- /dev/null
+++ b/related_commits
@@ -0,0 +1,10 @@
+ubuntu|pytorch|apex|release/1.9.0|07c3ee5347294b7a07a65c2c3596f1b14c7d3daa|https://github.com/ROCm/apex
+centos|pytorch|apex|release/1.9.0|07c3ee5347294b7a07a65c2c3596f1b14c7d3daa|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|release/0.24|b919bd0c56abbb3c5ca056a3a458af9fd1cabf52|https://github.com/pytorch/vision
+centos|pytorch|torchvision|release/0.24|b919bd0c56abbb3c5ca056a3a458af9fd1cabf52|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+centos|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|release/2.9|e3c6ee2b6588b7cd27a84182de74bf12fe043831|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|release/2.9|e3c6ee2b6588b7cd27a84182de74bf12fe043831|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|a52a64aeb84fa6ff683ec2c7c42b97e27651a619|https://github.com/pytorch/ao
+centos|pytorch|ao|main|a52a64aeb84fa6ff683ec2c7c42b97e27651a619|https://github.com/pytorch/ao
diff --git a/requirements.txt b/requirements.txt
index fc4b53dfd49e..f6dc86a0aa46 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,9 @@ hypothesis
 jinja2
 lintrunner ; platform_machine != "s390x" and platform_machine != "riscv64"
 networkx>=2.5.1
+ninja
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
 optree>=0.13.0
 psutil
 sympy>=1.13.3
diff --git a/setup.py b/setup.py
index 203e09f1b733..ae0097465da6 100644
--- a/setup.py
+++ b/setup.py
@@ -58,8 +58,8 @@
 #   USE_FBGEMM=0
 #     disables the FBGEMM build
 #
-#   USE_FBGEMM_GENAI=1
-#     enables the FBGEMM GenAI kernels to build
+#   USE_FBGEMM_GENAI=0
+#     disables the FBGEMM GenAI build
 #
 #   USE_KINETO=0
 #     disables usage of libkineto library for profiling
@@ -162,6 +162,10 @@
 #   USE_ROCM_CK_SDPA=1
 #     Enable building CK SDPA backend in ROCm platform
 #
+#   PYTORCH_LAYERNORM_FAST_RECIPROCAL
+#     If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
+#     layer normalization. Default: enabled.
+#
 # Environment variables we respect (these environment variables are
 # conventional and are often understood/set by other software.)
 #
@@ -259,7 +263,7 @@
 
 
 # Also update `project.requires-python` in pyproject.toml when changing this
-python_min_version = (3, 9, 0)
+python_min_version = (3, 10, 0)
 python_min_version_str = ".".join(map(str, python_min_version))
 if sys.version_info < python_min_version:
     print(
@@ -420,6 +424,41 @@ def _get_package_path(package_name: str) -> Path:
     if arg == "rebuild" or arg == "build":
         arg = "build"  # rebuild is gone, make it build
         EMIT_BUILD_WARNING = True
+    if arg == "develop":
+        print(
+            (
+                "WARNING: Redirecting 'python setup.py develop' to 'pip install -e . -v --no-build-isolation',"
+                " for more info see https://github.com/pytorch/pytorch/issues/152276"
+            ),
+            file=sys.stderr,
+        )
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "pip",
+                "install",
+                "-e",
+                ".",
+                "-v",
+                "--no-build-isolation",
+            ],
+            env={**os.environ},
+        )
+        sys.exit(result.returncode)
+    if arg == "install":
+        print(
+            (
+                "WARNING: Redirecting 'python setup.py install' to 'pip install . -v --no-build-isolation',"
+                " for more info see https://github.com/pytorch/pytorch/issues/152276"
+            ),
+            file=sys.stderr,
+        )
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "install", ".", "-v", "--no-build-isolation"],
+            env={**os.environ},
+        )
+        sys.exit(result.returncode)
     if arg == "--":
         filtered_args += sys.argv[i:]
         break
@@ -1719,7 +1758,18 @@ def main() -> None:
     package_data = {
         "torch": torch_package_data,
     }
-    exclude_package_data = {}
+    # some win libraries are excluded
+    # these are statically linked
+    exclude_windows_libs = [
+        "lib/dnnl.lib",
+        "lib/kineto.lib",
+        "lib/libprotobuf-lite.lib",
+        "lib/libprotobuf.lib",
+        "lib/libprotoc.lib",
+    ]
+    exclude_package_data = {
+        "torch": exclude_windows_libs,
+    }
 
     if not BUILD_LIBTORCH_WHL:
         package_data["torchgen"] = torchgen_package_data
diff --git a/test/ao/sparsity/test_composability.py b/test/ao/sparsity/test_composability.py
index b3aaf1c6dfbe..528fe9b83c65 100644
--- a/test/ao/sparsity/test_composability.py
+++ b/test/ao/sparsity/test_composability.py
@@ -411,7 +411,6 @@ def test_q_prep_fx_before_s_prep(self):
         )
         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
 
-    @xfailIfS390X
     def test_q_prep_fx_s_prep_ref_conv(self):
         r"""
         This checks that the ordering: prepare_fx -> sparse prepare -> convert_to_reference_fx
@@ -586,7 +585,6 @@ def test_s_prep_before_qat_prep_fx(self):
         )
         self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
 
-    @xfailIfS390X
     def test_s_prep_q_prep_fx_ref(self):
         r"""
         This checks that the ordering: sparse prepare -> prepare_fx -> convert_to_reference_fx
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
index f58d81ed008a..0b2a06b53c9a 100644
--- a/test/cpp/jit/CMakeLists.txt
+++ b/test/cpp/jit/CMakeLists.txt
@@ -88,6 +88,7 @@ set(JIT_TEST_SRCS
   ${JIT_TEST_ROOT}/test_subgraph_matcher.cpp
   ${JIT_TEST_ROOT}/test_subgraph_rewriter.cpp
   ${JIT_TEST_ROOT}/test_subgraph_utils.cpp
+  ${JIT_TEST_ROOT}/test_te.cpp
   ${JIT_TEST_ROOT}/test_union.cpp
   ${JIT_TEST_ROOT}/test_utils.cpp
   ${JIT_TEST_ROOT}/test_script_profile.cpp
diff --git a/test/cpp/jit/test_te.cpp b/test/cpp/jit/test_te.cpp
new file mode 100644
index 000000000000..5456210843fd
--- /dev/null
+++ b/test/cpp/jit/test_te.cpp
@@ -0,0 +1,41 @@
+#include <gtest/gtest.h>
+
+#include <test/cpp/jit/test_utils.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+
+#include <iostream>
+
+namespace torch {
+namespace jit {
+
+TEST(TETest, RemoveProfiling) {
+  auto g = std::make_shared<Graph>();
+  const auto graph_string = R"IR(
+    graph(%a : Tensor,
+          %b : bool):
+      %1 : None = prim::Constant()
+      %2 : Tensor? = prim::If(%b)
+        block0():
+          %3 : Tensor? = prim::profile[profiled_type=Tensor, seen_none=0](%1)
+          -> (%3)
+        block1():
+          %4 : Tensor = prim::profile[profiled_type=Tensor, seen_none=0](%a)
+          -> (%4)
+      return (%2))IR";
+  torch::jit::parseIR(graph_string, g.get());
+
+  g->lint();
+  RemoveProfileNodesAndSpecializeTypes(g);
+  g->lint();
+
+  testing::FileCheck()
+      .check("prim::Constant")
+      ->check("prim::If")
+      ->check("block")
+      ->check("block")
+      ->check("return")
+      ->run(*g);
+}
+} // namespace jit
+} // namespace torch
diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt
index 1b7024f75488..1b4752ed9089 100644
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@@ -40,8 +40,16 @@ set(NATIVERT_TEST_SRCS
   ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/GraphPasses.cpp
   ${TORCH_ROOT}/torch/nativert/graph/passes/pass_manager/PassManager.cpp
   ${TORCH_ROOT}/torch/nativert/kernels/KernelHandlerRegistry.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/TritonKernel.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
+  ${TORCH_ROOT}/torch/nativert/executor/DelegateExecutor.cpp
 )
 
+if(USE_CUDA)
+  list(APPEND NATIVERT_TEST_SRCS ${TORCH_ROOT}/torch/nativert/executor/triton/CudaTritonKernelManager.cpp)
+endif(MSVC)
+
+
 add_executable(test_nativert
   ${TORCH_ROOT}/test/cpp/common/main.cpp
   ${NATIVERT_TEST_SRCS}
diff --git a/test/cpp/nativert/test_triton_kernel_manager_registration.cpp b/test/cpp/nativert/test_triton_kernel_manager_registration.cpp
new file mode 100644
index 000000000000..ca864158e312
--- /dev/null
+++ b/test/cpp/nativert/test_triton_kernel_manager_registration.cpp
@@ -0,0 +1,14 @@
+#include <gtest/gtest.h>
+
+#include <torch/nativert/kernels/TritonKernel.h>
+
+using namespace ::testing;
+using namespace torch::nativert;
+
+TEST(TritonKernelManagerRegistrationTests, TestRegister) {
+#ifndef USE_CUDA
+  EXPECT_TRUE(create_cuda_triton_kernel_manager == nullptr);
+#else
+  EXPECT_FALSE(create_cuda_triton_kernel_manager == nullptr);
+#endif // USE_CUDA
+}
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
index 943af3c3575f..306a882627d4 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@@ -343,7 +343,7 @@ void boxed_my_narrow(
 
 Tensor my_new_empty_dtype_variant(Tensor t) {
   std::vector<int64_t> sizes = {2, 5};
-  auto dtype = std::make_optional(at::ScalarType::BFloat16);
+  auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
   return new_empty(t, sizes, dtype);
 }
 
@@ -352,6 +352,17 @@ void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, ui
   stack[0] = from(res);
 }
 
+Tensor my_new_zeros_dtype_variant(Tensor t) {
+  std::vector<int64_t> sizes = {2, 5};
+  auto dtype = std::make_optional(at::ScalarType::Float);
+  return new_zeros(t, sizes, dtype);
+}
+
+void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  auto res = my_new_zeros_dtype_variant(to<Tensor>(stack[0]));
+  stack[0] = from(res);
+}
+
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
   m.def("my_empty_like(Tensor t) -> Tensor");
@@ -359,6 +370,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_pad(Tensor t) -> Tensor");
   m.def("my_narrow(Tensor t, int dim, int start, int length) -> Tensor");
   m.def("my_new_empty_dtype_variant(Tensor t) -> Tensor");
+  m.def("my_new_zeros_dtype_variant(Tensor t) -> Tensor");
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
@@ -367,6 +379,7 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("fill_infinity", &boxed_fill_infinity);
   m.impl("my_is_cpu", &boxed_my_is_cpu);
   m.impl("my_new_empty_dtype_variant", &boxed_my_new_empty_dtype_variant);
+  m.impl("my_new_zeros_dtype_variant", &boxed_my_new_zeros_dtype_variant);
 }
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
index ebb4ba582499..074461d35274 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@@ -295,3 +295,15 @@ def my_new_empty_dtype_variant(t) -> Tensor:
     Returns: New empty tensor with shape [2, 5] and dtype bfloat16
     """
     return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)
+
+
+def my_new_zeros_dtype_variant(t) -> Tensor:
+    """
+    Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
+
+    Args:
+        t: Input tensor used as a reference for device and other properties
+
+    Returns: New zeros tensor
+    """
+    return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
index 6783f040bcd6..0f471e8132a6 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@@ -337,6 +337,14 @@ def test_my_new_empty_dtype_variant(self, device):
             finally:
                 torch.use_deterministic_algorithms(deterministic)
 
+        def test_my_new_zeros_dtype_variant(self, device):
+            import libtorch_agnostic
+
+            t = torch.randn(3, 4, device=device)
+            out = libtorch_agnostic.ops.my_new_zeros_dtype_variant(t)
+            ref_out = t.new_zeros((2, 5), dtype=torch.float)
+            self.assertEqual(out, ref_out, exact_device=True)
+
     instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
 
 if __name__ == "__main__":
diff --git a/test/cpp_extensions/mps_extension.mm b/test/cpp_extensions/mps_extension.mm
index 882e5c5603e2..30b70a76563d 100644
--- a/test/cpp_extensions/mps_extension.mm
+++ b/test/cpp_extensions/mps_extension.mm
@@ -13,6 +13,11 @@ kernel void add_arrays(device const float* inA,
 {
     result[index] = inA[index] + inB[index];
 }
+
+kernel void add_one(device float* data,
+                    uint index [[thread_position_in_grid]]) {
+  data[index] += 1.0;
+}
 )MPS_ADD_ARRAYS");
 
 at::Tensor get_cpu_add_output(at::Tensor & cpu_input1, at::Tensor & cpu_input2) {
@@ -50,7 +55,31 @@ kernel void add_arrays(device const float* inA,
   return mps_output;
 }
 
+void mps_add_one_new_encoder(const at::Tensor& input) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps());
+  TORCH_CHECK(input.numel() > 0);
+
+  @autoreleasepool {
+  auto kernelPSO = lib.getPipelineStateForFunc("add_one");
+  auto serialQueue = torch::mps::get_dispatch_queue();
+
+  dispatch_sync(serialQueue, ^(){
+    auto commandBuffer = torch::mps::get_command_buffer();
+    // Start a compute pass.
+    auto computeEncoder = [commandBuffer computeCommandEncoder];
+    TORCH_CHECK(computeEncoder, "Failed to create compute command encoder");
+    [computeEncoder setComputePipelineState: kernelPSO];
+    mtl_setArgs(computeEncoder, input);
+    mtl_dispatch1DJob(computeEncoder, kernelPSO, input.numel());
+    [computeEncoder endEncoding];
+     torch::mps::commit();
+  });
+  }
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("get_cpu_add_output", &get_cpu_add_output);
   m.def("get_mps_add_output", &get_mps_add_output);
+  m.def("mps_add_one_new_context", &mps_add_one_new_encoder);
 }
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt b/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt
index c1cc0eeeb3b1..2c207ca63eab 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/CMakeLists.txt
@@ -15,6 +15,8 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 
+set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+
 if(APPLE)
   set(CMAKE_INSTALL_RPATH "@loader_path/lib;@loader_path")
 elseif(UNIX)
@@ -34,6 +36,7 @@ else()
   message(FATAL_ERROR "Cannot find Python directory")
 endif()
 
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include(${PROJECT_SOURCE_DIR}/cmake/TorchPythonTargets.cmake)
 
 add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/openreg)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
index 91044cebc4ad..8a3263bb80e0 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/aten/native/Minimal.cpp
@@ -164,13 +164,13 @@ at::Tensor view(const at::Tensor& self, c10::SymIntArrayRef size) {
 
 // LITERALINCLUDE START: FALLBACK IMPL
 void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  static const std::unordered_set<c10::OperatorName> cpu_fallback_blacklist = {
+  static const std::unordered_set<c10::OperatorName> cpu_fallback_blocklist = {
       c10::OperatorName("aten::abs", ""),
       c10::OperatorName("aten::abs", "out"),
   };
 
   const auto& op_name = op.schema().operator_name();
-  if (cpu_fallback_blacklist.count(op_name)) {
+  if (cpu_fallback_blocklist.count(op_name)) {
     TORCH_CHECK(
         false,
         "Operator '",
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegEvent.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegEvent.h
new file mode 100644
index 000000000000..e869cf0deafb
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegEvent.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <include/openreg.h>
+
+#include "OpenRegException.h"
+#include "OpenRegStream.h"
+
+namespace c10::openreg {
+
+struct OpenRegEvent {
+  OpenRegEvent(bool enable_timing) noexcept : enable_timing_{enable_timing} {}
+
+  ~OpenRegEvent() {
+    if (is_created_) {
+      OPENREG_CHECK(orEventDestroy(event_));
+    }
+  }
+
+  OpenRegEvent(const OpenRegEvent&) = delete;
+  OpenRegEvent& operator=(const OpenRegEvent&) = delete;
+
+  OpenRegEvent(OpenRegEvent&& other) noexcept {
+    moveHelper(std::move(other));
+  }
+  OpenRegEvent& operator=(OpenRegEvent&& other) noexcept {
+    if (this != &other) {
+      moveHelper(std::move(other));
+    }
+    return *this;
+  }
+
+  operator orEvent_t() const {
+    return event();
+  }
+
+  std::optional<at::Device> device() const {
+    if (is_created_) {
+      return at::Device(at::kPrivateUse1, device_index_);
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  bool isCreated() const {
+    return is_created_;
+  }
+
+  DeviceIndex device_index() const {
+    return device_index_;
+  }
+
+  orEvent_t event() const {
+    return event_;
+  }
+
+  bool query() const {
+    if (!is_created_) {
+      return true;
+    }
+
+    orError_t err = orEventQuery(event_);
+    if (err == orSuccess) {
+      return true;
+    }
+
+    return false;
+  }
+
+  void record() {
+    record(getCurrentOpenRegStream());
+  }
+
+  void recordOnce(const OpenRegStream& stream) {
+    if (!was_recorded_)
+      record(stream);
+  }
+
+  void record(const OpenRegStream& stream) {
+    if (!is_created_) {
+      createEvent(stream.device_index());
+    }
+
+    TORCH_CHECK(
+        device_index_ == stream.device_index(),
+        "Event device ",
+        device_index_,
+        " does not match recording stream's device ",
+        stream.device_index(),
+        ".");
+
+    OPENREG_CHECK(orEventRecord(event_, stream));
+    was_recorded_ = true;
+  }
+
+  void block(const OpenRegStream& stream) {
+    if (is_created_) {
+      OPENREG_CHECK(orStreamWaitEvent(stream, event_, 0));
+    }
+  }
+
+  float elapsed_time(const OpenRegEvent& other) const {
+    TORCH_CHECK_VALUE(
+        !(enable_timing_ & orEventDisableTiming) &&
+            !(other.enable_timing_ & orEventDisableTiming),
+        "Both events must be created with argument 'enable_timing=True'.");
+    TORCH_CHECK_VALUE(
+        is_created_ && other.isCreated(),
+        "Both events must be recorded before calculating elapsed time.");
+    TORCH_CHECK(
+        query() && other.query(),
+        "Both events must be completed before calculating elapsed time.");
+
+    float time_ms = 0;
+    OPENREG_CHECK(orEventElapsedTime(&time_ms, event_, other.event_));
+    return time_ms;
+  }
+
+  void synchronize() const {
+    if (is_created_) {
+      OPENREG_CHECK(orEventSynchronize(event_));
+    }
+  }
+
+ private:
+  unsigned int enable_timing_{orEventDisableTiming};
+  bool is_created_{false};
+  bool was_recorded_{false};
+  DeviceIndex device_index_{-1};
+  orEvent_t event_{};
+
+  void createEvent(DeviceIndex device_index) {
+    device_index_ = device_index;
+    OPENREG_CHECK(orEventCreateWithFlags(&event_, enable_timing_));
+    is_created_ = true;
+  }
+
+  void moveHelper(OpenRegEvent&& other) {
+    std::swap(enable_timing_, other.enable_timing_);
+    std::swap(is_created_, other.is_created_);
+    std::swap(was_recorded_, other.was_recorded_);
+    std::swap(device_index_, other.device_index_);
+    std::swap(event_, other.event_);
+  }
+};
+
+} // namespace c10::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.cpp
new file mode 100644
index 000000000000..09eb09b6a2d6
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.cpp
@@ -0,0 +1,9 @@
+#include "OpenRegException.h"
+
+void orCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg) {
+  throw ::c10::Error({func, file, line}, msg);
+}
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h
new file mode 100644
index 000000000000..16c1ee1ca230
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegException.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <include/openreg.h>
+
+#include <c10/util/Exception.h>
+
+void orCheckFail(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg = "");
+
+#define OPENREG_CHECK(EXPR, ...)                                               \
+  do {                                                                         \
+    const orError_t __err = EXPR;                                              \
+    if (__err != orSuccess) {                                                  \
+      orCheckFail(                                                             \
+          __func__, __FILE__, static_cast<uint32_t>(__LINE__), ##__VA_ARGS__); \
+    }                                                                          \
+  } while (0)
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
index 6b928f4ad9cc..566bacd06e9a 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.cpp
@@ -1,5 +1,6 @@
 #include <include/openreg.h>
 
+#include "OpenRegException.h"
 #include "OpenRegFunctions.h"
 
 namespace c10::openreg {
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
index 8d8e9cd1e302..c2eb1e807496 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegFunctions.h
@@ -1,14 +1,10 @@
 #pragma once
 
-#ifdef _WIN32
-  #define OPENREG_EXPORT __declspec(dllexport)
-#else
-  #define OPENREG_EXPORT __attribute__((visibility("default")))
-#endif
-
 #include <c10/core/Device.h>
 #include <c10/macros/Macros.h>
 
+#include <include/Macros.h>
+
 #include <limits>
 
 namespace c10::openreg {
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp
new file mode 100644
index 000000000000..aa6c325d077d
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.cpp
@@ -0,0 +1,253 @@
+#include "OpenRegStream.h"
+
+#include <c10/util/CallOnce.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <deque>
+
+namespace c10::openreg {
+
+namespace {
+
+// Global stream state and constants
+static c10::once_flag init_flag;
+
+static DeviceIndex num_devices = -1;
+static constexpr int kStreamsPerPoolBits = 5;
+static constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
+static constexpr int kStreamTypeBits = 2;
+
+/*
+ * The stream pools are lazily initialized when the first queue is requested
+ * for a device. The device flags track the initialization of each device. When
+ * a queue is requested, the next queue in the pool to be returned in a
+ * round-robin fashion, see Note [Stream Management].
+ */
+static std::deque<c10::once_flag> device_flags;
+static std::vector<std::array<
+    std::array<orStream_t, kStreamsPerPool>,
+    c10::openreg::max_compile_time_stream_priorities>>
+    streams;
+static std::deque<
+    std::array<std::atomic<uint32_t>, max_compile_time_stream_priorities>>
+    priority_counters;
+
+static thread_local std::unique_ptr<StreamId[]> current_streams = nullptr;
+
+/*
+ * Note [StreamId assignment]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * How do we assign stream IDs?
+ *
+ * -- 56 bits --    -- 5 bits --     -- 2 bits --     -- 1 bit --
+ *     zeros       StreamIdIndex     StreamIdType    Ext/native stream
+ *                ignored for ext   ignored for ext
+ *
+ * Where StreamIdType:
+ *  00 = default stream
+ *  01 = normal stream
+ *  11 = external stream
+ *
+ * For external stream, StreamID is a orStream_t pointer. This means that last
+ * bit will always be 0. So when constructing StreamId for a native stream we
+ * set last bit to 1 to distinguish between native and external streams.
+ *
+ * StreamId is 64-bit, so we can just rely on regular promotion rules.
+ * We rely on StreamIdIndex and StreamIdType being non-negative;
+ */
+using StreamIdIndex = uint8_t;
+enum class StreamIdType : uint8_t {
+  DEFAULT = 0x0,
+  NORMAL = 0x1,
+  EXT = 0x3,
+};
+
+inline std::ostream& operator<<(std::ostream& stream, StreamIdType s) {
+  switch (s) {
+    case StreamIdType::DEFAULT:
+      return stream << "DEFAULT";
+    case StreamIdType::NORMAL:
+      return stream << "NORMAL";
+    case StreamIdType::EXT:
+      return stream << "EXT";
+    default:
+      break;
+  }
+
+  return stream << static_cast<int16_t>(s);
+}
+
+static inline StreamIdType streamIdType(StreamId s) {
+  // Externally allocated streams have their id being the orStream_ptr
+  // so the last bit will be 0
+  if (!(s & 1)) {
+    return StreamIdType(StreamIdType::EXT);
+  }
+
+  int mask_for_type = (1 << kStreamTypeBits) - 1;
+  auto st = static_cast<StreamIdType>((s >> 1) & mask_for_type);
+  TORCH_CHECK(
+      st == StreamIdType::DEFAULT || st == StreamIdType::NORMAL,
+      "invalid StreamId: ",
+      s);
+  return st;
+}
+
+static inline size_t streamIdIndex(StreamId s) {
+  return static_cast<size_t>(
+      (s >> (kStreamTypeBits + 1)) & ((1 << kStreamsPerPoolBits) - 1));
+}
+
+StreamId makeStreamId(StreamIdType st, size_t si) {
+  if (st == StreamIdType::EXT) {
+    return static_cast<StreamId>(0);
+  }
+
+  return (static_cast<StreamId>(si) << (kStreamTypeBits + 1)) |
+      (static_cast<StreamId>(st) << 1) | 1;
+}
+
+static void initGlobalStreamState() {
+  num_devices = device_count();
+  device_flags.resize(num_devices);
+  streams.resize(num_devices);
+  priority_counters.resize(num_devices);
+}
+
+static void initSingleDeviceStream(
+    int priority,
+    DeviceIndex device_index,
+    int i) {
+  auto& stream = streams[device_index][priority][i];
+
+  OPENREG_CHECK(orStreamCreateWithPriority(&stream, 0, priority));
+  priority_counters[device_index][priority] = 0;
+}
+
+// Creates stream pools for the specified device. It should be call only once.
+static void initDeviceStreamState(DeviceIndex device_index) {
+  for (const auto i : c10::irange(kStreamsPerPool)) {
+    for (const auto p : c10::irange(max_compile_time_stream_priorities)) {
+      initSingleDeviceStream(p, device_index, i);
+    }
+  }
+}
+
+static void initOpenRegStreamsOnce() {
+  c10::call_once(init_flag, initGlobalStreamState);
+
+  if (current_streams) {
+    return;
+  }
+
+  // Inits current streams (thread local) to the last queue in the "normal
+  // priority" queue pool. Note: the queue pool have not been initialized yet.
+  // It will be initialized in initDeviceStreamState for the specified device.
+  current_streams = std::make_unique<StreamId[]>(num_devices);
+  for (const auto i : c10::irange(num_devices)) {
+    current_streams[i] = makeStreamId(StreamIdType::DEFAULT, 0);
+  }
+}
+
+static uint32_t get_idx(std::atomic<uint32_t>& counter) {
+  auto raw_idx = counter++;
+  return raw_idx % kStreamsPerPool;
+}
+
+OpenRegStream OpenRegStreamForId(DeviceIndex device_index, StreamId stream_id) {
+  return OpenRegStream(
+      OpenRegStream::UNCHECKED,
+      Stream(
+          Stream::UNSAFE,
+          c10::Device(DeviceType::PrivateUse1, device_index),
+          stream_id));
+}
+
+} // anonymous namespace
+
+// See Note [StreamId assignment]
+orStream_t OpenRegStream::stream() const {
+  c10::DeviceIndex device_index = stream_.device_index();
+  StreamId stream_id = stream_.id();
+  StreamIdType st = streamIdType(stream_id);
+  size_t si = streamIdIndex(stream_id);
+  switch (st) {
+    // The index 0 stream is default as well.
+    case StreamIdType::DEFAULT:
+    case StreamIdType::NORMAL:
+      return streams[device_index][static_cast<uint8_t>(st)][si];
+    case StreamIdType::EXT:
+      return reinterpret_cast<orStream_t>(stream_id);
+    default:
+      TORCH_CHECK(
+          false,
+          "Unrecognized stream ",
+          stream_,
+          " (I didn't recognize the stream type, ",
+          st,
+          ").",
+          " Did you manufacture the StreamId yourself?  Don't do that;");
+  }
+}
+
+// Returns a stream from the requested pool
+// Note: when called the first time on a device, this will create the
+// stream pools for that device.
+OpenRegStream getStreamFromPool(const int priority, DeviceIndex device_index) {
+  initOpenRegStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+  }
+  c10::call_once(
+      device_flags[device_index], initDeviceStreamState, device_index);
+  auto pri_idx =
+      std::clamp(priority, 0, max_compile_time_stream_priorities - 1);
+  const auto idx = get_idx(priority_counters[device_index][pri_idx]);
+  auto id_type = static_cast<StreamIdType>(pri_idx);
+  return OpenRegStreamForId(device_index, makeStreamId(id_type, idx));
+}
+
+OpenRegStream getStreamFromPool(const bool isHighPriority, DeviceIndex device) {
+  initOpenRegStreamsOnce();
+  int priority = 0;
+  return getStreamFromPool(priority, device);
+}
+
+OpenRegStream getStreamFromExternal(
+    orStream_t ext_stream,
+    DeviceIndex device_index) {
+  return OpenRegStreamForId(
+      device_index, reinterpret_cast<int64_t>(ext_stream));
+}
+
+OpenRegStream getDefaultOpenRegStream(DeviceIndex device_index) {
+  initOpenRegStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+  }
+  return OpenRegStreamForId(
+      device_index, makeStreamId(StreamIdType::DEFAULT, 0));
+}
+
+OpenRegStream getCurrentOpenRegStream(DeviceIndex device_index) {
+  initOpenRegStreamsOnce();
+  if (device_index == -1) {
+    device_index = current_device();
+  }
+  return OpenRegStreamForId(device_index, current_streams[device_index]);
+}
+
+void setCurrentOpenRegStream(OpenRegStream stream) {
+  initOpenRegStreamsOnce();
+  current_streams[stream.device_index()] = stream.id();
+}
+
+std::ostream& operator<<(std::ostream& stream, const OpenRegStream& s) {
+  return stream << s.unwrap();
+}
+
+} // namespace c10::openreg
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.h b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.h
new file mode 100644
index 000000000000..e1fd0c719f5a
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/csrc/runtime/OpenRegStream.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <include/openreg.h>
+
+#include "OpenRegException.h"
+#include "OpenRegFunctions.h"
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+
+namespace c10::openreg {
+
+static constexpr int max_compile_time_stream_priorities = 1;
+
+class OpenRegStream {
+ public:
+  enum Unchecked { UNCHECKED };
+
+  explicit OpenRegStream(Stream stream) : stream_(stream) {
+    TORCH_CHECK(stream_.device_type() == DeviceType::PrivateUse1);
+  }
+
+  explicit OpenRegStream(Unchecked, Stream stream) : stream_(stream) {}
+
+  bool operator==(const OpenRegStream& other) const noexcept {
+    return unwrap() == other.unwrap();
+  }
+
+  bool operator!=(const OpenRegStream& other) const noexcept {
+    return unwrap() != other.unwrap();
+  }
+
+  operator orStream_t() const {
+    return stream();
+  }
+
+  operator Stream() const {
+    return unwrap();
+  }
+
+  DeviceType device_type() const {
+    return DeviceType::PrivateUse1;
+  }
+
+  DeviceIndex device_index() const {
+    return stream_.device_index();
+  }
+
+  Device device() const {
+    return Device(DeviceType::PrivateUse1, device_index());
+  }
+
+  StreamId id() const {
+    return stream_.id();
+  }
+
+  bool query() const {
+    DeviceGuard guard{stream_.device()};
+
+    if (orStreamQuery(stream()) == orSuccess) {
+      return true;
+    }
+
+    return false;
+  }
+
+  void synchronize() const {
+    DeviceGuard guard{stream_.device()};
+    OPENREG_CHECK(orStreamSynchronize(stream()));
+  }
+
+  int priority() const {
+    DeviceGuard guard{stream_.device()};
+    int priority = 0;
+    OPENREG_CHECK(orStreamGetPriority(stream(), &priority));
+    return priority;
+  }
+
+  orStream_t stream() const;
+
+  Stream unwrap() const {
+    return stream_;
+  }
+
+  struct c10::StreamData3 pack3() const {
+    return stream_.pack3();
+  }
+
+  static OpenRegStream unpack3(
+      StreamId stream_id,
+      DeviceIndex device_index,
+      DeviceType device_type) {
+    return OpenRegStream(Stream::unpack3(stream_id, device_index, device_type));
+  }
+
+ private:
+  Stream stream_;
+};
+
+/*
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream from the highest priority pool by setting
+ * isHighPriority to true for a specific device.
+ */
+OPENREG_EXPORT OpenRegStream
+getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
+
+/*
+ * Get a stream from the pool in a round-robin fashion.
+ *
+ * You can request a stream by setting a priority value for a specific device.
+ * The priority number lower, the priority higher.
+ */
+OPENREG_EXPORT OpenRegStream
+getStreamFromPool(const int priority, DeviceIndex device = -1);
+
+/*
+ * Get a OpenRegStream from a externally allocated one.
+ *
+ * This is mainly for interoperability with different libraries where we
+ * want to operate on a non-torch allocated stream for data exchange or similar
+ * purposes
+ */
+OPENREG_EXPORT OpenRegStream
+getStreamFromExternal(orStream_t ext_stream, DeviceIndex device_index);
+
+/*
+ * Get the default OpenReg stream, for the passed OpenReg device, or for the
+ * current device if no device index is passed.
+ */
+OPENREG_EXPORT OpenRegStream
+getDefaultOpenRegStream(DeviceIndex device_index = -1);
+
+/*
+ * Get the current OpenReg stream, for the passed OpenReg device, or for the
+ * current device if no device index is passed.
+ */
+OPENREG_EXPORT OpenRegStream
+getCurrentOpenRegStream(DeviceIndex device_index = -1);
+
+/*
+ * Set the current stream on the device of the passed in stream to be the passed
+ * in stream.
+ */
+OPENREG_EXPORT void setCurrentOpenRegStream(OpenRegStream stream);
+
+OPENREG_EXPORT std::ostream& operator<<(
+    std::ostream& stream,
+    const OpenRegStream& s);
+
+} // namespace c10::openreg
+
+namespace std {
+template <>
+struct hash<c10::openreg::OpenRegStream> {
+  size_t operator()(c10::openreg::OpenRegStream s) const noexcept {
+    return std::hash<c10::Stream>{}(s.unwrap());
+  }
+};
+} // namespace std
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/include/Macros.h b/test/cpp_extensions/open_registration_extension/torch_openreg/include/Macros.h
new file mode 100644
index 000000000000..c75523c2bc78
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/include/Macros.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#ifdef _WIN32
+#define OPENREG_EXPORT __declspec(dllexport)
+#else
+#define OPENREG_EXPORT __attribute__((visibility("default")))
+#endif
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py b/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
index 1186ac0dbdf8..0768653e1ac4 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/setup.py
@@ -53,7 +53,7 @@ def build_deps():
         ".",
         "--target",
         "install",
-        "--config",
+        "--config",  # For multi-config generators
         "Release",
         "--",
     ]
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt
index 5450b49be164..2c7d26d6806b 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/CMakeLists.txt
@@ -1,7 +1,14 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+project(TORCH_OPENREG CXX C)
+
+option(USE_TEST "Build and run unit tests" ON)
+
 set(LIBRARY_NAME openreg)
+set(LIBRARY_TEST ortests)
 
 file(GLOB_RECURSE SOURCE_FILES
-    "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp"
 )
 
 add_library(${LIBRARY_NAME} SHARED ${SOURCE_FILES})
@@ -13,3 +20,26 @@ install(TARGETS ${LIBRARY_NAME}
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
     RUNTIME DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
+
+if(USE_TEST)
+    enable_testing()
+
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/GTestTargets.cmake)
+
+    file(GLOB_RECURSE TEST_FILES
+        "${CMAKE_CURRENT_SOURCE_DIR}/tests/*.cpp"
+    )
+
+    add_executable(${LIBRARY_TEST} ${TEST_FILES})
+    target_link_libraries(${LIBRARY_TEST}
+        PRIVATE
+        ${LIBRARY_NAME}
+        gtest
+        gtest_main
+    )
+
+    add_test(NAME alltests COMMAND ${LIBRARY_TEST})
+    add_custom_command(TARGET ${LIBRARY_TEST}
+                POST_BUILD
+                COMMAND ${CMAKE_CTEST_COMMAND} -C Release --output-on-failure --verbose)
+endif()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
index af17ef3abdb1..0cee2c87ea34 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/README.md
@@ -4,9 +4,9 @@
 
 OpenReg is a C++ backend library that simulates the behavior of a CUDA-like device on a CPU. Its core objective is **not to accelerate computation or improve performance**, but rather to **simulate modern CUDA programming, enabling developers to prototype and test in an environment without actual GPU hardware**. The current design principles are as follows:
 
-* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's PrivateUse1 backend) to switch and test seamlessly.
+* **API Consistency**: Provide an interface consistent with the CUDA Runtime API, allowing upper-level applications (like PyTorch's `PrivateUse1` backend) to switch and test seamlessly.
 * **Functional Consistency**: Provide behavior consistent with the CUDA Runtime, such as memory isolation, device context management, etc.
-* **Completeness**: Aim to support PrivateUse1 device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
+* **Completeness**: Aim to support `PrivateUse1` device integration and safeguard the third-party device integration mechanism, without striving to cover all capabilities of the CUDA Runtime.
 
 ## Directory Structure
 
@@ -14,19 +14,34 @@ The project's code is organized with a clear structure and separation of respons
 
 ```text
 openreg/
-├── CMakeLists.txt      # Top-level CMake build script, used to compile and generate libopenreg.so
+├── README.md               # Comprehensive introduction of OpenReg.
+├── CMakeLists.txt          # Top-level CMake build script, used to compile and generate libopenreg.so
+├── cmake/
+│   └── GTestTargets.cmake  # Utils of fetching GoogleTest.
 ├── include/
-│   └── openreg.h       # Public API header file, external users only need to include this file
+│   ├── openreg.h           # Public API header file, external users only need to include this file
+│   └── openreg.inl         # Public API header file, as an extension of openreg.h, cannot be included separately.
+├── example/
+│   └── example.cpp         # Example for OpenReg.
+├── tests/
+│   ├── event_tests.cpp     # Testcases about OpenReg Event.
+│   ├── stream_tests.cpp    # Testcases about OpenReg Stream.
+│   ├── device_tests.cpp    # Testcases about OpenReg Device.
+│   └── memory_tests.cpp    # Testcases about OpenReg Memory.
 └── csrc/
-    ├── device.cpp      # Implementation of device management-related APIs
-    └── memory.cpp      # Implementation of APIs for memory management, copying, and protection
+    ├── device.cpp          # Implementation of device management APIs
+    ├── memory.cpp          # Implementation of memory management APIs
+    └── stream.cpp          # Implementation of stream and event APIs.
 ```
 
-* `include/openreg.h`: Defines all externally exposed C-style APIs, data structures, and enums. It is the "public face" of this library.
-* `csrc/`: Contains the C++ implementation source code for all core functionalities.
-  * `device.cpp`: Implements device discovery (`orGetDeviceCount`) and thread context management (`orSetDevice`/`orGetDevice`).
-  * `memory.cpp`: Implements the core functions of memory allocation (`orMalloc`/`orMallocHost`), deallocation, copying, and memory protection (`orMemoryProtect`, `orMemoryUnprotect`).
 * `CMakeLists.txt`: Responsible for compiling and linking all source files under the `csrc/` directory to generate the final `libopenreg.so` shared library.
+* `include`: Defines all externally exposed APIs, data structures, and enums.
+  * `openreg.h`: Defines all externally exposed C-style APIs.
+  * `openreg.inl`: Defines all externally exposed C++ APIs.
+* `csrc/`: Contains the C++ implementation source code for all core functionalities.
+  * `device.cpp`: Implements the core functions of device management: device discovery and context management.
+  * `memory.cpp`: Implements the core functions of memory management: allocation, free, copy and memory protection.
+  * `stream.cpp`: Implements the core functions of stream and event: creation, destroy, record, synchronization and so on.
 
 ## Implemented APIs
 
@@ -34,25 +49,49 @@ OpenReg currently provides a set of APIs covering basic memory and device manage
 
 ### Device Management APIs
 
-| OpenReg              | CUDA                 | Feature Description                               |
-| :------------------- | :------------------- | :------------------------------------------------ |
-| `orGetDeviceCount`   | `cudaGetDeviceCount` | Get the number of devices                         |
-| `orSetDevice`        | `cudaSetDevice`      | Set the current device for the current thread     |
-| `orGetDevice`        | `cudaGetDevice`      | Get the current device for the current thread     |
+| OpenReg                          | CUDA                               | Feature Description                |
+| :------------------------------- | :--------------------------------- | :--------------------------------- |
+| `orGetDeviceCount`               | `cudaGetDeviceCount`               | Get the number of available GPUs   |
+| `orSetDevice`                    | `cudaSetDevice`                    | Set the active GPU                 |
+| `orGetDevice`                    | `cudaGetDevice`                    | Get the current GPU                |
+| `orDeviceSynchronize`            | `cudaDeviceSynchronize`            | Wait for all GPU tasks to finish   |
+| `orDeviceGetStreamPriorityRange` | `cudaDeviceGetStreamPriorityRange` | Get the range of stream priorities |
 
 ### Memory Management APIs
 
-| OpenReg                  | CUDA                         | Feature Description                        |
-| :----------------------- | :--------------------------- | :----------------------------------------- |
-| `orMalloc`               | `cudaMalloc`                 | Allocate device memory                     |
-| `orFree`                 | `cudaFree`                   | Free device memory                         |
-| `orMallocHost`           | `cudaMallocHost`             | Allocate page-locked (Pinned) host memory  |
-| `orFreeHost`             | `cudaFreeHost`               | Free page-locked host memory               |
-| `orMemcpy`               | `cudaMemcpy`                 | Synchronous memory copy                    |
-| `orMemcpyAsync`          | `cudaMemcpyAsync`            | Asynchronous memory copy                   |
-| `orPointerGetAttributes` | `cudaPointerGetAttributes`   | Get pointer attributes                     |
-| `orMemoryUnprotect`      | -                            | (Internal use) Unprotect memory            |
-| `orMemoryProtect`        | -                            | (Internal use) Restore memory protection   |
+| OpenReg                  | CUDA                       | Feature Description                       |
+| :----------------------- | :------------------------- | :---------------------------------------- |
+| `orMalloc`               | `cudaMalloc`               | Allocate device memory                    |
+| `orFree`                 | `cudaFree`                 | Free device memory                        |
+| `orMallocHost`           | `cudaMallocHost`           | Allocate page-locked (Pinned) host memory |
+| `orFreeHost`             | `cudaFreeHost`             | Free page-locked host memory              |
+| `orMemcpy`               | `cudaMemcpy`               | Synchronous memory copy                   |
+| `orMemcpyAsyn`           | `cudaMemcpyAsyn`           | Asynchronous memory copy                  |
+| `orPointerGetAttributes` | `cudaPointerGetAttributes` | Get pointer attributes                    |
+
+### Stream APIs
+
+| OpenReg                      | CUDA                           | Feature Description                    |
+| :--------------------------- | :----------------------------- | :------------------------------------- |
+| `orStreamCreate`             | `cudaStreamCreate`             |  Create a default-priority stream      |
+| `orStreamCreateWithPriority` | `cudaStreamCreateWithPriority` |  Create a stream with a given priority |
+| `orStreamDestroy`            | `cudaStreamDestroy`            |  Destroy a stream                      |
+| `orStreamQuery`              | `cudaStreamQuery`              |  Check if a stream has completed       |
+| `orStreamSynchronize`        | `cudaStreamSynchronize`        |  Wait for a stream to complete         |
+| `orStreamWaitEvent`          | `cudaStreamWaitEvent`          |  Make a stream wait for an event       |
+| `orStreamGetPriority`        | `cudaStreamGetPriority`        |  Get a stream’s priority               |
+
+### Event APIs
+
+| OpenReg                  | CUDA                       | Feature Description                 |
+| :----------------------- | :------------------------- | :---------------------------------- |
+| `orEventCreate`          | `cudaEventCreate`          | Create an event with default flag   |
+| `orEventCreateWithFlags` | `cudaEventCreateWithFlags` | Create an event with specific flag  |
+| `orEventDestroy`         | `cudaEventDestroy`         | Destroy an event                    |
+| `orEventRecord`          | `cudaEventRecord`          | Record an event in a stream         |
+| `orEventSynchronize`     | `cudaEventSynchronize`     | Wait for an event to complete       |
+| `orEventQuery`           | `cudaEventQuery`           | Check if an event has completed     |
+| `orEventElapsedTime`     | `cudaEventElapsedTime`     | Get time elapsed between two events |
 
 ## Implementation Principles
 
@@ -71,67 +110,42 @@ Simulating device memory, host memory, and memory copies:
 2. **Deallocation**: Memory is freed using `munmap`.
 3. **Authorization**: When a legitimate memory access is required, an RAII guard restores the memory permissions to `PROT_READ | PROT_WRITE`. The permissions are automatically reverted to `PROT_NONE` when the scope is exited.
 
+### Stream&Event Principles
+
+Simulating creation, release and synchronization for event and steam:
+
+1. **Event**: Each event is encapsulated as a task function and placed into a stream, which acts as a thread. Upon completion of the task, a flag within the event is modified to simulate the event's status.
+2. **Stream**: When each stream is requested, a new thread is created, which sequentially processes each task in the task queue within the stream structure. Tasks can be wrappers around kernel functions or events.
+3. **Synchronization**: Synchronization between streams and events is achieved using multithreading, condition variables, and mutexes.
+
 ## Usage Example
 
-The following is a simple code snippet demonstrating how to use the core features of the OpenReg library.
-
-```cpp
-#include "openreg.h"
-#include <iostream>
-#include <vector>
-#include <cstdio>
-
-#define OR_CHECK(call) do { \
-    orError_t err = call; \
-    if (err != orSuccess) { \
-        fprintf(stderr, "OR Error code %d in %s at line %d\n", err, __FILE__, __LINE__); \
-        exit(EXIT_FAILURE); \
-    } \
-} while (0)
-
-int main() {
-    int device_count = 0;
-    OR_CHECK(orGetDeviceCount(&device_count));
-    std::cout << "Found " << device_count << " simulated devices." << std::endl;
-
-    int current_device = -1;
-    OR_CHECK(orSetDevice(1));
-    OR_CHECK(orGetDevice(&current_device));
-    std::cout << "Set current device to " << current_device << "." << std::endl;
-
-    const int n = 1024;
-    const size_t size = n * sizeof(int);
-    int *h_a, *d_a;
-    OR_CHECK(orMallocHost((void**)&h_a, size));
-    OR_CHECK(orMalloc((void**)&d_a, size));
-
-    orPointerAttributes attr;
-    OR_CHECK(orPointerGetAttributes(&attr, d_a));
-    std::cout << "Pointer " << (void*)d_a << " is of type " << attr.type
-              << " on device " << attr.device << std::endl;
-
-    for (int i = 0; i < n; ++i) {
-        h_a[i] = i;
-    }
-    OR_CHECK(orMemcpy(d_a, h_a, size, orMemcpyHostToDevice));
-    std::cout << "Data copied from Host to Device." << std::endl;
-
-    // std::cout << "Trying to access device memory directly from CPU..." << std::endl;
-    // int val = d_a[0]; // CRASH!
-
-    // Clean up resources
-    OR_CHECK(orFree(d_a));
-    OR_CHECK(orFreeHost(h_a));
-    std::cout << "Resources freed." << std::endl;
-
-    return 0;
-}
+Please refer to [example](example/example.cpp) for example.
+
+The command to compile example.cpp is as follow:
+
+```Shell
+mkdir build
+
+pushd build
+cmake ..
+make -j 32
+popd
+
+g++ -o out example/example.cpp -L ./build -lopenreg
+LD_LIBRARY_PATH=./build ./out
 ```
 
-## Next Steps
+The output is as follow:
 
-To better support PrivateUse1 device integration, the following capabilities are planned for the future:
+```Shell
+Current environment have 2 devices
+Current is 0 device
+All tasks have been submitted.
+Kernel execution time: 0.238168 ms
+Verification PASSED!
+```
+
+## Next Steps
 
-* **Stream Support**: Provide the ability to simulate CUDA Streams.
-* **Event Support**: Provide the ability to simulate CUDA Events.
-* **Cross-Platform Support**: Add support for Windows and macOS (low priority).
+The most basic functions of the OpenReg backend are currently supported, and will be dynamically optimized and expanded based on the needs of PyTorch integration.
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/cmake/GTestTargets.cmake b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/cmake/GTestTargets.cmake
new file mode 100644
index 000000000000..777fc489ba25
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/cmake/GTestTargets.cmake
@@ -0,0 +1,12 @@
+set(GTest_REL_PATH "../../../../../../../third_party/googletest")
+get_filename_component(GTest_DIR "${CMAKE_CURRENT_LIST_DIR}/${GTest_REL_PATH}" ABSOLUTE)
+
+if(EXISTS "${GTest_DIR}/CMakeLists.txt")
+    message(STATUS "Found GTest: ${GTest_DIR}")
+
+    set(BUILD_GMOCK OFF CACHE BOOL "Disable GMock build")
+    set(INSTALL_GTEST OFF CACHE BOOL "Disable GTest install")
+    add_subdirectory(${GTest_DIR} "${CMAKE_BINARY_DIR}/gtest")
+else()
+    message(FATAL_ERROR "GTest Not Found")
+endif()
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp
index 3f1d43ea0b55..9643bc591587 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/device.cpp
@@ -1,10 +1,12 @@
 #include <include/openreg.h>
 
 namespace {
+
 // Total device numbers
 constexpr int DEVICE_COUNT = 2;
 // Current device index
 thread_local int gCurrentDevice = 0;
+
 } // namespace
 
 orError_t orGetDeviceCount(int* count) {
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp
index 942b04b3b50a..6f02eeb053a6 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.cpp
@@ -1,10 +1,20 @@
 #include "memory.h"
 
+#include <include/openreg.h>
+
 #include <map>
 #include <mutex>
 
 namespace {
 
+struct Block {
+  orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
+  int device = -1;
+  void* pointer = nullptr;
+  size_t size = 0;
+  int refcount{0};
+};
+
 class MemoryManager {
  public:
   static MemoryManager& getInstance() {
@@ -38,7 +48,7 @@ class MemoryManager {
       }
     }
 
-    m_registry[mem] = {type, current_device, mem, aligned_size};
+    m_registry[mem] = {type, current_device, mem, aligned_size, 0};
     *ptr = mem;
     return orSuccess;
   }
@@ -51,14 +61,15 @@ class MemoryManager {
     auto it = m_registry.find(ptr);
     if (it == m_registry.end())
       return orErrorUnknown;
-    const auto& info = it->second;
 
+    const auto& info = it->second;
     if (info.type == orMemoryType::orMemoryTypeDevice) {
       openreg::mprotect(info.pointer, info.size, F_PROT_READ | F_PROT_WRITE);
       openreg::munmap(info.pointer, info.size);
     } else {
       openreg::free(info.pointer);
     }
+
     m_registry.erase(it);
     return orSuccess;
   }
@@ -70,36 +81,39 @@ class MemoryManager {
       orMemcpyKind kind) {
     if (!dst || !src || count == 0)
       return orErrorUnknown;
+
     std::lock_guard<std::mutex> lock(m_mutex);
-    orPointerAttributes dst_info = getPointerInfo(dst);
-    orPointerAttributes src_info = getPointerInfo(src);
+    Block* dst_info = getBlockInfoNoLock(dst);
+    Block* src_info = getBlockInfoNoLock(src);
+
     switch (kind) {
       case orMemcpyHostToDevice:
-        if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
-            src_info.type == orMemoryType::orMemoryTypeDevice)
+        if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
+            (src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
           return orErrorUnknown;
         break;
       case orMemcpyDeviceToHost:
-        if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
-            src_info.type != orMemoryType::orMemoryTypeDevice)
+        if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
+            (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
           return orErrorUnknown;
         break;
       case orMemcpyDeviceToDevice:
-        if (dst_info.type != orMemoryType::orMemoryTypeDevice ||
-            src_info.type != orMemoryType::orMemoryTypeDevice)
+        if ((!dst_info || dst_info->type != orMemoryType::orMemoryTypeDevice) ||
+            (!src_info || src_info->type != orMemoryType::orMemoryTypeDevice))
           return orErrorUnknown;
         break;
       case orMemcpyHostToHost:
-        if (dst_info.type == orMemoryType::orMemoryTypeDevice ||
-            src_info.type == orMemoryType::orMemoryTypeDevice)
+        if ((dst_info && dst_info->type == orMemoryType::orMemoryTypeDevice) ||
+            (src_info && src_info->type == orMemoryType::orMemoryTypeDevice))
           return orErrorUnknown;
         break;
     }
-    {
-      ScopedMemoryProtector dst_protector(dst_info);
-      ScopedMemoryProtector src_protector(src_info);
-      ::memcpy(dst, src, count);
-    }
+
+    unprotectNoLock(dst_info);
+    unprotectNoLock(src_info);
+    ::memcpy(dst, src, count);
+    protectNoLock(dst_info);
+    protectNoLock(src_info);
 
     return orSuccess;
   }
@@ -111,17 +125,16 @@ class MemoryManager {
       return orErrorUnknown;
 
     std ::lock_guard<std::mutex> lock(m_mutex);
-    orPointerAttributes info = getPointerInfo(ptr);
+    Block* info = getBlockInfoNoLock(ptr);
 
-    attributes->type = info.type;
-    if (info.type == orMemoryType::orMemoryTypeUnmanaged) {
+    if (!info) {
+      attributes->type = orMemoryType::orMemoryTypeUnmanaged;
       attributes->device = -1;
       attributes->pointer = const_cast<void*>(ptr);
-      attributes->size = 0;
     } else {
-      attributes->device = info.device;
-      attributes->pointer = info.pointer;
-      attributes->size = info.size;
+      attributes->type = info->type;
+      attributes->device = info->device;
+      attributes->pointer = info->pointer;
     }
 
     return orSuccess;
@@ -129,71 +142,61 @@ class MemoryManager {
 
   orError_t unprotect(void* ptr) {
     std::lock_guard<std::mutex> lock(m_mutex);
-    orPointerAttributes info = getPointerInfo(ptr);
-    if (info.type != orMemoryType::orMemoryTypeDevice) {
-      return orErrorUnknown;
-    }
-    if (openreg::mprotect(
-            info.pointer, info.size, F_PROT_READ | F_PROT_WRITE) != 0) {
-      return orErrorUnknown;
-    }
-    return orSuccess;
+    return unprotectNoLock(getBlockInfoNoLock(ptr));
   }
 
   orError_t protect(void* ptr) {
     std::lock_guard<std::mutex> lock(m_mutex);
-    orPointerAttributes info = getPointerInfo(ptr);
-    if (info.type != orMemoryType::orMemoryTypeDevice) {
-      return orErrorUnknown;
-    }
-    if (openreg::mprotect(info.pointer, info.size, F_PROT_NONE) != 0) {
-      return orErrorUnknown;
-    }
-    return orSuccess;
+    return protectNoLock(getBlockInfoNoLock(ptr));
   }
 
  private:
-  class ScopedMemoryProtector {
-   public:
-    ScopedMemoryProtector(const orPointerAttributes& info)
-        : m_info(info), m_protected(false) {
-      if (m_info.type == orMemoryType::orMemoryTypeDevice) {
+  MemoryManager() = default;
+
+  orError_t unprotectNoLock(Block* info) {
+    if (info && info->type == orMemoryType::orMemoryTypeDevice) {
+      if (info->refcount == 0) {
         if (openreg::mprotect(
-                m_info.pointer, m_info.size, F_PROT_READ | F_PROT_WRITE) == 0) {
-          m_protected = true;
+                info->pointer, info->size, F_PROT_READ | F_PROT_WRITE) != 0) {
+          return orErrorUnknown;
         }
       }
+
+      info->refcount++;
     }
-    ~ScopedMemoryProtector() {
-      if (m_protected) {
-        openreg::mprotect(m_info.pointer, m_info.size, F_PROT_NONE);
+
+    return orSuccess;
+  }
+
+  orError_t protectNoLock(Block* info) {
+    if (info && info->type == orMemoryType::orMemoryTypeDevice) {
+      if (info->refcount == 1) {
+        if (openreg::mprotect(info->pointer, info->size, F_PROT_NONE) != 0) {
+          return orErrorUnknown;
+        }
       }
-    }
-    ScopedMemoryProtector(const ScopedMemoryProtector&) = delete;
-    ScopedMemoryProtector& operator=(const ScopedMemoryProtector&) = delete;
 
-   private:
-    orPointerAttributes m_info;
-    bool m_protected;
-  };
+      info->refcount--;
+    }
 
-  MemoryManager() = default;
+    return orSuccess;
+  }
 
-  orPointerAttributes getPointerInfo(const void* ptr) {
+  Block* getBlockInfoNoLock(const void* ptr) {
     auto it = m_registry.upper_bound(const_cast<void*>(ptr));
     if (it != m_registry.begin()) {
       --it;
       const char* p_char = static_cast<const char*>(ptr);
       const char* base_char = static_cast<const char*>(it->first);
       if (p_char >= base_char && p_char < (base_char + it->second.size)) {
-        return it->second;
+        return &it->second;
       }
     }
 
-    return {};
+    return nullptr;
   }
 
-  std::map<void*, orPointerAttributes> m_registry;
+  std::map<void*, Block> m_registry;
   std::mutex m_mutex;
 };
 
@@ -225,6 +228,22 @@ orError_t orMemcpy(
   return MemoryManager::getInstance().memcpy(dst, src, count, kind);
 }
 
+orError_t orMemcpyAsync(
+    void* dst,
+    const void* src,
+    size_t count,
+    orMemcpyKind kind,
+    orStream_t stream) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+  auto& mm = MemoryManager::getInstance();
+
+  return orLaunchKernel(
+      stream, &MemoryManager::memcpy, &mm, dst, src, count, kind);
+}
+
 orError_t orPointerGetAttributes(
     orPointerAttributes* attributes,
     const void* ptr) {
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.h b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.h
index 9de13acc2350..35851ac90659 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/memory.h
@@ -4,8 +4,6 @@
 #include <cstdlib>
 #include <cstring>
 
-#include <include/openreg.h>
-
 #if defined(_WIN32)
 #include <windows.h>
 #else
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/stream.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/stream.cpp
new file mode 100644
index 000000000000..30f50b1aa289
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/csrc/stream.cpp
@@ -0,0 +1,313 @@
+#include <include/openreg.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+
+static std::mutex g_mutex;
+static std::once_flag g_flag;
+static std::vector<std::set<orStream_t>> g_streams_per_device;
+
+static void initialize_registries() {
+  int device_count = 0;
+  orGetDeviceCount(&device_count);
+  g_streams_per_device.resize(device_count);
+}
+
+struct orEventImpl {
+  std::mutex mtx;
+  std::condition_variable cv;
+  std::atomic<bool> completed{true};
+  int device_index = -1;
+  bool timing_enabled{false};
+  std::chrono::high_resolution_clock::time_point completion_time;
+};
+
+struct orEvent {
+  std::shared_ptr<orEventImpl> impl;
+};
+
+struct orStream {
+  std::queue<std::function<void()>> tasks;
+  std::mutex mtx;
+  std::condition_variable cv;
+  std::thread worker;
+  std::atomic<bool> stop_flag{false};
+  int device_index = -1;
+
+  orStream() {
+    worker = std::thread([this] {
+      while (true) {
+        std::function<void()> task;
+        {
+          std::unique_lock<std::mutex> lock(this->mtx);
+          this->cv.wait(lock, [this] {
+            return this->stop_flag.load() || !this->tasks.empty();
+          });
+          if (this->stop_flag.load() && this->tasks.empty()) {
+            return;
+          }
+          task = std::move(this->tasks.front());
+          this->tasks.pop();
+        }
+        task();
+      }
+    });
+  }
+
+  ~orStream() {
+    stop_flag.store(true);
+    cv.notify_one();
+    worker.join();
+  }
+};
+
+orError_t openreg::addTaskToStream(
+    orStream_t stream,
+    std::function<void()> task) {
+  if (!stream)
+    return orErrorUnknown;
+
+  {
+    std::lock_guard<std::mutex> lock(stream->mtx);
+    stream->tasks.push(std::move(task));
+  }
+
+  stream->cv.notify_one();
+  return orSuccess;
+}
+
+orError_t orEventCreateWithFlags(orEvent_t* event, unsigned int flags) {
+  if (!event)
+    return orErrorUnknown;
+
+  auto impl = std::make_shared<orEventImpl>();
+  orGetDevice(&(impl->device_index));
+  if (flags & orEventEnableTiming) {
+    impl->timing_enabled = true;
+  }
+
+  *event = new orEvent{std::move(impl)};
+  return orSuccess;
+}
+
+orError_t orEventCreate(orEvent_t* event) {
+  return orEventCreateWithFlags(event, orEventDisableTiming);
+}
+
+orError_t orEventDestroy(orEvent_t event) {
+  if (!event)
+    return orErrorUnknown;
+
+  delete event;
+  return orSuccess;
+}
+
+orError_t orEventRecord(orEvent_t event, orStream_t stream) {
+  if (!event || !stream)
+    return orErrorUnknown;
+
+  auto event_impl = event->impl;
+  event_impl->completed.store(false);
+  auto record_task = [event_impl]() {
+    if (event_impl->timing_enabled) {
+      event_impl->completion_time = std::chrono::high_resolution_clock::now();
+    }
+
+    {
+      std::lock_guard<std::mutex> lock(event_impl->mtx);
+      event_impl->completed.store(true);
+    }
+
+    event_impl->cv.notify_all();
+  };
+
+  return openreg::addTaskToStream(stream, record_task);
+}
+
+orError_t orEventSynchronize(orEvent_t event) {
+  if (!event)
+    return orErrorUnknown;
+
+  auto event_impl = event->impl;
+  std::unique_lock<std::mutex> lock(event_impl->mtx);
+  event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
+
+  return orSuccess;
+}
+
+orError_t orEventQuery(orEvent_t event) {
+  if (!event)
+    return orErrorUnknown;
+
+  return event->impl->completed.load() ? orSuccess : orErrorNotReady;
+}
+
+orError_t orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end) {
+  if (!ms || !start || !end)
+    return orErrorUnknown;
+
+  auto start_impl = start->impl;
+  auto end_impl = end->impl;
+
+  if (start_impl->device_index != end_impl->device_index) {
+    return orErrorUnknown;
+  }
+
+  if (!start_impl->timing_enabled || !end_impl->timing_enabled) {
+    return orErrorUnknown;
+  }
+
+  if (!start_impl->completed.load() || !end_impl->completed.load()) {
+    return orErrorUnknown;
+  }
+
+  auto duration = end_impl->completion_time - start_impl->completion_time;
+  *ms = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(
+            duration)
+            .count();
+
+  return orSuccess;
+}
+
+orError_t orStreamCreateWithPriority(
+    orStream_t* stream,
+    [[maybe_unused]] unsigned int flag,
+    int priority) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+  int min_p, max_p;
+  orDeviceGetStreamPriorityRange(&min_p, &max_p);
+  if (priority < min_p || priority > max_p) {
+    return orErrorUnknown;
+  }
+
+  int current_device = 0;
+  orGetDevice(&current_device);
+
+  orStream_t new_stream = nullptr;
+  new_stream = new orStream();
+  new_stream->device_index = current_device;
+
+  {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    std::call_once(g_flag, initialize_registries);
+    g_streams_per_device[current_device].insert(new_stream);
+  }
+
+  *stream = new_stream;
+
+  return orSuccess;
+}
+
+orError_t orStreamCreate(orStream_t* stream) {
+  int min_p, max_p;
+  orDeviceGetStreamPriorityRange(&min_p, &max_p);
+
+  return orStreamCreateWithPriority(stream, 0, max_p);
+}
+
+orError_t orStreamGetPriority(
+    [[maybe_unused]] orStream_t stream,
+    int* priority) {
+  // Since OpenReg has only one priority level, the following code
+  // returns 0 directly for convenience.
+  *priority = 0;
+
+  return orSuccess;
+}
+
+orError_t orStreamDestroy(orStream_t stream) {
+  if (!stream)
+    return orErrorUnknown;
+
+  {
+    std::lock_guard<std::mutex> lock(g_mutex);
+
+    int device_idx = stream->device_index;
+    if (device_idx >= 0 && device_idx < g_streams_per_device.size()) {
+      g_streams_per_device[device_idx].erase(stream);
+    }
+  }
+
+  delete stream;
+  return orSuccess;
+}
+
+orError_t orStreamQuery(orStream_t stream) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+  std::lock_guard<std::mutex> lock(stream->mtx);
+  return stream->tasks.empty() ? orSuccess : orErrorNotReady;
+}
+
+orError_t orStreamSynchronize(orStream_t stream) {
+  if (!stream)
+    return orErrorUnknown;
+
+  orEvent_t event;
+  orEventCreate(&event);
+  orEventRecord(event, stream);
+
+  orError_t status = orEventSynchronize(event);
+  orEventDestroy(event);
+
+  return status;
+}
+
+orError_t orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int) {
+  if (!stream || !event)
+    return orErrorUnknown;
+
+  auto event_impl = event->impl;
+  auto wait_task = [event_impl]() {
+    std::unique_lock<std::mutex> lock(event_impl->mtx);
+    event_impl->cv.wait(lock, [&] { return event_impl->completed.load(); });
+  };
+
+  return openreg::addTaskToStream(stream, wait_task);
+}
+
+orError_t orDeviceGetStreamPriorityRange(
+    int* leastPriority,
+    int* greatestPriority) {
+  if (!leastPriority || !greatestPriority) {
+    return orErrorUnknown;
+  }
+
+  // OpenReg have only one priority now.
+  *leastPriority = 0;
+  *greatestPriority = 0;
+  return orSuccess;
+}
+
+orError_t orDeviceSynchronize(void) {
+  int current_device = 0;
+  orGetDevice(&current_device);
+
+  std::vector<orStream_t> streams;
+  {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    std::call_once(g_flag, initialize_registries);
+
+    auto& streams_on_device = g_streams_per_device[current_device];
+    streams.assign(streams_on_device.begin(), streams_on_device.end());
+  }
+
+  for (orStream_t stream : streams) {
+    orError_t status = orStreamSynchronize(stream);
+    if (status != orSuccess) {
+      return status;
+    }
+  }
+
+  return orSuccess;
+}
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
new file mode 100644
index 000000000000..f00f1909b7ec
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/example/example.cpp
@@ -0,0 +1,112 @@
+#include "include/openreg.h"
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+struct MemoryGuard {
+  MemoryGuard(void* ptr) : ptr_(ptr) {
+    orMemoryUnprotect(ptr_);
+  }
+  ~MemoryGuard() {
+    orMemoryProtect(ptr_);
+  }
+
+ private:
+  void* ptr_{};
+};
+
+void add_kernel(float* out, float* a, float* b, int num) {
+  for (int i = 0; i < num; ++i) {
+    out[i] = a[i] + b[i];
+  }
+}
+
+int main() {
+  int device_count = 0;
+  orGetDeviceCount(&device_count);
+
+  std::cout << "Current environment have " << device_count << " devices"
+            << std::endl;
+
+  orSetDevice(0);
+  int current_device = -1;
+  orGetDevice(&current_device);
+
+  std::cout << "Current is " << current_device << " device" << std::endl;
+
+  constexpr int num = 50000;
+  constexpr size_t size = num * sizeof(float);
+
+  std::vector<float> host_a(num), host_b(num), host_out(num, 0.0f);
+  std::iota(host_a.begin(), host_a.end(), 0.0f);
+  for (int i = 0; i < num; ++i) {
+    host_b[i] = 2.0f;
+  }
+
+  float *dev_a, *dev_b, *dev_out;
+  orMalloc((void**)&dev_a, size);
+  orMalloc((void**)&dev_b, size);
+  orMalloc((void**)&dev_out, size);
+
+  // There will be subsequent memory access operations, so memory protection
+  // needs to be released
+  MemoryGuard a{dev_a};
+  MemoryGuard b{dev_b};
+  MemoryGuard c{dev_out};
+
+  orStream_t stream1, stream2;
+  orEvent_t start_event, stop_event;
+
+  orStreamCreate(&stream1);
+  orStreamCreate(&stream2);
+  orEventCreateWithFlags(&start_event, orEventEnableTiming);
+  orEventCreateWithFlags(&stop_event, orEventEnableTiming);
+
+  // Copy input from host to device
+  orMemcpyAsync(dev_a, host_a.data(), size, orMemcpyHostToDevice, stream1);
+  orMemcpyAsync(dev_b, host_b.data(), size, orMemcpyHostToDevice, stream1);
+
+  // Submit compute kernel and two events those are used for calculating time.
+  orEventRecord(start_event, stream1);
+  orLaunchKernel(stream1, add_kernel, dev_out, dev_a, dev_b, num);
+  orEventRecord(stop_event, stream1);
+
+  // Synchronization between streams.
+  orStreamWaitEvent(stream2, stop_event, 0);
+  orMemcpyAsync(host_out.data(), dev_out, size, orMemcpyDeviceToHost, stream2);
+  orStreamSynchronize(stream2);
+
+  std::cout << "All tasks have been submitted." << std::endl;
+
+  float elapsed_ms = 0.0f;
+  orEventElapsedTime(&elapsed_ms, start_event, stop_event);
+  std::cout << "Kernel execution time: " << elapsed_ms << " ms" << std::endl;
+
+  bool success = true;
+  for (int i = 0; i < num; ++i) {
+    if (std::abs(host_out[i] - (host_a[i] + host_b[i])) > 1e-5) {
+      std::cout << "Verification FAILED at index " << i << "! Expected "
+                << (host_a[i] + host_b[i]) << ", got " << host_out[i]
+                << std::endl;
+      success = false;
+      break;
+    }
+  }
+  if (success) {
+    std::cout << "Verification PASSED!" << std::endl;
+  }
+
+  orFree(dev_a);
+  orFree(dev_b);
+  orFree(dev_out);
+
+  orStreamDestroy(stream1);
+  orStreamDestroy(stream2);
+
+  orEventDestroy(start_event);
+  orEventDestroy(stop_event);
+
+  return 0;
+}
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h
index a5e8b77c421c..a5e4af5585f1 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.h
@@ -3,16 +3,20 @@
 #include <cstddef>
 
 #ifdef _WIN32
-  #define OPENREG_EXPORT __declspec(dllexport)
+#define OPENREG_EXPORT __declspec(dllexport)
 #else
-  #define OPENREG_EXPORT __attribute__((visibility("default")))
+#define OPENREG_EXPORT __attribute__((visibility("default")))
 #endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef enum orError_t { orSuccess = 0, orErrorUnknown = 1 } orError_t;
+typedef enum orError_t {
+  orSuccess = 0,
+  orErrorUnknown = 1,
+  orErrorNotReady = 2
+} orError_t;
 
 typedef enum orMemcpyKind {
   orMemcpyHostToHost = 0,
@@ -31,25 +35,75 @@ struct orPointerAttributes {
   orMemoryType type = orMemoryType::orMemoryTypeUnmanaged;
   int device;
   void* pointer;
-  size_t size;
 };
 
+typedef enum orEventFlags {
+  orEventDisableTiming = 0x0,
+  orEventEnableTiming = 0x1,
+} orEventFlags;
+
+struct orStream;
+struct orEvent;
+typedef struct orStream* orStream_t;
+typedef struct orEvent* orEvent_t;
+
+// Memory
 OPENREG_EXPORT orError_t orMalloc(void** devPtr, size_t size);
 OPENREG_EXPORT orError_t orFree(void* devPtr);
 OPENREG_EXPORT orError_t orMallocHost(void** hostPtr, size_t size);
 OPENREG_EXPORT orError_t orFreeHost(void* hostPtr);
-OPENREG_EXPORT orError_t orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
+OPENREG_EXPORT orError_t
+orMemcpy(void* dst, const void* src, size_t count, orMemcpyKind kind);
+OPENREG_EXPORT orError_t orMemcpyAsync(
+    void* dst,
+    const void* src,
+    size_t count,
+    orMemcpyKind kind,
+    orStream_t stream);
+OPENREG_EXPORT orError_t
+orPointerGetAttributes(orPointerAttributes* attributes, const void* ptr);
 OPENREG_EXPORT orError_t orMemoryUnprotect(void* devPtr);
 OPENREG_EXPORT orError_t orMemoryProtect(void* devPtr);
 
+// Device
 OPENREG_EXPORT orError_t orGetDeviceCount(int* count);
 OPENREG_EXPORT orError_t orSetDevice(int device);
 OPENREG_EXPORT orError_t orGetDevice(int* device);
+OPENREG_EXPORT orError_t
+orDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority);
+OPENREG_EXPORT orError_t orDeviceSynchronize(void);
 
-OPENREG_EXPORT orError_t orPointerGetAttributes(
-    orPointerAttributes* attributes,
-    const void* ptr);
+// Stream
+OPENREG_EXPORT orError_t orStreamCreateWithPriority(
+    orStream_t* stream,
+    unsigned int flags,
+    int priority);
+OPENREG_EXPORT orError_t orStreamCreate(orStream_t* stream);
+OPENREG_EXPORT orError_t orStreamGetPriority(orStream_t stream, int* priority);
+OPENREG_EXPORT orError_t orStreamDestroy(orStream_t stream);
+OPENREG_EXPORT orError_t orStreamQuery(orStream_t stream);
+OPENREG_EXPORT orError_t orStreamSynchronize(orStream_t stream);
+OPENREG_EXPORT orError_t
+orStreamWaitEvent(orStream_t stream, orEvent_t event, unsigned int flags);
+
+// Event
+OPENREG_EXPORT orError_t
+orEventCreateWithFlags(orEvent_t* event, unsigned int flags);
+OPENREG_EXPORT orError_t orEventCreate(orEvent_t* event);
+OPENREG_EXPORT orError_t orEventDestroy(orEvent_t event);
+OPENREG_EXPORT orError_t orEventRecord(orEvent_t event, orStream_t stream);
+OPENREG_EXPORT orError_t orEventSynchronize(orEvent_t event);
+OPENREG_EXPORT orError_t orEventQuery(orEvent_t event);
+OPENREG_EXPORT orError_t
+orEventElapsedTime(float* ms, orEvent_t start, orEvent_t end);
 
 #ifdef __cplusplus
 } // extern "C"
 #endif
+
+#ifdef __cplusplus
+
+#define OPENREG_H
+#include "openreg.inl"
+
+#endif
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.inl b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.inl
new file mode 100644
index 000000000000..851be132cc36
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/include/openreg.inl
@@ -0,0 +1,42 @@
+#ifndef OPENREG_H
+#error "Don`t include openreg.inl directly, include openreg.h instead."
+#endif
+
+#include <functional>
+#include <tuple>
+#include <utility>
+
+namespace openreg {
+OPENREG_EXPORT orError_t
+addTaskToStream(orStream* stream, std::function<void()> task);
+}
+
+template <typename Func, typename... Args>
+OPENREG_EXPORT inline orError_t orLaunchKernel(
+    orStream* stream,
+    Func&& kernel_func,
+    Args&&... args) {
+  if (!stream) {
+    return orErrorUnknown;
+  }
+
+/*
+ * Some tests in PyTorch still use C++11, so we use conditional macro to
+ * select different approaches for different C++ version.
+ *
+ * Std::apply is only supported in C++17, so for C++11/14, std::bind is
+ * a more appropriate approach, but the former has better performance.
+ */
+#if __cplusplus >= 201703L
+  auto task = [func = std::forward<Func>(kernel_func),
+               args_tuple =
+                   std::make_tuple(std::forward<Args>(args)...)]() mutable {
+    std::apply(func, std::move(args_tuple));
+  };
+#else
+  auto task =
+      std::bind(std::forward<Func>(kernel_func), std::forward<Args>(args)...);
+#endif
+
+  return openreg::addTaskToStream(stream, std::move(task));
+}
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/device_tests.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/device_tests.cpp
new file mode 100644
index 000000000000..b7501c81d7b7
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/device_tests.cpp
@@ -0,0 +1,41 @@
+#include <gtest/gtest.h>
+#include <include/openreg.h>
+
+namespace {
+
+class DeviceTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    orSetDevice(0);
+  }
+};
+
+TEST_F(DeviceTest, GetDeviceCountValid) {
+  int count = -1;
+  EXPECT_EQ(orGetDeviceCount(&count), orSuccess);
+  EXPECT_EQ(count, 2);
+}
+
+TEST_F(DeviceTest, GetDeviceValid) {
+  int device = -1;
+  EXPECT_EQ(orGetDevice(&device), orSuccess);
+  EXPECT_EQ(device, 0);
+}
+
+TEST_F(DeviceTest, SetDeviceValid) {
+  EXPECT_EQ(orSetDevice(1), orSuccess);
+
+  int device = -1;
+  EXPECT_EQ(orGetDevice(&device), orSuccess);
+  EXPECT_EQ(device, 1);
+
+  EXPECT_EQ(orSetDevice(0), orSuccess);
+  EXPECT_EQ(orGetDevice(&device), orSuccess);
+  EXPECT_EQ(device, 0);
+}
+
+TEST_F(DeviceTest, SetDeviceInvalidNegative) {
+  EXPECT_EQ(orSetDevice(-1), orErrorUnknown);
+}
+
+} // namespace
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/event_tests.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/event_tests.cpp
new file mode 100644
index 000000000000..416c50a86343
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/event_tests.cpp
@@ -0,0 +1,88 @@
+#include <gtest/gtest.h>
+#include <include/openreg.h>
+
+#include <atomic>
+#include <thread>
+
+namespace {
+
+class EventTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    orSetDevice(0);
+  }
+};
+
+TEST_F(EventTest, EventCreateAndDestroy) {
+  orEvent_t event = nullptr;
+  EXPECT_EQ(orEventCreate(&event), orSuccess);
+  EXPECT_NE(event, nullptr);
+
+  EXPECT_EQ(orEventDestroy(event), orSuccess);
+}
+
+TEST_F(EventTest, EventCreateWithFlagsTiming) {
+  orEvent_t event = nullptr;
+  EXPECT_EQ(orEventCreateWithFlags(&event, orEventEnableTiming), orSuccess);
+  EXPECT_NE(event, nullptr);
+
+  EXPECT_EQ(orEventDestroy(event), orSuccess);
+}
+
+TEST_F(EventTest, EventRecordAndSynchronize) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  orEvent_t event = nullptr;
+  EXPECT_EQ(orEventCreate(&event), orSuccess);
+
+  EXPECT_EQ(orEventRecord(event, stream), orSuccess);
+  EXPECT_EQ(orEventSynchronize(event), orSuccess);
+  EXPECT_EQ(orEventQuery(event), orSuccess);
+
+  EXPECT_EQ(orEventDestroy(event), orSuccess);
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+TEST_F(EventTest, EventElapsedTime) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  orEvent_t start = nullptr;
+  orEvent_t end = nullptr;
+  EXPECT_EQ(orEventCreateWithFlags(&start, orEventEnableTiming), orSuccess);
+  EXPECT_EQ(orEventCreateWithFlags(&end, orEventEnableTiming), orSuccess);
+
+  EXPECT_EQ(orEventRecord(start, stream), orSuccess);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+  EXPECT_EQ(orEventRecord(end, stream), orSuccess);
+
+  EXPECT_EQ(orEventSynchronize(start), orSuccess);
+  EXPECT_EQ(orEventSynchronize(end), orSuccess);
+
+  float elapsed_ms = 0.0f;
+  EXPECT_EQ(orEventElapsedTime(&elapsed_ms, start, end), orSuccess);
+  EXPECT_GE(elapsed_ms, 0.0f);
+
+  EXPECT_EQ(orEventDestroy(start), orSuccess);
+  EXPECT_EQ(orEventDestroy(end), orSuccess);
+}
+
+TEST_F(EventTest, StreamWaitEvent) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  orEvent_t event = nullptr;
+  EXPECT_EQ(orEventCreate(&event), orSuccess);
+
+  EXPECT_EQ(orEventRecord(event, stream), orSuccess);
+  EXPECT_EQ(orStreamWaitEvent(stream, event, 0), orSuccess);
+
+  EXPECT_EQ(orEventSynchronize(event), orSuccess);
+  EXPECT_EQ(orEventDestroy(event), orSuccess);
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+} // namespace
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/memory_tests.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/memory_tests.cpp
new file mode 100644
index 000000000000..e36ad4c0da3e
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/memory_tests.cpp
@@ -0,0 +1,115 @@
+#include <gtest/gtest.h>
+#include <include/openreg.h>
+
+namespace {
+
+class MemoryTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    orSetDevice(0);
+  }
+};
+
+TEST_F(MemoryTest, AllocateAndFreeDevice) {
+  void* ptr = nullptr;
+  EXPECT_EQ(orMalloc(&ptr, 4096), orSuccess);
+  EXPECT_NE(ptr, nullptr);
+
+  EXPECT_EQ(orFree(ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, AllocateAndFreeHost) {
+  void* ptr = nullptr;
+  EXPECT_EQ(orMallocHost(&ptr, 8192), orSuccess);
+  EXPECT_NE(ptr, nullptr);
+
+  EXPECT_EQ(orFreeHost(ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, AllocateNullptr) {
+  EXPECT_EQ(orMalloc(nullptr, 4096), orErrorUnknown);
+  EXPECT_EQ(orMallocHost(nullptr, 4096), orErrorUnknown);
+}
+
+TEST_F(MemoryTest, AllocateZeroSize) {
+  void* ptr = nullptr;
+  EXPECT_EQ(orMalloc(&ptr, 0), orErrorUnknown);
+  EXPECT_EQ(orMallocHost(&ptr, 0), orErrorUnknown);
+}
+
+TEST_F(MemoryTest, MemcpyHostToDevice) {
+  char host_src[] = "data";
+  char host_dst[5] = {};
+
+  void* dev_ptr = nullptr;
+  EXPECT_EQ(orMalloc(&dev_ptr, 5), orSuccess);
+
+  EXPECT_EQ(orMemcpy(dev_ptr, host_src, 5, orMemcpyHostToDevice), orSuccess);
+  EXPECT_EQ(orMemcpy(host_dst, dev_ptr, 5, orMemcpyDeviceToHost), orSuccess);
+
+  EXPECT_STREQ(host_dst, host_src);
+
+  EXPECT_EQ(orFree(dev_ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, MemcpyDeviceToDevice) {
+  const char host_src[5] = "data";
+  char host_dst[5] = {};
+  void *dev_dst1 = nullptr, *dev_dst2 = nullptr;
+
+  EXPECT_EQ(orMalloc(&dev_dst1, 5), orSuccess);
+  EXPECT_EQ(orMalloc(&dev_dst2, 5), orSuccess);
+
+  EXPECT_EQ(orMemcpy(dev_dst1, host_src, 5, orMemcpyHostToDevice), orSuccess);
+  EXPECT_EQ(orMemcpy(dev_dst2, dev_dst1, 5, orMemcpyDeviceToDevice), orSuccess);
+  EXPECT_EQ(orMemcpy(host_dst, dev_dst2, 5, orMemcpyDeviceToHost), orSuccess);
+
+  EXPECT_STREQ(host_dst, host_src);
+
+  EXPECT_EQ(orFree(dev_dst1), orSuccess);
+  EXPECT_EQ(orFree(dev_dst2), orSuccess);
+}
+
+TEST_F(MemoryTest, MemcpyInvalidKind) {
+  char host_ptr[5] = "data";
+  void* dev_ptr = nullptr;
+
+  EXPECT_EQ(orMalloc(&dev_ptr, 5), orSuccess);
+
+  EXPECT_EQ(
+      orMemcpy(nullptr, host_ptr, 4, orMemcpyHostToDevice), orErrorUnknown);
+  EXPECT_EQ(
+      orMemcpy(dev_ptr, nullptr, 4, orMemcpyHostToDevice), orErrorUnknown);
+  EXPECT_EQ(
+      orMemcpy(dev_ptr, host_ptr, 0, orMemcpyHostToDevice), orErrorUnknown);
+
+  EXPECT_EQ(orFree(dev_ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, PointerAttributes) {
+  void* dev_ptr = nullptr;
+  EXPECT_EQ(orMalloc(&dev_ptr, 32), orSuccess);
+
+  orPointerAttributes attr{};
+  EXPECT_EQ(orPointerGetAttributes(&attr, dev_ptr), orSuccess);
+  EXPECT_EQ(attr.type, orMemoryType::orMemoryTypeDevice);
+  EXPECT_EQ(attr.pointer, dev_ptr);
+
+  char host_ptr[16];
+  EXPECT_EQ(orPointerGetAttributes(&attr, host_ptr), orSuccess);
+  EXPECT_EQ(attr.type, orMemoryType::orMemoryTypeUnmanaged);
+
+  EXPECT_EQ(orFree(dev_ptr), orSuccess);
+}
+
+TEST_F(MemoryTest, ProtectUnprotectDevice) {
+  void* dev_ptr = nullptr;
+  EXPECT_EQ(orMalloc(&dev_ptr, 64), orSuccess);
+
+  EXPECT_EQ(orMemoryUnprotect(dev_ptr), orSuccess);
+  EXPECT_EQ(orMemoryProtect(dev_ptr), orSuccess);
+
+  EXPECT_EQ(orFree(dev_ptr), orSuccess);
+}
+
+} // namespace
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/stream_tests.cpp b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/stream_tests.cpp
new file mode 100644
index 000000000000..e91abaa1e7fe
--- /dev/null
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/third_party/openreg/tests/stream_tests.cpp
@@ -0,0 +1,79 @@
+#include <gtest/gtest.h>
+#include <include/openreg.h>
+
+#include <atomic>
+#include <thread>
+
+namespace {
+
+class StreamTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    orSetDevice(0);
+  }
+};
+
+TEST_F(StreamTest, StreamCreateAndDestroy) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+  EXPECT_NE(stream, nullptr);
+
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+TEST_F(StreamTest, StreamCreateWithInvalidPriority) {
+  orStream_t stream = nullptr;
+  int min_p, max_p;
+  orDeviceGetStreamPriorityRange(&min_p, &max_p);
+
+  EXPECT_EQ(orStreamCreateWithPriority(&stream, 0, min_p - 1), orErrorUnknown);
+  EXPECT_EQ(orStreamCreateWithPriority(&stream, 0, max_p + 1), orErrorUnknown);
+}
+
+TEST_F(StreamTest, StreamTaskExecution) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  std::atomic<int> counter{0};
+  EXPECT_EQ(openreg::addTaskToStream(stream, [&] { counter++; }), orSuccess);
+
+  EXPECT_EQ(orStreamSynchronize(stream), orSuccess);
+  EXPECT_EQ(counter.load(), 1);
+
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+TEST_F(StreamTest, StreamQuery) {
+  orStream_t stream = nullptr;
+  EXPECT_EQ(orStreamCreate(&stream), orSuccess);
+
+  EXPECT_EQ(orStreamQuery(stream), orSuccess);
+
+  std::atomic<int> counter{0};
+  openreg::addTaskToStream(stream, [&] { counter++; });
+
+  EXPECT_EQ(orStreamSynchronize(stream), orSuccess);
+  EXPECT_EQ(orStreamQuery(stream), orSuccess);
+
+  EXPECT_EQ(orStreamDestroy(stream), orSuccess);
+}
+
+TEST_F(StreamTest, DeviceSynchronize) {
+  orStream_t stream1 = nullptr;
+  orStream_t stream2 = nullptr;
+
+  EXPECT_EQ(orStreamCreate(&stream1), orSuccess);
+  EXPECT_EQ(orStreamCreate(&stream2), orSuccess);
+
+  std::atomic<int> counter{0};
+  openreg::addTaskToStream(stream1, [&] { counter++; });
+  openreg::addTaskToStream(stream2, [&] { counter++; });
+
+  EXPECT_EQ(orDeviceSynchronize(), orSuccess);
+  EXPECT_EQ(counter.load(), 2);
+
+  EXPECT_EQ(orStreamDestroy(stream1), orSuccess);
+  EXPECT_EQ(orStreamDestroy(stream2), orSuccess);
+}
+
+} // namespace
diff --git a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/stub.c b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/stub.c
index 243a43a37e5e..4e02f9fd551f 100644
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/stub.c
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/torch_openreg/csrc/stub.c
@@ -1,9 +1,9 @@
 #include <Python.h>
 
 #ifdef _WIN32
-  #define OPENREG_EXPORT __declspec(dllexport)
+#define OPENREG_EXPORT __declspec(dllexport)
 #else
-  #define OPENREG_EXPORT __attribute__((visibility("default")))
+#define OPENREG_EXPORT __attribute__((visibility("default")))
 #endif
 
 extern OPENREG_EXPORT PyObject* initOpenRegModule(void);
@@ -12,9 +12,9 @@ extern OPENREG_EXPORT PyObject* initOpenRegModule(void);
 extern "C"
 #endif
 
-OPENREG_EXPORT PyObject* PyInit__C(void);
+    OPENREG_EXPORT PyObject*
+    PyInit__C(void);
 
-PyMODINIT_FUNC PyInit__C(void)
-{
+PyMODINIT_FUNC PyInit__C(void) {
   return initOpenRegModule();
 }
diff --git a/test/cpp_extensions/torch_stable_test_extension/setup.py b/test/cpp_extensions/torch_stable_test_extension/setup.py
new file mode 100644
index 000000000000..062d466e7ae9
--- /dev/null
+++ b/test/cpp_extensions/torch_stable_test_extension/setup.py
@@ -0,0 +1,67 @@
+import distutils.command.clean
+import shutil
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+from torch.utils.cpp_extension import BuildExtension, CppExtension
+
+
+ROOT_DIR = Path(__file__).parent
+CSRC_DIR = ROOT_DIR / "torch_stable_test" / "csrc"
+
+
+class clean(distutils.command.clean.clean):
+    def run(self):
+        # Run default behavior first
+        distutils.command.clean.clean.run(self)
+
+        # Remove extension
+        for path in (ROOT_DIR / "torch_stable_test").glob("**/*.so"):
+            path.unlink()
+        # Remove build and dist and egg-info directories
+        dirs = [
+            ROOT_DIR / "build",
+            ROOT_DIR / "dist",
+            ROOT_DIR / "torch_stable_test.egg-info",
+        ]
+        for path in dirs:
+            if path.exists():
+                shutil.rmtree(str(path), ignore_errors=True)
+
+
+def get_extension():
+    extra_compile_args = {
+        "cxx": ["-fdiagnostics-color=always", "-DTORCH_STABLE_ONLY"],
+    }
+
+    sources = list(CSRC_DIR.glob("**/*.cpp"))
+
+    return [
+        CppExtension(
+            "torch_stable_test._C",
+            sources=sorted(str(s) for s in sources),
+            py_limited_api=True,
+            extra_compile_args=extra_compile_args,
+            extra_link_args=[],
+        )
+    ]
+
+
+setup(
+    name="torch_stable_test",
+    version="0.0",
+    author="PyTorch Core Team",
+    description="Test extension to verify TORCH_STABLE_ONLY flag",
+    packages=find_packages(exclude=("test",)),
+    package_data={"torch_stable_test": ["*.dll", "*.dylib", "*.so"]},
+    install_requires=[
+        "torch",
+    ],
+    ext_modules=get_extension(),
+    cmdclass={
+        "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
+        "clean": clean,
+    },
+    options={"bdist_wheel": {"py_limited_api": "cp39"}},
+)
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_bad_key b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_bad_key
rename to test/cpp_extensions/torch_stable_test_extension/torch_stable_test/__init__.py
diff --git a/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/csrc/test_extension.cpp b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/csrc/test_extension.cpp
new file mode 100644
index 000000000000..c92d56da11ba
--- /dev/null
+++ b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/csrc/test_extension.cpp
@@ -0,0 +1 @@
+#include <ATen/core/TensorBase.h> // This should trigger the TORCH_STABLE_ONLY error
diff --git a/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/test_torch_stable.py b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/test_torch_stable.py
new file mode 100644
index 000000000000..5c5613bb5484
--- /dev/null
+++ b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/test_torch_stable.py
@@ -0,0 +1,22 @@
+# Owner(s): ["module: cpp"]
+
+from pathlib import Path
+
+from torch.testing._internal.common_utils import (
+    install_cpp_extension,
+    IS_WINDOWS,
+    run_tests,
+    TestCase,
+)
+
+
+if not IS_WINDOWS:
+
+    class TestTorchStable(TestCase):
+        def test_setup_fails(self):
+            with self.assertRaisesRegex(RuntimeError, "build failed for cpp extension"):
+                install_cpp_extension(extension_root=Path(__file__).parent.parent)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_hf_safetensor_e2e.py b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
index 40558175569c..9fbe2c47db03 100644
--- a/test/distributed/checkpoint/test_hf_safetensor_e2e.py
+++ b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
@@ -1,11 +1,15 @@
 # Owner(s): ["oncall: distributed checkpointing"]
 
 import importlib
+import json
 import os
 
 import torch
 import torch.distributed.checkpoint as dist_cp
 from torch import distributed as dist
+from torch.distributed.checkpoint.quantized_hf_storage import (
+    QuantizedHuggingFaceStorageReader,
+)
 from torch.distributed.checkpoint.state_dict_loader import _load_state_dict_from_keys
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import distribute_tensor, DTensor, Replicate, Shard, zeros
@@ -157,6 +161,118 @@ def test_load_with_multiple_threads(self) -> None:
                 torch.equal(state_dict_to_save[key], state_dict_to_load[key])
             )
 
+    @with_temp_dir
+    def test_quantized_checkpoint_loading(self) -> None:
+        """Test end-to-end saving a quantizaed checkpoint and loading it."""
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print("safetensors not installed")
+            return
+
+        CHECKPOINT_DIR = self.temp_dir
+
+        # Create original (unquantized) tensors to validate against
+        original_tensors = {
+            "linear1.weight": torch.randn(256, 128, dtype=torch.float32) * 2.0,
+            "linear2.weight": torch.randn(128, 64, dtype=torch.float32) * 1.5,
+            "embedding.weight": torch.randn(512, 256, dtype=torch.float32) * 3.0,
+        }
+
+        # Create quantized tensors and scale tensors
+        quantized_checkpoint = {}
+        block_size = 128
+
+        for tensor_name, original_tensor in original_tensors.items():
+            # Simulate quantization: scale down the tensor for quantization
+            # This is a simplified quantization - in real scenarios it would be more complex
+            rows, cols = original_tensor.shape
+
+            # Create scale tensor for block-wise dequantization
+            block_rows = (rows + block_size - 1) // block_size
+            block_cols = (cols + block_size - 1) // block_size
+
+            # Create scale inverse tensor (used for dequantization)
+            scale_inv = torch.ones(block_rows, block_cols, dtype=torch.float32) * 2.0
+
+            # Create quantized version (divide by scale for quantization)
+            quantized_tensor = original_tensor / 2.0  # Simplified quantization
+
+            # Store quantized tensor and its scale
+            quantized_checkpoint[tensor_name] = quantized_tensor
+            quantized_checkpoint[f"{tensor_name}_scale_inv"] = scale_inv
+
+        # Save quantized checkpoint to safetensors file
+        safetensors_file = os.path.join(CHECKPOINT_DIR, "model.safetensors")
+        save_file(quantized_checkpoint, safetensors_file)
+
+        # Create model.safetensors.index.json with weight mapping
+        weight_map = {}
+        for key in quantized_checkpoint.keys():
+            weight_map[key] = "model.safetensors"
+
+        index_data = {
+            "metadata": {
+                "total_size": sum(
+                    t.numel() * t.element_size() for t in quantized_checkpoint.values()
+                )
+            },
+            "weight_map": weight_map,
+        }
+
+        index_file = os.path.join(CHECKPOINT_DIR, "model.safetensors.index.json")
+        with open(index_file, "w") as f:
+            json.dump(index_data, f, indent=2)
+
+        # Prepare state dict to load into
+        state_dict_to_load = {}
+        for tensor_name, original_tensor in original_tensors.items():
+            state_dict_to_load[tensor_name] = torch.zeros_like(original_tensor)
+
+        # Load using QuantizedHuggingFaceStorageReader
+        dist_cp.load(
+            state_dict=state_dict_to_load,
+            storage_reader=QuantizedHuggingFaceStorageReader(
+                path=CHECKPOINT_DIR,
+                target_dtype=torch.float32,
+                block_size=block_size,
+                thread_count=2,
+            ),
+        )
+
+        # Validate that loaded tensors match original tensors
+        self.assertEqual(
+            sorted(original_tensors.keys()), sorted(state_dict_to_load.keys())
+        )
+
+        for tensor_name in original_tensors.keys():
+            original = original_tensors[tensor_name]
+            loaded = state_dict_to_load[tensor_name]
+
+            # Verify shapes match
+            self.assertEqual(
+                original.shape,
+                loaded.shape,
+                f"Shape mismatch for {tensor_name}: {original.shape} vs {loaded.shape}",
+            )
+
+            # Verify dtypes match
+            self.assertEqual(
+                original.dtype,
+                loaded.dtype,
+                f"Dtype mismatch for {tensor_name}: {original.dtype} vs {loaded.dtype}",
+            )
+
+            # Verify dequantized values match original values
+            # We expect exact match since we used simple 2x scaling
+            torch.testing.assert_close(
+                loaded,
+                original,
+                rtol=1e-5,
+                atol=1e-5,
+                msg=f"Value mismatch for tensor {tensor_name}",
+            )
+
 
 class TestDistributedHFSafetensorsConsolidation(DTensorTestBase):
     @with_comms
diff --git a/test/distributed/checkpoint/test_quantized_hf_storage.py b/test/distributed/checkpoint/test_quantized_hf_storage.py
new file mode 100644
index 000000000000..82d658c27105
--- /dev/null
+++ b/test/distributed/checkpoint/test_quantized_hf_storage.py
@@ -0,0 +1,167 @@
+# Owner(s): ["oncall: distributed checkpointing"]
+
+import tempfile
+from unittest.mock import MagicMock, patch
+
+import torch
+from torch.distributed.checkpoint.metadata import MetadataIndex
+from torch.distributed.checkpoint.planner import LoadItemType, ReadItem
+from torch.distributed.checkpoint.quantized_hf_storage import (
+    QuantizedHuggingFaceStorageReader,
+)
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestQuantizedHfStorage(TestCase):
+    def setUp(self):
+        """Set up common test fixtures."""
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = self.temp_dir.name
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        self.temp_dir.cleanup()
+
+    def test_dequantization(self):
+        """Test quantized tensors with weights and scales in both same and different files."""
+        reader = QuantizedHuggingFaceStorageReader(self.path, thread_count=1)
+
+        # Test data for two different weights
+        quantized_tensor1 = torch.ones(4, 4, dtype=torch.float32)
+        quantized_tensor2 = (
+            torch.ones(4, 4, dtype=torch.float32) * 3.0
+        )  # Different values
+        scale_inv1 = torch.tensor([[2.0]], dtype=torch.float32)
+        scale_inv2 = torch.tensor([[0.5]], dtype=torch.float32)  # Different scale
+
+        # Define weight and scale tensor names
+        weight1_fqn = "model.layers.0.self_attn.q_proj.weight"  # Scale in same file
+        scale1_fqn = "model.layers.0.self_attn.q_proj.weight_scale_inv"
+        weight2_fqn = (
+            "model.layers.0.self_attn.k_proj.weight"  # Scale in different file
+        )
+        scale2_fqn = "model.layers.0.self_attn.k_proj.weight_scale_inv"
+
+        file1_name = "model-00001-of-00002.safetensors"
+        file2_name = "model-00002-of-00002.safetensors"
+
+        # Setup weight-scale mapping and file locations
+        reader._weight_scale_mapping = {
+            weight1_fqn: scale1_fqn,
+            weight2_fqn: scale2_fqn,
+        }
+        reader._weight_map = {
+            weight1_fqn: file1_name,  # Weight in file 1
+            scale1_fqn: file1_name,  # Scale also in file 1 (same file scenario)
+            weight2_fqn: file1_name,  # Weight in file 1
+            scale2_fqn: file2_name,  # Scale in file 2 (different file scenario)
+        }
+
+        # Mock the main safetensors file (file1)
+        mock_file1 = MagicMock()
+
+        # Mock get_slice to return different tensors based on tensor name
+        def mock_get_slice(tensor_name):
+            mock_tensor = MagicMock()
+            if tensor_name == weight1_fqn:
+                mock_tensor.__getitem__ = lambda _, __: quantized_tensor1
+            elif tensor_name == weight2_fqn:
+                mock_tensor.__getitem__ = lambda _, __: quantized_tensor2
+            return mock_tensor
+
+        mock_file1.get_slice = mock_get_slice
+
+        # Mock get_tensor for same-file scale (scale1)
+        mock_file1.get_tensor.return_value = scale_inv1
+
+        # Mock the cross-file safetensors file (file2) for scale2
+        mock_file2 = MagicMock()
+        mock_file2.get_tensor.return_value = scale_inv2
+
+        # Test 1: Same-file scenario (weight1 + scale1 both in file1)
+        read_item1 = ReadItem(
+            type=LoadItemType.TENSOR,
+            storage_index=MetadataIndex(
+                fqn=weight1_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            dest_index=MetadataIndex(
+                fqn=weight1_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            storage_offsets=[0, 0],
+            dest_offsets=[0, 0],
+            lengths=[4, 4],
+        )
+
+        target_tensor1 = torch.zeros(4, 4, dtype=torch.float32)
+        mock_planner1 = MagicMock()
+        mock_planner1.resolve_tensor.return_value = target_tensor1
+
+        # Process first weight (same file scenario)
+        reader._process_read_request(mock_file1, read_item1, mock_planner1)
+
+        # Verify first tensor was dequantized (ones * 2.0 = twos)
+        expected_result1 = torch.ones(4, 4, dtype=torch.float32) * 2.0
+        mock_planner1.commit_tensor.assert_called_once()
+
+        # Check that target_tensor1 was updated correctly
+        args1, _ = mock_planner1.commit_tensor.call_args
+        committed_tensor1 = args1[1]
+        torch.testing.assert_close(committed_tensor1, expected_result1)
+
+        # Test 2: Cross-file scenario (weight2 in file1, scale2 in file2)
+        read_item2 = ReadItem(
+            type=LoadItemType.TENSOR,
+            storage_index=MetadataIndex(
+                fqn=weight2_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            dest_index=MetadataIndex(
+                fqn=weight2_fqn,
+                offset=torch.Size([0, 0]),
+            ),
+            storage_offsets=[0, 0],
+            dest_offsets=[0, 0],
+            lengths=[4, 4],
+        )
+
+        target_tensor2 = torch.zeros(4, 4, dtype=torch.float32)
+        mock_planner2 = MagicMock()
+        mock_planner2.resolve_tensor.return_value = target_tensor2
+
+        # Mock the entire safetensors module since it may not be available in test environment
+        mock_safetensors = MagicMock()
+        mock_safe_open = MagicMock()
+        mock_safetensors.safe_open = mock_safe_open
+
+        # Set up the mock to return a context manager that yields mock_file2
+        mock_safe_open.return_value.__enter__.return_value = mock_file2
+        mock_safe_open.return_value.__exit__.return_value = False
+
+        # Mock the module import and safe_open function
+        with patch.dict("sys.modules", {"safetensors": mock_safetensors}):
+            # Process second weight (cross-file scenario)
+            reader._process_read_request(mock_file1, read_item2, mock_planner2)
+
+            # Verify safe_open was called with the correct file path
+            expected_path = f"{self.path}/{file2_name}"
+            mock_safe_open.assert_called_once()
+            call_args = mock_safe_open.call_args[0]
+            self.assertEqual(str(call_args[0]), expected_path)
+
+        # Verify the scale tensor was loaded from the correct file
+        mock_file2.get_tensor.assert_called_once_with(scale2_fqn)
+
+        # Verify second tensor was dequantized (3.0 * 0.5 = 1.5)
+        expected_result2 = torch.ones(4, 4, dtype=torch.float32) * 3.0 * 0.5  # 1.5
+        mock_planner2.commit_tensor.assert_called_once()
+
+        # Check that target_tensor2 was updated correctly
+        args2, _ = mock_planner2.commit_tensor.call_args
+        committed_tensor2 = args2[1]
+        torch.testing.assert_close(committed_tensor2, expected_result2)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/fsdp/test_fsdp_flatten_params.py b/test/distributed/fsdp/test_fsdp_flatten_params.py
index 1e4a408b8729..12e432f214f3 100644
--- a/test/distributed/fsdp/test_fsdp_flatten_params.py
+++ b/test/distributed/fsdp/test_fsdp_flatten_params.py
@@ -44,8 +44,11 @@ def world_size(self) -> int:
         return 1
 
     def _get_default_config(self):
+        device_type = (
+            acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+        )
         return {
-            "device": torch.device("cuda"),
+            "device": torch.device(device_type),
             "sharding_strategy": HandleShardingStrategy.FULL_SHARD,
             "offload_params": False,
             "mp_param_dtype": None,
diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py
index 0ffe6054bd33..ad318a6bf752 100644
--- a/test/distributed/fsdp/test_fsdp_freezing_weights.py
+++ b/test/distributed/fsdp/test_fsdp_freezing_weights.py
@@ -31,6 +31,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 class Model(nn.Module):
     def __init__(
@@ -47,7 +49,6 @@ def __init__(
             nn.AdaptiveAvgPool2d(output_size=(1, 1)),
             nn.Flatten(),
         )
-        self.device = torch.cuda.current_device()
         self.head = nn.Linear(64, 10)
         if with_fsdp and freeze_after_wrap_fsdp:
             self.fsdp_wrap(fsdp_kwargs)
@@ -145,7 +146,7 @@ def _dist_train(
         forward_prefetch,
     ):
         torch.manual_seed(0)
-        batch = torch.randn(size=(2, 3, 224, 224)).cuda()
+        batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
 
         fsdp_kwargs = {
             "device_id": self.rank,
@@ -164,7 +165,7 @@ def _dist_train(
             disable_autograd,
             fsdp_kwargs,
         )
-        model = model.cuda()
+        model = model.to(device_type)
 
         # freezing the trunk using requires_grad.
         if freezing_method == FreezingMethod.RequiresGrad:
@@ -178,7 +179,7 @@ def _dist_train(
         else:
             model = DistributedDataParallel(model, **ddp_kwargs)
 
-        target = torch.tensor([0, 1], dtype=torch.long).cuda()
+        target = torch.tensor([0, 1], dtype=torch.long).to(device_type)
         criterion = nn.CrossEntropyLoss()
         optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
 
diff --git a/test/distributed/fsdp/test_fsdp_hybrid_shard.py b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
index 70c415ae1fe7..26a05bbc4171 100644
--- a/test/distributed/fsdp/test_fsdp_hybrid_shard.py
+++ b/test/distributed/fsdp/test_fsdp_hybrid_shard.py
@@ -49,6 +49,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 @contextlib.contextmanager
 def patch_allreduce(new_allreduce):
@@ -97,7 +99,7 @@ class ShardingStrategyMode(Enum):
 class TestFSDPHybridShard(FSDPTest):
     @property
     def world_size(self):
-        return max(torch.cuda.device_count(), 2)
+        return max(torch.accelerator.device_count(), 2)
 
     @property
     def process_group(self):
@@ -105,7 +107,7 @@ def process_group(self):
 
     @skip_if_lt_x_gpu(2)
     def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
-        model = MyModel().cuda()
+        model = MyModel().to(device_type)
         err_ctx = self.assertRaisesRegex(
             ValueError,
             "requires explicit specification of process group or device_mesh.",
@@ -119,8 +121,8 @@ def test_raises_manual_wrap_hybrid_shard_when_none_policy(self):
 
     @skip_if_lt_x_gpu(4)
     def test_hsdp_save_load_state_dict(self):
-        model = MyModel().cuda()
-        num_node_devices = torch.cuda.device_count()
+        model = MyModel().to(device_type)
+        num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
             list(range(0, num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
@@ -161,7 +163,7 @@ def test_hsdp_save_load_state_dict(self):
             msd = model.state_dict()
             osd = FSDP.optim_state_dict(model, optim)
 
-        load_model = fsdp_ctor(MyModel().cuda())
+        load_model = fsdp_ctor(MyModel().to(device_type))
         load_optim = torch.optim.AdamW(load_model.parameters())
         with FSDP.state_dict_type(load_model, StateDictType.SHARDED_STATE_DICT):
             load_model.load_state_dict(msd)
@@ -170,8 +172,8 @@ def test_hsdp_save_load_state_dict(self):
 
     @skip_if_lt_x_gpu(4)
     def test_hsdp_sync_module_state(self):
-        model = MyModel().cuda()
-        num_node_devices = torch.cuda.device_count()
+        model = MyModel().to(device_type)
+        num_node_devices = torch.accelerator.device_count()
         shard_rank_lists = (
             list(range(0, num_node_devices // 2)),
             list(range(num_node_devices // 2, num_node_devices)),
@@ -214,7 +216,7 @@ def test_hsdp_sync_module_state(self):
     @skip_if_lt_x_gpu(2)
     def test_invalid_pg_specification_raises(self):
         pol = ModuleWrapPolicy({nn.Linear})
-        model = MyModel().cuda()
+        model = MyModel().to(device_type)
         with self.assertRaisesRegex(
             ValueError, "Expected process_group to be passed in"
         ):
@@ -260,7 +262,7 @@ def _test_fsdp_hybrid_shard_basic_setup(
         use_device_mesh: bool,
     ):
         if use_device_mesh:
-            device_mesh = init_device_mesh("cuda", (1, self.world_size))
+            device_mesh = init_device_mesh(device_type, (1, self.world_size))
         else:
             device_mesh = None
         hsdp_model = self._init_hsdp_model(
@@ -316,7 +318,7 @@ def patched_collective(orig_collective, counter, *args, **kwargs):
             patch_allreduce(patched_allreduce),
             patch_reduce_scatter(patched_reduce_scatter),
         ):
-            inp = hsdp_model.get_input(device=torch.cuda.current_device())
+            inp = hsdp_model.get_input(device=torch.accelerator.current_device_index())
             out = hsdp_model(inp[0], inp[1])
             loss = hsdp_model.get_loss(inp, out)
             loss.backward()
@@ -365,7 +367,7 @@ def _test_fsdp_hybrid_shard_parity(
         hsdp_optim = torch.optim.Adam(hsdp_model.parameters(), lr=1e-2)
         torch.manual_seed(global_pg.rank() + 1)
         for _ in range(5):
-            inp = fsdp_model.module.get_input(torch.device("cuda"))
+            inp = fsdp_model.module.get_input(torch.device(device_type))
             losses: list[torch.Tensor] = []
             for model, optim in ((fsdp_model, fsdp_optim), (hsdp_model, hsdp_optim)):
                 optim.zero_grad()
@@ -381,7 +383,7 @@ def _init_fsdp_model(self, use_orig_params: bool) -> nn.Module:
         )
         hsdp_kwargs = {
             "auto_wrap_policy": auto_wrap_policy,
-            "device_id": torch.cuda.current_device(),
+            "device_id": torch.accelerator.current_device_index(),
             "use_orig_params": use_orig_params,
         }
         fsdp_model = TransformerWithSharedParams.init(
@@ -408,7 +410,7 @@ def _init_hsdp_model(
             {TransformerEncoderLayer, TransformerDecoderLayer},
         )
         hsdp_kwargs = {
-            "device_id": torch.cuda.current_device(),
+            "device_id": torch.accelerator.current_device_index(),
             "auto_wrap_policy": auto_wrap_policy,
             "sharding_strategy": hsdp_sharding_strategy,
             "use_orig_params": use_orig_params,
@@ -435,7 +437,7 @@ def _init_hsdp_model(
             # Use `FULL_SHARD` for the embedding and output projection
             hsdp_model = FSDP(
                 model,
-                device_id=torch.cuda.current_device(),
+                device_id=torch.accelerator.current_device_index(),
                 sharding_strategy=ShardingStrategy.FULL_SHARD,
                 use_orig_params=use_orig_params,
             )
diff --git a/test/distributed/fsdp/test_fsdp_ignored_modules.py b/test/distributed/fsdp/test_fsdp_ignored_modules.py
index e75f911226da..d8974327ea5d 100644
--- a/test/distributed/fsdp/test_fsdp_ignored_modules.py
+++ b/test/distributed/fsdp/test_fsdp_ignored_modules.py
@@ -36,6 +36,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 class Model(torch.nn.Module):
     def __init__(self) -> None:
@@ -94,9 +96,9 @@ def __init__(self, num_ignored: int) -> None:
 class TestFSDPIgnoredModules(FSDPTest):
     @property
     def world_size(self):
-        return min(torch.cuda.device_count(), 2)
+        return min(torch.accelerator.device_count(), 2)
 
-    def _train_model(self, model, optim, num_iters, device=torch.device("cuda")):
+    def _train_model(self, model, optim, num_iters, device=torch.device(device_type)):
         for _ in range(num_iters):
             module = model.module if isinstance(model, FSDP) else model
             inp = module.get_input(device)
@@ -198,7 +200,7 @@ def _test_ignored_modules_nested(self, use_orig_params: bool, ignore_modules: bo
         # Initialize an FSDP-wrapped nested model that first wraps the nested
         # sequential's second linear layer (`layer1[1]`) and then wraps the
         # overall model while ignoring the nested sequential (`layer1`)
-        model = Model().cuda()
+        model = Model().to(device_type)
         fsdp_fn = functools.partial(FSDP, use_orig_params=use_orig_params)
         model.layer1[1] = fsdp_fn(model.layer1[1])
         if ignore_modules:
@@ -246,7 +248,7 @@ def test_ignored_states_auto_wrap(self):
         )
 
     def _test_ignored_states_auto_wrap(self, policy, ignore_bias: bool):
-        model = Model().cuda()
+        model = Model().to(device_type)
         ignored_states = [model.layer1[1].weight]
         if ignore_bias:
             ignored_states.append(model.layer1[1].bias)
@@ -285,7 +287,7 @@ def _test_ignored_states_auto_wrap(self, policy, ignore_bias: bool):
     def test_ignored_modules_invalid(self):
         """Tests that passing an FSDP module as an ignored module or the
         top-level module itself errors."""
-        model = Model().cuda()
+        model = Model().to(device_type)
         wrap_cls = FSDP
         model.layer1 = wrap_cls(model.layer1)
         # Passing an FSDP module as an ignored module should error
@@ -302,7 +304,7 @@ def test_ignored_modules_invalid(self):
         ):
             # FSDP does not allow to wrap the same model twice, so create
             # a new local model here.
-            new_model = Model().cuda()
+            new_model = Model().to(device_type)
             wrap_cls(new_model, ignored_modules=[new_model])
 
     @skip_if_lt_x_gpu(2)
@@ -334,7 +336,7 @@ def _test_diff_ignored_modules_across_ranks(
         # we wrap `layer3` with FSDP, where `layer3` is registered as a module
         # after `layer1`, which has the variable number of ignored modules
         wrap_cls = FSDP
-        model = ModelWithIgnoredModules(num_ignored=self.rank + 1).cuda()
+        model = ModelWithIgnoredModules(num_ignored=self.rank + 1).to(device_type)
         layer1_ignored_modules = [
             m for m in model.layer1.modules() if isinstance(m, IgnoredModule)
         ]
@@ -370,7 +372,7 @@ def _test_diff_ignored_modules_across_ranks(
     @skip_if_lt_x_gpu(2)
     @parametrize("ignore_modules", [True, False])
     def test_ignored_modules_not_under_wrapped_root(self, ignore_modules: bool):
-        model = Model().cuda()
+        model = Model().to(device_type)
         ignored_modules = list(model.layer1.children())[1:]
 
         ignore_kwargs = (
@@ -409,7 +411,7 @@ def test_ignored_states_check(self):
         )
 
     def _test_ignored_states_check(self, ignore_modules: bool):
-        model = Model().cuda()
+        model = Model().to(device_type)
         ignored_modules = list(model.layer1.children())[1:]
         ignored_params = {p for m in ignored_modules for p in m.parameters()}
         ignored_states = ignored_params.union(set(ignored_modules))
diff --git a/test/distributed/fsdp/test_fsdp_memory.py b/test/distributed/fsdp/test_fsdp_memory.py
index d10f78e3b3c7..93391f01b376 100644
--- a/test/distributed/fsdp/test_fsdp_memory.py
+++ b/test/distributed/fsdp/test_fsdp_memory.py
@@ -14,6 +14,7 @@
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_CUDA,
     TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
 )
@@ -31,11 +32,14 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 def get_cur_mem(rank, result, prefix):
     """Collect memory allocated values in a result dict in MB"""
-    torch._C._cuda_clearCublasWorkspaces()
-    result[prefix] = round(torch.cuda.memory_allocated() / 1024 / 1024)
+    if TEST_CUDA:
+        torch._C._cuda_clearCublasWorkspaces()
+    result[prefix] = round(torch.accelerator.memory_allocated() / 1024 / 1024)
 
 
 class Model(nn.Module):
@@ -110,14 +114,14 @@ def world_size(self):
 
     def _dist_train(self, with_checkpoint, expected, model_hidden_dim, iterations):
         gpu_id = self.rank
-        batch = torch.randn(size=(2, 3, 224, 224)).cuda()
+        batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
 
         model = create_model(
             with_fsdp=True,
             with_checkpoint=with_checkpoint,
             model_hidden_dim=model_hidden_dim,
         )
-        model = model.cuda()
+        model = model.to(device_type)
         model = FSDP(model)
 
         # We enable momentum so that after the first iteration, the optimizer state is added
@@ -133,7 +137,7 @@ def _dist_train(self, with_checkpoint, expected, model_hidden_dim, iterations):
             get_cur_mem(gpu_id, results, f"iter {iteration}: after fwd")
 
             out = sum(o.sum() for o in out[0])
-            fake_loss = criterion(out, torch.tensor(0.0).cuda())
+            fake_loss = criterion(out, torch.tensor(0.0).to(device_type))
             get_cur_mem(gpu_id, results, f"iter {iteration}: after loss")
 
             fake_loss.backward()
@@ -167,8 +171,8 @@ def test_fsdp_memory(self, ckpt):
 
         model = create_model(
             with_fsdp=False, with_checkpoint=False, model_hidden_dim=model_hidden_dim
-        ).cuda()
-        model_size_mb = round(torch.cuda.memory_allocated() / 1024 / 1024)
+        ).to(device_type)
+        model_size_mb = round(torch.accelerator.memory_allocated() / 1024 / 1024)
         del model
 
         sharded_model_size_mb = int(model_size_mb / self.world_size)
diff --git a/test/distributed/fsdp/test_fsdp_meta.py b/test/distributed/fsdp/test_fsdp_meta.py
index 9a3d57c705a5..d3b0079a24ad 100644
--- a/test/distributed/fsdp/test_fsdp_meta.py
+++ b/test/distributed/fsdp/test_fsdp_meta.py
@@ -43,6 +43,8 @@
     )
     sys.exit(0)
 
+device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
+
 
 def _reset_params_if_meta(is_meta: bool, model: nn.Module):
     # For torchdistX init, we don't need to call reset_params, as
@@ -117,7 +119,7 @@ def _init_with_reset_params(module: nn.Module):
         )
     )
     if has_meta_states:
-        device = torch.device("cuda", torch.cuda.current_device())
+        device = torch.device(device_type, torch.accelerator.current_device_index())
         module.to_empty(device=device, recurse=False)
         module.reset_parameters()
 
@@ -164,13 +166,13 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None):
 
         # Test to make sure it is the same model parameters as regular FSDP
         # approach.
-        regular = MyModel(device="cuda")
+        regular = MyModel(device=device_type)
         _reset_params_if_meta(is_meta, regular)
         fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
         regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
 
         self._compare_fsdp(fsdp_meta, fsdp_regular)
-        inp = torch.randn(10, 2, device="cuda")
+        inp = torch.randn(10, 2, device=device_type)
         fsdp_meta(inp).sum().backward()
         fsdp_regular(inp).sum().backward()
         meta_opt.step()
@@ -182,7 +184,7 @@ def _test_simple_model_with_meta_device(self, meta_module_fn, init_fn=None):
         model = meta_module_fn()
         fsdp_meta = FSDP(model, param_init_fn=init_fn)
         meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
-        regular = MyModel(device="cuda")
+        regular = MyModel(device=device_type)
         _reset_params_if_meta(is_meta, regular)
         fsdp_regular = FSDP(regular, auto_wrap_policy=always_wrap)
         regular_opt = torch.optim.SGD(fsdp_regular.parameters(), lr=1e-3)
@@ -217,7 +219,7 @@ def meta_module_fn():
     )
     def test_simple_model_with_torchdistX_default_init(self):
         def meta_module_fn():
-            return deferred_init.deferred_init(MyModel, device="cuda")
+            return deferred_init.deferred_init(MyModel, device=device_type)
 
         self._test_simple_model_with_meta_device(meta_module_fn)
 
@@ -228,7 +230,7 @@ def meta_module_fn():
     )
     def test_simple_model_with_torchdistX_init_fn(self):
         def meta_module_fn():
-            return deferred_init.deferred_init(MyModel, device="cuda")
+            return deferred_init.deferred_init(MyModel, device=device_type)
 
         self._test_simple_model_with_meta_device(
             meta_module_fn, init_fn=_init_with_torchdistX
@@ -248,7 +250,7 @@ def _test_nested_model_with_meta_device(
                 param_init_fn=init_fn,
             )
             meta_opt = torch.optim.SGD(fsdp_meta.parameters(), lr=1e-3)
-            module_regular = NestedModel(device="cuda")
+            module_regular = NestedModel(device=device_type)
             _reset_params_if_meta(is_meta, module_regular)
             fsdp_regular = FSDP(
                 module_regular,
@@ -269,7 +271,7 @@ def _test_nested_model_with_meta_device(
 
             # Init and reset parameters before wrapping so that reset_params
             # matches up with meta device's initialization.
-            module_regular = NestedModel(device="cuda")
+            module_regular = NestedModel(device=device_type)
             _reset_params_if_meta(is_meta, module_regular)
             with enable_wrap(wrapper_cls=FSDP):
                 module_regular.lin1 = wrap(module_regular.lin1)
@@ -279,7 +281,7 @@ def _test_nested_model_with_meta_device(
 
         # Compare it before training
         self._compare_fsdp(fsdp_meta, fsdp_regular)
-        inp = torch.randn(10, 2, device="cuda")
+        inp = torch.randn(10, 2, device=device_type)
         fsdp_meta(inp).sum().backward()
         fsdp_regular(inp).sum().backward()
         meta_opt.step()
@@ -317,7 +319,7 @@ def meta_module_fn():
     @parametrize("auto_wrap", [True, False])
     def test_nested_model_with_torchdistX_default_init(self, auto_wrap):
         def meta_module_fn():
-            return deferred_init.deferred_init(NestedModel, device="cuda")
+            return deferred_init.deferred_init(NestedModel, device=device_type)
 
         self._test_nested_model_with_meta_device(
             auto_wrap=auto_wrap, meta_module_fn=meta_module_fn
@@ -331,7 +333,7 @@ def meta_module_fn():
     @parametrize("auto_wrap", [True, False])
     def test_nested_model_with_torchdistX_init_fn(self, auto_wrap):
         def meta_module_fn():
-            return deferred_init.deferred_init(NestedModel, device="cuda")
+            return deferred_init.deferred_init(NestedModel, device=device_type)
 
         self._test_nested_model_with_meta_device(
             auto_wrap=auto_wrap,
@@ -351,7 +353,7 @@ def _test_bad_arg(self, meta_module_fn):
     )
     def test_bad_arg_torchdistx(self):
         def meta_module_fn():
-            return deferred_init.deferred_init(NestedModel, "cuda")
+            return deferred_init.deferred_init(NestedModel, device_type)
 
         self._test_bad_arg(meta_module_fn)
 
@@ -401,7 +403,7 @@ def _param_init_fn(module: nn.Module) -> None:
             # TODO: `module.to_empty()` is not generally correct for meta
             # device initialization.
             # https://github.com/pytorch/pytorch/issues/90465
-            module.to_empty(device=torch.device("cuda"))
+            module.to_empty(device=torch.device(device_type))
             module.apply(model._module_init_fn)
 
         model = Model()
@@ -414,7 +416,7 @@ def _param_init_fn(module: nn.Module) -> None:
                 param_dtype=torch.float32, reduce_dtype=torch.float16
             ),
             param_init_fn=_param_init_fn,
-            device_id=torch.cuda.current_device(),
+            device_id=torch.accelerator.current_device_index(),
         )
 
 
diff --git a/test/distributed/tensor/debug/test_comm_mode_features.py b/test/distributed/tensor/debug/test_comm_mode_features.py
index 6c0743129150..86b3849fda69 100644
--- a/test/distributed/tensor/debug/test_comm_mode_features.py
+++ b/test/distributed/tensor/debug/test_comm_mode_features.py
@@ -11,7 +11,7 @@
     parallelize_module,
     RowwiseParallel,
 )
-from torch.testing._internal.common_utils import run_tests, skipIfHpu
+from torch.testing._internal.common_utils import run_tests, skipIfHpu, TEST_XPU, xfailIf
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     MLPModule,
@@ -221,6 +221,7 @@ def test_MLP_module_tracing(self):
 
     @skipIfHpu
     @skip_unless_torch_gpu
+    @xfailIf(TEST_XPU)  # https://github.com/intel/torch-xpu-ops/issues/1555
     @with_comms
     def test_transformer_module_tracing(self, is_seq_parallel=False):
         """
diff --git a/test/distributed/tensor/experimental/test_local_map.py b/test/distributed/tensor/experimental/test_local_map.py
index 1e1b4fa8f27d..dad23226363e 100644
--- a/test/distributed/tensor/experimental/test_local_map.py
+++ b/test/distributed/tensor/experimental/test_local_map.py
@@ -1,6 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]
-from functools import partial
 
 import torch
 import torch.distributed._functional_collectives as funcol
@@ -50,8 +49,7 @@ def mm_allreduce_forward(device_mesh, A, B):
     return funcol.all_reduce(partial_sum_tensor, "sum", device_mesh).wait()
 
 
-@partial(
-    local_map,
+@local_map(
     out_placements=replicate,
     in_placements=(None, col_wise, row_wise),
 )
diff --git a/test/distributed/tensor/parallel/test_parallelize_api.py b/test/distributed/tensor/parallel/test_parallelize_api.py
index cc41b250e34a..2ef70f1a447e 100644
--- a/test/distributed/tensor/parallel/test_parallelize_api.py
+++ b/test/distributed/tensor/parallel/test_parallelize_api.py
@@ -33,7 +33,7 @@ def forward(self, x):
 class TensorParallelAPITests(DTensorTestBase):
     @property
     def world_size(self):
-        gpu_num = torch.cuda.device_count()
+        gpu_num = torch.accelerator.device_count()
         return gpu_num if gpu_num % 2 == 0 and gpu_num > 4 else 4
 
     def _compare_params(
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index 2365bd9ffc63..49d3d6a0c52d 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -27,6 +27,7 @@
     RowwiseParallel,
 )
 from torch.distributed.tensor.parallel.input_reshard import input_reshard
+from torch.testing._internal.common_device_type import skipXPUIf
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -281,6 +282,7 @@ def _thaw_params(thaw_params, model, model_tp):
     @skip_unless_torch_gpu
     @parametrize("is_seq_parallel", [True, False])
     @parametrize("dtype", [torch.float64, torch.float32])
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1555")
     def test_transformer_training(self, is_seq_parallel, dtype: torch.dtype):
         EXP_BASE_CC = ExpCommCounts(
             fwd={all_reduce: 6, all_gather: 1}, bwd={all_reduce: 9}
@@ -412,6 +414,7 @@ def test_transformer_training(self, is_seq_parallel, dtype: torch.dtype):
         + f"{str(dtype).split('.')[-1]}_"
         + f"thaw_{'__'.join(sorted({n.rpartition('.')[0].replace('.', '_') for n in thaw})) if thaw else 'all'}",
     )
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1555")
     def test_transformer_req_grad(self, thaw_params, is_seq_parallel, dtype, exp_cnts):
         # Sample a subset of `requires_grad` patterns
 
diff --git a/test/distributed/tensor/parallel/test_tp_random_state.py b/test/distributed/tensor/parallel/test_tp_random_state.py
index 0544022a84fc..490210517f51 100644
--- a/test/distributed/tensor/parallel/test_tp_random_state.py
+++ b/test/distributed/tensor/parallel/test_tp_random_state.py
@@ -66,7 +66,7 @@ def test_model_init(self):
             # in the following way:
             #   - within a tensor parallel group, the RNG is set with the same seed
             #   - across data parallel groups, the RNG is set with different seeds
-            torch.cuda.manual_seed(0)
+            torch.get_device_module(self.device_type).manual_seed(0)
 
             # disable/enable parallel RNG feature
             if random._rng_tracker:
diff --git a/test/distributed/tensor/test_attention.py b/test/distributed/tensor/test_attention.py
index 76f96f5f0853..a2543d443e4f 100644
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@@ -678,6 +678,9 @@ def _test_ring_flex_attention(
 
     @skip_if_lt_x_gpu(2)
     @with_comms
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
     def test_ring_flex_attention(self) -> None:
         self.run_subtests(
             {"qkv_size": [128 * self.world_size, 2048]},
@@ -694,6 +697,9 @@ def test_ring_flex_attention(self) -> None:
     # TODO: merge with the above test
     @skip_if_lt_x_gpu(2)
     @with_comms
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
     def test_ring_flex_attention_document_mask(self) -> None:
         random.seed(10)
 
diff --git a/test/distributed/tensor/test_dtensor.py b/test/distributed/tensor/test_dtensor.py
index 73f4b709103f..f5ddb1a4222c 100644
--- a/test/distributed/tensor/test_dtensor.py
+++ b/test/distributed/tensor/test_dtensor.py
@@ -848,6 +848,30 @@ def test_implicit_replication(self):
             self.assertEqual(local_shard.shape, (4, 3))
             self.assertEqual(local_shard, torch.ones(4, 3) + torch.ones(3))
 
+    @with_comms
+    def test_vmap_embedding(self):
+        mesh = self.build_device_mesh()
+        batch_size, seq_len = 2, 6
+        output_dim = 32
+
+        indices = torch.zeros(*(batch_size, seq_len), dtype=torch.int64)
+        indices[0, 1] = 1
+        indices[1, 3] = 1
+        indices[1, 5] = 1
+        indices = DTensor.from_local(indices, mesh, [Shard(0)])
+
+        emb = torch.randn(
+            *(batch_size, 8, output_dim),
+            dtype=torch.float32,
+        )
+        emb = DTensor.from_local(emb, mesh, [Shard(0)])
+        result = torch.vmap(F.embedding)(indices, emb)
+        expected = [F.embedding(indices[i], emb[i]) for i in range(batch_size)]
+        expected = torch.stack(expected)
+        local_result = result.to_local()
+        local_expected = expected.to_local()
+        self.assertEqual(local_result, local_expected)
+
     @with_comms
     def test_auto_implicit_replication(self):
         mesh = self.build_device_mesh()
diff --git a/test/distributed/tensor/test_dtensor_compile.py b/test/distributed/tensor/test_dtensor_compile.py
index 79da180def3d..15e3daf6b941 100644
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@@ -20,7 +20,15 @@
 )
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.tensor import DeviceMesh, DTensor, Partial, Replicate, Shard
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_module,
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -88,6 +96,33 @@ def extract_graph(fx_g, _, graph_cell):
 )
 
 
+def _apply_sharding(mod: nn.Module, shard_dim: int, device_mesh: DeviceMesh):
+    """
+    Shards on the given dimension if possible, else replicate
+    Args:
+        mod: (nn.Module) Module to shard or replicate
+        shard_dim: (int) Dimension to shard on if possible
+        device_mesh: (DeviceMesh) 1D Device Mesh
+
+    Returns:
+        Sharded DTensor
+    """
+
+    def shard_module_params(name, module, device_mesh):
+        for name, param in module.named_parameters():
+            placement = Replicate()
+            if shard_dim < len(param.size()):
+                placement = Shard(shard_dim)
+            dist_param = torch.nn.Parameter(
+                distribute_tensor(param, device_mesh, [placement])
+            )
+            name = name.split(".")[-1]
+            module.register_parameter(name, dist_param)
+
+    sharded_mod = distribute_module(mod, device_mesh, shard_module_params)
+    return sharded_mod
+
+
 class TestDTensorCompile(torch._dynamo.test_case.TestCase):
     def setUp(self):
         super(
@@ -167,6 +202,8 @@ def forward(self, b_buffer, x):
     return (view_as_1,)""",  # noqa: B950
         )
 
+        # During tracing, sharding propagation cache is skipped, so an extra dry run for
+        # add is performed in _propagate_tensor_meta_non_cached, hence add_1 instead of add
         self.assertExpectedInline(
             str(ep.run_decompositions({}).graph_module.code).strip(),
             """\
@@ -174,8 +211,8 @@ def forward(self, b_parametrizations_buffer_original0, x):
     _assert_tensor_metadata = torch.ops.aten._assert_tensor_metadata.default(x, None, None, torch.float64, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata = None
     _to_copy = torch.ops.aten._to_copy.default(x, dtype = torch.float64, layout = torch.strided, device = device(type='cuda', index=0));  x = None
     view = torch.ops.aten.view.default(_to_copy, [4, 4]);  _to_copy = None
-    add = torch.ops.aten.add.Tensor(b_parametrizations_buffer_original0, view);  b_parametrizations_buffer_original0 = view = None
-    view_1 = torch.ops.aten.view.default(add, [4, 4]);  add = None
+    add_1 = torch.ops.aten.add.Tensor(b_parametrizations_buffer_original0, view);  b_parametrizations_buffer_original0 = view = None
+    view_1 = torch.ops.aten.view.default(add_1, [4, 4]);  add_1 = None
     return (view_1,)""",  # noqa: B950
         )
 
@@ -269,7 +306,9 @@ def fn(x):
                 .to_local()[0]
             )
 
-        x = DTensor.from_local(torch.rand(4, 4), mesh, [Shard(0)], run_check=False)
+        x = DTensor.from_local(
+            torch.rand(4, 4, requires_grad=True), mesh, [Shard(0)], run_check=False
+        )
         torch._dynamo.mark_dynamic(x, 0)
         ref = fn(x)
 
@@ -290,7 +329,9 @@ def fn(x):
                 for t in torch.tensor_split(x, 2)
             ]
 
-        x = DTensor.from_local(torch.rand(4, 4), mesh, [Shard(0)], run_check=False)
+        x = DTensor.from_local(
+            torch.rand(4, 4, requires_grad=True), mesh, [Shard(0)], run_check=False
+        )
         ref = fn(x)
 
         opt_fn = torch.compile(fn, backend="aot_eager", fullgraph=True, dynamic=True)
@@ -317,6 +358,30 @@ def fn(x):
             res = opt_fn(x)
         self.assertEqual(res, ref)
 
+    def test_dtensor_dynamic_cat(self):
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        # test passing in tuple of DTensors as
+        def fn(x, y):
+            return (
+                torch.cat((x, y), dim=0)
+                .redistribute(device_mesh=x.device_mesh, placements=[Replicate()])
+                .to_local()[0]
+            )
+
+        x = DTensor.from_local(
+            torch.rand(4, 4, requires_grad=True), mesh, [Shard(0)], run_check=False
+        )
+        y = DTensor.from_local(
+            torch.rand(4, 4, requires_grad=True), mesh, [Shard(0)], run_check=False
+        )
+        torch._dynamo.mark_dynamic(x, 0)
+        ref = fn(x, y)
+
+        opt_fn = torch.compile(fn, backend="aot_eager", fullgraph=True)
+        res = opt_fn(x, y)
+        self.assertEqual(res, ref)
+
     def test_dtensor_attribute_access_on_intermediate(self):
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
 
@@ -1150,6 +1215,29 @@ def fn(x, y):
         self.assertEqual(x_ref.grad, x.grad)
         self.assertEqual(y_ref.grad, y.grad)
 
+    @with_comms
+    def test_compile_embedding_redistribute(self):
+        mesh = self.build_device_mesh()
+
+        class Network(nn.Module):
+            def __init__(self, embedding, mesh):
+                super().__init__()
+                self.mesh = mesh
+                self.embedding = _apply_sharding(embedding, 0, self.mesh)
+
+            def forward(self, x):
+                x = self.embedding(x)
+                x = x.redistribute(self.mesh, [Shard(1)])
+                return x
+
+        embedding = torch.nn.Embedding(10, 20, device=self.device_type)
+        inp = torch.randint(0, 10, (8,), device=self.device_type)
+        ref_out = embedding(inp)
+        sharded_net = torch.compile(Network(embedding, mesh))
+        replicated_inp = DTensor.from_local(inp, mesh, [Replicate()], run_check=False)
+        output = sharded_net(replicated_inp)
+        self.assertEqual(output.full_tensor(), ref_out)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py
index 39e4231b1e11..8c650f6b0ce0 100644
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@@ -507,18 +507,17 @@ class TestDTensorOps(DTensorOpTestBase):
     def world_size(self) -> int:
         return OP_DB_WORLD_SIZE
 
-    # only allow float dytpe for now, we can relax this constraint
-    # when feel necessary later (i.e when adding quantization support).
-    @suppress_warnings
-    @ops(op_db, allowed_dtypes=(torch.float,))
-    @skipOps("TestDTensorOps", "test_dtensor_op_db", dtensor_fails)
-    def test_dtensor_op_db(self, dtype, op):
+    def run_opinfo_test(
+        self, dtype, op, requires_grad=True, sample_inputs_filter=lambda s: True
+    ):
         self.mesh = DeviceMesh(DEVICE_TYPE, torch.arange(self.world_size))
 
         # test each op with dist tensor inputs and normal inputs
         def test():
-            samples = op.sample_inputs(DEVICE_TYPE, dtype, requires_grad=True)
+            samples = op.sample_inputs(DEVICE_TYPE, dtype, requires_grad=requires_grad)
             for sample_input in samples:
+                if not sample_inputs_filter(sample_input):
+                    continue
                 args = [sample_input.input] + list(sample_input.args)
                 kwargs = sample_input.kwargs
 
@@ -531,6 +530,14 @@ def test():
 
         self.check_dtensor_func(test, op)
 
+    # only allow float dytpe for now, we can relax this constraint
+    # when feel necessary later (i.e when adding quantization support).
+    @suppress_warnings
+    @ops(op_db, allowed_dtypes=(torch.float,))
+    @skipOps("TestDTensorOps", "test_dtensor_op_db", dtensor_fails)
+    def test_dtensor_op_db(self, dtype, op):
+        self.run_opinfo_test(dtype, op)
+
     def assert_ref_dtensor_equal(self, dtensor_rs, rs):
         flat_dtensor_rs = pytree.tree_leaves(dtensor_rs)
         flat_rs = pytree.tree_leaves(rs)
@@ -644,6 +651,18 @@ def check_dtensor_func(self, test_func, opinfo, dry_run=False):
                 else:
                     print(f"xfail('{opinfo.name}'),")
 
+    def test_one_hot(self):
+        ops = [op for op in op_db if op.name == "nn.functional.one_hot"]
+        assert len(ops) == 1
+        op = ops[0]
+        # num_classes = -1 appears to have a bug with dtensor.max().item()
+        self.run_opinfo_test(
+            torch.int64,
+            op,
+            requires_grad=False,
+            sample_inputs_filter=lambda s: s.kwargs["num_classes"] != -1,
+        )
+
 
 # only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
 instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,))
diff --git a/test/distributed/tensor/test_dtensor_testbase.py b/test/distributed/tensor/test_dtensor_testbase.py
new file mode 100644
index 000000000000..b5a2de69a566
--- /dev/null
+++ b/test/distributed/tensor/test_dtensor_testbase.py
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import numpy as np
+
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
+
+
+class DTensorTestBaseUtilCPUTest(DTensorTestBase):
+    """
+    This class tests if the basic functionalities of DTensorTestBase are
+    working as expected on CPU, regardless of the presence of CUDA devices.
+    """
+
+    @property
+    def backend(self):
+        return "gloo"
+
+    @property
+    def device_type(self) -> str:
+        return "cpu"
+
+    @property
+    def world_size(self):
+        return np.prod(list(self.mesh_dim_sizes.values())).item()
+
+    @property
+    def mesh_dim_sizes(self) -> dict[str, int]:
+        """Mapping from mesh dimension names to sizes."""
+        return {"data": 2, "fsdp": 3, "tensor": 5}
+
+    def build_device_mesh(self) -> DeviceMesh:
+        return init_device_mesh(
+            self.device_type,
+            mesh_shape=tuple(self.mesh_dim_sizes.values()),
+            mesh_dim_names=tuple(self.mesh_dim_sizes.keys()),
+        )
+
+    @with_comms
+    def test_dtensor_testbase_destroy_pg(self):
+        # This tests destroy_pg() correctly finishes.
+        device_mesh = self.build_device_mesh()  # noqa: F841
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/tensor/test_init.py b/test/distributed/tensor/test_init.py
index 4212b6fc2c9b..d08b7e0fda4a 100644
--- a/test/distributed/tensor/test_init.py
+++ b/test/distributed/tensor/test_init.py
@@ -131,7 +131,7 @@ def test_zeros(self):
 
     @with_comms
     def test_zeros_full_mesh(self):
-        # construct a cuda device 1d mesh
+        # construct a gpu device 1d mesh
         mesh = self.build_device_mesh()
         placements = [Shard(0)]
         size = [32, 3]
@@ -157,7 +157,7 @@ def test_zeros_full_mesh(self):
             self.assertEqual(local_tensor.size(), torch.Size([7, 3]))
             self.assertEqual(torch.zeros(7, 3), local_tensor)
 
-        # construct a cuda device mesh with 2d: shard, replicate
+        # construct a gpu device mesh with 2d: shard, replicate
         mesh = DeviceMesh(self.device_type, torch.arange(self.world_size).reshape(2, 2))
         placements = [Shard(0), Replicate()]
         size = [32, 4]
@@ -168,7 +168,7 @@ def test_zeros_full_mesh(self):
         self.assertEqual(local_tensor.size(), torch.Size([16, 4]))
         self.assertEqual(local_tensor, torch.zeros([16, 4]))
 
-        # construct a cuda device mesh with 2d: shard, shard
+        # construct a gpu device mesh with 2d: shard, shard
         placements = [Shard(0), Shard(1)]
         size = [32, 4]
         dist_tensor = zeros(size, device_mesh=mesh, placements=placements)
@@ -197,7 +197,7 @@ def test_zeros_full_mesh(self):
     @with_comms
     def test_zeros_submesh(self):
         # default world_size is 4
-        # construct a cuda device 1d mesh, with no sub pg initialized
+        # construct a gpu device 1d mesh, with no sub pg initialized
         sub_mesh_list = [0, 3]
         mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0)]
@@ -213,7 +213,7 @@ def test_zeros_submesh(self):
             self.assertEqual(local_tensor.size(), torch.Size([0]))
             self.assertEqual(local_tensor, torch.zeros(0))
 
-        # construct a cuda device 1d mesh: unevenly, with subpg initialized
+        # construct a gpu device 1d mesh: unevenly, with subpg initialized
         sub_mesh_list = [0, 1, 3]
         mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0)]
@@ -233,7 +233,7 @@ def test_zeros_submesh(self):
             self.assertEqual(local_tensor.size(), torch.Size([0]))
             self.assertEqual(local_tensor, torch.tensor([]))
 
-        # construct a cuda device 2d mesh, with no subpg initialized
+        # construct a gpu device 2d mesh, with no subpg initialized
         sub_mesh_list = [[0], [3]]
         mesh = DeviceMesh(self.device_type, sub_mesh_list)
         placements = [Shard(0), Shard(1)]
diff --git a/test/distributed/tensor/test_math_ops.py b/test/distributed/tensor/test_math_ops.py
index 2419720256de..0dc2f15fe69a 100644
--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@@ -724,7 +724,7 @@ def test_foreach_add_different_mesh(self):
         self.assertEqual(out0.device_mesh, mesh_x)
         self.assertEqual(out1.device_mesh, mesh_y)
 
-        with self.assertRaisesRegex(ValueError, "computation across different mesh"):
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             torch.ops.aten._foreach_add(
                 [replica_inp00, replica_inp01], [replica_inp10, replica_inp11]
             )
diff --git a/test/distributed/tensor/test_matrix_ops.py b/test/distributed/tensor/test_matrix_ops.py
index e9baf2102b25..f467d1175db1 100644
--- a/test/distributed/tensor/test_matrix_ops.py
+++ b/test/distributed/tensor/test_matrix_ops.py
@@ -411,10 +411,6 @@ def test_scaled_dot_product_attention(self):
             requires_grad=True,
         )
 
-        dist_query = distribute_tensor(query, device_mesh, [Shard(1)])
-        dist_key = distribute_tensor(key, device_mesh, [Shard(1)])
-        dist_value = distribute_tensor(value, device_mesh, [Shard(1)])
-
         from torch.nn.attention import sdpa_kernel, SDPBackend
 
         available_backends = []
@@ -431,7 +427,13 @@ def test_scaled_dot_product_attention(self):
         if torch.backends.cuda.can_use_efficient_attention(params, debug=False):
             available_backends.append(SDPBackend.EFFICIENT_ATTENTION)
 
-        for backend in available_backends:
+        placement_specs = [(Replicate(),), (Shard(0),), (Shard(1),)]
+        for backend, input_placements in itertools.product(
+            available_backends, placement_specs
+        ):
+            dist_query = distribute_tensor(query, device_mesh, input_placements)
+            dist_key = distribute_tensor(key, device_mesh, input_placements)
+            dist_value = distribute_tensor(value, device_mesh, input_placements)
             with sdpa_kernel(backends=[backend]):
                 out = F.scaled_dot_product_attention(
                     query, key, value, dropout_p=dropout_p, is_causal=is_causal
@@ -445,19 +447,22 @@ def test_scaled_dot_product_attention(self):
                         is_causal=is_causal,
                     )
                     self.assertEqual(comm_mode.get_total_counts(), 0)
-                    self.assertTrue(dist_out.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_out.placements, input_placements)
                     self.assertEqual(dist_out.full_tensor(), out)
 
                 out.sum().backward()
                 with comm_mode:
                     dist_out.sum().backward()
                     self.assertEqual(comm_mode.get_total_counts(), 0)
-                    self.assertTrue(dist_query.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_query.grad.placements, input_placements)
                     self.assertEqual(dist_query.grad.full_tensor(), query.grad)
-                    self.assertTrue(dist_key.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_key.grad.placements, input_placements)
                     self.assertEqual(dist_key.grad.full_tensor(), key.grad)
-                    self.assertTrue(dist_value.grad.placements[0].is_shard(dim=1))
+                    self.assertEqual(dist_value.grad.placements, input_placements)
                     self.assertEqual(dist_value.grad.full_tensor(), value.grad)
+                    query.grad.zero_()
+                    key.grad.zero_()
+                    value.grad.zero_()
 
     @skip_unless_torch_gpu
     @with_comms()
diff --git a/test/distributed/tensor/test_op_strategy.py b/test/distributed/tensor/test_op_strategy.py
index cb3d293cbeff..8e97d80e9543 100644
--- a/test/distributed/tensor/test_op_strategy.py
+++ b/test/distributed/tensor/test_op_strategy.py
@@ -536,7 +536,7 @@ class DistTensorReplicateStrategyRegistrationTest(DTensorTestBase):
     def test_replicate_strategy_placement(self, mock_select_strategy):
         costs_from__select_strategy = []
 
-        def mock_select_func(strategy):
+        def mock_select_func(strategy, op_schema=None):
             """function copied from _select_strategy but with cost capturing"""
             nonlocal costs_from__select_strategy
             if len(strategy.strategies) == 1:
diff --git a/test/distributed/tensor/test_random_ops.py b/test/distributed/tensor/test_random_ops.py
index ef63b3ac77c9..2cf9916c7d67 100644
--- a/test/distributed/tensor/test_random_ops.py
+++ b/test/distributed/tensor/test_random_ops.py
@@ -44,7 +44,7 @@ def _run_init_op(self, init_op, *args, **kwargs):
         shard_spec = [Shard(0)]
         input_size = (8, 4)
 
-        # NOTE: currently random initialization on cuda device has different
+        # NOTE: currently random initialization on gpu device has different
         # behavior from other devices. Unify the test once the behavior is unified.
         if not is_rng_supported_mesh(device_mesh):
             input_tensor = torch.randn(*input_size, device=self.device_type)
@@ -97,7 +97,7 @@ def test_init_ops(self):
     def test_init_with_user_generator(self):
         device_mesh = self.build_device_mesh()
         torch.manual_seed(42)
-        rng = torch.Generator(device="cuda").manual_seed(42)
+        rng = torch.Generator(device=self.device_type).manual_seed(42)
         t1 = torch.distributed.tensor.empty(
             (8, 3), device_mesh=device_mesh, placements=[Shard(0)]
         )
@@ -126,7 +126,7 @@ def test_meta_tensor_init(self):
         # The DTensor random ops will use the same generator as the default one on the device.
 
         # Note: this behavior changed, and now the guideline is to set the same RNG seed on all SPMD ranks.
-        torch.cuda.manual_seed(0)
+        torch.get_device_module(self.device_type).manual_seed(0)
         device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
         size = [1024, 2048]
         meta_dtensor = distribute_tensor(
@@ -592,8 +592,8 @@ class DistTensorRandomOpsTest3D(DTensorTestBase):
     def world_size(self):
         return 8
 
-    @with_comms
     @skip_if_lt_x_gpu(8)
+    @with_comms
     def test_hsdp_tp_model_meta_init(self):
         # initialize the 3-d device mesh
         global_mesh = init_device_mesh(
diff --git a/test/distributed/tensor/test_view_ops.py b/test/distributed/tensor/test_view_ops.py
index 91dee66f674e..815b588a7ded 100644
--- a/test/distributed/tensor/test_view_ops.py
+++ b/test/distributed/tensor/test_view_ops.py
@@ -228,20 +228,24 @@ def test_illegal_views(self):
         shard.view(-1)
 
         shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=1)])
-        with self.assertRaisesRegex(
-            RuntimeError, "Attempted to flatten sharded dimension"
-        ):
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             shard.view(-1)
 
         # 8 is the uneven case since mesh dim is 6
         tensor = torch.randn((8, 256))
         dtensor = distribute_tensor(tensor, device_mesh, [Replicate()])
         shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=0)])
-        with self.assertRaisesRegex(
-            RuntimeError, "Attempted to flatten unevenly sharded dimension"
-        ):
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             shard.view(-1)
 
+        # assuming world size is 4+, tensor is shardable on dim 1 with size 256
+        # but not viewable when the resulting dim 1 has size 2
+        tensor = torch.randn((8, 256))
+        dtensor = distribute_tensor(tensor, device_mesh, [Replicate()])
+        shard = dtensor.redistribute(device_mesh=device_mesh, placements=[Shard(dim=1)])
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
+            shard.view(8, 2, -1)
+
     @with_comms
     def test_view_ops(self):
         mesh_shape = (dist.get_world_size() // 2, 2)
@@ -637,9 +641,7 @@ def test_view_redistribution(self):
         mesh = init_device_mesh(self.device_type, (self.world_size,))
         dtensor_x = distribute_tensor(x, mesh, (Shard(0),))
 
-        with self.assertRaisesRegex(
-            RuntimeError, "Attempted to flatten unevenly sharded dimension"
-        ):
+        with self.assertRaisesRegex(RuntimeError, "Sharding propagation failed"):
             dtensor_x.view(-1, 8)
 
     @with_comms
diff --git a/test/distributed/test_c10d_common.py b/test/distributed/test_c10d_common.py
index efac131e6c38..1857feffd939 100644
--- a/test/distributed/test_c10d_common.py
+++ b/test/distributed/test_c10d_common.py
@@ -293,6 +293,23 @@ def forward(self, x):
         return self.conv3(x)
 
 
+# A model involving FFTs, used to test DDP with complex tensors
+class FFTModel(nn.Module):
+    def __init__(self, hin, win, n_features):
+        super().__init__()
+        self.hin = hin
+        self.win = win
+        self.weight = nn.Parameter(
+            torch.ones((n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat)
+        )
+
+    def forward(self, x):
+        xc = torch.fft.rfft2(x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho")
+        xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
+        x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
+        return x
+
+
 class Task(nn.Module):
     def __init__(self) -> None:
         super().__init__()
diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py
index ff0dac4fcc0e..0b265e65cf57 100644
--- a/test/distributed/test_c10d_gloo.py
+++ b/test/distributed/test_c10d_gloo.py
@@ -25,6 +25,7 @@
 
 import test_c10d_common
 from test_c10d_common import (
+    FFTModel,
     gpus_for_rank,
     LOOPBACK,
     ModuleForDdpCommHook,
@@ -134,6 +135,32 @@ def simple_reduce_tests(rank, world_size):
             ),
         )
 
+    # Extend tests for cfloat dtype
+    tests.extend(
+        (
+            (
+                c10d.ReduceOp.SUM,
+                torch.tensor([complex(rank + 1.0, rank + 1.0)], dtype=torch.cfloat),
+                torch.tensor(
+                    [
+                        complex(
+                            world_size * (world_size + 1) / 2,
+                            world_size * (world_size + 1) / 2,
+                        )
+                    ],
+                    dtype=torch.cfloat,
+                ),
+            ),
+            (
+                c10d.ReduceOp.AVG,
+                torch.tensor([complex(rank + 1.0, rank + 1.0)], dtype=torch.cfloat),
+                torch.tensor(
+                    [complex(float((world_size + 1) / 2), float((world_size + 1) / 2))],
+                    dtype=torch.cfloat,
+                ),
+            ),
+        )
+    )
     return tests
 
 
@@ -373,6 +400,13 @@ def broadcast(xs, rootRank, rootTensor):
                     torch.tensor([i * num + j], dtype=torch.float32), output[1]
                 )
 
+            # Run with 1 input tensor of cfloat dtype
+            x = fn(torch.tensor([complex(self.rank, self.rank)], dtype=torch.cfloat))
+            output = broadcast([x], i, 0)
+            self.assertEqual(
+                torch.tensor([complex(i, i)], dtype=torch.cfloat), output[0]
+            )
+
         # Test overloaded convenience function
         x = torch.tensor([self.rank + 1.0])
         fut = pg.broadcast(x, root=0).get_future()
@@ -1605,6 +1639,22 @@ def test_block_current_stream_cuda(self):
 
         work.wait()
 
+    @requires_gloo()
+    def test_send_recv_complex(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_gloo(
+            store, self.rank, self.world_size, self.opts()
+        )
+        # Generate the same random tensor
+        torch.manual_seed(0)
+        send_tensor = torch.rand(10, 10, dtype=torch.cfloat)
+        if self.rank == 0:
+            pg.send([send_tensor], 1, 0).wait()
+        if self.rank == 1:
+            recv_tensor = torch.rand(10, 10, dtype=torch.cfloat)
+            pg.recv([recv_tensor], 0, 0).wait()
+            self.assertEqual(send_tensor, recv_tensor)
+
 
 class DistributedDataParallelTest(
     test_c10d_common.CommonDistributedDataParallelTest, MultiProcessTestCase
@@ -2270,6 +2320,24 @@ def div_by_world_size(fut):
 
         self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
 
+    @requires_gloo()
+    def test_ddp_complex_params(self):
+        process_group = self._get_process_group()
+        N, C, H, W = 1, 16, 64, 64
+        ddp_model = DistributedDataParallel(
+            FFTModel(hin=H, win=W, n_features=C),
+            process_group=process_group,
+        )
+        optimizer = torch.optim.Adam(ddp_model.parameters(), lr=0.001)
+
+        inp = torch.ones((N, C, H, W), dtype=torch.float32)
+
+        # train step
+        out = ddp_model(inp)
+        loss = torch.sum(out)
+        loss.backward()
+        optimizer.step()
+
 
 class ReducerModule(nn.Module):
     def __init__(self) -> None:
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index 2f6a71c92793..2ac332f65fd0 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -29,7 +29,13 @@
 
 
 import test_c10d_common
-from test_c10d_common import ConvNet, DoubleGpuNet, gpus_for_rank, ModuleForDdpCommHook
+from test_c10d_common import (
+    ConvNet,
+    DoubleGpuNet,
+    FFTModel,
+    gpus_for_rank,
+    ModuleForDdpCommHook,
+)
 
 import torch.distributed as dist
 import torch.distributed.algorithms.ddp_comm_hooks.default_hooks as default
@@ -2552,25 +2558,6 @@ def test_channels_last_contig(self):
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_ddp_complex_params(self):
-        class FFTModel(nn.Module):
-            def __init__(self, hin, win, n_features):
-                super().__init__()
-                self.hin = hin
-                self.win = win
-                self.weight = nn.Parameter(
-                    torch.ones(
-                        (n_features, n_features, hin, win // 2 + 1), dtype=torch.cfloat
-                    )
-                )
-
-            def forward(self, x):
-                xc = torch.fft.rfft2(
-                    x, s=(self.hin, self.win), dim=(-2, -1), norm="ortho"
-                )
-                xcw = torch.einsum("nchw,cohw->nohw", xc, self.weight)
-                x = torch.fft.irfft2(xcw, dim=(-2, -1), norm="ortho")
-                return x
-
         process_group = self._get_process_group()
         device_id = gpus_for_rank(self.world_size)[self.rank][0]
         N, C, H, W = 1, 16, 64, 64
@@ -2851,6 +2838,25 @@ def _reduce_timeout(self):
         os.environ["TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"] = "4"
         os.environ["TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"] = "1000"
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(3)
+    @skip_if_rocm_multiprocess
+    def test_send_recv_non_dense_tensor(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        device = torch.device("cuda", self.rank % torch.cuda.device_count())
+        dist.init_process_group(
+            rank=self.rank, world_size=self.world_size, store=store, device_id=device
+        )
+        full = torch.empty((64, 64), device=device).fill_(self.rank)
+        # Take a slice in col dimension, making it non-dense
+        block = full[:, 16:32]
+        if self.rank == 0:
+            with self.assertRaises(ValueError):
+                dist.send(block, dst=1)
+        elif self.rank == 1:
+            with self.assertRaises(ValueError):
+                dist.recv(block, src=0)
+
     @requires_nccl()
     @requires_nccl_version((2, 4, 0), "Need NCCL 2.4+ for error checking")
     @skip_if_lt_x_gpu(3)
@@ -3755,6 +3761,27 @@ def test_allgather_base(self):
         dist.all_gather_into_tensor(output_tensor, tensor)
         self.assertEqual(output_tensor, tensor)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_allgather_noncontig(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        device = "cuda"
+        tensor = (
+            torch.arange(0, 16, device=torch.device(device))
+            .view(2, 2, 2, 2)
+            .to(memory_format=torch.channels_last)
+        )
+        tensor_list = [torch.empty_like(tensor) for _ in range(self.world_size)]
+        dist.all_gather(tensor_list, tensor)
+        for o in tensor_list:
+            self.assertEqual(o, tensor)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(1)
     @parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
diff --git a/test/distributed/test_collective_utils.py b/test/distributed/test_collective_utils.py
index 50610d3f201e..791aafa5a3a6 100644
--- a/test/distributed/test_collective_utils.py
+++ b/test/distributed/test_collective_utils.py
@@ -7,16 +7,20 @@
 from torch.distributed.collective_utils import (
     _check_rng_sync,
     _check_rng_sync_internal,
+    _summarize_ranks,
     all_gather,
     broadcast,
 )
+from torch.distributed.device_mesh import init_device_mesh
 from torch.testing import FileCheck
 from torch.testing._internal.common_distributed import MultiProcessTestCase
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TestCase,
 )
+from torch.testing._internal.distributed.fake_pg import FakeStore
 
 
 class TestCollectiveUtils(MultiProcessTestCase):
@@ -163,7 +167,47 @@ def test_check_rng_sync(
         log_str = _check_rng_sync(generator, group)
         FileCheck().check("Generator desync detected").check("Ranks").check("0").check(
             "1"
-        ).check("2-3").run(log_str)
+        ).check("2:4").run(log_str)
+
+
+class TestUtils(TestCase):
+    def setUp(self):
+        super().setUp()
+
+        if not c10d.is_initialized():
+            self.rank = 0
+            self.world_size = 4096
+
+            store = FakeStore()
+            c10d.init_process_group(
+                backend="fake",
+                world_size=self.world_size,
+                rank=self.rank,
+                store=store,
+            )
+
+    def tearDown(self):
+        c10d.destroy_process_group()
+
+    def test_summarize_ranks(self):
+        mesh_dim_names = ("pp", "dp", "tp")
+        mesh = init_device_mesh("cpu", (8, 64, 8), mesh_dim_names=mesh_dim_names)
+        ranks_lists = {name: mesh[name].mesh.tolist() for name in mesh_dim_names}
+        summaries = {
+            name: _summarize_ranks(ranks_lists[name]) for name in mesh_dim_names
+        }
+        self.assertEqual(summaries["pp"], "0:4096:512")
+        self.assertEqual(summaries["dp"], "0:512:8")
+        self.assertEqual(summaries["tp"], "0:8")
+
+        self.assertEqual(
+            _summarize_ranks([1, 2, 3, 6, 7, 8, 10, 12, 14, 16]),
+            "1:4,6:9,10:18:2",
+        )
+        self.assertEqual(
+            _summarize_ranks([1]),
+            "1",
+        )
 
 
 instantiate_parametrized_tests(TestCollectiveUtils)
diff --git a/test/distributed/test_compute_comm_reordering.py b/test/distributed/test_compute_comm_reordering.py
index c05d5edae233..986fc2a0247d 100644
--- a/test/distributed/test_compute_comm_reordering.py
+++ b/test/distributed/test_compute_comm_reordering.py
@@ -259,6 +259,11 @@ def func(a, *, tag, ranks, group_size):
             "reorder_compute_for_overlap",
         ],
     )
+    @patch.object(
+        torch._inductor.config,
+        "runtime_estimations_mms_benchmark",
+        False,
+    )
     def test_reorder_compute_for_overlap(self):
         def func(a, *, tag, ranks, group_size):
             ar = _functional_collectives.all_reduce(a, "sum", ranks, tag)
diff --git a/test/distributed/test_fake_pg.py b/test/distributed/test_fake_pg.py
index bc65fab2c67f..0214680ba5e0 100644
--- a/test/distributed/test_fake_pg.py
+++ b/test/distributed/test_fake_pg.py
@@ -40,16 +40,14 @@ def tearDown(self):
             pass
 
     def test_all_reduce(self):
-        store = FakeStore()
-        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
+        dist.init_process_group(backend="fake", rank=1, world_size=2)
 
         output = torch.ones(3, 3) * dist.get_rank()
         dist.all_reduce(output)
         self.assertEqual(tuple(output.shape), (3, 3))
 
     def test_allgather(self):
-        store = FakeStore()
-        dist.init_process_group(backend="fake", rank=1, world_size=2, store=store)
+        dist.init_process_group(backend="fake", rank=1, world_size=2)
 
         input_tensor = torch.ones(3, 3) * dist.get_rank()
         output_tensors = [torch.empty_like(input_tensor) for _ in range(2)]
@@ -106,8 +104,7 @@ def allgather_fn(tensor):
         FileCheck().check("all_gather").check("wait_tensor").run(str(gm.graph))
 
     def test_broadcast(self):
-        store = FakeStore()
-        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+        dist.init_process_group(backend="fake", rank=0, world_size=2)
 
         # src == rank
         output = torch.ones(3, 3)
diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py
index db11c232500a..b5522fe2bef0 100644
--- a/test/distributed/test_functional_api.py
+++ b/test/distributed/test_functional_api.py
@@ -13,7 +13,6 @@
 from torch._inductor.utils import run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
-from torch.testing._internal.distributed.fake_pg import FakeStore
 from torch.testing._internal.inductor_utils import HAS_GPU
 
 
@@ -431,12 +430,10 @@ def setUp(self):
         # so create a fake_pg.
         self.rank = 0
         self.world_size = 2
-        store = FakeStore()
         dist.init_process_group(
             backend="fake",
             world_size=self.world_size,
             rank=self.rank,
-            store=store,
         )
 
     def tearDown(self):
@@ -598,7 +595,6 @@ def allreduce(t, pg):
             backend="fake",
             rank=0,
             world_size=8,
-            store=FakeStore(),
         )
         allreduce(torch.randn(8, device=device), pg=dist.group.WORLD)
         dist.destroy_process_group()
diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 656c03aa6cfd..ca729fd50b0a 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -22,8 +22,13 @@
     sink_waits_iterative,
 )
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
-from torch._inductor.scheduler import BaseSchedulerNode
-from torch._inductor.utils import run_and_get_triton_code
+from torch._inductor.scheduler import (
+    _get_mm_like_fn,
+    BaseSchedulerNode,
+    get_estimate_runtime_cache,
+    get_estimate_runtime_cache_key_from_snode,
+)
+from torch._inductor.utils import fresh_inductor_cache, run_and_get_triton_code
 from torch.distributed.distributed_c10d import GroupMember
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_cuda import SM80OrLater
@@ -1523,7 +1528,8 @@ def _reorder_communication_preserving_peak_memory(
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @unittest.skipIf(not SM80OrLater, "bfloat16")
-    def test_all_gather_bucket(self):
+    @parametrize("bucket_mode", ["all", "all_custom_ops"])
+    def test_all_gather_bucket(self, bucket_mode):
         def func(x, w, ag_0, ag_1, ag_2, ag_3, *, tag, ranks, group_size):
             # do some unrelated matmuls
             y = torch.mm(x, w)
@@ -1568,11 +1574,21 @@ def func(x, w, ag_0, ag_1, ag_2, ag_3, *, tag, ranks, group_size):
         inputs = [x, w, ag_0, ag_1, ag_2, ag_3]
         correct = func(*inputs, **self.get_world_trs())
 
-        with torch._inductor.config.patch(
-            {
-                "bucket_all_gathers_fx": "all",
-                "reorder_for_compute_comm_overlap": False,
-            }
+        with (
+            torch._inductor.config.patch(
+                {
+                    "bucket_all_gathers_fx": bucket_mode,
+                    "reorder_for_compute_comm_overlap": False,
+                    "runtime_estimations_mms_benchmark": True,
+                }
+            ),
+            torch._inductor.config_comms.patch(
+                {
+                    "runtime_estimations_align_across_all_distributed_ranks": True,
+                }
+            ),
+            # Clearing cache to cover runtime_estimations_mms_benchmark that use LocalCache
+            fresh_inductor_cache(),
         ):
             compiled = torch.compile(func)
             code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
@@ -1580,7 +1596,9 @@ def func(x, w, ag_0, ag_1, ag_2, ag_3, *, tag, ranks, group_size):
         # We want to make sure no unnecessary copy is made.
         (
             FileCheck()
-            .check_count(".all_gather_into_tensor_out.default(", 2, exactly=True)
+            .check("= torch.ops._c10d_functional.all_gather_into_tensor")
+            .check("torch.ops._c10d_functional.all_gather_into_tensor_out.default(")
+            .check("= torch.ops._c10d_functional.all_gather_into_tensor")
             .run(code)
         )
         out = compiled(*inputs, **self.get_world_trs())
@@ -1641,7 +1659,8 @@ def func(x, w, ag_0, ag_1, *, tag, ranks, group_size):
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @unittest.skipIf(not SM80OrLater, "bfloat16")
-    def test_reduce_scatter_bucket(self):
+    @parametrize("bucket_mode", ["all", "all_custom_ops"])
+    def test_reduce_scatter_bucket(self, bucket_mode):
         def func(x, w, rs_0, rs_1, tag, ranks, group_size):
             # do some unrelated matmuls
             y = torch.mm(x, w)
@@ -1682,7 +1701,7 @@ def func2(x, w, rs_0, rs_1, tag, ranks, group_size):
 
             with torch._inductor.config.patch(
                 {
-                    "bucket_reduce_scatters_fx": "fsdp",
+                    "bucket_reduce_scatters_fx": bucket_mode,
                     "reorder_for_compute_comm_overlap": False,
                 }
             ):
@@ -1708,7 +1727,8 @@ def func2(x, w, rs_0, rs_1, tag, ranks, group_size):
 
     @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
     @unittest.skipIf(not SM80OrLater, "bfloat16")
-    def test_reorder_peak_memory_bucketed(self):
+    @parametrize("bucket_mode", ["all", "all_custom_ops"])
+    def test_reorder_peak_memory_bucketed(self, bucket_mode):
         """
         Simulate the case where a bucketing pass ran and grouped several inputs into one bucketed allgather.
         Ensure the whole bucketed group including copy-ops get moved together rather than the copy ops preventing the
@@ -1801,6 +1821,17 @@ def func(x, w, ag_0, ag_1, ag_2, ag_3, *, tag, ranks, group_size):
         def _reorder_communication_preserving_peak_memory(
             snodes: list[BaseSchedulerNode],
         ) -> list[BaseSchedulerNode]:
+            if torch._inductor.config.runtime_estimations_mms_benchmark:
+                cache = get_estimate_runtime_cache()
+                for snode in snodes:
+                    if _get_mm_like_fn(snode) is None:
+                        continue
+                    cache_key = get_estimate_runtime_cache_key_from_snode(snode)
+                    assert cache.lookup(cache_key) is not None
+
+            if torch._inductor.config_comms.runtime_estimations_align_across_all_distributed_ranks:
+                for snode in snodes:
+                    assert snode.override_estimated_runtime is not None
             nonlocal node_stats
             (
                 reordered_snodes,
@@ -1808,20 +1839,30 @@ def _reorder_communication_preserving_peak_memory(
             ) = _reorder_communication_preserving_peak_memory_internal(snodes)
             return reordered_snodes
 
-        with torch._inductor.config.patch(
-            {
-                "bucket_all_gathers_fx": "all",
-                "bucket_all_gathers_fx_bucket_size_determinator": lambda _: 2,
-                "bucket_reduce_scatters_fx": "all",
-                "bucket_reduce_scatters_fx_bucket_size_determinator": lambda _: 2,
-                "reorder_for_compute_comm_overlap": True,
-                "reorder_for_compute_comm_overlap_passes": [
-                    sink_waits_iterative,
-                    _reorder_communication_preserving_peak_memory,
-                ],
-                "allow_buffer_reuse": False,
-                "test_configs.track_memory_lifecycle": "error",
-            }
+        with (
+            torch._inductor.config.patch(
+                {
+                    "bucket_all_gathers_fx": bucket_mode,
+                    "bucket_all_gathers_fx_bucket_size_determinator": lambda _: 2,
+                    "bucket_reduce_scatters_fx": bucket_mode,
+                    "bucket_reduce_scatters_fx_bucket_size_determinator": lambda _: 2,
+                    "reorder_for_compute_comm_overlap": True,
+                    "reorder_for_compute_comm_overlap_passes": [
+                        sink_waits_iterative,
+                        _reorder_communication_preserving_peak_memory,
+                    ],
+                    "allow_buffer_reuse": False,
+                    "test_configs.track_memory_lifecycle": "error",
+                    "runtime_estimations_mms_benchmark": True,
+                }
+            ),
+            torch._inductor.config_comms.patch(
+                {
+                    "runtime_estimations_align_across_all_distributed_ranks": True,
+                }
+            ),
+            # Clearing cache to cover runtime_estimations_mms_benchmark that use LocalCache
+            fresh_inductor_cache(),
         ):
             compiled = torch.compile(func, fullgraph=True)
             code = run_and_get_triton_code(compiled, *inputs, **self.get_world_trs())
diff --git a/test/distributed/test_nvshmem.py b/test/distributed/test_nvshmem.py
index 64b8062b6098..a51f7e35eb33 100644
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@@ -7,7 +7,11 @@
 import torch
 import torch.distributed as dist
 import torch.distributed._symmetric_memory as symm_mem
-from torch.testing._internal.common_distributed import MultiProcContinuousTest
+from torch.distributed.device_mesh import init_device_mesh
+from torch.testing._internal.common_distributed import (
+    MultiProcContinuousTest,
+    skip_if_lt_x_gpu,
+)
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -37,8 +41,6 @@ class NVSHMEMSymmetricMemoryTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
         # TODO: relieve this (seems to hang if without)
         device_module.set_device(self.device)
-        # NOTE: required for nvshmem allocation
-        torch.empty(1, device=self.device)
         # Set NVSHMEM as SymmMem backend
         symm_mem.set_backend("NVSHMEM")
 
@@ -65,6 +67,123 @@ def foo():
         out = symm_mem.empty(numel, dtype=dtype, device=self.device)
         symm_mem.rendezvous(out, group=group_name)
 
+    @skipIfRocm
+    def test_alloc_without_device_context(self) -> None:
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+        out = symm_mem.empty(numel, dtype=dtype, device=self.device)
+        self.assertEqual(out.device, self.device)
+        symm_mem.rendezvous(out, group=group_name)
+
+    @skipIfRocm
+    def test_mempool_tensor_factory(self) -> None:
+        """
+        Test the effectiveness of MemPool on tensor factory ops.
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+        src_rank = 0
+
+        allocator = symm_mem.get_mempool_allocator(self.device)
+        mempool = torch.cuda.MemPool(allocator)
+
+        with torch.cuda.use_mem_pool(mempool):
+            if self.rank == src_rank:
+                tensor = torch.arange(numel, dtype=dtype, device=self.device)
+            else:
+                tensor = torch.zeros(numel, dtype=dtype, device=self.device)
+
+        symm_mem.rendezvous(tensor, group=group_name)
+        torch.ops.symm_mem.nvshmem_broadcast(tensor, src_rank, group_name)
+        self.assertEqual(tensor, torch.arange(numel, dtype=dtype, device=self.device))
+
+    @skipIfRocm
+    def test_mempool_compute_ops(self) -> None:
+        """
+        Apply MemPool context to a compute op that creates input to collective.
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        dim = 1024
+        w = torch.ones(dim, dim, dtype=dtype, device=self.device)
+        x0 = torch.ones(1, dim, dtype=dtype, device=self.device)
+
+        allocator = symm_mem.get_mempool_allocator(self.device)
+        mempool = torch.cuda.MemPool(allocator)
+
+        with torch.cuda.use_mem_pool(mempool):
+            x = x0 + self.rank
+            y = torch.mm(x, w)
+
+        # y should be a symm tensor
+        torch.ops.symm_mem.nvshmem_broadcast(y, 0, group_name)
+        expected = torch.mm(x0, w)
+        self.assertEqual(y, expected)
+
+    @skipIfRocm
+    def test_handle_offset(self) -> None:
+        """
+        Test if handle offset is correctly set.
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+        allocator = symm_mem.get_mempool_allocator(self.device)
+        mempool = torch.cuda.MemPool(allocator)
+
+        with torch.cuda.use_mem_pool(mempool):
+            x0 = torch.empty(numel, dtype=dtype, device=self.device)
+            x1 = torch.empty_like(x0)
+
+        hdl0 = symm_mem.rendezvous(x0, group=group_name)
+        hdl1 = symm_mem.rendezvous(x1, group=group_name)
+        self.assertEqual(hdl0.offset, 0)
+        self.assertEqual(hdl1.offset, x0.untyped_storage().nbytes())
+
+    def test_get_remote_tensor(self) -> None:
+        """
+        Get a remote tensor and use regular aten ops to write to it.
+        """
+        self._init_device()
+        group_name = dist.group.WORLD.group_name
+        symm_mem.enable_symm_mem_for_group(group_name)
+
+        dtype = torch.float
+        numel = 1024
+        allocator = symm_mem.get_mempool_allocator(self.device)
+        mempool = torch.cuda.MemPool(allocator)
+
+        with torch.cuda.use_mem_pool(mempool):
+            # src data stores my rank
+            x = torch.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
+            y = torch.empty_like(x)
+
+        hdl_y = symm_mem.rendezvous(y, group=group_name)
+        peer = (self.rank + 1) % self.world_size  # Shifting pattern
+        y_remote = hdl_y.get_remote_tensor(peer, y.size(), y.dtype)
+        y_remote.copy_(x)
+        dist.barrier()
+        # Expecting data from -1 rank
+        expected = torch.empty(numel, dtype=dtype, device=self.device).fill_(
+            (self.rank - 1) % self.world_size
+        )
+        self.assertEqual(y, expected)
+
     @skipIfRocm
     def test_nvshmem_put(self) -> None:
         self._init_device()
@@ -121,8 +240,6 @@ class NVSHMEMAll2AllTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
         # TODO: relieve this (seems to hang if without)
         device_module.set_device(self.device)
-        # NOTE: required for nvshmem allocation
-        torch.empty(1, device=self.device)
         # Set NVSHMEM as SymmMem backend
         symm_mem.set_backend("NVSHMEM")
 
@@ -182,25 +299,33 @@ def test_all_to_all_vdev(self) -> None:
             torch.randn(max_inp_numel, dtype=dtype, device=self.device)
         )
         out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
-        in_out_splits = symm_mem.empty(
-            (3, self.world_size), dtype=torch.int64, device=self.device
+        in_splits = symm_mem.empty(
+            self.world_size, dtype=torch.int64, device=self.device
+        )
+        out_splits_offsets = symm_mem.empty(
+            (2, self.world_size), dtype=torch.int64, device=self.device
         )
         # Row 0 is input splits
-        in_out_splits[0].copy_(inp_splits)
+        in_splits.copy_(inp_splits)
+
+        # Sync all ranks to ensure remote tensors are allocated
+        dist.barrier()
 
-        torch.ops.symm_mem.all_to_all_vdev(inp, out, in_out_splits, group_name)
+        torch.ops.symm_mem.all_to_all_vdev(
+            inp, out, in_splits, out_splits_offsets, group_name
+        )
 
         # Check input splits (row 0) -- should not change
-        torch.testing.assert_close(in_out_splits[0], inp_splits)
+        torch.testing.assert_close(in_splits, inp_splits)
 
         # Check output splits (row 1)
-        torch.testing.assert_close(in_out_splits[1], out_splits)
+        torch.testing.assert_close(out_splits_offsets[0], out_splits)
 
         # Check output offsets (row 2)
         out_offsets = torch.cumsum(out_splits, dim=0)  # inclusive scan
         # output offsets from `all_to_all_vdev` is exclusive scan
-        self.assertEqual(in_out_splits[2][0], 0)
-        torch.testing.assert_close(in_out_splits[2][1:], out_offsets[:-1])
+        self.assertEqual(out_splits_offsets[1][0], 0)
+        torch.testing.assert_close(out_splits_offsets[1][1:], out_offsets[:-1])
 
         # Check data
         expected = torch.empty(out_numel, dtype=dtype, device=self.device)
@@ -256,6 +381,9 @@ def test_all_to_all_vdev_2d(self, align: int) -> None:
             (2, nsplits), dtype=torch.int64, device=self.device
         ).fill_(-1)
 
+        # Sync all ranks to ensure remote tensors are allocated
+        dist.barrier()
+
         torch.ops.symm_mem.all_to_all_vdev_2d(
             inp, out, in_splits, out_splits_offsets, group_name, major_align=align
         )
@@ -371,6 +499,9 @@ def test_all_to_all_vdev_2d_offset(self) -> None:
         # Row 1 is input offsets
         in_splits_offsets[1].copy_(inp_offsets)
 
+        # Sync all ranks to ensure remote tensors are allocated
+        dist.barrier()
+
         torch.ops.symm_mem.all_to_all_vdev_2d_offset(
             inp, out, in_splits_offsets, out_splits_offsets, group_name
         )
@@ -429,89 +560,148 @@ def test_all_to_all_vdev_2d_offset(self) -> None:
         # Check data
         torch.testing.assert_close(out_expected, out[:out_numel])
 
-    @skipIfRocm
-    @parametrize("align", [1, 8, 16])  # `major_align` of output
-    def test_shuffle_combine(self, align: int) -> None:
-        """
-        Shuffle the tokens, then combine them, and check if the combined data is
-        exactly the same as the original input data
-        """
-        torch.manual_seed(42 + self.rank)
-        self._init_device()
 
-        group_name = dist.group.WORLD.group_name
-        symm_mem.enable_symm_mem_for_group(group_name)
+# Help function used by multiple tests
+def dispatch_then_combine(device, align: int, group) -> None:
+    """
+    Shuffle the tokens, then combine them, and check if the combined data is
+    exactly the same as the original input data
+    """
+    group_name = group.group_name
+    symm_mem.enable_symm_mem_for_group(group_name)
+
+    dtype = torch.float
+    # Number of experts per rank
+    ne = 8
+    nsplits = ne * group.size()
+
+    # Number of elements for an expert is random between [0, k)
+    k = 10
+    inp_splits = torch.randint(k, (nsplits,), dtype=torch.int64, device=device)
+
+    # Actual number of input elements
+    inp_numel = inp_splits.sum().item()
+    # Max number of input elements (must be a constant across ranks for symmetric memory allocation)
+    max_inp_numel = k * nsplits
+    # Max number of output elements (must be a constant across ranks for symmetric memory allocation)
+    overflow_factor = group.size()  # worst case: one rank receives all data
+    max_out_numel = max_inp_numel * overflow_factor
+
+    # Buffers for shuffle
+    inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=device).copy_(
+        torch.randn(max_inp_numel, dtype=dtype, device=device)
+    )
+    out = symm_mem.empty(max_out_numel, dtype=dtype, device=device).fill_(-1)
+    in_splits = symm_mem.empty(nsplits, dtype=torch.int64, device=device).copy_(
+        inp_splits
+    )
+    # 2 rows: output splits, output offsets
+    # Initiallizing all values to -1 to check if they are updated
+    out_splits_offsets = symm_mem.empty(
+        (2, nsplits), dtype=torch.int64, device=device
+    ).fill_(-1)
+
+    # Buffers for combine
+    combine_out = symm_mem.empty(max_out_numel, dtype=dtype, device=device).fill_(-1)
+    # 2 rows: output splits, output offsets
+    # Initiallizing all values to -1 to check if they are updated
+    combine_out_splits_offsets = symm_mem.empty(
+        (2, nsplits), dtype=torch.int64, device=device
+    ).fill_(-1)
+
+    # Wait for all ranks to finish tensor allocation before accessing them
+    torch.cuda.synchronize(device)
+    dist.barrier(group=group)
+
+    # Shuffle the tokens
+    torch.ops.symm_mem.all_to_all_vdev_2d(
+        inp, out, in_splits, out_splits_offsets, group_name, major_align=align
+    )
 
-        dtype = torch.float
-        # Number of experts per rank
-        ne = 8
-        nsplits = ne * self.world_size
+    # Combine the tokens
+    # `out_splits_offsets` from shuffle is exactly the `input_splits_offsets` for combine
+    # `out` data from shuffle is exactly the `input` data for combine
+    torch.ops.symm_mem.all_to_all_vdev_2d_offset(
+        out, combine_out, out_splits_offsets, combine_out_splits_offsets, group_name
+    )
 
-        # Number of elements for an expert is random between [0, k)
-        k = 10
-        inp_splits = torch.randint(k, (nsplits,), dtype=torch.int64, device=self.device)
+    # Assert the combined data is exactly the same as the original input data
+    torch.testing.assert_close(combine_out[:inp_numel], inp[:inp_numel])
 
-        # Exchange input splits to get output splits
-        out_splits = torch.zeros_like(inp_splits)
-        dist.all_to_all_single(out_splits, inp_splits)
+    # Assert the combined out splits are exactly the same as the original input splits
+    torch.testing.assert_close(combine_out_splits_offsets[0], inp_splits)
 
-        # Actual number of input elements
-        inp_numel = inp_splits.sum().item()
-        # Max number of input elements (must be a constant across ranks for symmetric memory allocation)
-        max_inp_numel = k * nsplits
-        # Max number of output elements (must be a constant across ranks for symmetric memory allocation)
-        overflow_factor = self.world_size  # worst case: one rank receives all data
-        max_out_numel = max_inp_numel * overflow_factor
+    # Assert the combined out offsets are exactly the same as the original input offsets
+    inp_offsets = torch.cumsum(inp_splits, dim=0)  # inclusive scan
+    # Make it exclusive scan because that's what `all_to_all_vdev_2d_offset` returns
+    inp_offsets = torch.cat([torch.zeros(1, device=device), inp_offsets[:-1]]).to(
+        torch.int64
+    )
+    torch.testing.assert_close(combine_out_splits_offsets[1], inp_offsets)
 
-        # Buffers for shuffle
-        inp = symm_mem.empty(max_inp_numel, dtype=dtype, device=self.device).fill_(
-            self.rank
-        )
-        out = symm_mem.empty(max_out_numel, dtype=dtype, device=self.device).fill_(-1)
-        in_splits = symm_mem.empty(
-            nsplits, dtype=torch.int64, device=self.device
-        ).copy_(inp_splits)
-        # 2 rows: output splits, output offsets
-        # Initiallizing all values to -1 to check if they are updated
-        out_splits_offsets = symm_mem.empty(
-            (2, nsplits), dtype=torch.int64, device=self.device
-        ).fill_(-1)
+    # Wait for all ranks to finish accessing tensors before freeing them
+    dist.barrier(group=group)
+    torch.cuda.synchronize(device)
 
-        # Shuffle the tokens
-        torch.ops.symm_mem.all_to_all_vdev_2d(
-            inp, out, in_splits, out_splits_offsets, group_name, major_align=align
-        )
 
-        # Buffers for combine
-        combine_out = symm_mem.empty(
-            max_out_numel, dtype=dtype, device=self.device
-        ).fill_(-1)
-        # 2 rows: output splits, output offsets
-        # Initiallizing all values to -1 to check if they are updated
-        combine_out_splits_offsets = symm_mem.empty(
-            (2, nsplits), dtype=torch.int64, device=self.device
-        ).fill_(-1)
+@instantiate_parametrized_tests
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class DispatchCombineTest(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
 
-        # Combine the tokens
-        # `out_splits_offsets` from shuffle is exactly the `input_splits_offsets` for combine
-        # `out` data from shuffle is exactly the `input` data for combine
-        torch.ops.symm_mem.all_to_all_vdev_2d_offset(
-            out, combine_out, out_splits_offsets, combine_out_splits_offsets, group_name
-        )
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
 
-        # Assert the combined data is exactly the same as the original input data
-        torch.testing.assert_close(combine_out[:inp_numel], inp[:inp_numel])
+    @skipIfRocm
+    @parametrize("align", [1, 8, 16])  # `major_align` of output
+    def test_dispatch_combine(self, align: int) -> None:
+        """
+        Test dispatch-and-combine over World group
+        """
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        dispatch_then_combine(self.device, align, dist.group.WORLD)
 
-        # Assert the combined out splits are exactly the same as the original input splits
-        torch.testing.assert_close(combine_out_splits_offsets[0], inp_splits)
 
-        # Assert the combined out offsets are exactly the same as the original input offsets
-        inp_offsets = torch.cumsum(inp_splits, dim=0)  # inclusive scan
-        # Make it exclusive scan because that's what `all_to_all_vdev_2d_offset` returns
-        inp_offsets = torch.cat(
-            [torch.zeros(1, device=self.device), inp_offsets[:-1]]
-        ).to(torch.int64)
-        torch.testing.assert_close(combine_out_splits_offsets[1], inp_offsets)
+@instantiate_parametrized_tests
+@requires_nvshmem()
+@requires_cuda_p2p_access()
+class DispatchCombineInSubgroups(MultiProcContinuousTest):
+    def _init_device(self) -> None:
+        # TODO: relieve this (seems to hang if without)
+        device_module.set_device(self.device)
+        # Set NVSHMEM as SymmMem backend
+        symm_mem.set_backend("NVSHMEM")
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    @skipIfRocm
+    # TODO: FIXIT. Currently, `MultiProcContinuousTest` treats the skip code as a
+    # failure
+    @skip_if_lt_x_gpu(4)
+    def test_dispatch_combine_subgroup(self) -> None:
+        """
+        Test dispatch-and-combine over concurrent subgroups
+        """
+        torch.manual_seed(42 + self.rank)
+        self._init_device()
+        symm_mem.enable_symm_mem_for_group(dist.group.WORLD.group_name)
+        # Test on two concurrent subgroups
+        ngroups = 2
+        subgroup_size = self.world_size // ngroups
+        dm = init_device_mesh(
+            device_type, (ngroups, subgroup_size), mesh_dim_names=("dp", "ep")
+        )
+        subgroup = dm.get_group("ep")
+        dispatch_then_combine(self.device, align=8, group=subgroup)
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/test_nvshmem_triton.py b/test/distributed/test_nvshmem_triton.py
index 450666c25c32..06a4ccda473d 100644
--- a/test/distributed/test_nvshmem_triton.py
+++ b/test/distributed/test_nvshmem_triton.py
@@ -2,6 +2,8 @@
 # To run:
 # python test/distributed/test_nvshmem_triton.py
 
+import sys
+
 import triton.language as tl
 
 import torch
@@ -9,6 +11,7 @@
 import torch.distributed._symmetric_memory as symm_mem
 import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
 from torch._inductor.runtime.triton_compat import triton
+from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem
 from torch.testing._internal.common_distributed import MultiProcContinuousTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -20,12 +23,9 @@
 from torch.testing._internal.inductor_utils import IS_H100, requires_triton
 
 
-# Decorators
-def requires_nvshmem():
-    return skip_but_pass_in_sandcastle_if(
-        not symm_mem.is_nvshmem_available(),
-        "test_nvshmem requires NVSHMEM, skipping tests",
-    )
+if not symm_mem.is_nvshmem_available():
+    print("NVSHMEM not available, skipping tests")
+    sys.exit(0)
 
 
 def requires_h100():
@@ -41,8 +41,11 @@ def requires_h100():
 
 
 # Shared Triton JIT kernels
+
+
+@requires_nvshmem
 @triton.jit
-def nvshmem_put_kernel(
+def my_put_kernel(
     dest,
     src,
     nelems,
@@ -51,8 +54,9 @@ def nvshmem_put_kernel(
     nvshmem.put(dest, src, nelems, pe)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_get_kernel(
+def my_get_kernel(
     dest,
     src,
     nelems,
@@ -61,28 +65,29 @@ def nvshmem_get_kernel(
     nvshmem.get(dest, src, nelems, pe)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_putmem_signal_block_kernel(
-    dst_ptr,
-    src_ptr,
+def my_putmem_signal_block_kernel(
+    dst,
+    src,
     size_bytes,
-    sig_ptr,
-    signal_val,
+    signal,
+    sig_val,
     sig_op,
     peer,
 ):
-    nvshmem.putmem_signal_block(
-        dst_ptr, src_ptr, size_bytes, sig_ptr, signal_val, sig_op, peer
-    )
+    nvshmem.putmem_signal_block(dst, src, size_bytes, signal, sig_val, sig_op, peer)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_signal_wait_until_kernel(sig_ptr, cmp_op, cmp_val):
-    nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)
+def my_signal_wait_until_kernel(signal, cmp_op, cmp_val):
+    nvshmem.signal_wait_until(signal, cmp_op, cmp_val)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_signal_op_kernel(
+def my_signal_op_kernel(
     sig_addr,
     signal,
     sig_op,
@@ -91,8 +96,9 @@ def nvshmem_signal_op_kernel(
     nvshmem.signal_op(sig_addr, signal, sig_op, peer)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_wait_until_kernel(
+def my_wait_until_kernel(
     ivar,
     cmp_op,
     cmp_val,
@@ -100,13 +106,15 @@ def nvshmem_wait_until_kernel(
     nvshmem.wait_until(ivar, cmp_op, cmp_val)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_fence_kernel():
+def my_fence_kernel():
     nvshmem.fence()
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_put_with_fence_kernel(
+def my_put_with_fence_kernel(
     dst1,
     src1,
     dst2,
@@ -128,8 +136,9 @@ def nvshmem_put_with_fence_kernel(
     nvshmem.put(flag_dst, flag_src, 1, peer)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_put_with_quiet_kernel(
+def my_put_with_quiet_kernel(
     dst,
     src,
     flag_dst,
@@ -146,8 +155,9 @@ def nvshmem_put_with_quiet_kernel(
     nvshmem.put(flag_dst, flag_src, 1, peer)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_barrier_test_kernel(
+def my_barrier_test_kernel(
     dst,
     src,
     nelems,
@@ -180,13 +190,15 @@ def nvshmem_barrier_test_kernel(
         tl.store(p_dst, received + 1)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_barrier_all_kernel():
+def my_barrier_all_kernel():
     nvshmem.barrier_all()
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_sync_test_kernel(
+def my_sync_test_kernel(
     local_data,
     remote_data,
     nelems,
@@ -212,8 +224,9 @@ def nvshmem_sync_test_kernel(
     # because sync_all() made those local stores visible
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_alltoall_kernel(
+def my_alltoall_kernel(
     team_handle,
     dst,
     src,
@@ -222,8 +235,9 @@ def nvshmem_alltoall_kernel(
     nvshmem.alltoall(team_handle, dst, src, nelems_per_pe)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_broadcast_kernel(
+def my_broadcast_kernel(
     team_handle,
     dst,
     src,
@@ -233,8 +247,9 @@ def nvshmem_broadcast_kernel(
     nvshmem.broadcast(team_handle, dst, src, nelems, pe_root)
 
 
+@requires_nvshmem
 @triton.jit
-def nvshmem_reduce_kernel(
+def my_reduce_kernel(
     team_handle,
     dest_tensor,
     source_tensor,
@@ -245,13 +260,10 @@ def nvshmem_reduce_kernel(
 
 
 @instantiate_parametrized_tests
-@requires_nvshmem()
 class NVSHMEMTritonTest(MultiProcContinuousTest):
     def _init_device(self) -> None:
         # TODO: relieve this (seems to hang if without)
         device_module.set_device(self.device)
-        # NOTE: required for nvshmem allocation
-        torch.empty(1, device=self.device)
         # Set NVSHMEM as SymmMem backend
         symm_mem.set_backend("NVSHMEM")
 
@@ -266,9 +278,6 @@ def test_triton_put(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        # Enable NVSHMEM for Triton
-        nvshmem_lib = nvshmem.enable_triton()
-
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -298,12 +307,11 @@ def test_triton_put(self) -> None:
         peer = 1 - rank
         if rank == 0:
             # Rank 0 puts its data to Rank 1
-            nvshmem_put_kernel[(1,)](
+            my_put_kernel[(1,)](
                 dst,
                 src,
                 nelems,
                 peer,
-                extern_libs=nvshmem_lib,
             )
 
         # Synchronize after operation
@@ -323,7 +331,6 @@ def test_triton_get(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -345,12 +352,11 @@ def test_triton_get(self) -> None:
         peer = 1 - rank
         if rank == 1:
             # Rank 1 gets data from rank 0 using tensor-aware API
-            nvshmem_get_kernel[(1,)](
+            my_get_kernel[(1,)](
                 out,
                 inp,
                 numel,
                 peer,
-                extern_libs=nvshmem_lib,
             )
         if rank == 1:
             torch.testing.assert_close(
@@ -364,7 +370,6 @@ def test_triton_get_ring(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -387,12 +392,11 @@ def test_triton_get_ring(self) -> None:
         peer = (rank - 1) % world_size
 
         # All ranks execute the get operation using tensor-aware API
-        nvshmem_get_kernel[(1,)](
+        my_get_kernel[(1,)](
             out,
             inp,
             numel,
             peer,
-            extern_libs=nvshmem_lib,
         )
 
         expected_value = peer
@@ -407,8 +411,6 @@ def test_triton_put_signal_set(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
-
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -421,7 +423,7 @@ def test_triton_put_signal_set(self) -> None:
         val = 11
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
         out_hdl = symm_mem.rendezvous(out, group=group_name)
 
         # Use the signal pad attached to the output symmetric memory handle
@@ -435,28 +437,22 @@ def test_triton_put_signal_set(self) -> None:
 
         if rank == 0:
             # Rank 0 puts into Rank 1
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            sig_ptr = out_hdl.signal_pad_ptrs[peer]
-            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
+            my_putmem_signal_block_kernel[(1, 1, 1)](
+                out,
+                inp,
                 size_bytes=msg_size_bytes,
-                sig_ptr=sig_ptr,
-                signal_val=SIGNAL_VAL,
+                signal=flag,
+                sig_val=SIGNAL_VAL,
                 sig_op=NVSHMEM_SIGNAL_SET,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
 
         if rank == 1:
             # Wait until signal flag is set by Rank 0
-            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
-            nvshmem_signal_wait_until_kernel[(1,)](
-                sig_ptr_local,
+            my_signal_wait_until_kernel[(1,)](
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=SIGNAL_VAL,
-                extern_libs=nvshmem_lib,
             )
             # After wait completes, verify data and flag contents
             torch.testing.assert_close(
@@ -473,8 +469,6 @@ def test_triton_put_signal_add(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
-
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -487,7 +481,7 @@ def test_triton_put_signal_add(self) -> None:
         val = 11
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
         out_hdl = symm_mem.rendezvous(out, group=group_name)
 
         # Use the signal pad attached to the output symmetric memory handle
@@ -501,27 +495,21 @@ def test_triton_put_signal_add(self) -> None:
 
         if rank == 0:
             # Rank 0 puts into Rank 1
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            sig_ptr = out_hdl.signal_pad_ptrs[peer]
-            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
+            my_putmem_signal_block_kernel[(1, 1, 1)](
+                out,
+                inp,
                 size_bytes=msg_size_bytes,
-                sig_ptr=sig_ptr,
-                signal_val=SIGNAL_VAL,
+                signal=flag,
+                sig_val=SIGNAL_VAL,
                 sig_op=NVSHMEM_SIGNAL_ADD,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
 
         if rank == 1:
-            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
-            nvshmem_signal_wait_until_kernel[(1, 1, 1)](
-                sig_ptr_local,
+            my_signal_wait_until_kernel[(1, 1, 1)](
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=SIGNAL_VAL,
-                extern_libs=nvshmem_lib,
             )
             torch.testing.assert_close(
                 out, val * torch.ones(numel, dtype=dtype, device=self.device)
@@ -537,7 +525,6 @@ def test_triton_wait_until(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
 
@@ -548,41 +535,36 @@ def test_triton_wait_until(self) -> None:
         FLAG_FINAL_VALUE = 42
 
         # Use a single int64 symmetric tensor as our synchronization flag.
-        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(
+        flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(
             FLAG_INITIAL_VALUE
         )
         symm_mem.rendezvous(flag, group=group_name)
-
-        nvshmem_barrier_all_kernel[(1,)](extern_libs=nvshmem_lib)
+        expected_flag = torch.tensor(
+            [FLAG_FINAL_VALUE], dtype=torch.int32, device=self.device
+        )
 
         if rank == 0:
             # Rank 0 (the waiter)
-            nvshmem_wait_until_kernel[(1,)](
+            my_wait_until_kernel[(1,)](
                 flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=FLAG_FINAL_VALUE,
-                extern_libs=nvshmem_lib,
             )
 
             # Verification
             torch.testing.assert_close(
                 flag,
-                torch.tensor([FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device),
+                expected_flag,
             )
 
         if rank == 1:
             # Rank 1 (the signaler)
-            val_to_put = torch.tensor(
-                [FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device
-            )
-
             # Launch a kernel to put the value to Rank 0's flag tensor.
-            nvshmem_put_kernel[(1,)](
+            my_put_kernel[(1,)](
                 flag,  # Destination symmetric tensor on the remote PE
-                val_to_put,  # Source data tensor (local)
+                expected_flag,  # Source data tensor (local)
                 1,  # Number of elements
                 peer,  # The target PE (Rank 0)
-                extern_libs=nvshmem_lib,
             )
 
     @skipIfRocm
@@ -590,8 +572,6 @@ def test_triton_wait_until(self) -> None:
     @requires_h100()
     def test_triton_signal_wait_until(self) -> None:
         self._init_device()
-        # Enable NVSHMEM for Triton
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -611,7 +591,7 @@ def test_triton_signal_wait_until(self) -> None:
 
         # Producer (rank 0) prepares the data to send
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val_to_put)
-        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
+        symm_mem.rendezvous(inp, group=group_name)
         # Consumer (rank 1) prepares the destination buffer
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
         out_hdl = symm_mem.rendezvous(out, group=group_name)
@@ -621,27 +601,21 @@ def test_triton_signal_wait_until(self) -> None:
 
         if rank == 0:
             # Producer (rank 0): Puts data into rank 1's `out` buffer and then sets the flag
-            dst_ptr = out_hdl.buffer_ptrs[peer]
-            src_ptr = inp_hdl.buffer_ptrs[rank]
-            sig_ptr = out_hdl.signal_pad_ptrs[peer]
-            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                dst_ptr,
-                src_ptr,
+            my_putmem_signal_block_kernel[(1, 1, 1)](
+                out,
+                inp,
                 size_bytes=msg_size_bytes,
-                sig_ptr=sig_ptr,
-                signal_val=COMPLETION_FLAG_VAL,
+                signal=flag,
+                sig_val=COMPLETION_FLAG_VAL,
                 sig_op=NVSHMEM_SIGNAL_SET,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
         elif rank == 1:
             # Consumer (rank 1): Waits on the signal variable using `signal_wait_until`.
-            sig_ptr = out_hdl.signal_pad_ptrs[rank]
-            nvshmem_signal_wait_until_kernel[(1, 1, 1)](
-                sig_ptr,
+            my_signal_wait_until_kernel[(1, 1, 1)](
+                flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=COMPLETION_FLAG_VAL,
-                extern_libs=nvshmem_lib,
             )
             # After the wait returns, verify data and flag
             torch.testing.assert_close(
@@ -668,7 +642,6 @@ def test_triton_fence(self) -> None:
         """
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -691,15 +664,15 @@ def test_triton_fence(self) -> None:
         symm_mem.rendezvous(out2, group=group_name)
 
         # Use regular symmetric memory tensor for flag
-        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
+        flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
         symm_mem.rendezvous(flag, group=group_name)
         flag_update_val = torch.tensor(
-            [flag_val], dtype=torch.int64, device=self.device
+            [flag_val], dtype=torch.int32, device=self.device
         )
         NVSHMEM_CMP_EQ = 0  # compare equal
 
         if rank == 0:
-            nvshmem_put_with_fence_kernel[(1,)](
+            my_put_with_fence_kernel[(1,)](
                 out1,
                 inp1,
                 out2,
@@ -708,15 +681,13 @@ def test_triton_fence(self) -> None:
                 flag_update_val,
                 nelems=numel,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
         elif rank == 1:
             # Wait until flag is set by Rank 0
-            nvshmem_wait_until_kernel[(1,)](
+            my_wait_until_kernel[(1,)](
                 flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=flag_val,
-                extern_libs=nvshmem_lib,
             )
 
             # Verify ordered data arrival.
@@ -727,7 +698,7 @@ def test_triton_fence(self) -> None:
                 out2, val2 * torch.ones(numel, dtype=dtype, device=self.device)
             )
             torch.testing.assert_close(
-                flag, torch.tensor([flag_val], dtype=torch.int64, device=self.device)
+                flag, torch.tensor([flag_val], dtype=torch.int32, device=self.device)
             )
 
     @skipIfRocm
@@ -736,7 +707,6 @@ def test_triton_fence(self) -> None:
     def test_triton_quiet(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -749,9 +719,9 @@ def test_triton_quiet(self) -> None:
 
         inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
         out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
+        flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
         flag_update_val = torch.tensor(
-            [flag_val], dtype=torch.int64, device=self.device
+            [flag_val], dtype=torch.int32, device=self.device
         )
 
         symm_mem.rendezvous(inp, group=group_name)
@@ -762,21 +732,19 @@ def test_triton_quiet(self) -> None:
 
         dist.barrier()
         if rank == 1:
-            nvshmem_put_with_quiet_kernel[(1,)](
+            my_put_with_quiet_kernel[(1,)](
                 out,
                 inp,
                 flag,
                 flag_update_val,
                 nelems=numel,
                 peer=peer,
-                extern_libs=nvshmem_lib,
             )
         elif rank == 0:
-            nvshmem_wait_until_kernel[(1,)](
+            my_wait_until_kernel[(1,)](
                 flag,
                 cmp_op=NVSHMEM_CMP_EQ,
                 cmp_val=flag_val,
-                extern_libs=nvshmem_lib,
             )
             torch.testing.assert_close(
                 out, val * torch.ones(numel, dtype=dtype, device=self.device)
@@ -789,7 +757,6 @@ def test_triton_quiet(self) -> None:
     def test_triton_barrier(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -801,11 +768,10 @@ def test_triton_barrier(self) -> None:
         symm_mem.rendezvous(src, group=group_name)
         symm_mem.rendezvous(dst, group=group_name)
 
-        nvshmem_barrier_test_kernel[(1,)](
+        my_barrier_test_kernel[(1,)](
             dst,
             src,
             nelems=numel,
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
             num_ctas=1,
         )
@@ -827,7 +793,6 @@ def test_triton_sync(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
 
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -841,11 +806,10 @@ def test_triton_sync(self) -> None:
         symm_mem.rendezvous(remote_data, group=group_name)
 
         # Launch kernel with cooperative grid
-        nvshmem_sync_test_kernel[(1,)](
+        my_sync_test_kernel[(1,)](
             local_data,
             remote_data,
             nelems=numel,
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
             num_ctas=1,
         )
@@ -872,7 +836,6 @@ def test_triton_sync(self) -> None:
     def test_triton_alltoall(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         world_size = dist.get_world_size()
@@ -897,12 +860,11 @@ def test_triton_alltoall(self) -> None:
         dist.barrier()
         team_handle = 0  # NVSHMEM_TEAM_WORLD handle is 0
         # Launch the kernel using new tensor-aware API
-        nvshmem_alltoall_kernel[(1,)](
+        my_alltoall_kernel[(1,)](
             team_handle,
             dst,
             src,
             nelems_per_pe,
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
         # Synchronize after alltoall
@@ -921,7 +883,6 @@ def test_triton_alltoall(self) -> None:
     def test_triton_broadcast(self) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         rank = self.rank
@@ -952,13 +913,12 @@ def test_triton_broadcast(self) -> None:
 
         # Execute broadcast
         team_handle = 0  # NVSHMEM_TEAM_WORLD
-        nvshmem_broadcast_kernel[(1,)](
+        my_broadcast_kernel[(1,)](
             team_handle,
             dst,
             src,
             nelems,
             pe_root,
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
 
@@ -984,14 +944,13 @@ def test_triton_broadcast(self) -> None:
             torch.uint8,
             torch.float16,
             torch.float32,
-            torch.float64,
+            # torch.float64,  # Tensor-likes are not close
             torch.bfloat16,
         ],
     )
     def test_triton_sum_reduce(self, dtype) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         world_size = dist.get_world_size()
@@ -1018,13 +977,12 @@ def test_triton_sum_reduce(self, dtype) -> None:
 
         # Execute sum reduction across all ranks
         team_handle = 0  # NVSHMEM_TEAM_WORLD
-        nvshmem_reduce_kernel[(1,)](
+        my_reduce_kernel[(1,)](
             team_handle,
             dst,
             src,
             nreduce,
             operation="sum",
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
 
@@ -1055,7 +1013,6 @@ def test_triton_sum_reduce(self, dtype) -> None:
     def test_triton_minmax_reduce(self, dtype) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         world_size = dist.get_world_size()
@@ -1097,23 +1054,21 @@ def test_triton_minmax_reduce(self, dtype) -> None:
         dist.barrier()
         # Execute MIN reduction
         team_handle = 0
-        nvshmem_reduce_kernel[(1,)](
+        my_reduce_kernel[(1,)](
             team_handle,
             dst_min,
             src_min,
             nreduce,
             operation="min",
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
         # Execute MAX reduction
-        nvshmem_reduce_kernel[(1,)](
+        my_reduce_kernel[(1,)](
             team_handle,
             dst_max,
             src_max,
             nreduce,
             operation="max",
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
         dist.barrier()
@@ -1137,14 +1092,13 @@ def test_triton_minmax_reduce(self, dtype) -> None:
             torch.int64,
             torch.float16,
             torch.float32,
-            torch.float64,
+            # torch.float64,  # Tensor-likes are not close
             torch.bfloat16,
         ],
     )
     def test_triton_prod_reduce(self, dtype) -> None:
         torch.manual_seed(42 + self.rank)
         self._init_device()
-        nvshmem_lib = nvshmem.enable_triton()
         group_name = dist.distributed_c10d._get_default_group().group_name
         symm_mem.enable_symm_mem_for_group(group_name)
         world_size = dist.get_world_size()
@@ -1184,13 +1138,12 @@ def test_triton_prod_reduce(self, dtype) -> None:
 
         # Execute product reduction across all ranks
         team_handle = 0  # NVSHMEM_TEAM_WORLD
-        nvshmem_reduce_kernel[(1,)](
+        my_reduce_kernel[(1,)](
             team_handle,
             dst,
             src,
             nreduce,
             operation="prod",
-            extern_libs=nvshmem_lib,
             launch_cooperative_grid=True,
         )
 
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
index 8327a5611ef4..eeeb24bec307 100644
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@@ -2,6 +2,7 @@
 
 import itertools
 import os
+import random
 from contextlib import nullcontext
 from unittest import skip, skipIf
 
@@ -57,9 +58,8 @@ class SymmetricMemoryTest(MultiProcContinuousTest):
     def device(self) -> torch.device:
         return torch.device(device_type, self.rank)
 
-    def _init_process(self, set_device: bool = True):
-        if set_device:
-            torch.cuda.set_device(self.device)
+    def _init_process(self):
+        torch.cuda.set_device(self.device)
         torch.manual_seed(42 + self.rank)
 
     def test_has_multicast_support(self) -> None:
@@ -92,86 +92,6 @@ def test_large_alloc(self) -> None:
         t = symm_mem.empty(2 * 1024**3, dtype=torch.uint8, device="cuda")
         self.assertEqual(t.numel() * t.element_size(), 2 * 1024**3)
 
-    def _get_test_alloc_args(self):
-        shape = (64, 64)
-        stride = (64, 1)
-        dtype = torch.float32
-        device = self.device
-        group_name = "0"
-        return (shape, stride, dtype, device, group_name)
-
-    def _verify_symmetric_memory(self, symm_mem_hdl):
-        self.assertEqual(symm_mem_hdl.world_size, self.world_size)
-
-        buf = symm_mem_hdl.get_buffer(
-            0, (symm_mem_hdl.buffer_size // 4,), torch.float32
-        )
-        self.assertEqual(buf.storage_offset(), 0)
-        self.assertEqual(buf.untyped_storage().size(), symm_mem_hdl.buffer_size)
-
-        if symm_mem_hdl.rank == 0:
-            symm_mem_hdl.wait_signal(src_rank=1)
-            self.assertTrue(buf.eq(42).all())
-        else:
-            buf.fill_(42)
-            symm_mem_hdl.put_signal(dst_rank=0)
-
-        symm_mem_hdl.barrier()
-
-        if symm_mem_hdl.rank == 0:
-            symm_mem_hdl.barrier()
-            self.assertTrue(buf.eq(43).all())
-        else:
-            buf.fill_(43)
-            symm_mem_hdl.barrier()
-
-        symm_mem_hdl.barrier()
-
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
-    @parametrize("set_device", [True, False])
-    def test_empty_strided_p2p(self, set_device: bool) -> None:
-        self._init_process(set_device)
-        enable_symm_mem_for_group(dist.group.WORLD.group_name)
-
-        alloc_args = self._get_test_alloc_args()
-
-        t = torch.empty((64, 64), device=self.device)
-        self.assertIsNone(_SymmetricMemory.rendezvous(t))
-
-        t = _SymmetricMemory.empty_strided_p2p(*alloc_args)
-        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
-
-        del t
-        self._verify_symmetric_memory(symm_mem_hdl)
-
-    @skipIfRocm  # started failing during ROCm 6.4 CI upgrade
-    @skip_if_lt_x_gpu(2)
-    @parametrize("set_device", [True, False])
-    def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
-        self._init_process(set_device)
-        enable_symm_mem_for_group(dist.group.WORLD.group_name)
-
-        alloc_args = self._get_test_alloc_args()
-
-        t = _SymmetricMemory.empty_strided_p2p(*alloc_args, alloc_id=42)
-        data_ptr = t.data_ptr()
-
-        # Verify that persistent allocation would fail if there's an active
-        # allocation with the same alloc_id.
-        with self.assertRaises(RuntimeError):
-            _SymmetricMemory.empty_strided_p2p(*alloc_args, alloc_id=42)
-
-        # Verify that persistent allocation would succeed in lieu of activate
-        # allocations with the same alloc_id, and the returned tensor would
-        # have the same data pointer.
-        del t
-        t = _SymmetricMemory.empty_strided_p2p(*alloc_args, alloc_id=42)
-        self.assertEqual(t.data_ptr(), data_ptr)
-
-        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
-        self._verify_symmetric_memory(symm_mem_hdl)
-
     @runOnRocmArch(MI300_ARCH)
     @skip_if_lt_x_gpu(2)
     def test_get_signal_pad(self) -> None:
@@ -232,6 +152,124 @@ def test_allow_overlapping_devices(self) -> None:
 
         os.environ["TORCH_SYMM_MEM_ALLOW_OVERLAPPING_DEVICES"] = "0"
 
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("symm_mem_input", [True, False])
+    def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
+        self._init_process()
+
+        if symm_mem_input:
+            t = _SymmetricMemory.empty_strided_p2p(
+                size=(64, 64),
+                stride=(64, 1),
+                dtype=torch.float32,
+                device=self.device,
+                group_name="0",
+            ).fill_(self.rank)
+        else:
+            t = torch.full((64, 64), self.rank, dtype=torch.float32, device=self.device)
+
+        res = torch.ops.symm_mem._low_contention_all_gather(t, "0")
+        res = torch.ops._c10d_functional.wait_tensor(res)
+        self.assertEqual(res.shape, (64 * self.world_size, 64))
+
+        chunks = res.chunk(self.world_size)
+        for r in range(self.world_size):
+            self.assertTrue(chunks[r].eq(r).all())
+
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(2)
+    @parametrize("reduce_op", ["sum", "avg"])
+    @parametrize("symm_mem_input", [True, False])
+    def test_low_contention_reduce_scatter(
+        self, reduce_op: str, symm_mem_input: bool
+    ) -> None:
+        self._init_process()
+
+        if symm_mem_input:
+            t = _SymmetricMemory.empty_strided_p2p(
+                size=(64, 64),
+                stride=(64, 1),
+                dtype=torch.float32,
+                device=self.device,
+                group_name="0",
+            )
+        else:
+            t = torch.empty((64, 64), dtype=torch.float32, device=self.device)
+
+        chunks = t.chunk(self.world_size)
+        for r in range(self.world_size):
+            chunks[r].fill_(r)
+
+        res = torch.ops.symm_mem._low_contention_reduce_scatter(t, reduce_op, "0")
+        res = torch.ops._c10d_functional.wait_tensor(res)
+        self.assertEqual(res.shape, (64 // self.world_size, 64))
+
+        if reduce_op == "sum":
+            expect = self.rank * self.world_size
+        elif reduce_op == "avg":
+            expect = self.rank
+        else:
+            raise AssertionError(f"Unexpected reduce_op: {reduce_op}")
+        self.assertTrue(res.eq(expect).all())
+
+    @runOnRocmArch(MI300_ARCH)
+    @skip_if_lt_x_gpu(4)
+    def test_subgroup(self) -> None:
+        self._init_process()
+
+        ranks = list(range(self.world_size))
+        subgroup_0 = dist.new_group(ranks[: len(ranks) // 2])
+        subgroup_1 = dist.new_group(ranks[len(ranks) // 2 :])
+
+        world = dist.group.WORLD
+        subgroup = subgroup_0 if world.rank() < world.size() // 2 else subgroup_1
+
+        t = symm_mem.empty(64, device="cuda")
+        symm_mem_world = symm_mem.rendezvous(t, group=world)
+        symm_mem_subgroup = symm_mem.rendezvous(t, group=subgroup)
+
+        self.assertEqual(symm_mem_world.world_size, world.size())
+        self.assertEqual(symm_mem_world.rank, world.rank())
+        self.assertEqual(symm_mem_subgroup.world_size, world.size() // 2)
+        self.assertEqual(symm_mem_subgroup.rank, world.rank() % subgroup.size())
+
+        t.fill_(world.rank())
+        symm_mem_world.barrier()
+
+        # Observe a peer buffer via the world group
+        peer_rank = (world.rank() + 1) % world.size()
+        buf = symm_mem_world.get_buffer(peer_rank, (64,), torch.float32)
+        self.assertTrue(buf.eq(peer_rank).all())
+
+        # Observe a peer buffer via the subgroup
+        peer_rank = (subgroup.rank() + 1) % subgroup.size()
+        buf = symm_mem_subgroup.get_buffer(peer_rank, (64,), torch.float32)
+        if world.rank() < world.size() // 2:
+            self.assertTrue(buf.eq(peer_rank).all())
+        else:
+            self.assertTrue(buf.eq(peer_rank + world.size() // 2).all())
+
+
+# We move AsyncTP tests to a seperate test suite because 1) Async TP ops are not
+# the core symmetric memory APIs, they are more like applications, 2)
+# MultiProcContinuousTest will skip all the following tests if a test fails (
+# we should fix this too). We still want to get the test signals for the core
+# symmetric memory APIs when Async TP ops fail.
+@instantiate_parametrized_tests
+@requires_cuda_p2p_access()
+class AsyncTPTest(MultiProcContinuousTest):
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    def _init_process(self):
+        torch.cuda.set_device(self.device)
+        torch.manual_seed(42 + self.rank)
+        torch.use_deterministic_algorithms(True)
+        torch.set_deterministic_debug_mode("warn")
+        torch.utils.deterministic.fill_uninitialized_memory = True
+
     @runOnRocmArch(MI300_ARCH)
     @skip_if_lt_x_gpu(2)
     @parametrize("gather_dim", [0, 1])
@@ -520,7 +558,7 @@ def test_fused_scaled_matmul_reduce_scatter(
                 )
 
         assert outputs[0].stride() == outputs[1].stride()
-        assert torch.allclose(outputs[0], outputs[1]), (outputs[0], outputs[1])
+        self.assertEqual(outputs[0], outputs[1])
 
     @runOnRocmArch(MI300_ARCH)
     @parametrize("dim", [0, 1, 2])
@@ -535,103 +573,130 @@ def test_optimal_layout(self, dim: int) -> None:
         self.assertTrue(x.movedim(dim, 0).is_contiguous())
         self.assertTrue(torch.allclose(x, t))
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
-    @parametrize("symm_mem_input", [True, False])
-    def test_low_contention_all_gather(self, symm_mem_input: bool) -> None:
-        self._init_process()
 
-        if symm_mem_input:
-            t = _SymmetricMemory.empty_strided_p2p(
-                size=(64, 64),
-                stride=(64, 1),
-                dtype=torch.float32,
-                device=self.device,
-                group_name="0",
-            ).fill_(self.rank)
-        else:
-            t = torch.full((64, 64), self.rank, dtype=torch.float32, device=self.device)
+# [READ ME FIRST]
+# The `SymmMemEmptySetDeviceTest` suite parameterizes whether user sets the
+# device before calling symm_mem.emtpy.  Either way should work.
+# However, since `set_device` is persistent, we cannot use the
+# `MultiProcContinuousTest` template because the next function will be
+# "contaminated", leading to flaky tests (e.g. hang). Therefore, we use
+# `MultiProcessTestCase` which spawns new processes for each test function.
+# Please limit the number of tests you want to add under this test
+# suite as respawning processes and `init_process_group` is expensive.
+@instantiate_parametrized_tests
+@requires_cuda_p2p_access()
+class SymmMemEmptySetDeviceTest(MultiProcessTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
 
-        res = torch.ops.symm_mem._low_contention_all_gather(t, "0")
-        res = torch.ops._c10d_functional.wait_tensor(res)
-        self.assertEqual(res.shape, (64 * self.world_size, 64))
+    @property
+    def world_size(self) -> int:
+        return device_module.device_count()
 
-        chunks = res.chunk(self.world_size)
-        for r in range(self.world_size):
-            self.assertTrue(chunks[r].eq(r).all())
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
 
-    @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(2)
-    @parametrize("reduce_op", ["sum", "avg"])
-    @parametrize("symm_mem_input", [True, False])
-    def test_low_contention_reduce_scatter(
-        self, reduce_op: str, symm_mem_input: bool
-    ) -> None:
-        self._init_process()
+    def _init_process(self, set_device: bool):
+        if set_device:
+            torch.cuda.set_device(self.device)
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        torch.manual_seed(42 + self.rank)
 
-        if symm_mem_input:
-            t = _SymmetricMemory.empty_strided_p2p(
-                size=(64, 64),
-                stride=(64, 1),
-                dtype=torch.float32,
-                device=self.device,
-                group_name="0",
-            )
-        else:
-            t = torch.empty((64, 64), dtype=torch.float32, device=self.device)
+    def _get_test_alloc_args(self):
+        shape = (64, 64)
+        stride = (64, 1)
+        dtype = torch.float32
+        device = self.device
+        return (shape, stride, dtype, device)
 
-        chunks = t.chunk(self.world_size)
-        for r in range(self.world_size):
-            chunks[r].fill_(r)
+    def _verify_symmetric_memory(self, symm_mem_hdl):
+        self.assertEqual(symm_mem_hdl.world_size, self.world_size)
 
-        res = torch.ops.symm_mem._low_contention_reduce_scatter(t, reduce_op, "0")
-        res = torch.ops._c10d_functional.wait_tensor(res)
-        self.assertEqual(res.shape, (64 // self.world_size, 64))
+        buf = symm_mem_hdl.get_buffer(
+            0, (symm_mem_hdl.buffer_size // 4,), torch.float32
+        )
+        self.assertEqual(buf.storage_offset(), 0)
+        self.assertEqual(buf.untyped_storage().size(), symm_mem_hdl.buffer_size)
 
-        if reduce_op == "sum":
-            expect = self.rank * self.world_size
-        elif reduce_op == "avg":
-            expect = self.rank
+        if symm_mem_hdl.rank == 0:
+            symm_mem_hdl.wait_signal(src_rank=1)
+            self.assertTrue(buf.eq(42).all())
         else:
-            raise AssertionError(f"Unexpected reduce_op: {reduce_op}")
-        self.assertTrue(res.eq(expect).all())
+            buf.fill_(42)
+            symm_mem_hdl.put_signal(dst_rank=0)
+
+        symm_mem_hdl.barrier()
+
+        if symm_mem_hdl.rank == 0:
+            symm_mem_hdl.barrier()
+            self.assertTrue(buf.eq(43).all())
+        else:
+            buf.fill_(43)
+            symm_mem_hdl.barrier()
+
+        symm_mem_hdl.barrier()
 
     @runOnRocmArch(MI300_ARCH)
-    @skip_if_lt_x_gpu(4)
-    def test_subgroup(self) -> None:
-        self._init_process()
+    @skip_if_lt_x_gpu(2)
+    @parametrize("set_device", [True, False])
+    def test_empty_strided_p2p(self, set_device: bool) -> None:
+        self._init_process(set_device)
+        group_name = dist.group.WORLD.group_name
+        enable_symm_mem_for_group(group_name)
 
-        ranks = list(range(self.world_size))
-        subgroup_0 = dist.new_group(ranks[: len(ranks) // 2])
-        subgroup_1 = dist.new_group(ranks[len(ranks) // 2 :])
+        alloc_args = self._get_test_alloc_args()
 
-        world = dist.group.WORLD
-        subgroup = subgroup_0 if world.rank() < world.size() // 2 else subgroup_1
+        t = torch.empty((64, 64), device=self.device)
+        self.assertIsNone(_SymmetricMemory.rendezvous(t))
 
-        t = symm_mem.empty(64, device="cuda")
-        symm_mem_world = symm_mem.rendezvous(t, group=world)
-        symm_mem_subgroup = symm_mem.rendezvous(t, group=subgroup)
+        t = _SymmetricMemory.empty_strided_p2p(*alloc_args, group_name=group_name)
+        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
 
-        self.assertEqual(symm_mem_world.world_size, world.size())
-        self.assertEqual(symm_mem_world.rank, world.rank())
-        self.assertEqual(symm_mem_subgroup.world_size, world.size() // 2)
-        self.assertEqual(symm_mem_subgroup.rank, world.rank() % subgroup.size())
+        del t
+        self._verify_symmetric_memory(symm_mem_hdl)
 
-        t.fill_(world.rank())
-        symm_mem_world.barrier()
+    @skipIfRocm  # started failing during ROCm 6.4 CI upgrade
+    @skip_if_lt_x_gpu(2)
+    @parametrize("set_device", [True, False])
+    def test_empty_strided_p2p_persistent(self, set_device: bool) -> None:
+        self._init_process(set_device)
+        group_name = dist.group.WORLD.group_name
+        enable_symm_mem_for_group(group_name)
 
-        # Observe a peer buffer via the world group
-        peer_rank = (world.rank() + 1) % world.size()
-        buf = symm_mem_world.get_buffer(peer_rank, (64,), torch.float32)
-        self.assertTrue(buf.eq(peer_rank).all())
+        alloc_args = self._get_test_alloc_args()
 
-        # Observe a peer buffer via the subgroup
-        peer_rank = (subgroup.rank() + 1) % subgroup.size()
-        buf = symm_mem_subgroup.get_buffer(peer_rank, (64,), torch.float32)
-        if world.rank() < world.size() // 2:
-            self.assertTrue(buf.eq(peer_rank).all())
-        else:
-            self.assertTrue(buf.eq(peer_rank + world.size() // 2).all())
+        alloc_id = 42 + random.randint(0, 2147483647)
+        t = _SymmetricMemory.empty_strided_p2p(
+            *alloc_args, group_name=group_name, alloc_id=alloc_id
+        )
+        data_ptr = t.data_ptr()
+
+        # Verify that persistent allocation would fail if there's an active
+        # allocation with the same alloc_id.
+        with self.assertRaises(RuntimeError):
+            _SymmetricMemory.empty_strided_p2p(
+                *alloc_args, group_name=group_name, alloc_id=alloc_id
+            )
+
+        # Verify that persistent allocation would succeed in lieu of activate
+        # allocations with the same alloc_id, and the returned tensor would
+        # have the same data pointer.
+        del t
+        t = _SymmetricMemory.empty_strided_p2p(
+            *alloc_args, group_name=group_name, alloc_id=alloc_id
+        )
+        self.assertEqual(t.data_ptr(), data_ptr)
+
+        symm_mem_hdl = _SymmetricMemory.rendezvous(t)
+        self._verify_symmetric_memory(symm_mem_hdl)
 
 
 # This Test class is used to test the error handling of SymmetricMemory APIs.
diff --git a/test/dynamo/_test_nested_graph_breaks_helper.py b/test/dynamo/_test_nested_graph_breaks_helper.py
new file mode 100644
index 000000000000..ea229524d21b
--- /dev/null
+++ b/test/dynamo/_test_nested_graph_breaks_helper.py
@@ -0,0 +1,18 @@
+import torch
+
+
+global1 = torch.ones(3)
+
+
+def reset_state():
+    global global1
+    global1 = torch.ones(3)
+
+
+def fn(val, call):
+    global global1
+    global1 += 1
+    val = val + global1
+    val = call(val)
+    val = val + 1
+    return val
diff --git a/test/dynamo/cpython/3_13/list_tests.diff b/test/dynamo/cpython/3_13/list_tests.diff
index 57b4383b5db9..1a5c63a9142d 100644
--- a/test/dynamo/cpython/3_13/list_tests.diff
+++ b/test/dynamo/cpython/3_13/list_tests.diff
@@ -62,16 +62,16 @@ index dbc5ef4f9f2..af717703053 100644
 @@ -5,7 +58,7 @@ Tests common to list and UserList.UserList
  import sys
  from functools import cmp_to_key
- 
+
 -from test import seq_tests
 +import seq_tests
  from test.support import ALWAYS_EQ, NEVER_EQ, get_c_recursion_limit
- 
- 
+
+
 @@ -119,10 +172,6 @@ class CommonTest(seq_tests.CommonTest):
          a[-1] = 9
          self.assertEqual(a, self.type2test([5,6,7,8,9]))
- 
+
 -        msg = "list indices must be integers or slices"
 -        with self.assertRaisesRegex(TypeError, msg):
 -            a['a'] = "python"
@@ -81,7 +81,7 @@ index dbc5ef4f9f2..af717703053 100644
          del a[1]
 @@ -270,13 +319,14 @@ class CommonTest(seq_tests.CommonTest):
          self.assertRaises(TypeError, a.extend)
- 
+
          # overflow test. issue1621
 -        class CustomIter:
 -            def __iter__(self):
@@ -90,7 +90,7 @@ index dbc5ef4f9f2..af717703053 100644
 -                raise StopIteration
 -            def __length_hint__(self):
 -                return sys.maxsize
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class CustomIter:
 +                def __iter__(self):
 +                    return self
@@ -104,13 +104,13 @@ index dbc5ef4f9f2..af717703053 100644
 @@ -337,21 +387,23 @@ class CommonTest(seq_tests.CommonTest):
          a = self.type2test([NEVER_EQ])
          self.assertRaises(ValueError, a.remove, ALWAYS_EQ)
- 
+
 -        class BadExc(Exception):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class BadExc(Exception):
 +                pass
- 
+
 -        class BadCmp:
 -            def __eq__(self, other):
 -                if other == 2:
@@ -121,24 +121,24 @@ index dbc5ef4f9f2..af717703053 100644
 +                    if other == 2:
 +                        raise BadExc()
 +                    return False
- 
+
          a = self.type2test([0, 1, 2, 3])
          self.assertRaises(BadExc, a.remove, BadCmp())
- 
+
 -        class BadCmp2:
 -            def __eq__(self, other):
 -                raise BadExc()
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class BadCmp2:
 +                def __eq__(self, other):
 +                    raise BadExc()
- 
+
          d = self.type2test('abcdefghcij')
          d.remove('c')
 @@ -376,13 +428,14 @@ class CommonTest(seq_tests.CommonTest):
          self.assertRaises(ValueError, a.index, 2, 0, 4)
          self.assertEqual(a, self.type2test([-2, -1, 0, 1, 2]))
- 
+
 -        # Test modifying the list during index's iteration
 -        class EvilCmp:
 -            def __init__(self, victim):
@@ -146,7 +146,7 @@ index dbc5ef4f9f2..af717703053 100644
 -            def __eq__(self, other):
 -                del self.victim[:]
 -                return False
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Test modifying the list during index's iteration
 +            class EvilCmp:
 +                def __init__(self, victim):
diff --git a/test/dynamo/cpython/3_13/list_tests.py b/test/dynamo/cpython/3_13/list_tests.py
index af7177030531..21e85eef179f 100644
--- a/test/dynamo/cpython/3_13/list_tests.py
+++ b/test/dynamo/cpython/3_13/list_tests.py
@@ -319,7 +319,7 @@ def test_extend(self):
         self.assertRaises(TypeError, a.extend)
 
         # overflow test. issue1621
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class CustomIter:
                 def __iter__(self):
                     return self
@@ -387,7 +387,7 @@ def test_remove(self):
         a = self.type2test([NEVER_EQ])
         self.assertRaises(ValueError, a.remove, ALWAYS_EQ)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class BadExc(Exception):
                 pass
 
@@ -400,7 +400,7 @@ def __eq__(self, other):
         a = self.type2test([0, 1, 2, 3])
         self.assertRaises(BadExc, a.remove, BadCmp())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class BadCmp2:
                 def __eq__(self, other):
                     raise BadExc()
@@ -428,7 +428,7 @@ def test_index(self):
         self.assertRaises(ValueError, a.index, 2, 0, 4)
         self.assertEqual(a, self.type2test([-2, -1, 0, 1, 2]))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Test modifying the list during index's iteration
             class EvilCmp:
                 def __init__(self, victim):
diff --git a/test/dynamo/cpython/3_13/mapping_tests.diff b/test/dynamo/cpython/3_13/mapping_tests.diff
index 009b53f31b55..c376ddf725ae 100644
--- a/test/dynamo/cpython/3_13/mapping_tests.diff
+++ b/test/dynamo/cpython/3_13/mapping_tests.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/mapping_tests.py b/test/dynamo/cpython/3_13/mapping_tests.py
-index ed89a81a6ea..10fc6e7e467 100644
+index ed89a81a6ea..b19cec7cb23 100644
 --- a/test/dynamo/cpython/3_13/mapping_tests.py
 +++ b/test/dynamo/cpython/3_13/mapping_tests.py
 @@ -1,10 +1,64 @@
@@ -61,10 +61,360 @@ index ed89a81a6ea..10fc6e7e467 100644
  import unittest
  import collections
  from test.support import get_c_recursion_limit
- 
- 
+
+
 -class BasicTestMappingProtocol(unittest.TestCase):
 +class BasicTestMappingProtocol(__TestCase):
      # This base class can be used to check that an object conforms to the
      # mapping protocol
- 
+
+@@ -196,70 +250,76 @@ class BasicTestMappingProtocol(unittest.TestCase):
+         self.assertRaises((TypeError, AttributeError), d.update, 42)
+
+         outerself = self
+-        class SimpleUserDict:
+-            def __init__(self):
+-                self.d = outerself.reference
+-            def keys(self):
+-                return self.d.keys()
+-            def __getitem__(self, i):
+-                return self.d[i]
++        with torch._dynamo.error_on_graph_break(False):
++            class SimpleUserDict:
++                def __init__(self):
++                    self.d = outerself.reference
++                def keys(self):
++                    return self.d.keys()
++                def __getitem__(self, i):
++                    return self.d[i]
+         d.clear()
+         d.update(SimpleUserDict())
+         i1 = sorted(d.items())
+         i2 = sorted(self.reference.items())
+         self.assertEqual(i1, i2)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+         d = self._empty_mapping()
+-        class FailingUserDict:
+-            def keys(self):
+-                raise Exc
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    raise Exc
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+         d.clear()
+
+-        class FailingUserDict:
+-            def keys(self):
+-                class BogonIter:
+-                    def __init__(self):
+-                        self.i = 1
+-                    def __iter__(self):
+-                        return self
+-                    def __next__(self):
+-                        if self.i:
+-                            self.i = 0
+-                            return 'a'
+-                        raise Exc
+-                return BogonIter()
+-            def __getitem__(self, key):
+-                return key
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    class BogonIter:
++                        def __init__(self):
++                            self.i = 1
++                        def __iter__(self):
++                            return self
++                        def __next__(self):
++                            if self.i:
++                                self.i = 0
++                                return 'a'
++                            raise Exc
++                    return BogonIter()
++                def __getitem__(self, key):
++                    return key
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+-        class FailingUserDict:
+-            def keys(self):
+-                class BogonIter:
+-                    def __init__(self):
+-                        self.i = ord('a')
+-                    def __iter__(self):
+-                        return self
+-                    def __next__(self):
+-                        if self.i <= ord('z'):
+-                            rtn = chr(self.i)
+-                            self.i += 1
+-                            return rtn
+-                        raise StopIteration
+-                return BogonIter()
+-            def __getitem__(self, key):
+-                raise Exc
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    class BogonIter:
++                        def __init__(self):
++                            self.i = ord('a')
++                        def __iter__(self):
++                            return self
++                        def __next__(self):
++                            if self.i <= ord('z'):
++                                rtn = chr(self.i)
++                                self.i += 1
++                                return rtn
++                            raise StopIteration
++                    return BogonIter()
++                def __getitem__(self, key):
++                    raise Exc
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+         d = self._empty_mapping()
+-        class badseq(object):
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class badseq(object):
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, d.update, badseq())
+
+@@ -409,13 +469,14 @@ class TestMappingProtocol(BasicTestMappingProtocol):
+         d.update(self._full_mapping({1:2, 3:4, 5:6}).items())
+         self.assertEqual(d, {1:2, 2:4, 3:4, 5:6})
+
+-        class SimpleUserDict:
+-            def __init__(self):
+-                self.d = {1:1, 2:2, 3:3}
+-            def keys(self):
+-                return self.d.keys()
+-            def __getitem__(self, i):
+-                return self.d[i]
++        with torch._dynamo.error_on_graph_break(False):
++            class SimpleUserDict:
++                def __init__(self):
++                    self.d = {1:1, 2:2, 3:3}
++                def keys(self):
++                    return self.d.keys()
++                def __getitem__(self, i):
++                    return self.d[i]
+         d.clear()
+         d.update(SimpleUserDict())
+         self.assertEqual(d, {1:1, 2:2, 3:3})
+@@ -431,39 +492,44 @@ class TestMappingProtocol(BasicTestMappingProtocol):
+             yield 1
+         self.assertEqual(d.fromkeys(g()), {1:None})
+         self.assertRaises(TypeError, {}.fromkeys, 3)
+-        class dictlike(self.type2test): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class dictlike(self.type2test): pass
+         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
+         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
+         self.assertTrue(dictlike.fromkeys('a').__class__ is dictlike)
+         self.assertTrue(dictlike().fromkeys('a').__class__ is dictlike)
+         self.assertTrue(type(dictlike.fromkeys('a')) is dictlike)
+-        class mydict(self.type2test):
+-            def __new__(cls):
+-                return collections.UserDict()
++        with torch._dynamo.error_on_graph_break(False):
++            class mydict(self.type2test):
++                def __new__(cls):
++                    return collections.UserDict()
+         ud = mydict.fromkeys('ab')
+         self.assertEqual(ud, {'a':None, 'b':None})
+         self.assertIsInstance(ud, collections.UserDict)
+         self.assertRaises(TypeError, dict.fromkeys)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class baddict1(self.type2test):
+-            def __init__(self, *args, **kwargs):
+-                raise Exc()
++            class baddict1(self.type2test):
++                def __init__(self, *args, **kwargs):
++                    raise Exc()
+
+         self.assertRaises(Exc, baddict1.fromkeys, [1])
+
+-        class BadSeq(object):
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class BadSeq(object):
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, self.type2test.fromkeys, BadSeq())
+
+-        class baddict2(self.type2test):
+-            def __setitem__(self, key, value):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class baddict2(self.type2test):
++                def __setitem__(self, key, value):
++                    raise Exc()
+
+         self.assertRaises(Exc, baddict2.fromkeys, [1])
+
+@@ -537,25 +603,27 @@ class TestHashMappingProtocol(TestMappingProtocol):
+
+     def test_getitem(self):
+         TestMappingProtocol.test_getitem(self)
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadEq(object):
+-            def __eq__(self, other):
+-                raise Exc()
+-            def __hash__(self):
+-                return 24
++            class BadEq(object):
++                def __eq__(self, other):
++                    raise Exc()
++                def __hash__(self):
++                    return 24
+
+         d = self._empty_mapping()
+         d[BadEq()] = 42
+         self.assertRaises(KeyError, d.__getitem__, 23)
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         d = self._empty_mapping()
+         x = BadHash()
+@@ -565,9 +633,10 @@ class TestHashMappingProtocol(TestMappingProtocol):
+
+     def test_fromkeys(self):
+         TestMappingProtocol.test_fromkeys(self)
+-        class mydict(self.type2test):
+-            def __new__(cls):
+-                return collections.UserDict()
++        with torch._dynamo.error_on_graph_break(False):
++            class mydict(self.type2test):
++                def __new__(cls):
++                    return collections.UserDict()
+         ud = mydict.fromkeys('ab')
+         self.assertEqual(ud, {'a':None, 'b':None})
+         self.assertIsInstance(ud, collections.UserDict)
+@@ -575,15 +644,16 @@ class TestHashMappingProtocol(TestMappingProtocol):
+     def test_pop(self):
+         TestMappingProtocol.test_pop(self)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         d = self._empty_mapping()
+         x = BadHash()
+@@ -613,11 +683,12 @@ class TestHashMappingProtocol(TestMappingProtocol):
+         d[1] = d
+         self.assertEqual(repr(d), '{1: {...}}')
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadRepr(object):
+-            def __repr__(self):
+-                raise Exc()
++            class BadRepr(object):
++                def __repr__(self):
++                    raise Exc()
+
+         d = self._full_mapping({1: BadRepr()})
+         self.assertRaises(Exc, repr, d)
+@@ -635,13 +706,14 @@ class TestHashMappingProtocol(TestMappingProtocol):
+         self.assertEqual(self._full_mapping({1: 2}),
+                          self._full_mapping({1: 2}))
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadCmp(object):
+-            def __eq__(self, other):
+-                raise Exc()
+-            def __hash__(self):
+-                return 1
++            class BadCmp(object):
++                def __eq__(self, other):
++                    raise Exc()
++                def __hash__(self):
++                    return 1
+
+         d1 = self._full_mapping({BadCmp(): 1})
+         d2 = self._full_mapping({1: 1})
+@@ -651,15 +723,16 @@ class TestHashMappingProtocol(TestMappingProtocol):
+     def test_setdefault(self):
+         TestMappingProtocol.test_setdefault(self)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         d = self._empty_mapping()
+         x = BadHash()
diff --git a/test/dynamo/cpython/3_13/mapping_tests.py b/test/dynamo/cpython/3_13/mapping_tests.py
index 10fc6e7e4672..88c97899ae3e 100644
--- a/test/dynamo/cpython/3_13/mapping_tests.py
+++ b/test/dynamo/cpython/3_13/mapping_tests.py
@@ -250,70 +250,76 @@ def test_update(self):
         self.assertRaises((TypeError, AttributeError), d.update, 42)
 
         outerself = self
-        class SimpleUserDict:
-            def __init__(self):
-                self.d = outerself.reference
-            def keys(self):
-                return self.d.keys()
-            def __getitem__(self, i):
-                return self.d[i]
+        with torch._dynamo.error_on_graph_break(False):
+            class SimpleUserDict:
+                def __init__(self):
+                    self.d = outerself.reference
+                def keys(self):
+                    return self.d.keys()
+                def __getitem__(self, i):
+                    return self.d[i]
         d.clear()
         d.update(SimpleUserDict())
         i1 = sorted(d.items())
         i2 = sorted(self.reference.items())
         self.assertEqual(i1, i2)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
         d = self._empty_mapping()
-        class FailingUserDict:
-            def keys(self):
-                raise Exc
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    raise Exc
         self.assertRaises(Exc, d.update, FailingUserDict())
 
         d.clear()
 
-        class FailingUserDict:
-            def keys(self):
-                class BogonIter:
-                    def __init__(self):
-                        self.i = 1
-                    def __iter__(self):
-                        return self
-                    def __next__(self):
-                        if self.i:
-                            self.i = 0
-                            return 'a'
-                        raise Exc
-                return BogonIter()
-            def __getitem__(self, key):
-                return key
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    class BogonIter:
+                        def __init__(self):
+                            self.i = 1
+                        def __iter__(self):
+                            return self
+                        def __next__(self):
+                            if self.i:
+                                self.i = 0
+                                return 'a'
+                            raise Exc
+                    return BogonIter()
+                def __getitem__(self, key):
+                    return key
         self.assertRaises(Exc, d.update, FailingUserDict())
 
-        class FailingUserDict:
-            def keys(self):
-                class BogonIter:
-                    def __init__(self):
-                        self.i = ord('a')
-                    def __iter__(self):
-                        return self
-                    def __next__(self):
-                        if self.i <= ord('z'):
-                            rtn = chr(self.i)
-                            self.i += 1
-                            return rtn
-                        raise StopIteration
-                return BogonIter()
-            def __getitem__(self, key):
-                raise Exc
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    class BogonIter:
+                        def __init__(self):
+                            self.i = ord('a')
+                        def __iter__(self):
+                            return self
+                        def __next__(self):
+                            if self.i <= ord('z'):
+                                rtn = chr(self.i)
+                                self.i += 1
+                                return rtn
+                            raise StopIteration
+                    return BogonIter()
+                def __getitem__(self, key):
+                    raise Exc
         self.assertRaises(Exc, d.update, FailingUserDict())
 
         d = self._empty_mapping()
-        class badseq(object):
-            def __iter__(self):
-                return self
-            def __next__(self):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class badseq(object):
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, d.update, badseq())
 
@@ -463,13 +469,14 @@ def test_update(self):
         d.update(self._full_mapping({1:2, 3:4, 5:6}).items())
         self.assertEqual(d, {1:2, 2:4, 3:4, 5:6})
 
-        class SimpleUserDict:
-            def __init__(self):
-                self.d = {1:1, 2:2, 3:3}
-            def keys(self):
-                return self.d.keys()
-            def __getitem__(self, i):
-                return self.d[i]
+        with torch._dynamo.error_on_graph_break(False):
+            class SimpleUserDict:
+                def __init__(self):
+                    self.d = {1:1, 2:2, 3:3}
+                def keys(self):
+                    return self.d.keys()
+                def __getitem__(self, i):
+                    return self.d[i]
         d.clear()
         d.update(SimpleUserDict())
         self.assertEqual(d, {1:1, 2:2, 3:3})
@@ -485,39 +492,44 @@ def g():
             yield 1
         self.assertEqual(d.fromkeys(g()), {1:None})
         self.assertRaises(TypeError, {}.fromkeys, 3)
-        class dictlike(self.type2test): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class dictlike(self.type2test): pass
         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
         self.assertTrue(dictlike.fromkeys('a').__class__ is dictlike)
         self.assertTrue(dictlike().fromkeys('a').__class__ is dictlike)
         self.assertTrue(type(dictlike.fromkeys('a')) is dictlike)
-        class mydict(self.type2test):
-            def __new__(cls):
-                return collections.UserDict()
+        with torch._dynamo.error_on_graph_break(False):
+            class mydict(self.type2test):
+                def __new__(cls):
+                    return collections.UserDict()
         ud = mydict.fromkeys('ab')
         self.assertEqual(ud, {'a':None, 'b':None})
         self.assertIsInstance(ud, collections.UserDict)
         self.assertRaises(TypeError, dict.fromkeys)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class baddict1(self.type2test):
-            def __init__(self, *args, **kwargs):
-                raise Exc()
+            class baddict1(self.type2test):
+                def __init__(self, *args, **kwargs):
+                    raise Exc()
 
         self.assertRaises(Exc, baddict1.fromkeys, [1])
 
-        class BadSeq(object):
-            def __iter__(self):
-                return self
-            def __next__(self):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class BadSeq(object):
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, self.type2test.fromkeys, BadSeq())
 
-        class baddict2(self.type2test):
-            def __setitem__(self, key, value):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class baddict2(self.type2test):
+                def __setitem__(self, key, value):
+                    raise Exc()
 
         self.assertRaises(Exc, baddict2.fromkeys, [1])
 
@@ -591,25 +603,27 @@ class TestHashMappingProtocol(TestMappingProtocol):
 
     def test_getitem(self):
         TestMappingProtocol.test_getitem(self)
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadEq(object):
-            def __eq__(self, other):
-                raise Exc()
-            def __hash__(self):
-                return 24
+            class BadEq(object):
+                def __eq__(self, other):
+                    raise Exc()
+                def __hash__(self):
+                    return 24
 
         d = self._empty_mapping()
         d[BadEq()] = 42
         self.assertRaises(KeyError, d.__getitem__, 23)
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         d = self._empty_mapping()
         x = BadHash()
@@ -619,9 +633,10 @@ def __hash__(self):
 
     def test_fromkeys(self):
         TestMappingProtocol.test_fromkeys(self)
-        class mydict(self.type2test):
-            def __new__(cls):
-                return collections.UserDict()
+        with torch._dynamo.error_on_graph_break(False):
+            class mydict(self.type2test):
+                def __new__(cls):
+                    return collections.UserDict()
         ud = mydict.fromkeys('ab')
         self.assertEqual(ud, {'a':None, 'b':None})
         self.assertIsInstance(ud, collections.UserDict)
@@ -629,15 +644,16 @@ def __new__(cls):
     def test_pop(self):
         TestMappingProtocol.test_pop(self)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         d = self._empty_mapping()
         x = BadHash()
@@ -667,11 +683,12 @@ def test_repr(self):
         d[1] = d
         self.assertEqual(repr(d), '{1: {...}}')
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadRepr(object):
-            def __repr__(self):
-                raise Exc()
+            class BadRepr(object):
+                def __repr__(self):
+                    raise Exc()
 
         d = self._full_mapping({1: BadRepr()})
         self.assertRaises(Exc, repr, d)
@@ -689,13 +706,14 @@ def test_eq(self):
         self.assertEqual(self._full_mapping({1: 2}),
                          self._full_mapping({1: 2}))
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadCmp(object):
-            def __eq__(self, other):
-                raise Exc()
-            def __hash__(self):
-                return 1
+            class BadCmp(object):
+                def __eq__(self, other):
+                    raise Exc()
+                def __hash__(self):
+                    return 1
 
         d1 = self._full_mapping({BadCmp(): 1})
         d2 = self._full_mapping({1: 1})
@@ -705,15 +723,16 @@ def __hash__(self):
     def test_setdefault(self):
         TestMappingProtocol.test_setdefault(self)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         d = self._empty_mapping()
         x = BadHash()
diff --git a/test/dynamo/cpython/3_13/seq_tests.diff b/test/dynamo/cpython/3_13/seq_tests.diff
index b0e591fd4cbc..d5e6f92a0768 100644
--- a/test/dynamo/cpython/3_13/seq_tests.diff
+++ b/test/dynamo/cpython/3_13/seq_tests.diff
@@ -63,15 +63,15 @@ index 719c9434a16..290e57c04a0 100644
 @@ -95,7 +149,7 @@ class LyingList(list):
      def __iter__(self):
          yield 1
- 
+
 -class CommonTest(unittest.TestCase):
 +class CommonTest(__TestCase):
      # The type to be tested
      type2test = None
- 
+
 @@ -115,13 +169,14 @@ class CommonTest(unittest.TestCase):
          uu2 = self.type2test(u2)
- 
+
          v = self.type2test(tuple(u))
 -        class OtherSeq:
 -            def __init__(self, initseq):
@@ -80,7 +80,7 @@ index 719c9434a16..290e57c04a0 100644
 -                return len(self.__data)
 -            def __getitem__(self, i):
 -                return self.__data[i]
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class OtherSeq:
 +                def __init__(self, initseq):
 +                    self.__data = initseq
@@ -100,51 +100,51 @@ index 719c9434a16..290e57c04a0 100644
 -        class StopCompares:
 -            def __eq__(self, other):
 -                raise DoNotTestEq
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class DoNotTestEq(Exception):
 +                pass
 +            class StopCompares:
 +                def __eq__(self, other):
 +                    raise DoNotTestEq
- 
+
          checkfirst = self.type2test([1, StopCompares()])
          self.assertIn(1, checkfirst)
 @@ -283,8 +339,9 @@ class CommonTest(unittest.TestCase):
          self.assertEqual(u2+u2+u2, u2*3)
          self.assertEqual(u2+u2+u2, 3*u2)
- 
+
 -        class subclass(self.type2test):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass(self.type2test):
 +                pass
          u3 = subclass([0, 1])
          self.assertEqual(u3, u3*1)
          self.assertIsNot(u3, u3*1)
 @@ -311,9 +368,10 @@ class CommonTest(unittest.TestCase):
- 
+
      def test_getitemoverwriteiter(self):
          # Verify that __getitem__ overrides are not recognized by __iter__
 -        class T(self.type2test):
 -            def __getitem__(self, key):
 -                return str(key) + '!!!'
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class T(self.type2test):
 +                def __getitem__(self, key):
 +                    return str(key) + '!!!'
          self.assertEqual(next(iter(T((1,2)))), 1)
- 
+
      def test_repeat(self):
 @@ -361,14 +419,15 @@ class CommonTest(unittest.TestCase):
- 
+
          self.assertRaises(TypeError, a.count)
- 
+
 -        class BadExc(Exception):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class BadExc(Exception):
 +                pass
- 
+
 -        class BadCmp:
 -            def __eq__(self, other):
 -                if other == 2:
@@ -155,19 +155,19 @@ index 719c9434a16..290e57c04a0 100644
 +                    if other == 2:
 +                        raise BadExc()
 +                    return False
- 
+
          self.assertRaises(BadExc, a.count, BadCmp())
- 
+
 @@ -394,14 +453,15 @@ class CommonTest(unittest.TestCase):
- 
+
          self.assertRaises(TypeError, u.index)
- 
+
 -        class BadExc(Exception):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class BadExc(Exception):
 +                pass
- 
+
 -        class BadCmp:
 -            def __eq__(self, other):
 -                if other == 2:
@@ -178,6 +178,6 @@ index 719c9434a16..290e57c04a0 100644
 +                    if other == 2:
 +                        raise BadExc()
 +                    return False
- 
+
          a = self.type2test([0, 1, 2, 3])
          self.assertRaises(BadExc, a.index, BadCmp())
diff --git a/test/dynamo/cpython/3_13/seq_tests.py b/test/dynamo/cpython/3_13/seq_tests.py
index 290e57c04a0e..11d59c847326 100644
--- a/test/dynamo/cpython/3_13/seq_tests.py
+++ b/test/dynamo/cpython/3_13/seq_tests.py
@@ -169,7 +169,7 @@ def test_constructors(self):
         uu2 = self.type2test(u2)
 
         v = self.type2test(tuple(u))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class OtherSeq:
                 def __init__(self, initseq):
                     self.__data = initseq
@@ -294,7 +294,7 @@ def test_contains_order(self):
         # Sequences must test in-order.  If a rich comparison has side
         # effects, these will be visible to tests against later members.
         # In this test, the "side effect" is a short-circuiting raise.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class DoNotTestEq(Exception):
                 pass
             class StopCompares:
@@ -339,7 +339,7 @@ def test_addmul(self):
         self.assertEqual(u2+u2+u2, u2*3)
         self.assertEqual(u2+u2+u2, 3*u2)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass(self.type2test):
                 pass
         u3 = subclass([0, 1])
@@ -368,7 +368,7 @@ def test_imul(self):
 
     def test_getitemoverwriteiter(self):
         # Verify that __getitem__ overrides are not recognized by __iter__
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class T(self.type2test):
                 def __getitem__(self, key):
                     return str(key) + '!!!'
@@ -419,7 +419,7 @@ def test_count(self):
 
         self.assertRaises(TypeError, a.count)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class BadExc(Exception):
                 pass
 
@@ -453,7 +453,7 @@ def test_index(self):
 
         self.assertRaises(TypeError, u.index)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class BadExc(Exception):
                 pass
 
diff --git a/test/dynamo/cpython/3_13/test_bool.diff b/test/dynamo/cpython/3_13/test_bool.diff
index f6e0081aa164..0a223247e5c7 100644
--- a/test/dynamo/cpython/3_13/test_bool.diff
+++ b/test/dynamo/cpython/3_13/test_bool.diff
@@ -24,20 +24,20 @@ index 34ecb45f161..12b719c432b 100644
 +# ======= END DYNAMO PATCH =======
 +
  # Test properties of bool promised by PEP 285
- 
+
  import unittest
 @@ -5,12 +25,13 @@ from test.support import os_helper
- 
+
  import os
- 
+
 -class BoolTest(unittest.TestCase):
 +class BoolTest(__TestCase):
- 
+
      def test_subclass(self):
          try:
 -            class C(bool):
 -                pass
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class C(bool):
 +                    pass
          except TypeError:
@@ -50,67 +50,67 @@ index 34ecb45f161..12b719c432b 100644
 -        class Foo(object):
 -            def __bool__(self):
 -                return self
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Foo(object):
 +                def __bool__(self):
 +                    return self
          check(Foo())
- 
+
 -        class Bar(object):
 -            def __bool__(self):
 -                return "Yes"
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Bar(object):
 +                def __bool__(self):
 +                    return "Yes"
          check(Bar())
- 
+
 -        class Baz(int):
 -            def __bool__(self):
 -                return self
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Baz(int):
 +                def __bool__(self):
 +                    return self
          check(Baz())
- 
+
          # __bool__() must return a bool not an int
 -        class Spam(int):
 -            def __bool__(self):
 -                return 1
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Spam(int):
 +                def __bool__(self):
 +                    return 1
          check(Spam())
- 
+
 -        class Eggs:
 -            def __len__(self):
 -                return -1
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Eggs:
 +                def __len__(self):
 +                    return -1
          self.assertRaises(ValueError, bool, Eggs())
- 
+
      def test_interpreter_convert_to_bool_raises(self):
 -        class SymbolicBool:
 -            def __bool__(self):
 -                raise TypeError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class SymbolicBool:
 +                def __bool__(self):
 +                    raise TypeError
- 
+
 -        class Symbol:
 -            def __gt__(self, other):
 -                return SymbolicBool()
 +            class Symbol:
 +                def __gt__(self, other):
 +                    return SymbolicBool()
- 
+
          x = Symbol()
- 
+
 @@ -361,9 +388,10 @@ class BoolTest(unittest.TestCase):
          # this test just tests our assumptions about __len__
          # this will start failing if __len__ changes assertions
@@ -118,7 +118,7 @@ index 34ecb45f161..12b719c432b 100644
 -            class A:
 -                def __len__(self):
 -                    return badval
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class A:
 +                    def __len__(self):
 +                        return badval
@@ -127,30 +127,30 @@ index 34ecb45f161..12b719c432b 100644
              except (Exception) as e_bool:
 @@ -373,14 +401,16 @@ class BoolTest(unittest.TestCase):
                      self.assertEqual(str(e_bool), str(e_len))
- 
+
      def test_blocked(self):
 -        class A:
 -            __bool__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class A:
 +                __bool__ = None
          self.assertRaises(TypeError, bool, A())
- 
+
 -        class B:
 -            def __len__(self):
 -                return 10
 -            __bool__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class B:
 +                def __len__(self):
 +                    return 10
 +                __bool__ = None
          self.assertRaises(TypeError, bool, B())
- 
+
      def test_real_and_imag(self):
 @@ -394,12 +424,13 @@ class BoolTest(unittest.TestCase):
          self.assertIs(type(False.imag), int)
- 
+
      def test_bool_called_at_least_once(self):
 -        class X:
 -            def __init__(self):
@@ -158,19 +158,19 @@ index 34ecb45f161..12b719c432b 100644
 -            def __bool__(self):
 -                self.count += 1
 -                return True
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class X:
 +                def __init__(self):
 +                    self.count = 0
 +                def __bool__(self):
 +                    self.count += 1
 +                    return True
- 
+
          def f(x):
              if x or True:
 @@ -418,4 +449,4 @@ class BoolTest(unittest.TestCase):
- 
- 
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_bool.py b/test/dynamo/cpython/3_13/test_bool.py
index 12b719c432be..fd67829de012 100644
--- a/test/dynamo/cpython/3_13/test_bool.py
+++ b/test/dynamo/cpython/3_13/test_bool.py
@@ -29,7 +29,7 @@ class BoolTest(__TestCase):
 
     def test_subclass(self):
         try:
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            with torch._dynamo.error_on_graph_break(False):
                 class C(bool):
                     pass
         except TypeError:
@@ -328,39 +328,39 @@ def test_convert_to_bool(self):
         # from __bool__().  This isn't really a bool test, but
         # it's related.
         check = lambda o: self.assertRaises(TypeError, bool, o)
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Foo(object):
                 def __bool__(self):
                     return self
         check(Foo())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Bar(object):
                 def __bool__(self):
                     return "Yes"
         check(Bar())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Baz(int):
                 def __bool__(self):
                     return self
         check(Baz())
 
         # __bool__() must return a bool not an int
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Spam(int):
                 def __bool__(self):
                     return 1
         check(Spam())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Eggs:
                 def __len__(self):
                     return -1
         self.assertRaises(ValueError, bool, Eggs())
 
     def test_interpreter_convert_to_bool_raises(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class SymbolicBool:
                 def __bool__(self):
                     raise TypeError
@@ -388,7 +388,7 @@ def test_sane_len(self):
         # this test just tests our assumptions about __len__
         # this will start failing if __len__ changes assertions
         for badval in ['illegal', -1, 1 << 32]:
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            with torch._dynamo.error_on_graph_break(False):
                 class A:
                     def __len__(self):
                         return badval
@@ -401,12 +401,12 @@ def __len__(self):
                     self.assertEqual(str(e_bool), str(e_len))
 
     def test_blocked(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class A:
                 __bool__ = None
         self.assertRaises(TypeError, bool, A())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class B:
                 def __len__(self):
                     return 10
@@ -424,7 +424,7 @@ def test_real_and_imag(self):
         self.assertIs(type(False.imag), int)
 
     def test_bool_called_at_least_once(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __init__(self):
                     self.count = 0
diff --git a/test/dynamo/cpython/3_13/test_cmath.diff b/test/dynamo/cpython/3_13/test_cmath.diff
index cde38ef5f32e..deb03570db1c 100644
--- a/test/dynamo/cpython/3_13/test_cmath.diff
+++ b/test/dynamo/cpython/3_13/test_cmath.diff
@@ -65,7 +65,7 @@ index a96a5780b31..d00dfca8a17 100644
 @@ -50,7 +103,7 @@ complex_nans = [complex(x, y) for x, y in [
          (INF, NAN)
          ]]
- 
+
 -class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
 +class CMathTests(__TestCase):
      # list of all functions in cmath
@@ -74,7 +74,7 @@ index a96a5780b31..d00dfca8a17 100644
 @@ -66,6 +119,39 @@ class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
      def tearDown(self):
          self.test_values.close()
- 
+
 +    def assertFloatIdentical(self, x, y):
 +        """Fail unless floats x and y are identical, in the sense that:
 +        (1) both x and y are nans, or
@@ -113,7 +113,7 @@ index a96a5780b31..d00dfca8a17 100644
          """Fail if the two floating-point numbers are not almost equal.
 @@ -165,38 +251,39 @@ class CMathTests(ComplexesAreIdenticalMixin, unittest.TestCase):
          # end up being passed to the cmath functions
- 
+
          # usual case: new-style class implementing __complex__
 -        class MyComplex:
 -            def __init__(self, value):
@@ -127,7 +127,7 @@ index a96a5780b31..d00dfca8a17 100644
 -        class MyComplexException:
 -            def __complex__(self):
 -                raise SomeException
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyComplex:
 +                def __init__(self, value):
 +                    self.value = value
@@ -140,7 +140,7 @@ index a96a5780b31..d00dfca8a17 100644
 +            class MyComplexException:
 +                def __complex__(self):
 +                    raise SomeException
- 
+
 -        # some classes not providing __float__ or __complex__
 -        class NeitherComplexNorFloat(object):
 -            pass
@@ -179,12 +179,12 @@ index a96a5780b31..d00dfca8a17 100644
 +            class JustFloat:
 +                def __float__(self):
 +                    return flt_arg
- 
+
          for f in self.test_functions:
              # usual usage
 @@ -590,4 +677,4 @@ class IsCloseTests(test_math.IsCloseTests):
- 
- 
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_cmath.py b/test/dynamo/cpython/3_13/test_cmath.py
index d00dfca8a170..95cb84121f9c 100644
--- a/test/dynamo/cpython/3_13/test_cmath.py
+++ b/test/dynamo/cpython/3_13/test_cmath.py
@@ -251,7 +251,7 @@ def test_user_object(self):
         # end up being passed to the cmath functions
 
         # usual case: new-style class implementing __complex__
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyComplex:
                 def __init__(self, value):
                     self.value = value
diff --git a/test/dynamo/cpython/3_13/test_collections.diff b/test/dynamo/cpython/3_13/test_collections.diff
index a3161d0f4d24..89e4e72910a2 100644
--- a/test/dynamo/cpython/3_13/test_collections.diff
+++ b/test/dynamo/cpython/3_13/test_collections.diff
@@ -24,12 +24,12 @@ index cafc44007d1..4571e5a14fd 100644
 +# ======= END DYNAMO PATCH =======
 +
  """Unit tests for collections.py."""
- 
+
  import array
 @@ -29,7 +49,7 @@ from collections.abc import Sequence, MutableSequence
  from collections.abc import ByteString, Buffer
- 
- 
+
+
 -class TestUserObjects(unittest.TestCase):
 +class TestUserObjects(__TestCase):
      def _superset_test(self, a, b):
@@ -37,12 +37,12 @@ index cafc44007d1..4571e5a14fd 100644
              set(dir(a)),
 @@ -73,9 +93,10 @@ class TestUserObjects(unittest.TestCase):
          self._copy_test(obj)
- 
+
      def test_dict_missing(self):
 -        class A(UserDict):
 -            def __missing__(self, key):
 -                return 456
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class A(UserDict):
 +                def __missing__(self, key):
 +                    return 456
@@ -52,20 +52,20 @@ index cafc44007d1..4571e5a14fd 100644
 @@ -85,7 +106,7 @@ class TestUserObjects(unittest.TestCase):
  ### ChainMap (helper class for configparser and the string module)
  ################################################################################
- 
+
 -class TestChainMap(unittest.TestCase):
 +class TestChainMap(__TestCase):
- 
+
      def test_basics(self):
          c = ChainMap()
 @@ -172,9 +193,10 @@ class TestChainMap(unittest.TestCase):
          self.assertTrue(ChainMap({}, {1:2}))
- 
+
      def test_missing(self):
 -        class DefaultChainMap(ChainMap):
 -            def __missing__(self, key):
 -                return 999
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class DefaultChainMap(ChainMap):
 +                def __missing__(self, key):
 +                    return 999
@@ -74,7 +74,7 @@ index cafc44007d1..4571e5a14fd 100644
              self.assertEqual(d[k], v)                                  # check __getitem__ w/missing
 @@ -206,13 +228,14 @@ class TestChainMap(unittest.TestCase):
               ('i', 9999), ('j', 0)])
- 
+
      def test_iter_not_calling_getitem_on_maps(self):
 -        class DictWithGetItem(UserDict):
 -            def __init__(self, *args, **kwds):
@@ -83,7 +83,7 @@ index cafc44007d1..4571e5a14fd 100644
 -            def __getitem__(self, item):
 -                self.called = True
 -                UserDict.__getitem__(self, item)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class DictWithGetItem(UserDict):
 +                def __init__(self, *args, **kwds):
 +                    self.called = False
@@ -91,12 +91,12 @@ index cafc44007d1..4571e5a14fd 100644
 +                def __getitem__(self, item):
 +                    self.called = True
 +                    UserDict.__getitem__(self, item)
- 
+
          d = DictWithGetItem(a=1)
          c = ChainMap(d)
 @@ -237,15 +260,16 @@ class TestChainMap(unittest.TestCase):
          self.assertIs(m, d.maps[0])
- 
+
          # Use a different map than a dict
 -        class lowerdict(dict):
 -            def __getitem__(self, key):
@@ -107,7 +107,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                if isinstance(key, str):
 -                    key = key.lower()
 -                return dict.__contains__(self, key)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class lowerdict(dict):
 +                def __getitem__(self, key):
 +                    if isinstance(key, str):
@@ -117,46 +117,46 @@ index cafc44007d1..4571e5a14fd 100644
 +                    if isinstance(key, str):
 +                        key = key.lower()
 +                    return dict.__contains__(self, key)
- 
+
          c = ChainMap()
          c['a'] = 1
 @@ -315,7 +339,7 @@ class TestChainMap(unittest.TestCase):
- 
+
  TestNT = namedtuple('TestNT', 'x y z')    # type used for pickle tests
- 
+
 -class TestNamedTuple(unittest.TestCase):
 +class TestNamedTuple(__TestCase):
- 
+
      def test_factory(self):
          Point = namedtuple('Point', 'x y')
 @@ -666,8 +690,9 @@ class TestNamedTuple(unittest.TestCase):
              NT = namedtuple('NT', ['abc', 'def'], False, True)
- 
+
      def test_namedtuple_subclass_issue_24931(self):
 -        class Point(namedtuple('_Point', ['x', 'y'])):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Point(namedtuple('_Point', ['x', 'y'])):
 +                pass
- 
+
          a = Point(3, 4)
          self.assertEqual(a._asdict(), OrderedDict([('x', 3), ('y', 4)]))
 @@ -722,21 +747,26 @@ class TestNamedTuple(unittest.TestCase):
  ### Abstract Base Classes
  ################################################################################
- 
+
 -class ABCTestCase(unittest.TestCase):
 +class ABCTestCase(__TestCase):
- 
+
      def validate_abstract_methods(self, abc, *names):
          methodstubs = dict.fromkeys(names, lambda s, *args: 0)
- 
+
          # everything should work will all required methods are present
 -        C = type('C', (abc,), methodstubs)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            C = type('C', (abc,), methodstubs)
          C()
- 
+
 +        # Dynamo raises a hard error here that we can't easily capture
 +        # Commenting this part as this would also fail in eager if a user
 +        # attempt to run the same code
@@ -172,7 +172,7 @@ index cafc44007d1..4571e5a14fd 100644
 +        #     del stubs[name]
 +        #     C = type('C', (abc,), stubs)
 +        #     self.assertRaises(TypeError, C, name)
- 
+
      def validate_isinstance(self, abc, name):
          stub = lambda s, *args: 0
 @@ -981,19 +1011,21 @@ class TestOneTrickPonyABCs(ABCTestCase):
@@ -183,7 +183,7 @@ index cafc44007d1..4571e5a14fd 100644
 -        class I(Iterable):
 -            def __iter__(self):
 -                return super().__iter__()
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Check direct subclassing
 +            class I(Iterable):
 +                def __iter__(self):
@@ -197,7 +197,7 @@ index cafc44007d1..4571e5a14fd 100644
 -            def __iter__(self): return iter([])
 -        class ItBlocked(It):
 -            __iter__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Check None blocking
 +            class It:
 +                def __iter__(self): return iter([])
@@ -216,7 +216,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return iter(list())
 -            def __reversed__(self):
 -                return iter(list())
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Check direct subclassing
 +            class R(Reversible):
 +                def __iter__(self):
@@ -231,7 +231,7 @@ index cafc44007d1..4571e5a14fd 100644
 -            def __reversed__(self): return reversed([])
 -        class RevPlusIter(RevNoIter):
 -            def __iter__(self): return iter([])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Check reversible non-iterable (which is not Reversible)
 +            class RevNoIter:
 +                def __reversed__(self): return reversed([])
@@ -249,7 +249,7 @@ index cafc44007d1..4571e5a14fd 100644
 -            __iter__ = None
 -        class RevRevBlocked(Rev):
 -            __reversed__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Check None blocking
 +            class Rev:
 +                def __iter__(self): return iter([])
@@ -274,7 +274,7 @@ index cafc44007d1..4571e5a14fd 100644
 -            def __contains__(self, item):
 -                return False
 -        class DerCol(Col): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Check direct subclassing
 +            class Col(Collection):
 +                def __iter__(self):
@@ -300,7 +300,7 @@ index cafc44007d1..4571e5a14fd 100644
 -        class ColNoCont:
 -            def __iter__(self): return iter([])
 -            def __len__(self): return 0
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class ColNoIter:
 +                def __len__(self): return 0
 +                def __contains__(self, item): return False
@@ -326,7 +326,7 @@ index cafc44007d1..4571e5a14fd 100644
 -            def __contains__(self): return True
 -            __iter__ = None
 +
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Check None blocking
 +            class SizeBlock:
 +                def __iter__(self): return iter([])
@@ -350,7 +350,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return False
 -        class NonCol(ColImpl):
 -            __contains__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Check None blocking in subclass
 +            class ColImpl:
 +                def __iter__(self):
@@ -363,24 +363,24 @@ index cafc44007d1..4571e5a14fd 100644
 +                __contains__ = None
          self.assertFalse(issubclass(NonCol, Collection))
          self.assertFalse(isinstance(NonCol(), Collection))
- 
+
 @@ -1162,30 +1202,32 @@ class TestOneTrickPonyABCs(ABCTestCase):
              self.assertTrue(issubclass(type(x), Iterator), repr(type(x)))
          self.validate_abstract_methods(Iterator, '__next__', '__iter__')
- 
+
 -        # Issue 10565
 -        class NextOnly:
 -            def __next__(self):
 -                yield 1
 -                return
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Issue 10565
 +            class NextOnly:
 +                def __next__(self):
 +                    yield 1
 +                    return
          self.assertNotIsInstance(NextOnly(), Iterator)
- 
+
      def test_Generator(self):
 -        class NonGen1:
 -            def __iter__(self): return self
@@ -398,7 +398,7 @@ index cafc44007d1..4571e5a14fd 100644
 -            def close(self): pass
 -            def send(self, value): return value
 -            def throw(self, typ, val=None, tb=None): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class NonGen1:
 +                def __iter__(self): return self
 +                def __next__(self): return None
@@ -415,27 +415,27 @@ index cafc44007d1..4571e5a14fd 100644
 +                def close(self): pass
 +                def send(self, value): return value
 +                def throw(self, typ, val=None, tb=None): pass
- 
+
          non_samples = [
              None, 42, 3.14, 1j, b"", "", (), [], {}, set(),
 @@ -1194,18 +1236,19 @@ class TestOneTrickPonyABCs(ABCTestCase):
              self.assertNotIsInstance(x, Generator)
              self.assertFalse(issubclass(type(x), Generator), repr(type(x)))
- 
+
 -        class Gen:
 -            def __iter__(self): return self
 -            def __next__(self): return None
 -            def close(self): pass
 -            def send(self, value): return value
 -            def throw(self, typ, val=None, tb=None): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Gen:
 +                def __iter__(self): return self
 +                def __next__(self): return None
 +                def close(self): pass
 +                def send(self, value): return value
 +                def throw(self, typ, val=None, tb=None): pass
- 
+
 -        class MinimalGen(Generator):
 -            def send(self, value):
 -                return value
@@ -446,50 +446,50 @@ index cafc44007d1..4571e5a14fd 100644
 +                    return value
 +                def throw(self, typ, val=None, tb=None):
 +                    super().throw(typ, val, tb)
- 
+
          def gen():
              yield 1
 @@ -1228,15 +1271,17 @@ class TestOneTrickPonyABCs(ABCTestCase):
                                 mgen.throw, ValueError, ValueError("huhu"))
          self.assertRaises(StopIteration, mgen.throw, StopIteration())
- 
+
 -        class FailOnClose(Generator):
 -            def send(self, value): return value
 -            def throw(self, *args): raise ValueError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class FailOnClose(Generator):
 +                def send(self, value): return value
 +                def throw(self, *args): raise ValueError
- 
+
          self.assertRaises(ValueError, FailOnClose().close)
- 
+
 -        class IgnoreGeneratorExit(Generator):
 -            def send(self, value): return value
 -            def throw(self, *args): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class IgnoreGeneratorExit(Generator):
 +                def send(self, value): return value
 +                def throw(self, *args): pass
- 
+
          self.assertRaises(RuntimeError, IgnoreGeneratorExit().close)
- 
+
 @@ -1379,15 +1424,17 @@ class TestOneTrickPonyABCs(ABCTestCase):
- 
+
      def test_direct_subclassing(self):
          for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
 -            class C(B):
 -                pass
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class C(B):
 +                    pass
              self.assertTrue(issubclass(C, B))
              self.assertFalse(issubclass(int, C))
- 
+
      def test_registration(self):
          for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
 -            class C:
 -                __hash__ = None  # Make sure it isn't hashable by default
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class C:
 +                    __hash__ = None  # Make sure it isn't hashable by default
              self.assertFalse(issubclass(C, B), B.__name__)
@@ -506,7 +506,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return 0
 -            def __iter__(self):
 -                return iter([])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MySet(Set):
 +                def __contains__(self, x):
 +                    return False
@@ -515,11 +515,11 @@ index cafc44007d1..4571e5a14fd 100644
 +                def __iter__(self):
 +                    return iter([])
          self.validate_comparison(MySet())
- 
+
      def test_hash_Set(self):
 @@ -1448,15 +1496,16 @@ class TestCollectionABCs(ABCTestCase):
          self.assertTrue(hash(a) == hash(b))
- 
+
      def test_isdisjoint_Set(self):
 -        class MySet(Set):
 -            def __init__(self, itr):
@@ -530,7 +530,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return iter(self.contents)
 -            def __len__(self):
 -                return len([x for x in self.contents])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MySet(Set):
 +                def __init__(self, itr):
 +                    self.contents = itr
@@ -545,7 +545,7 @@ index cafc44007d1..4571e5a14fd 100644
          s3 = MySet((1, 5, 6))
 @@ -1464,15 +1513,16 @@ class TestCollectionABCs(ABCTestCase):
          self.assertFalse(s1.isdisjoint(s3))
- 
+
      def test_equality_Set(self):
 -        class MySet(Set):
 -            def __init__(self, itr):
@@ -556,7 +556,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return iter(self.contents)
 -            def __len__(self):
 -                return len([x for x in self.contents])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MySet(Set):
 +                def __init__(self, itr):
 +                    self.contents = itr
@@ -571,7 +571,7 @@ index cafc44007d1..4571e5a14fd 100644
          s3 = MySet((3, 4))
 @@ -1486,15 +1536,16 @@ class TestCollectionABCs(ABCTestCase):
          self.assertNotEqual(s2, s3)
- 
+
      def test_arithmetic_Set(self):
 -        class MySet(Set):
 -            def __init__(self, itr):
@@ -582,7 +582,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return iter(self.contents)
 -            def __len__(self):
 -                return len([x for x in self.contents])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MySet(Set):
 +                def __init__(self, itr):
 +                    self.contents = itr
@@ -596,7 +596,7 @@ index cafc44007d1..4571e5a14fd 100644
          s2 = MySet((3, 4, 5))
          s3 = s1 & s2
 @@ -1516,28 +1567,29 @@ class TestCollectionABCs(ABCTestCase):
- 
+
      def test_issue_4920(self):
          # MutableSet.pop() method did not work
 -        class MySet(MutableSet):
@@ -621,7 +621,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return result
 -            def __repr__(self):
 -                return "MySet(%s)" % repr(list(self))
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MySet(MutableSet):
 +                __slots__=['__s']
 +                def __init__(self,items=None):
@@ -669,7 +669,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return NotImplemented
 -            def __lt__(self, x):
 -                return NotImplemented
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyComparableSet(Set):
 +                def __contains__(self, x):
 +                    return False
@@ -688,11 +688,11 @@ index cafc44007d1..4571e5a14fd 100644
 +                    return NotImplemented
 +                def __lt__(self, x):
 +                    return NotImplemented
- 
+
          cs = MyComparableSet()
          ncs = MyNonComparableSet()
 @@ -1591,13 +1644,14 @@ class TestCollectionABCs(ABCTestCase):
- 
+
      def test_issue26915(self):
          # Container membership test should check identity first
 -        class CustomSequence(Sequence):
@@ -702,7 +702,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return self._seq[index]
 -            def __len__(self):
 -                return len(self._seq)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class CustomSequence(Sequence):
 +                def __init__(self, seq):
 +                    self._seq = seq
@@ -710,11 +710,11 @@ index cafc44007d1..4571e5a14fd 100644
 +                    return self._seq[index]
 +                def __len__(self):
 +                    return len(self._seq)
- 
+
          nan = float('nan')
          obj = support.NEVER_EQ
 @@ -1622,30 +1676,31 @@ class TestCollectionABCs(ABCTestCase):
- 
+
      def test_Set_from_iterable(self):
          """Verify _from_iterable overridden to an instance method works."""
 -        class SetUsingInstanceFromIterable(MutableSet):
@@ -723,48 +723,48 @@ index cafc44007d1..4571e5a14fd 100644
 -                    raise ValueError('created_by must be specified')
 -                self.created_by = created_by
 -                self._values = set(values)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class SetUsingInstanceFromIterable(MutableSet):
 +                def __init__(self, values, created_by):
 +                    if not created_by:
 +                        raise ValueError('created_by must be specified')
 +                    self.created_by = created_by
 +                    self._values = set(values)
- 
+
 -            def _from_iterable(self, values):
 -                return type(self)(values, 'from_iterable')
 +                def _from_iterable(self, values):
 +                    return type(self)(values, 'from_iterable')
- 
+
 -            def __contains__(self, value):
 -                return value in self._values
 +                def __contains__(self, value):
 +                    return value in self._values
- 
+
 -            def __iter__(self):
 -                yield from self._values
 +                def __iter__(self):
 +                    yield from self._values
- 
+
 -            def __len__(self):
 -                return len(self._values)
 +                def __len__(self):
 +                    return len(self._values)
- 
+
 -            def add(self, value):
 -                self._values.add(value)
 +                def add(self, value):
 +                    self._values.add(value)
- 
+
 -            def discard(self, value):
 -                self._values.discard(value)
 +                def discard(self, value):
 +                    self._values.discard(value)
- 
+
          impl = SetUsingInstanceFromIterable([1, 2, 3], 'test')
- 
+
 @@ -1678,20 +1733,21 @@ class TestCollectionABCs(ABCTestCase):
- 
+
      def test_Set_interoperability_with_real_sets(self):
          # Issue: 8743
 -        class ListSet(Set):
@@ -781,7 +781,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                return len(self.data)
 -            def __repr__(self):
 -                return 'Set({!r})'.format(self.data)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class ListSet(Set):
 +                def __init__(self, elements=()):
 +                    self.data = []
@@ -796,7 +796,7 @@ index cafc44007d1..4571e5a14fd 100644
 +                    return len(self.data)
 +                def __repr__(self):
 +                    return 'Set({!r})'.format(self.data)
- 
+
          r1 = set('abc')
          r2 = set('bcd')
 @@ -1846,13 +1902,14 @@ class TestCollectionABCs(ABCTestCase):
@@ -810,7 +810,7 @@ index cafc44007d1..4571e5a14fd 100644
 -                raise IndexError
 -            def __iter__(self):
 -                return iter(())
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyMapping(Mapping):
 +                def __len__(self):
 +                    return 0
@@ -820,7 +820,7 @@ index cafc44007d1..4571e5a14fd 100644
 +                    return iter(())
          self.validate_comparison(MyMapping())
          self.assertRaises(TypeError, reversed, MyMapping())
- 
+
 @@ -1860,7 +1917,7 @@ class TestCollectionABCs(ABCTestCase):
          for sample in [dict]:
              self.assertIsInstance(sample(), MutableMapping)
@@ -828,30 +828,30 @@ index cafc44007d1..4571e5a14fd 100644
 -        self.validate_abstract_methods(MutableMapping, '__contains__', '__iter__', '__len__',
 +        self.validate_abstract_methods(MutableMapping, '__iter__', '__len__',
              '__getitem__', '__setitem__', '__delitem__')
- 
+
      def test_MutableMapping_subclass(self):
 @@ -1903,15 +1960,16 @@ class TestCollectionABCs(ABCTestCase):
              '__getitem__')
- 
+
      def test_Sequence_mixins(self):
 -        class SequenceSubclass(Sequence):
 -            def __init__(self, seq=()):
 -                self.seq = seq
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class SequenceSubclass(Sequence):
 +                def __init__(self, seq=()):
 +                    self.seq = seq
- 
+
 -            def __getitem__(self, index):
 -                return self.seq[index]
 +                def __getitem__(self, index):
 +                    return self.seq[index]
- 
+
 -            def __len__(self):
 -                return len(self.seq)
 +                def __len__(self):
 +                    return len(self.seq)
- 
+
          # Compare Sequence.index() behavior to (list|str).index() behavior
          def assert_index_same(seq1, seq2, index_args):
 @@ -1983,24 +2041,25 @@ class TestCollectionABCs(ABCTestCase):
@@ -861,54 +861,54 @@ index cafc44007d1..4571e5a14fd 100644
 -        class MutableSequenceSubclass(MutableSequence):
 -            def __init__(self):
 -                self.lst = []
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MutableSequenceSubclass(MutableSequence):
 +                def __init__(self):
 +                    self.lst = []
- 
+
 -            def __setitem__(self, index, value):
 -                self.lst[index] = value
 +                def __setitem__(self, index, value):
 +                    self.lst[index] = value
- 
+
 -            def __getitem__(self, index):
 -                return self.lst[index]
 +                def __getitem__(self, index):
 +                    return self.lst[index]
- 
+
 -            def __len__(self):
 -                return len(self.lst)
 +                def __len__(self):
 +                    return len(self.lst)
- 
+
 -            def __delitem__(self, index):
 -                del self.lst[index]
 +                def __delitem__(self, index):
 +                    del self.lst[index]
- 
+
 -            def insert(self, index, value):
 -                self.lst.insert(index, value)
 +                def insert(self, index, value):
 +                    self.lst.insert(index, value)
- 
+
          mss = MutableSequenceSubclass()
          mss.append(0)
 @@ -2059,7 +2118,7 @@ class CounterSubclassWithGet(Counter):
          self.called = True
          return Counter.get(self, key, default)
- 
+
 -class TestCounter(unittest.TestCase):
 +class TestCounter(__TestCase):
- 
+
      def test_basics(self):
          c = Counter('abcaba')
 @@ -2225,8 +2284,9 @@ class TestCounter(unittest.TestCase):
          check(Counter(words))
- 
+
      def test_copy_subclass(self):
 -        class MyCounter(Counter):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyCounter(Counter):
 +                pass
          c = MyCounter('slartibartfast')
@@ -916,8 +916,8 @@ index cafc44007d1..4571e5a14fd 100644
          self.assertEqual(d, c)
 @@ -2402,10 +2462,5 @@ class TestCounter(unittest.TestCase):
          self.assertFalse(Counter(a=2, b=1, c=0) > Counter('aab'))
- 
- 
+
+
 -def load_tests(loader, tests, pattern):
 -    tests.addTest(doctest.DocTestSuite(collections))
 -    return tests
diff --git a/test/dynamo/cpython/3_13/test_collections.py b/test/dynamo/cpython/3_13/test_collections.py
index 4571e5a14fd3..bb91f3ae9d87 100644
--- a/test/dynamo/cpython/3_13/test_collections.py
+++ b/test/dynamo/cpython/3_13/test_collections.py
@@ -93,7 +93,7 @@ def test_dict_copy(self):
         self._copy_test(obj)
 
     def test_dict_missing(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class A(UserDict):
                 def __missing__(self, key):
                     return 456
@@ -193,7 +193,7 @@ def test_bool(self):
         self.assertTrue(ChainMap({}, {1:2}))
 
     def test_missing(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class DefaultChainMap(ChainMap):
                 def __missing__(self, key):
                     return 999
@@ -228,7 +228,7 @@ def test_order_preservation(self):
              ('i', 9999), ('j', 0)])
 
     def test_iter_not_calling_getitem_on_maps(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class DictWithGetItem(UserDict):
                 def __init__(self, *args, **kwds):
                     self.called = False
@@ -260,7 +260,7 @@ def test_new_child(self):
         self.assertIs(m, d.maps[0])
 
         # Use a different map than a dict
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class lowerdict(dict):
                 def __getitem__(self, key):
                     if isinstance(key, str):
@@ -690,7 +690,7 @@ def test_keyword_only_arguments(self):
             NT = namedtuple('NT', ['abc', 'def'], False, True)
 
     def test_namedtuple_subclass_issue_24931(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Point(namedtuple('_Point', ['x', 'y'])):
                 pass
 
@@ -753,7 +753,7 @@ def validate_abstract_methods(self, abc, *names):
         methodstubs = dict.fromkeys(names, lambda s, *args: 0)
 
         # everything should work will all required methods are present
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             C = type('C', (abc,), methodstubs)
         C()
 
@@ -1011,7 +1011,7 @@ def test_Iterable(self):
         for x in samples:
             self.assertIsInstance(x, Iterable)
             self.assertTrue(issubclass(type(x), Iterable), repr(type(x)))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Check direct subclassing
             class I(Iterable):
                 def __iter__(self):
@@ -1020,7 +1020,7 @@ def __iter__(self):
         self.assertFalse(issubclass(str, I))
         self.validate_abstract_methods(Iterable, '__iter__')
         self.validate_isinstance(Iterable, '__iter__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Check None blocking
             class It:
                 def __iter__(self): return iter([])
@@ -1055,7 +1055,7 @@ def test_Reversible(self):
         self.assertTrue(issubclass(Sequence, Reversible), repr(Sequence))
         self.assertFalse(issubclass(Mapping, Reversible), repr(Mapping))
         self.assertFalse(issubclass(MutableMapping, Reversible), repr(MutableMapping))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Check direct subclassing
             class R(Reversible):
                 def __iter__(self):
@@ -1065,7 +1065,7 @@ def __reversed__(self):
         self.assertEqual(list(reversed(R())), [])
         self.assertFalse(issubclass(float, R))
         self.validate_abstract_methods(Reversible, '__reversed__', '__iter__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Check reversible non-iterable (which is not Reversible)
             class RevNoIter:
                 def __reversed__(self): return reversed([])
@@ -1075,7 +1075,7 @@ def __iter__(self): return iter([])
         self.assertFalse(isinstance(RevNoIter(), Reversible))
         self.assertTrue(issubclass(RevPlusIter, Reversible))
         self.assertTrue(isinstance(RevPlusIter(), Reversible))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Check None blocking
             class Rev:
                 def __iter__(self): return iter([])
@@ -1117,7 +1117,7 @@ def test_Collection(self):
         self.assertTrue(issubclass(Set, Collection), repr(Set))
         self.assertTrue(issubclass(MutableSet, Collection), repr(MutableSet))
         self.assertTrue(issubclass(Sequence, Collection), repr(MutableSet))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Check direct subclassing
             class Col(Collection):
                 def __iter__(self):
@@ -1138,7 +1138,7 @@ class DerCol(Col): pass
         self.validate_abstract_methods(Collection, '__len__', '__iter__',
                                                    '__contains__')
         # Check sized container non-iterable (which is not Collection) etc.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class ColNoIter:
                 def __len__(self): return 0
                 def __contains__(self, item): return False
@@ -1155,7 +1155,7 @@ def __len__(self): return 0
         self.assertFalse(issubclass(ColNoCont, Collection))
         self.assertFalse(isinstance(ColNoCont(), Collection))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Check None blocking
             class SizeBlock:
                 def __iter__(self): return iter([])
@@ -1169,7 +1169,7 @@ def __contains__(self): return True
         self.assertFalse(isinstance(SizeBlock(), Collection))
         self.assertFalse(issubclass(IterBlock, Collection))
         self.assertFalse(isinstance(IterBlock(), Collection))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Check None blocking in subclass
             class ColImpl:
                 def __iter__(self):
@@ -1202,7 +1202,7 @@ def test_Iterator(self):
             self.assertTrue(issubclass(type(x), Iterator), repr(type(x)))
         self.validate_abstract_methods(Iterator, '__next__', '__iter__')
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Issue 10565
             class NextOnly:
                 def __next__(self):
@@ -1211,7 +1211,7 @@ def __next__(self):
         self.assertNotIsInstance(NextOnly(), Iterator)
 
     def test_Generator(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class NonGen1:
                 def __iter__(self): return self
                 def __next__(self): return None
@@ -1236,7 +1236,7 @@ def throw(self, typ, val=None, tb=None): pass
             self.assertNotIsInstance(x, Generator)
             self.assertFalse(issubclass(type(x), Generator), repr(type(x)))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Gen:
                 def __iter__(self): return self
                 def __next__(self): return None
@@ -1271,14 +1271,14 @@ def gen():
                                mgen.throw, ValueError, ValueError("huhu"))
         self.assertRaises(StopIteration, mgen.throw, StopIteration())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class FailOnClose(Generator):
                 def send(self, value): return value
                 def throw(self, *args): raise ValueError
 
         self.assertRaises(ValueError, FailOnClose().close)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class IgnoreGeneratorExit(Generator):
                 def send(self, value): return value
                 def throw(self, *args): pass
@@ -1424,7 +1424,7 @@ def test_Callable(self):
 
     def test_direct_subclassing(self):
         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            with torch._dynamo.error_on_graph_break(False):
                 class C(B):
                     pass
             self.assertTrue(issubclass(C, B))
@@ -1432,7 +1432,7 @@ class C(B):
 
     def test_registration(self):
         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            with torch._dynamo.error_on_graph_break(False):
                 class C:
                     __hash__ = None  # Make sure it isn't hashable by default
             self.assertFalse(issubclass(C, B), B.__name__)
@@ -1470,7 +1470,7 @@ def test_Set(self):
             self.assertIsInstance(sample(), Set)
             self.assertTrue(issubclass(sample, Set))
         self.validate_abstract_methods(Set, '__contains__', '__iter__', '__len__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MySet(Set):
                 def __contains__(self, x):
                     return False
@@ -1496,7 +1496,7 @@ def __hash__(self):
         self.assertTrue(hash(a) == hash(b))
 
     def test_isdisjoint_Set(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MySet(Set):
                 def __init__(self, itr):
                     self.contents = itr
@@ -1513,7 +1513,7 @@ def __len__(self):
         self.assertFalse(s1.isdisjoint(s3))
 
     def test_equality_Set(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MySet(Set):
                 def __init__(self, itr):
                     self.contents = itr
@@ -1536,7 +1536,7 @@ def __len__(self):
         self.assertNotEqual(s2, s3)
 
     def test_arithmetic_Set(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MySet(Set):
                 def __init__(self, itr):
                     self.contents = itr
@@ -1567,7 +1567,7 @@ def test_issue_5647(self):
 
     def test_issue_4920(self):
         # MutableSet.pop() method did not work
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MySet(MutableSet):
                 __slots__=['__s']
                 def __init__(self,items=None):
@@ -1615,7 +1615,7 @@ def test_issue8750(self):
     def test_issue16373(self):
         # Recursion error comparing comparable and noncomparable
         # Set instances
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyComparableSet(Set):
                 def __contains__(self, x):
                     return False
@@ -1644,7 +1644,7 @@ def __lt__(self, x):
 
     def test_issue26915(self):
         # Container membership test should check identity first
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class CustomSequence(Sequence):
                 def __init__(self, seq):
                     self._seq = seq
@@ -1676,7 +1676,7 @@ def assertSameSet(self, s1, s2):
 
     def test_Set_from_iterable(self):
         """Verify _from_iterable overridden to an instance method works."""
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class SetUsingInstanceFromIterable(MutableSet):
                 def __init__(self, values, created_by):
                     if not created_by:
@@ -1733,7 +1733,7 @@ def discard(self, value):
 
     def test_Set_interoperability_with_real_sets(self):
         # Issue: 8743
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class ListSet(Set):
                 def __init__(self, elements=()):
                     self.data = []
@@ -1902,7 +1902,7 @@ def test_Mapping(self):
             self.assertTrue(issubclass(sample, Mapping))
         self.validate_abstract_methods(Mapping, '__contains__', '__iter__', '__len__',
             '__getitem__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyMapping(Mapping):
                 def __len__(self):
                     return 0
@@ -1960,7 +1960,7 @@ def test_Sequence(self):
             '__getitem__')
 
     def test_Sequence_mixins(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class SequenceSubclass(Sequence):
                 def __init__(self, seq=()):
                     self.seq = seq
@@ -2041,7 +2041,7 @@ def test_MutableSequence(self):
     def test_MutableSequence_mixins(self):
         # Test the mixins of MutableSequence by creating a minimal concrete
         # class inherited from it.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MutableSequenceSubclass(MutableSequence):
                 def __init__(self):
                     self.lst = []
@@ -2284,7 +2284,7 @@ def check(dup):
         check(Counter(words))
 
     def test_copy_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyCounter(Counter):
                 pass
         c = MyCounter('slartibartfast')
diff --git a/test/dynamo/cpython/3_13/test_complex.diff b/test/dynamo/cpython/3_13/test_complex.diff
index 063b9131056e..2a7042b9c0a6 100644
--- a/test/dynamo/cpython/3_13/test_complex.diff
+++ b/test/dynamo/cpython/3_13/test_complex.diff
@@ -43,7 +43,7 @@ index 6ff1a8ab29d..1572433c5ae 100644
 +    "test.test_iter",
 +    "test.typinganndata.ann_module",
  )
- 
+
 +class RedirectImportFinder(importlib.abc.MetaPathFinder):
 +    def find_spec(self, fullname, path, target=None):
 +        # Check if the import is the problematic one
@@ -74,7 +74,7 @@ index 6ff1a8ab29d..1572433c5ae 100644
  from math import isnan, copysign
 +import math
  import operator
- 
+
 +VALID_UNDERSCORE_LITERALS = [
 +    '0_0_0',
 +    '4_2',
@@ -158,7 +158,7 @@ index 6ff1a8ab29d..1572433c5ae 100644
 @@ -45,7 +176,40 @@ class WithComplex:
      def __complex__(self):
          return self.value
- 
+
 -class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
 +class ComplexTest(__TestCase):
 +
@@ -194,13 +194,13 @@ index 6ff1a8ab29d..1572433c5ae 100644
 +        """
 +        self.assertFloatIdentical(x.real, y.real)
 +        self.assertFloatIdentical(x.imag, y.imag)
- 
+
      def assertAlmostEqual(self, a, b):
          if isinstance(a, complex):
 @@ -74,6 +238,29 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
          # check that relative difference < eps
          self.assertTrue(abs((x-y)/y) < eps)
- 
+
 +    def assertFloatsAreIdentical(self, x, y):
 +        """assert that floats x and y are identical, in the sense that:
 +        (1) both x and y are nans, or
@@ -230,58 +230,58 @@ index 6ff1a8ab29d..1572433c5ae 100644
 @@ -93,6 +280,7 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
              q = z.__truediv__(y)
              self.assertClose(q, x)
- 
+
 +    @slowTest
      def test_truediv(self):
          simple_real = [float(i) for i in range(-5, 6)]
          simple_complex = [complex(x, y) for x in simple_real for y in simple_real]
 @@ -338,7 +526,10 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
- 
+
      def test_boolcontext(self):
          for i in range(100):
 -            self.assertTrue(complex(random() + 1e-6, random() + 1e-6))
-+            with torch._dynamo.set_fullgraph(False):
++            with torch._dynamo.error_on_graph_break(False):
 +                r1 = random()
 +                r2 = random()
 +            self.assertTrue(complex(r1 + 1e-6, r2 + 1e-6))
          self.assertTrue(not complex(0.0, 0.0))
          self.assertTrue(1j)
- 
+
 @@ -431,12 +622,13 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
          self.assertRaises(TypeError, complex, WithComplex(1), object())
          self.assertRaises(TypeError, complex, WithComplex(None), object())
- 
+
 -        class EvilExc(Exception):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class EvilExc(Exception):
 +                pass
- 
+
 -        class evilcomplex:
 -            def __complex__(self):
 -                raise EvilExc
 +            class evilcomplex:
 +                def __complex__(self):
 +                    raise EvilExc
- 
+
          self.assertRaises(EvilExc, complex, evilcomplex())
- 
+
 @@ -460,31 +652,33 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
          self.assertRaises(TypeError, complex, WithIndex(None), 1.5)
          self.assertRaises(TypeError, complex, 1.5, WithIndex(None))
- 
+
 -        class MyInt:
 -            def __int__(self):
 -                return 42
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyInt:
 +                def __int__(self):
 +                    return 42
- 
+
          self.assertRaises(TypeError, complex, MyInt())
          self.assertRaises(TypeError, complex, MyInt(), 1.5)
          self.assertRaises(TypeError, complex, 1.5, MyInt())
- 
+
 -        class complex0(complex):
 -            """Test usage of __complex__() when inheriting from 'complex'"""
 -            def __complex__(self):
@@ -299,7 +299,7 @@ index 6ff1a8ab29d..1572433c5ae 100644
 -            complex is returned"""
 -            def __complex__(self):
 -                return None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class complex0(complex):
 +                """Test usage of __complex__() when inheriting from 'complex'"""
 +                def __complex__(self):
@@ -317,12 +317,12 @@ index 6ff1a8ab29d..1572433c5ae 100644
 +                complex is returned"""
 +                def __complex__(self):
 +                    return None
- 
+
          check(complex(complex0(1j)), 0.0, 42.0)
          with self.assertWarns(DeprecationWarning):
 @@ -855,4 +1049,4 @@ class ComplexTest(ComplexesAreIdenticalMixin, unittest.TestCase):
- 
- 
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_complex.py b/test/dynamo/cpython/3_13/test_complex.py
index 1572433c5aef..6921c1da6ec4 100644
--- a/test/dynamo/cpython/3_13/test_complex.py
+++ b/test/dynamo/cpython/3_13/test_complex.py
@@ -526,7 +526,7 @@ def test_pow_with_small_integer_exponents(self):
 
     def test_boolcontext(self):
         for i in range(100):
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 r1 = random()
                 r2 = random()
             self.assertTrue(complex(r1 + 1e-6, r2 + 1e-6))
@@ -622,7 +622,7 @@ def check(z, x, y):
         self.assertRaises(TypeError, complex, WithComplex(1), object())
         self.assertRaises(TypeError, complex, WithComplex(None), object())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class EvilExc(Exception):
                 pass
 
@@ -652,7 +652,7 @@ def __complex__(self):
         self.assertRaises(TypeError, complex, WithIndex(None), 1.5)
         self.assertRaises(TypeError, complex, 1.5, WithIndex(None))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyInt:
                 def __int__(self):
                     return 42
@@ -661,7 +661,7 @@ def __int__(self):
         self.assertRaises(TypeError, complex, MyInt(), 1.5)
         self.assertRaises(TypeError, complex, 1.5, MyInt())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class complex0(complex):
                 """Test usage of __complex__() when inheriting from 'complex'"""
                 def __complex__(self):
diff --git a/test/dynamo/cpython/3_13/test_contextlib.diff b/test/dynamo/cpython/3_13/test_contextlib.diff
index e6fa14c96264..0a94558250d1 100644
--- a/test/dynamo/cpython/3_13/test_contextlib.diff
+++ b/test/dynamo/cpython/3_13/test_contextlib.diff
@@ -58,121 +58,121 @@ index cf651959803..256a824932d 100644
 +# ======= END DYNAMO PATCH =======
 +
  """Unit tests for contextlib.py, and other context managers."""
- 
+
  import io
 @@ -14,60 +68,67 @@ from test.support.testcase import ExceptionIsLikeMixin
  import weakref
- 
- 
+
+
 -class TestAbstractContextManager(unittest.TestCase):
 +class TestAbstractContextManager(__TestCase):
- 
+
      def test_enter(self):
 -        class DefaultEnter(AbstractContextManager):
 -            def __exit__(self, *args):
 -                super().__exit__(*args)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class DefaultEnter(AbstractContextManager):
 +                def __exit__(self, *args):
 +                    super().__exit__(*args)
- 
+
          manager = DefaultEnter()
          self.assertIs(manager.__enter__(), manager)
- 
+
      def test_slots(self):
 -        class DefaultContextManager(AbstractContextManager):
 -            __slots__ = ()
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class DefaultContextManager(AbstractContextManager):
 +                __slots__ = ()
- 
+
 -            def __exit__(self, *args):
 -                super().__exit__(*args)
 +                def __exit__(self, *args):
 +                    super().__exit__(*args)
- 
+
          with self.assertRaises(AttributeError):
              DefaultContextManager().var = 42
- 
+
      def test_exit_is_abstract(self):
 -        class MissingExit(AbstractContextManager):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MissingExit(AbstractContextManager):
 +                pass
- 
+
          with self.assertRaises(TypeError):
              MissingExit()
- 
+
      def test_structural_subclassing(self):
 -        class ManagerFromScratch:
 -            def __enter__(self):
 -                return self
 -            def __exit__(self, exc_type, exc_value, traceback):
 -                return None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class ManagerFromScratch:
 +                def __enter__(self):
 +                    return self
 +                def __exit__(self, exc_type, exc_value, traceback):
 +                    return None
- 
+
          self.assertTrue(issubclass(ManagerFromScratch, AbstractContextManager))
- 
+
 -        class DefaultEnter(AbstractContextManager):
 -            def __exit__(self, *args):
 -                super().__exit__(*args)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class DefaultEnter(AbstractContextManager):
 +                def __exit__(self, *args):
 +                    super().__exit__(*args)
- 
+
          self.assertTrue(issubclass(DefaultEnter, AbstractContextManager))
- 
+
 -        class NoEnter(ManagerFromScratch):
 -            __enter__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class NoEnter(ManagerFromScratch):
 +                __enter__ = None
- 
+
          self.assertFalse(issubclass(NoEnter, AbstractContextManager))
- 
+
 -        class NoExit(ManagerFromScratch):
 -            __exit__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class NoExit(ManagerFromScratch):
 +                __exit__ = None
- 
+
          self.assertFalse(issubclass(NoExit, AbstractContextManager))
- 
- 
+
+
 -class ContextManagerTestCase(unittest.TestCase):
 +class ContextManagerTestCase(__TestCase):
- 
+
      def test_contextmanager_plain(self):
          state = []
 @@ -115,8 +176,9 @@ class ContextManagerTestCase(unittest.TestCase):
          self.assertEqual(frames[0].line, '1/0')
- 
+
          # Repeat with RuntimeError (which goes through a different code path)
 -        class RuntimeErrorSubclass(RuntimeError):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class RuntimeErrorSubclass(RuntimeError):
 +                pass
- 
+
          try:
              with f():
 @@ -128,8 +190,9 @@ class ContextManagerTestCase(unittest.TestCase):
          self.assertEqual(frames[0].name, 'test_contextmanager_traceback')
          self.assertEqual(frames[0].line, 'raise RuntimeErrorSubclass(42)')
- 
+
 -        class StopIterationSubclass(StopIteration):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class StopIterationSubclass(StopIteration):
 +                pass
- 
+
          for stop_exc in (
              StopIteration('spam'),
 @@ -169,9 +232,9 @@ class ContextManagerTestCase(unittest.TestCase):
@@ -185,7 +185,7 @@ index cf651959803..256a824932d 100644
 +        # if support.check_impl_detail(cpython=True):
 +        #     # The "gen" attribute is an implementation detail.
 +        #     self.assertFalse(ctx.gen.gi_suspended)
- 
+
      def test_contextmanager_trap_no_yield(self):
          @contextmanager
 @@ -191,9 +254,9 @@ class ContextManagerTestCase(unittest.TestCase):
@@ -198,50 +198,50 @@ index cf651959803..256a824932d 100644
 +        # if support.check_impl_detail(cpython=True):
 +        #     # The "gen" attribute is an implementation detail.
 +        #     self.assertFalse(ctx.gen.gi_suspended)
- 
+
      def test_contextmanager_non_normalised(self):
          @contextmanager
 @@ -230,8 +293,9 @@ class ContextManagerTestCase(unittest.TestCase):
          def woohoo():
              yield
- 
+
 -        class StopIterationSubclass(StopIteration):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class StopIterationSubclass(StopIteration):
 +                pass
- 
+
          for stop_exc in (StopIteration('spam'), StopIterationSubclass('spam')):
              with self.subTest(type=type(stop_exc)):
 @@ -344,8 +408,9 @@ def woohoo():
              self.assertEqual(target, (11, 22, 33, 44))
- 
+
      def test_nokeepref(self):
 -        class A:
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class A:
 +                pass
- 
+
          @contextmanager
          def woohoo(a, b):
 @@ -396,7 +461,7 @@ def woohoo():
          self.assertEqual(depth, 0)
- 
- 
+
+
 -class ClosingTestCase(unittest.TestCase):
 +class ClosingTestCase(__TestCase):
- 
+
      @support.requires_docstrings
      def test_instance_docs(self):
 @@ -407,9 +472,10 @@ class ClosingTestCase(unittest.TestCase):
- 
+
      def test_closing(self):
          state = []
 -        class C:
 -            def close(self):
 -                state.append(1)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C:
 +                def close(self):
 +                    state.append(1)
@@ -249,13 +249,13 @@ index cf651959803..256a824932d 100644
          self.assertEqual(state, [])
          with closing(x) as y:
 @@ -418,9 +484,10 @@ class ClosingTestCase(unittest.TestCase):
- 
+
      def test_closing_error(self):
          state = []
 -        class C:
 -            def close(self):
 -                state.append(1)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C:
 +                def close(self):
 +                    state.append(1)
@@ -264,52 +264,52 @@ index cf651959803..256a824932d 100644
          with self.assertRaises(ZeroDivisionError):
 @@ -430,16 +497,17 @@ class ClosingTestCase(unittest.TestCase):
          self.assertEqual(state, [1])
- 
- 
+
+
 -class NullcontextTestCase(unittest.TestCase):
 +class NullcontextTestCase(__TestCase):
      def test_nullcontext(self):
 -        class C:
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C:
 +                pass
          c = C()
          with nullcontext(c) as c_in:
              self.assertIs(c_in, c)
- 
- 
+
+
 -class FileContextTestCase(unittest.TestCase):
 +class FileContextTestCase(__TestCase):
- 
+
      def testWithOpen(self):
          tfn = tempfile.mktemp()
 @@ -457,7 +525,7 @@ class FileContextTestCase(unittest.TestCase):
          finally:
              os_helper.unlink(tfn)
- 
+
 -class LockContextTestCase(unittest.TestCase):
 +class LockContextTestCase(__TestCase):
- 
+
      def boilerPlate(self, lock, locked):
          self.assertFalse(locked())
 @@ -520,7 +588,7 @@ class mycontext(ContextDecorator):
          return self.catch
- 
- 
+
+
 -class TestContextDecorator(unittest.TestCase):
 +class TestContextDecorator(__TestCase):
- 
+
      @support.requires_docstrings
      def test_instance_docs(self):
 @@ -584,13 +652,14 @@ class TestContextDecorator(unittest.TestCase):
      def test_decorating_method(self):
          context = mycontext()
- 
+
 -        class Test(object):
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Test(object):
- 
+
 -            @context
 -            def method(self, a, b, c=None):
 -                self.a = a
@@ -320,84 +320,84 @@ index cf651959803..256a824932d 100644
 +                    self.a = a
 +                    self.b = b
 +                    self.c = c
- 
+
          # these tests are for argument passing when used as a decorator
          test = Test()
 @@ -612,11 +681,12 @@ class TestContextDecorator(unittest.TestCase):
- 
- 
+
+
      def test_typo_enter(self):
 -        class mycontext(ContextDecorator):
 -            def __unter__(self):
 -                pass
 -            def __exit__(self, *exc):
 -                pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class mycontext(ContextDecorator):
 +                def __unter__(self):
 +                    pass
 +                def __exit__(self, *exc):
 +                    pass
- 
+
          with self.assertRaisesRegex(TypeError, 'the context manager'):
              with mycontext():
 @@ -624,11 +694,12 @@ class TestContextDecorator(unittest.TestCase):
- 
- 
+
+
      def test_typo_exit(self):
 -        class mycontext(ContextDecorator):
 -            def __enter__(self):
 -                pass
 -            def __uxit__(self, *exc):
 -                pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class mycontext(ContextDecorator):
 +                def __enter__(self):
 +                    pass
 +                def __uxit__(self, *exc):
 +                    pass
- 
+
          with self.assertRaisesRegex(TypeError, 'the context manager.*__exit__'):
              with mycontext():
 @@ -636,19 +707,20 @@ class TestContextDecorator(unittest.TestCase):
- 
- 
+
+
      def test_contextdecorator_as_mixin(self):
 -        class somecontext(object):
 -            started = False
 -            exc = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class somecontext(object):
 +                started = False
 +                exc = None
- 
+
 -            def __enter__(self):
 -                self.started = True
 -                return self
 +                def __enter__(self):
 +                    self.started = True
 +                    return self
- 
+
 -            def __exit__(self, *exc):
 -                self.exc = exc
 +                def __exit__(self, *exc):
 +                    self.exc = exc
- 
+
 -        class mycontext(somecontext, ContextDecorator):
 -            pass
 +            class mycontext(somecontext, ContextDecorator):
 +                pass
- 
+
          context = mycontext()
          @context
 @@ -680,7 +752,7 @@ class TestContextDecorator(unittest.TestCase):
          self.assertEqual(state, [1, 'something else', 999])
- 
- 
+
+
 -class TestBaseExitStack:
 +class _TestBaseExitStack:
      exit_stack = None
- 
+
      @support.requires_docstrings
 @@ -745,13 +817,14 @@ class TestBaseExitStack:
              self.assertIsNone(exc_type)
@@ -410,7 +410,7 @@ index cf651959803..256a824932d 100644
 -                self.fail("Should not be called!")
 -            def __exit__(self, *exc_details):
 -                self.check_exc(*exc_details)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class ExitCM(object):
 +                def __init__(self, check_exc):
 +                    self.check_exc = check_exc
@@ -423,25 +423,25 @@ index cf651959803..256a824932d 100644
              self.assertIs(stack._exit_callbacks[-1][1], _expect_ok)
 @@ -770,11 +843,12 @@ class TestBaseExitStack:
              1/0
- 
+
      def test_enter_context(self):
 -        class TestCM(object):
 -            def __enter__(self):
 -                result.append(1)
 -            def __exit__(self, *exc_details):
 -                result.append(3)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class TestCM(object):
 +                def __enter__(self):
 +                    result.append(1)
 +                def __exit__(self, *exc_details):
 +                    result.append(3)
- 
+
          result = []
          cm = TestCM()
 @@ -789,14 +863,15 @@ class TestBaseExitStack:
          self.assertEqual(result, [1, 2, 3, 4])
- 
+
      def test_enter_context_errors(self):
 -        class LacksEnterAndExit:
 -            pass
@@ -450,7 +450,7 @@ index cf651959803..256a824932d 100644
 -                pass
 -        class LacksExit:
 -            def __enter__(self):
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class LacksEnterAndExit:
                  pass
 +            class LacksEnter:
@@ -459,7 +459,7 @@ index cf651959803..256a824932d 100644
 +            class LacksExit:
 +                def __enter__(self):
 +                    pass
- 
+
          with self.exit_stack() as stack:
              with self.assertRaisesRegex(TypeError, 'the context manager'):
 @@ -877,32 +952,33 @@ class TestBaseExitStack:
@@ -492,7 +492,7 @@ index cf651959803..256a824932d 100644
 -            def __exit__(self, *exc_details):
 -                type(self).saved_details = exc_details
 -                return True
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class RaiseExc:
 +                def __init__(self, exc):
 +                    self.exc = exc
@@ -519,47 +519,47 @@ index cf651959803..256a824932d 100644
 +                def __exit__(self, *exc_details):
 +                    type(self).saved_details = exc_details
 +                    return True
- 
+
          try:
              with RaiseExc(IndexError):
 @@ -957,8 +1033,9 @@ class TestBaseExitStack:
          # Ensure ExitStack chaining matches actual nested `with` statements
          # regarding explicit __context__ = None.
- 
+
 -        class MyException(Exception):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyException(Exception):
 +                pass
- 
+
          @contextmanager
          def my_cm():
 @@ -1096,7 +1173,8 @@ class TestBaseExitStack:
                  stack.callback(int)
- 
+
      def test_instance_bypass(self):
 -        class Example(object): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Example(object): pass
          cm = Example()
          cm.__enter__ = object()
          cm.__exit__ = object()
 @@ -1108,8 +1186,9 @@ class TestBaseExitStack:
- 
+
      def test_dont_reraise_RuntimeError(self):
          # https://bugs.python.org/issue27122
 -        class UniqueException(Exception): pass
 -        class UniqueRuntimeError(RuntimeError): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class UniqueException(Exception): pass
 +            class UniqueRuntimeError(RuntimeError): pass
- 
+
          @contextmanager
          def second():
 @@ -1141,7 +1220,7 @@ class TestBaseExitStack:
          self.assertIs(exc.__cause__, exc.__context__)
- 
- 
+
+
 -class TestExitStack(TestBaseExitStack, unittest.TestCase):
 +class TestExitStack(_TestBaseExitStack, __TestCase):
      exit_stack = ExitStack
@@ -567,40 +567,40 @@ index cf651959803..256a824932d 100644
          ('__exit__', 'raise exc'),
 @@ -1149,7 +1228,7 @@ class TestExitStack(TestBaseExitStack, unittest.TestCase):
      ]
- 
- 
+
+
 -class TestRedirectStream:
 +class _TestRedirectStream:
- 
+
      redirect_stream = None
      orig_stream = None
 @@ -1206,19 +1285,19 @@ class TestRedirectStream:
          self.assertEqual(s, "Hello World!\n")
- 
- 
+
+
 -class TestRedirectStdout(TestRedirectStream, unittest.TestCase):
 +class TestRedirectStdout(_TestRedirectStream, __TestCase):
- 
+
      redirect_stream = redirect_stdout
      orig_stream = "stdout"
- 
- 
+
+
 -class TestRedirectStderr(TestRedirectStream, unittest.TestCase):
 +class TestRedirectStderr(_TestRedirectStream, __TestCase):
- 
+
      redirect_stream = redirect_stderr
      orig_stream = "stderr"
- 
- 
+
+
 -class TestSuppress(ExceptionIsLikeMixin, unittest.TestCase):
 +class TestSuppress(ExceptionIsLikeMixin, __TestCase):
- 
+
      @support.requires_docstrings
      def test_instance_docs(self):
 @@ -1315,7 +1394,7 @@ class TestSuppress(ExceptionIsLikeMixin, unittest.TestCase):
          )
- 
- 
+
+
 -class TestChdir(unittest.TestCase):
 +class TestChdir(__TestCase):
      def make_relative_path(self, *parts):
@@ -609,14 +609,14 @@ index cf651959803..256a824932d 100644
 @@ -1331,6 +1410,7 @@ class TestChdir(unittest.TestCase):
              self.assertEqual(os.getcwd(), target)
          self.assertEqual(os.getcwd(), old_cwd)
- 
+
 +    @unittest.skip("Missing archivetestdata")
      def test_reentrant(self):
          old_cwd = os.getcwd()
          target1 = self.make_relative_path('data')
 @@ -1363,4 +1443,4 @@ class TestChdir(unittest.TestCase):
- 
- 
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_contextlib.py b/test/dynamo/cpython/3_13/test_contextlib.py
index 256a824932d3..a4dd5ba20fb6 100644
--- a/test/dynamo/cpython/3_13/test_contextlib.py
+++ b/test/dynamo/cpython/3_13/test_contextlib.py
@@ -71,7 +71,7 @@ def find_spec(self, fullname, path, target=None):
 class TestAbstractContextManager(__TestCase):
 
     def test_enter(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class DefaultEnter(AbstractContextManager):
                 def __exit__(self, *args):
                     super().__exit__(*args)
@@ -80,7 +80,7 @@ def __exit__(self, *args):
         self.assertIs(manager.__enter__(), manager)
 
     def test_slots(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class DefaultContextManager(AbstractContextManager):
                 __slots__ = ()
 
@@ -91,7 +91,7 @@ def __exit__(self, *args):
             DefaultContextManager().var = 42
 
     def test_exit_is_abstract(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MissingExit(AbstractContextManager):
                 pass
 
@@ -99,7 +99,7 @@ class MissingExit(AbstractContextManager):
             MissingExit()
 
     def test_structural_subclassing(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class ManagerFromScratch:
                 def __enter__(self):
                     return self
@@ -108,20 +108,20 @@ def __exit__(self, exc_type, exc_value, traceback):
 
         self.assertTrue(issubclass(ManagerFromScratch, AbstractContextManager))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class DefaultEnter(AbstractContextManager):
                 def __exit__(self, *args):
                     super().__exit__(*args)
 
         self.assertTrue(issubclass(DefaultEnter, AbstractContextManager))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class NoEnter(ManagerFromScratch):
                 __enter__ = None
 
         self.assertFalse(issubclass(NoEnter, AbstractContextManager))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class NoExit(ManagerFromScratch):
                 __exit__ = None
 
@@ -176,7 +176,7 @@ def f():
         self.assertEqual(frames[0].line, '1/0')
 
         # Repeat with RuntimeError (which goes through a different code path)
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class RuntimeErrorSubclass(RuntimeError):
                 pass
 
@@ -190,7 +190,7 @@ class RuntimeErrorSubclass(RuntimeError):
         self.assertEqual(frames[0].name, 'test_contextmanager_traceback')
         self.assertEqual(frames[0].line, 'raise RuntimeErrorSubclass(42)')
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class StopIterationSubclass(StopIteration):
                 pass
 
@@ -293,7 +293,7 @@ def test_contextmanager_except_stopiter(self):
         def woohoo():
             yield
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class StopIterationSubclass(StopIteration):
                 pass
 
@@ -408,7 +408,7 @@ def woohoo(self, func, args, kwds):
             self.assertEqual(target, (11, 22, 33, 44))
 
     def test_nokeepref(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class A:
                 pass
 
@@ -472,7 +472,7 @@ def test_instance_docs(self):
 
     def test_closing(self):
         state = []
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C:
                 def close(self):
                     state.append(1)
@@ -484,7 +484,7 @@ def close(self):
 
     def test_closing_error(self):
         state = []
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C:
                 def close(self):
                     state.append(1)
@@ -499,7 +499,7 @@ def close(self):
 
 class NullcontextTestCase(__TestCase):
     def test_nullcontext(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C:
                 pass
         c = C()
@@ -652,7 +652,7 @@ def test():
     def test_decorating_method(self):
         context = mycontext()
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Test(object):
 
                 @context
@@ -681,7 +681,7 @@ def method(self, a, b, c=None):
 
 
     def test_typo_enter(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class mycontext(ContextDecorator):
                 def __unter__(self):
                     pass
@@ -694,7 +694,7 @@ def __exit__(self, *exc):
 
 
     def test_typo_exit(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class mycontext(ContextDecorator):
                 def __enter__(self):
                     pass
@@ -707,7 +707,7 @@ def __uxit__(self, *exc):
 
 
     def test_contextdecorator_as_mixin(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class somecontext(object):
                 started = False
                 exc = None
@@ -817,7 +817,7 @@ def _expect_ok(exc_type, exc, exc_tb):
             self.assertIsNone(exc_type)
             self.assertIsNone(exc)
             self.assertIsNone(exc_tb)
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class ExitCM(object):
                 def __init__(self, check_exc):
                     self.check_exc = check_exc
@@ -843,7 +843,7 @@ def __exit__(self, *exc_details):
             1/0
 
     def test_enter_context(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class TestCM(object):
                 def __enter__(self):
                     result.append(1)
@@ -863,7 +863,7 @@ def _exit():
         self.assertEqual(result, [1, 2, 3, 4])
 
     def test_enter_context_errors(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class LacksEnterAndExit:
                 pass
             class LacksEnter:
@@ -952,7 +952,7 @@ def raise_exc(exc):
     def test_exit_exception_chaining_reference(self):
         # Sanity check to make sure that ExitStack chaining matches
         # actual nested with statements
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class RaiseExc:
                 def __init__(self, exc):
                     self.exc = exc
@@ -1033,7 +1033,7 @@ def test_exit_exception_explicit_none_context(self):
         # Ensure ExitStack chaining matches actual nested `with` statements
         # regarding explicit __context__ = None.
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyException(Exception):
                 pass
 
@@ -1173,7 +1173,7 @@ def test_excessive_nesting(self):
                 stack.callback(int)
 
     def test_instance_bypass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Example(object): pass
         cm = Example()
         cm.__enter__ = object()
@@ -1186,7 +1186,7 @@ class Example(object): pass
 
     def test_dont_reraise_RuntimeError(self):
         # https://bugs.python.org/issue27122
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class UniqueException(Exception): pass
             class UniqueRuntimeError(RuntimeError): pass
 
diff --git a/test/dynamo/cpython/3_13/test_defaultdict.diff b/test/dynamo/cpython/3_13/test_defaultdict.diff
index 65a4cb892936..7f4fce2efdbe 100644
--- a/test/dynamo/cpython/3_13/test_defaultdict.diff
+++ b/test/dynamo/cpython/3_13/test_defaultdict.diff
@@ -61,19 +61,19 @@ index bdbe9b81e8f..d55f1dc54c6 100644
 +
 +
  """Unit tests for collections.defaultdict."""
- 
+
  import copy
 @@ -9,7 +66,7 @@ from collections import defaultdict
  def foobar():
      return list
- 
+
 -class TestDefaultDict(unittest.TestCase):
 +class TestDefaultDict(__TestCase):
- 
+
      def test_basic(self):
          d1 = defaultdict()
 @@ -127,11 +184,12 @@ class TestDefaultDict(unittest.TestCase):
- 
+
      def test_recursive_repr(self):
          # Issue2045: stack overflow when default_factory is a bound method
 -        class sub(defaultdict):
@@ -81,7 +81,7 @@ index bdbe9b81e8f..d55f1dc54c6 100644
 -                self.default_factory = self._factory
 -            def _factory(self):
 -                return []
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class sub(defaultdict):
 +                def __init__(self):
 +                    self.default_factory = self._factory
@@ -92,7 +92,7 @@ index bdbe9b81e8f..d55f1dc54c6 100644
              r"sub\(<bound method .*sub\._factory "
 @@ -187,4 +245,4 @@ class TestDefaultDict(unittest.TestCase):
              i |= None
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_defaultdict.py b/test/dynamo/cpython/3_13/test_defaultdict.py
index d55f1dc54c6f..390f55d89dec 100644
--- a/test/dynamo/cpython/3_13/test_defaultdict.py
+++ b/test/dynamo/cpython/3_13/test_defaultdict.py
@@ -184,7 +184,7 @@ def test_keyerror_without_factory(self):
 
     def test_recursive_repr(self):
         # Issue2045: stack overflow when default_factory is a bound method
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class sub(defaultdict):
                 def __init__(self):
                     self.default_factory = self._factory
diff --git a/test/dynamo/cpython/3_13/test_dict.diff b/test/dynamo/cpython/3_13/test_dict.diff
index 0c6beec66dad..d8e24851409a 100644
--- a/test/dynamo/cpython/3_13/test_dict.diff
+++ b/test/dynamo/cpython/3_13/test_dict.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_dict.py b/test/dynamo/cpython/3_13/test_dict.py
-index 4c095464cbb..fcda6484ea6 100644
+index 4729132c5a5..6ecf111c1e3 100644
 --- a/test/dynamo/cpython/3_13/test_dict.py
 +++ b/test/dynamo/cpython/3_13/test_dict.py
 @@ -1,3 +1,60 @@
@@ -63,57 +63,384 @@ index 4c095464cbb..fcda6484ea6 100644
  import collections
  import collections.abc
  import gc
-@@ -11,7 +68,7 @@ from test import support
+@@ -11,11 +68,12 @@ from test import support
  from test.support import import_helper, get_c_recursion_limit
- 
- 
+
+
 -class DictTest(unittest.TestCase):
 +class DictTest(__TestCase):
- 
+
      def test_invalid_keyword_arguments(self):
-         class Custom(dict):
-@@ -265,39 +322,7 @@ class DictTest(unittest.TestCase):
- 
-         self.assertRaises(ValueError, {}.update, [(1, 2, 3)])
- 
--    def test_update_shared_keys(self):
--        class MyClass: pass
--
--        # Subclass str to enable us to create an object during the
--        # dict.update() call.
--        class MyStr(str):
--            def __hash__(self):
--                return super().__hash__()
--
+-        class Custom(dict):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Custom(dict):
++                pass
+         for invalid in {1 : 2}, Custom({1 : 2}):
+             with self.assertRaises(TypeError):
+                 dict(**invalid)
+@@ -108,8 +166,9 @@ class DictTest(unittest.TestCase):
+
+     def test_views_mapping(self):
+         mappingproxy = type(type.__dict__)
+-        class Dict(dict):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Dict(dict):
++                pass
+         for cls in [dict, Dict]:
+             d = cls()
+             m1 = d.keys().mapping
+@@ -157,25 +216,27 @@ class DictTest(unittest.TestCase):
+
+         self.assertRaises(TypeError, d.__getitem__)
+
+-        class BadEq(object):
 -            def __eq__(self, other):
--                # Create an object that shares the same PyDictKeysObject as
--                # obj.__dict__.
--                obj2 = MyClass()
--                obj2.a = "a"
--                obj2.b = "b"
--                obj2.c = "c"
--                return super().__eq__(other)
--
--        obj = MyClass()
--        obj.a = "a"
--        obj.b = "b"
--
--        x = {}
--        x[MyStr("a")] = MyStr("a")
--
--        # gh-132617: this previously raised "dict mutated during update" error
--        x.update(obj.__dict__)
--
--        self.assertEqual(x, {
--            MyStr("a"): "a",
--            "b": "b",
--        })
--
+-                raise Exc()
+-            def __hash__(self):
+-                return 24
++        with torch._dynamo.error_on_graph_break(False):
++            class BadEq(object):
++                def __eq__(self, other):
++                    raise Exc()
++                def __hash__(self):
++                    return 24
+
+         d = {}
+         d[BadEq()] = 42
+         self.assertRaises(KeyError, d.__getitem__, 23)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         x = BadHash()
+         d[x] = 42
+@@ -201,70 +262,79 @@ class DictTest(unittest.TestCase):
+
+         self.assertRaises((TypeError, AttributeError), d.update, None)
+
+-        class SimpleUserDict:
+-            def __init__(self):
+-                self.d = {1:1, 2:2, 3:3}
+-            def keys(self):
+-                return self.d.keys()
+-            def __getitem__(self, i):
+-                return self.d[i]
++        with torch._dynamo.error_on_graph_break(False):
++            class SimpleUserDict:
++                def __init__(self):
++                    self.d = {1:1, 2:2, 3:3}
++                def keys(self):
++                    return self.d.keys()
++                def __getitem__(self, i):
++                    return self.d[i]
+         d.clear()
+         d.update(SimpleUserDict())
+         self.assertEqual(d, {1:1, 2:2, 3:3})
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+         d.clear()
+-        class FailingUserDict:
+-            def keys(self):
+-                raise Exc
++
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    raise Exc
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+-        class FailingUserDict:
+-            def keys(self):
+-                class BogonIter:
+-                    def __init__(self):
+-                        self.i = 1
+-                    def __iter__(self):
+-                        return self
+-                    def __next__(self):
+-                        if self.i:
+-                            self.i = 0
+-                            return 'a'
+-                        raise Exc
+-                return BogonIter()
+-            def __getitem__(self, key):
+-                return key
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    class BogonIter:
++                        def __init__(self):
++                            self.i = 1
++                        def __iter__(self):
++                            return self
++                        def __next__(self):
++                            if self.i:
++                                self.i = 0
++                                return 'a'
++                            raise Exc
++                    return BogonIter()
++                def __getitem__(self, key):
++                    return key
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+-        class FailingUserDict:
+-            def keys(self):
+-                class BogonIter:
+-                    def __init__(self):
+-                        self.i = ord('a')
+-                    def __iter__(self):
+-                        return self
+-                    def __next__(self):
+-                        if self.i <= ord('z'):
+-                            rtn = chr(self.i)
+-                            self.i += 1
+-                            return rtn
+-                        raise StopIteration
+-                return BogonIter()
+-            def __getitem__(self, key):
+-                raise Exc
++        with torch._dynamo.error_on_graph_break(False):
++            class FailingUserDict:
++                def keys(self):
++                    class BogonIter:
++                        def __init__(self):
++                            self.i = ord('a')
++                        def __iter__(self):
++                            return self
++                        def __next__(self):
++                            if self.i <= ord('z'):
++                                rtn = chr(self.i)
++                                self.i += 1
++                                return rtn
++                            raise StopIteration
++                    return BogonIter()
++                def __getitem__(self, key):
++                    raise Exc
+         self.assertRaises(Exc, d.update, FailingUserDict())
+
+-        class badseq(object):
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                raise Exc()
++
++        with torch._dynamo.error_on_graph_break(False):
++            class badseq(object):
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, {}.update, badseq())
+
+         self.assertRaises(ValueError, {}.update, [(1, 2, 3)])
+
 +    @unittest.skip("test hangs")
      def test_fromkeys(self):
          self.assertEqual(dict.fromkeys('abc'), {'a':None, 'b':None, 'c':None})
          d = {}
-@@ -510,7 +535,7 @@ class DictTest(unittest.TestCase):
+@@ -276,38 +346,43 @@ class DictTest(unittest.TestCase):
+             yield 1
+         self.assertEqual(d.fromkeys(g()), {1:None})
+         self.assertRaises(TypeError, {}.fromkeys, 3)
+-        class dictlike(dict): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class dictlike(dict): pass
+         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
+         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
+         self.assertIsInstance(dictlike.fromkeys('a'), dictlike)
+         self.assertIsInstance(dictlike().fromkeys('a'), dictlike)
+-        class mydict(dict):
+-            def __new__(cls):
+-                return collections.UserDict()
++        with torch._dynamo.error_on_graph_break(False):
++            class mydict(dict):
++                def __new__(cls):
++                    return collections.UserDict()
+         ud = mydict.fromkeys('ab')
+         self.assertEqual(ud, {'a':None, 'b':None})
+         self.assertIsInstance(ud, collections.UserDict)
+         self.assertRaises(TypeError, dict.fromkeys)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class baddict1(dict):
+-            def __init__(self):
+-                raise Exc()
++            class baddict1(dict):
++                def __init__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, baddict1.fromkeys, [1])
+
+-        class BadSeq(object):
+-            def __iter__(self):
+-                return self
+-            def __next__(self):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class BadSeq(object):
++                def __iter__(self):
++                    return self
++                def __next__(self):
++                    raise Exc()
+
+         self.assertRaises(Exc, dict.fromkeys, BadSeq())
+
+-        class baddict2(dict):
+-            def __setitem__(self, key, value):
+-                raise Exc()
++        with torch._dynamo.error_on_graph_break(False):
++            class baddict2(dict):
++                def __setitem__(self, key, value):
++                    raise Exc()
+
+         self.assertRaises(Exc, baddict2.fromkeys, [1])
+
+@@ -323,18 +398,20 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(dict.fromkeys(d, 0), res)
+
+         # test fast path when object's constructor returns large non-empty dict
+-        class baddict3(dict):
+-            def __new__(cls):
+-                return d
++        with torch._dynamo.error_on_graph_break(False):
++            class baddict3(dict):
++                def __new__(cls):
++                    return d
+         d = {i : i for i in range(1000)}
+         res = d.copy()
+         res.update(a=None, b=None, c=None)
+         self.assertEqual(baddict3.fromkeys({"a", "b", "c"}), res)
+
+         # test slow path when object is a proper subclass of dict
+-        class baddict4(dict):
+-            def __init__(self):
+-                dict.__init__(self, d)
++        with torch._dynamo.error_on_graph_break(False):
++            class baddict4(dict):
++                def __init__(self):
++                    dict.__init__(self, d)
+         d = {i : i for i in range(1000)}
+         res = d.copy()
+         res.update(a=None, b=None, c=None)
+@@ -370,8 +447,9 @@ class DictTest(unittest.TestCase):
+                 self.assertEqual(len(d2), len(d) + 1)
+
+     def test_copy_maintains_tracking(self):
+-        class A:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                pass
+
+         key = A()
+
+@@ -416,15 +494,17 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(len(d['key']), 2)
+         self.assertRaises(TypeError, d.setdefault)
+
+-        class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
++
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         x = BadHash()
+         d[x] = 42
+@@ -433,16 +513,17 @@ class DictTest(unittest.TestCase):
+
+     def test_setdefault_atomic(self):
+         # Issue #13521: setdefault() calls __hash__ and __eq__ only once.
+-        class Hashed(object):
+-            def __init__(self):
+-                self.hash_count = 0
+-                self.eq_count = 0
+-            def __hash__(self):
+-                self.hash_count += 1
+-                return 42
+-            def __eq__(self, other):
+-                self.eq_count += 1
+-                return id(self) == id(other)
++        with torch._dynamo.error_on_graph_break(False):
++            class Hashed(object):
++                def __init__(self):
++                    self.hash_count = 0
++                    self.eq_count = 0
++                def __hash__(self):
++                    self.hash_count += 1
++                    return 42
++                def __eq__(self, other):
++                    self.eq_count += 1
++                    return id(self) == id(other)
+         hashed1 = Hashed()
+         y = {hashed1: 5}
+         hashed2 = Hashed()
+@@ -452,16 +533,17 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
+
+     def test_setitem_atomic_at_resize(self):
+-        class Hashed(object):
+-            def __init__(self):
+-                self.hash_count = 0
+-                self.eq_count = 0
+-            def __hash__(self):
+-                self.hash_count += 1
+-                return 42
+-            def __eq__(self, other):
+-                self.eq_count += 1
+-                return id(self) == id(other)
++        with torch._dynamo.error_on_graph_break(False):
++            class Hashed(object):
++                def __init__(self):
++                    self.hash_count = 0
++                    self.eq_count = 0
++                def __hash__(self):
++                    self.hash_count += 1
++                    return 42
++                def __eq__(self, other):
++                    self.eq_count += 1
++                    return id(self) == id(other)
+         hashed1 = Hashed()
+         # 5 items
+         y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
+@@ -477,7 +559,7 @@ class DictTest(unittest.TestCase):
          for copymode in -1, +1:
              # -1: b has same structure as a
              # +1: b is a.copy()
@@ -122,10 +449,251 @@ index 4c095464cbb..fcda6484ea6 100644
                  size = 2**log2size
                  a = {}
                  b = {}
-@@ -1039,18 +1064,6 @@ class DictTest(unittest.TestCase):
+@@ -517,15 +599,16 @@ class DictTest(unittest.TestCase):
+
+         self.assertRaises(TypeError, d.pop)
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadHash(object):
+-            fail = False
+-            def __hash__(self):
+-                if self.fail:
+-                    raise Exc()
+-                else:
+-                    return 42
++            class BadHash(object):
++                fail = False
++                def __hash__(self):
++                    if self.fail:
++                        raise Exc()
++                    else:
++                        return 42
+
+         x = BadHash()
+         d[x] = 42
+@@ -569,22 +652,23 @@ class DictTest(unittest.TestCase):
+
+     def test_mutating_lookup(self):
+         # changing dict during a lookup (issue #14417)
+-        class NastyKey:
+-            mutate_dict = None
++        with torch._dynamo.error_on_graph_break(False):
++            class NastyKey:
++                mutate_dict = None
+
+-            def __init__(self, value):
+-                self.value = value
++                def __init__(self, value):
++                    self.value = value
+
+-            def __hash__(self):
+-                # hash collision!
+-                return 1
++                def __hash__(self):
++                    # hash collision!
++                    return 1
+
+-            def __eq__(self, other):
+-                if NastyKey.mutate_dict:
+-                    mydict, key = NastyKey.mutate_dict
+-                    NastyKey.mutate_dict = None
+-                    del mydict[key]
+-                return self.value == other.value
++                def __eq__(self, other):
++                    if NastyKey.mutate_dict:
++                        mydict, key = NastyKey.mutate_dict
++                        NastyKey.mutate_dict = None
++                        del mydict[key]
++                    return self.value == other.value
+
+         key1 = NastyKey(1)
+         key2 = NastyKey(2)
+@@ -602,11 +686,12 @@ class DictTest(unittest.TestCase):
+         d[1] = d
+         self.assertEqual(repr(d), '{1: {...}}')
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadRepr(object):
+-            def __repr__(self):
+-                raise Exc()
++            class BadRepr(object):
++                def __repr__(self):
++                    raise Exc()
+
+         d = {1: BadRepr()}
+         self.assertRaises(Exc, repr, d)
+@@ -621,13 +706,14 @@ class DictTest(unittest.TestCase):
+         self.assertEqual({}, {})
+         self.assertEqual({1: 2}, {1: 2})
+
+-        class Exc(Exception): pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
+
+-        class BadCmp(object):
+-            def __eq__(self, other):
+-                raise Exc()
+-            def __hash__(self):
+-                return 1
++            class BadCmp(object):
++                def __eq__(self, other):
++                    raise Exc()
++                def __hash__(self):
++                    return 1
+
+         d1 = {BadCmp(): 1}
+         d2 = {1: 1}
+@@ -684,9 +770,10 @@ class DictTest(unittest.TestCase):
+         self.assertFalse(larger == larger3)
+
+     def test_errors_in_view_containment_check(self):
+-        class C:
+-            def __eq__(self, other):
+-                raise RuntimeError
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def __eq__(self, other):
++                    raise RuntimeError
+
+         d1 = {1: C()}
+         d2 = {1: C()}
+@@ -766,9 +853,10 @@ class DictTest(unittest.TestCase):
+         # (E) subclass defines __missing__ method raising RuntimeError
+         # (F) subclass sets __missing__ instance variable (no effect)
+         # (G) subclass doesn't define __missing__ at all
+-        class D(dict):
+-            def __missing__(self, key):
+-                return 42
++        with torch._dynamo.error_on_graph_break(False):
++            class D(dict):
++                def __missing__(self, key):
++                    return 42
+         d = D({1: 2, 3: 4})
+         self.assertEqual(d[1], 2)
+         self.assertEqual(d[3], 4)
+@@ -776,25 +864,28 @@ class DictTest(unittest.TestCase):
+         self.assertNotIn(2, d.keys())
+         self.assertEqual(d[2], 42)
+
+-        class E(dict):
+-            def __missing__(self, key):
+-                raise RuntimeError(key)
++        with torch._dynamo.error_on_graph_break(False):
++            class E(dict):
++                def __missing__(self, key):
++                    raise RuntimeError(key)
+         e = E()
+         with self.assertRaises(RuntimeError) as c:
+             e[42]
+         self.assertEqual(c.exception.args, (42,))
+
+-        class F(dict):
+-            def __init__(self):
+-                # An instance variable __missing__ should have no effect
+-                self.__missing__ = lambda key: None
++        with torch._dynamo.error_on_graph_break(False):
++            class F(dict):
++                def __init__(self):
++                    # An instance variable __missing__ should have no effect
++                    self.__missing__ = lambda key: None
+         f = F()
+         with self.assertRaises(KeyError) as c:
+             f[42]
+         self.assertEqual(c.exception.args, (42,))
+
+-        class G(dict):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class G(dict):
++                pass
+         g = G()
+         with self.assertRaises(KeyError) as c:
+             g[42]
+@@ -809,17 +900,18 @@ class DictTest(unittest.TestCase):
+
+     def test_bad_key(self):
+         # Dictionary lookups should fail if __eq__() raises an exception.
+-        class CustomException(Exception):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomException(Exception):
++                pass
+
+-        class BadDictKey:
+-            def __hash__(self):
+-                return hash(self.__class__)
++            class BadDictKey:
++                def __hash__(self):
++                    return hash(self.__class__)
+
+-            def __eq__(self, other):
+-                if isinstance(other, self.__class__):
+-                    raise CustomException
+-                return other
++                def __eq__(self, other):
++                    if isinstance(other, self.__class__):
++                        raise CustomException
++                    return other
+
+         d = {}
+         x1 = BadDictKey()
+@@ -855,13 +947,14 @@ class DictTest(unittest.TestCase):
+         # Another dict resizing bug (SF bug #1456209).
+         # This caused Segmentation faults or Illegal instructions.
+
+-        class X(object):
+-            def __hash__(self):
+-                return 5
+-            def __eq__(self, other):
+-                if resizing:
+-                    d.clear()
+-                return False
++        with torch._dynamo.error_on_graph_break(False):
++            class X(object):
++                def __hash__(self):
++                    return 5
++                def __eq__(self, other):
++                    if resizing:
++                        d.clear()
++                    return False
+         d = {}
+         resizing = False
+         d[X()] = 1
+@@ -884,8 +977,9 @@ class DictTest(unittest.TestCase):
+     def test_container_iterator(self):
+         # Bug #3680: tp_traverse was not implemented for dictiter and
+         # dictview objects.
+-        class C(object):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C(object):
++                pass
+         views = (dict.items, dict.values, dict.keys)
+         for v in views:
+             obj = C()
+@@ -938,8 +1032,10 @@ class DictTest(unittest.TestCase):
+     @support.cpython_only
+     def test_track_dynamic(self):
+         # Test GC-optimization of dynamically-created dicts
+-        class MyObject(object):
+-            pass
++
++        with torch._dynamo.error_on_graph_break(False):
++            class MyObject(object):
++                pass
+         x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
+
+         d = dict()
+@@ -1006,21 +1102,10 @@ class DictTest(unittest.TestCase):
              pass
          self._tracked(MyDict())
- 
+
 -    @support.cpython_only
 -    def test_track_lazy_instance_dicts(self):
 -        class C:
@@ -139,20 +707,402 @@ index 4c095464cbb..fcda6484ea6 100644
 -        self._tracked(d)
 -
      def make_shared_key_dict(self, n):
-         class C:
+-        class C:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                pass
+
+         dicts = []
+         for i in range(n):
+@@ -1109,12 +1194,13 @@ class DictTest(unittest.TestCase):
+     @support.cpython_only
+     def test_splittable_update(self):
+         """dict.update(other) must preserve order in other."""
+-        class C:
+-            def __init__(self, order):
+-                if order:
+-                    self.a, self.b, self.c = 1, 2, 3
+-                else:
+-                    self.c, self.b, self.a = 1, 2, 3
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                def __init__(self, order):
++                    if order:
++                        self.a, self.b, self.c = 1, 2, 3
++                    else:
++                        self.c, self.b, self.a = 1, 2, 3
+         o = C(True)
+         o = C(False)  # o.__dict__ has reversed order.
+         self.assertEqual(list(o.__dict__), ["c", "b", "a"])
+@@ -1126,8 +1212,9 @@ class DictTest(unittest.TestCase):
+     @support.cpython_only
+     def test_splittable_to_generic_combinedtable(self):
+         """split table must be correctly resized and converted to generic combined table"""
+-        class C:
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class C:
++                pass
+
+         a = C()
+         a.x = 1
+@@ -1249,17 +1336,20 @@ class DictTest(unittest.TestCase):
+             self.assertEqual(sorted(values), sorted(data.values()))
+
+     def test_instance_dict_getattr_str_subclass(self):
+-        class Foo:
+-            def __init__(self, msg):
+-                self.msg = msg
++        with torch._dynamo.error_on_graph_break(False):
++            class Foo:
++                def __init__(self, msg):
++                    self.msg = msg
+         f = Foo('123')
+-        class _str(str):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class _str(str):
++                pass
+         self.assertEqual(f.msg, getattr(f, _str('msg')))
+         self.assertEqual(f.msg, f.__dict__[_str('msg')])
+
+     def test_object_set_item_single_instance_non_str_key(self):
+-        class Foo: pass
++        with torch._dynamo.error_on_graph_break(False):
++            class Foo: pass
+         f = Foo()
+         f.__dict__[1] = 1
+         f.a = 'a'
+@@ -1269,9 +1359,10 @@ class DictTest(unittest.TestCase):
+         # This object will trigger mutation of the dict when replaced
+         # by another value.  Note this relies on refcounting: the test
+         # won't achieve its purpose on fully-GCed Python implementations.
+-        class Mutating:
+-            def __del__(self):
+-                mutate(d)
++        with torch._dynamo.error_on_graph_break(False):
++            class Mutating:
++                def __del__(self):
++                    mutate(d)
+
+         d = {k: Mutating() for k in 'abcdefghijklmnopqr'}
+         for k in list(d):
+@@ -1294,13 +1385,14 @@ class DictTest(unittest.TestCase):
+         self.check_reentrant_insertion(mutate)
+
+     def test_merge_and_mutate(self):
+-        class X:
+-            def __hash__(self):
+-                return 0
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __hash__(self):
++                    return 0
+
+-            def __eq__(self, o):
+-                other.clear()
+-                return False
++                def __eq__(self, o):
++                    other.clear()
++                    return False
+
+         l = [(i,0) for i in range(1, 1337)]
+         other = dict(l)
+@@ -1316,26 +1408,28 @@ class DictTest(unittest.TestCase):
+
+     def test_equal_operator_modifying_operand(self):
+         # test fix for seg fault reported in bpo-27945 part 3.
+-        class X():
+-            def __del__(self):
+-                dict_b.clear()
++        with torch._dynamo.error_on_graph_break(False):
++            class X():
++                def __del__(self):
++                    dict_b.clear()
+
+-            def __eq__(self, other):
+-                dict_a.clear()
+-                return True
++                def __eq__(self, other):
++                    dict_a.clear()
++                    return True
+
+-            def __hash__(self):
+-                return 13
++                def __hash__(self):
++                    return 13
+
+         dict_a = {X(): 0}
+         dict_b = {X(): X()}
+         self.assertTrue(dict_a == dict_b)
+
+         # test fix for seg fault reported in bpo-38588 part 1.
+-        class Y:
+-            def __eq__(self, other):
+-                dict_d.clear()
+-                return True
++        with torch._dynamo.error_on_graph_break(False):
++            class Y:
++                def __eq__(self, other):
++                    dict_d.clear()
++                    return True
+
+         dict_c = {0: Y()}
+         dict_d = {0: set()}
+@@ -1343,14 +1437,15 @@ class DictTest(unittest.TestCase):
+
+     def test_fromkeys_operator_modifying_dict_operand(self):
+         # test fix for seg fault reported in issue 27945 part 4a.
+-        class X(int):
+-            def __hash__(self):
+-                return 13
++        with torch._dynamo.error_on_graph_break(False):
++            class X(int):
++                def __hash__(self):
++                    return 13
+
+-            def __eq__(self, other):
+-                if len(d) > 1:
+-                    d.clear()
+-                return False
++                def __eq__(self, other):
++                    if len(d) > 1:
++                        d.clear()
++                    return False
+
+         d = {}  # this is required to exist so that d can be constructed!
+         d = {X(1): 1, X(2): 2}
+@@ -1361,14 +1456,15 @@ class DictTest(unittest.TestCase):
+
+     def test_fromkeys_operator_modifying_set_operand(self):
+         # test fix for seg fault reported in issue 27945 part 4b.
+-        class X(int):
+-            def __hash__(self):
+-                return 13
++        with torch._dynamo.error_on_graph_break(False):
++            class X(int):
++                def __hash__(self):
++                    return 13
+
+-            def __eq__(self, other):
+-                if len(d) > 1:
+-                    d.clear()
+-                return False
++                def __eq__(self, other):
++                    if len(d) > 1:
++                        d.clear()
++                    return False
+
+         d = {}  # this is required to exist so that d can be constructed!
+         d = {X(1), X(2)}
+@@ -1378,40 +1474,44 @@ class DictTest(unittest.TestCase):
              pass
-@@ -1655,7 +1668,7 @@ class DictTest(unittest.TestCase):
+
+     def test_dictitems_contains_use_after_free(self):
+-        class X:
+-            def __eq__(self, other):
+-                d.clear()
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __eq__(self, other):
++                    d.clear()
++                    return NotImplemented
+
+         d = {0: set()}
+         (0, X()) in d.items()
+
+     def test_dict_contain_use_after_free(self):
+         # bpo-40489
+-        class S(str):
+-            def __eq__(self, other):
+-                d.clear()
+-                return NotImplemented
++        with torch._dynamo.error_on_graph_break(False):
++            class S(str):
++                def __eq__(self, other):
++                    d.clear()
++                    return NotImplemented
+
+-            def __hash__(self):
+-                return hash('test')
++                def __hash__(self):
++                    return hash('test')
+
+         d = {S(): 'value'}
+         self.assertFalse('test' in d)
+
+     def test_init_use_after_free(self):
+-        class X:
+-            def __hash__(self):
+-                pair[:] = []
+-                return 13
++        with torch._dynamo.error_on_graph_break(False):
++            class X:
++                def __hash__(self):
++                    pair[:] = []
++                    return 13
+
+         pair = [X(), 123]
+         dict([pair])
+
+     def test_oob_indexing_dictiter_iternextitem(self):
+-        class X(int):
+-            def __del__(self):
+-                d.clear()
++        with torch._dynamo.error_on_graph_break(False):
++            class X(int):
++                def __del__(self):
++                    d.clear()
+
+         d = {i: X(i) for i in range(8)}
+
+@@ -1445,10 +1545,11 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(list(reversed(dict().keys())), [])
+
+     def test_reverse_iterator_for_shared_shared_dicts(self):
+-        class A:
+-            def __init__(self, x, y):
+-                if x: self.x = x
+-                if y: self.y = y
++        with torch._dynamo.error_on_graph_break(False):
++            class A:
++                def __init__(self, x, y):
++                    if x: self.x = x
++                    if y: self.y = y
+
+         self.assertEqual(list(reversed(A(1, 2).__dict__)), ['y', 'x'])
+         self.assertEqual(list(reversed(A(1, 0).__dict__)), ['x'])
+@@ -1464,22 +1565,24 @@ class DictTest(unittest.TestCase):
+         self.assertEqual(list(copy.items()), expected)
+
+         # dict subclass doesn't override __iter__
+-        class CustomDict(dict):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomDict(dict):
++                pass
+
+         pairs = [('a', 1), ('b', 2), ('c', 3)]
+
+         d = CustomDict(pairs)
+         self.assertEqual(pairs, list(dict(d).items()))
+
+-        class CustomReversedDict(dict):
+-            def keys(self):
+-                return reversed(list(dict.keys(self)))
++        with torch._dynamo.error_on_graph_break(False):
++            class CustomReversedDict(dict):
++                def keys(self):
++                    return reversed(list(dict.keys(self)))
+
+-            __iter__ = keys
++                __iter__ = keys
+
+-            def items(self):
+-                return reversed(dict.items(self))
++                def items(self):
++                    return reversed(dict.items(self))
+
+         d = CustomReversedDict(pairs)
+         self.assertEqual(pairs[::-1], list(dict(d).items()))
+@@ -1504,17 +1607,18 @@ class DictTest(unittest.TestCase):
+         self.assertTrue(gc.is_tracked(next(it)))
+
+     def test_store_evilattr(self):
+-        class EvilAttr:
+-            def __init__(self, d):
+-                self.d = d
++        with torch._dynamo.error_on_graph_break(False):
++            class EvilAttr:
++                def __init__(self, d):
++                    self.d = d
+
+-            def __del__(self):
+-                if 'attr' in self.d:
+-                    del self.d['attr']
+-                gc.collect()
++                def __del__(self):
++                    if 'attr' in self.d:
++                        del self.d['attr']
++                    gc.collect()
+
+-        class Obj:
+-            pass
++            class Obj:
++                pass
+
+         obj = Obj()
+         obj.__dict__ = {}
+@@ -1526,21 +1630,23 @@ class DictTest(unittest.TestCase):
+         # `str` keys. Make sure the unoptimized path is used when a non-`str`
+         # key appears.
+
+-        class StrSub(str):
+-            pass
++        with torch._dynamo.error_on_graph_break(False):
++            class StrSub(str):
++                pass
+
+         eq_count = 0
+         # This class compares equal to the string 'key3'
+-        class Key3:
+-            def __hash__(self):
+-                return hash('key3')
+-
+-            def __eq__(self, other):
+-                nonlocal eq_count
+-                if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
+-                    eq_count += 1
+-                    return True
+-                return False
++        with torch._dynamo.error_on_graph_break(False):
++            class Key3:
++                def __hash__(self):
++                    return hash('key3')
++
++                def __eq__(self, other):
++                    nonlocal eq_count
++                    if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
++                        eq_count += 1
++                        return True
++                    return False
+
+         key3_1 = StrSub('key3')
+         key3_2 = Key3()
+@@ -1622,7 +1728,7 @@ class DictTest(unittest.TestCase):
                  self.assertGreaterEqual(eq_count, 1)
- 
- 
+
+
 -class CAPITest(unittest.TestCase):
 +class CAPITest(__TestCase):
- 
+
      # Test _PyDict_GetItem_KnownHash()
      @support.cpython_only
-@@ -1699,4 +1712,4 @@ class SubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
- 
- 
+@@ -1640,12 +1746,13 @@ class CAPITest(unittest.TestCase):
+         # key does not exist
+         self.assertRaises(KeyError, dict_getitem_knownhash, {}, 1, hash(1))
+
+-        class Exc(Exception): pass
+-        class BadEq:
+-            def __eq__(self, other):
+-                raise Exc
+-            def __hash__(self):
+-                return 7
++        with torch._dynamo.error_on_graph_break(False):
++            class Exc(Exception): pass
++            class BadEq:
++                def __eq__(self, other):
++                    raise Exc
++                def __hash__(self):
++                    return 7
+
+         k1, k2 = BadEq(), BadEq()
+         d = {k1: 1}
+@@ -1666,4 +1773,4 @@ class SubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_dict.py b/test/dynamo/cpython/3_13/test_dict.py
index fcda6484ea60..4a4f170ad972 100644
--- a/test/dynamo/cpython/3_13/test_dict.py
+++ b/test/dynamo/cpython/3_13/test_dict.py
@@ -71,8 +71,9 @@ def find_spec(self, fullname, path, target=None):
 class DictTest(__TestCase):
 
     def test_invalid_keyword_arguments(self):
-        class Custom(dict):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Custom(dict):
+                pass
         for invalid in {1 : 2}, Custom({1 : 2}):
             with self.assertRaises(TypeError):
                 dict(**invalid)
@@ -165,8 +166,9 @@ def test_items(self):
 
     def test_views_mapping(self):
         mappingproxy = type(type.__dict__)
-        class Dict(dict):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Dict(dict):
+                pass
         for cls in [dict, Dict]:
             d = cls()
             m1 = d.keys().mapping
@@ -214,25 +216,27 @@ def test_getitem(self):
 
         self.assertRaises(TypeError, d.__getitem__)
 
-        class BadEq(object):
-            def __eq__(self, other):
-                raise Exc()
-            def __hash__(self):
-                return 24
+        with torch._dynamo.error_on_graph_break(False):
+            class BadEq(object):
+                def __eq__(self, other):
+                    raise Exc()
+                def __hash__(self):
+                    return 24
 
         d = {}
         d[BadEq()] = 42
         self.assertRaises(KeyError, d.__getitem__, 23)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         x = BadHash()
         d[x] = 42
@@ -258,65 +262,73 @@ def test_update(self):
 
         self.assertRaises((TypeError, AttributeError), d.update, None)
 
-        class SimpleUserDict:
-            def __init__(self):
-                self.d = {1:1, 2:2, 3:3}
-            def keys(self):
-                return self.d.keys()
-            def __getitem__(self, i):
-                return self.d[i]
+        with torch._dynamo.error_on_graph_break(False):
+            class SimpleUserDict:
+                def __init__(self):
+                    self.d = {1:1, 2:2, 3:3}
+                def keys(self):
+                    return self.d.keys()
+                def __getitem__(self, i):
+                    return self.d[i]
         d.clear()
         d.update(SimpleUserDict())
         self.assertEqual(d, {1:1, 2:2, 3:3})
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
         d.clear()
-        class FailingUserDict:
-            def keys(self):
-                raise Exc
+
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    raise Exc
         self.assertRaises(Exc, d.update, FailingUserDict())
 
-        class FailingUserDict:
-            def keys(self):
-                class BogonIter:
-                    def __init__(self):
-                        self.i = 1
-                    def __iter__(self):
-                        return self
-                    def __next__(self):
-                        if self.i:
-                            self.i = 0
-                            return 'a'
-                        raise Exc
-                return BogonIter()
-            def __getitem__(self, key):
-                return key
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    class BogonIter:
+                        def __init__(self):
+                            self.i = 1
+                        def __iter__(self):
+                            return self
+                        def __next__(self):
+                            if self.i:
+                                self.i = 0
+                                return 'a'
+                            raise Exc
+                    return BogonIter()
+                def __getitem__(self, key):
+                    return key
         self.assertRaises(Exc, d.update, FailingUserDict())
 
-        class FailingUserDict:
-            def keys(self):
-                class BogonIter:
-                    def __init__(self):
-                        self.i = ord('a')
-                    def __iter__(self):
-                        return self
-                    def __next__(self):
-                        if self.i <= ord('z'):
-                            rtn = chr(self.i)
-                            self.i += 1
-                            return rtn
-                        raise StopIteration
-                return BogonIter()
-            def __getitem__(self, key):
-                raise Exc
+        with torch._dynamo.error_on_graph_break(False):
+            class FailingUserDict:
+                def keys(self):
+                    class BogonIter:
+                        def __init__(self):
+                            self.i = ord('a')
+                        def __iter__(self):
+                            return self
+                        def __next__(self):
+                            if self.i <= ord('z'):
+                                rtn = chr(self.i)
+                                self.i += 1
+                                return rtn
+                            raise StopIteration
+                    return BogonIter()
+                def __getitem__(self, key):
+                    raise Exc
         self.assertRaises(Exc, d.update, FailingUserDict())
 
-        class badseq(object):
-            def __iter__(self):
-                return self
-            def __next__(self):
-                raise Exc()
+
+        with torch._dynamo.error_on_graph_break(False):
+            class badseq(object):
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, {}.update, badseq())
 
@@ -334,38 +346,43 @@ def g():
             yield 1
         self.assertEqual(d.fromkeys(g()), {1:None})
         self.assertRaises(TypeError, {}.fromkeys, 3)
-        class dictlike(dict): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class dictlike(dict): pass
         self.assertEqual(dictlike.fromkeys('a'), {'a':None})
         self.assertEqual(dictlike().fromkeys('a'), {'a':None})
         self.assertIsInstance(dictlike.fromkeys('a'), dictlike)
         self.assertIsInstance(dictlike().fromkeys('a'), dictlike)
-        class mydict(dict):
-            def __new__(cls):
-                return collections.UserDict()
+        with torch._dynamo.error_on_graph_break(False):
+            class mydict(dict):
+                def __new__(cls):
+                    return collections.UserDict()
         ud = mydict.fromkeys('ab')
         self.assertEqual(ud, {'a':None, 'b':None})
         self.assertIsInstance(ud, collections.UserDict)
         self.assertRaises(TypeError, dict.fromkeys)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class baddict1(dict):
-            def __init__(self):
-                raise Exc()
+            class baddict1(dict):
+                def __init__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, baddict1.fromkeys, [1])
 
-        class BadSeq(object):
-            def __iter__(self):
-                return self
-            def __next__(self):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class BadSeq(object):
+                def __iter__(self):
+                    return self
+                def __next__(self):
+                    raise Exc()
 
         self.assertRaises(Exc, dict.fromkeys, BadSeq())
 
-        class baddict2(dict):
-            def __setitem__(self, key, value):
-                raise Exc()
+        with torch._dynamo.error_on_graph_break(False):
+            class baddict2(dict):
+                def __setitem__(self, key, value):
+                    raise Exc()
 
         self.assertRaises(Exc, baddict2.fromkeys, [1])
 
@@ -381,18 +398,20 @@ def __setitem__(self, key, value):
         self.assertEqual(dict.fromkeys(d, 0), res)
 
         # test fast path when object's constructor returns large non-empty dict
-        class baddict3(dict):
-            def __new__(cls):
-                return d
+        with torch._dynamo.error_on_graph_break(False):
+            class baddict3(dict):
+                def __new__(cls):
+                    return d
         d = {i : i for i in range(1000)}
         res = d.copy()
         res.update(a=None, b=None, c=None)
         self.assertEqual(baddict3.fromkeys({"a", "b", "c"}), res)
 
         # test slow path when object is a proper subclass of dict
-        class baddict4(dict):
-            def __init__(self):
-                dict.__init__(self, d)
+        with torch._dynamo.error_on_graph_break(False):
+            class baddict4(dict):
+                def __init__(self):
+                    dict.__init__(self, d)
         d = {i : i for i in range(1000)}
         res = d.copy()
         res.update(a=None, b=None, c=None)
@@ -428,8 +447,9 @@ def test_copy_fuzz(self):
                 self.assertEqual(len(d2), len(d) + 1)
 
     def test_copy_maintains_tracking(self):
-        class A:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                pass
 
         key = A()
 
@@ -474,15 +494,17 @@ def test_setdefault(self):
         self.assertEqual(len(d['key']), 2)
         self.assertRaises(TypeError, d.setdefault)
 
-        class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
+
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         x = BadHash()
         d[x] = 42
@@ -491,16 +513,17 @@ def __hash__(self):
 
     def test_setdefault_atomic(self):
         # Issue #13521: setdefault() calls __hash__ and __eq__ only once.
-        class Hashed(object):
-            def __init__(self):
-                self.hash_count = 0
-                self.eq_count = 0
-            def __hash__(self):
-                self.hash_count += 1
-                return 42
-            def __eq__(self, other):
-                self.eq_count += 1
-                return id(self) == id(other)
+        with torch._dynamo.error_on_graph_break(False):
+            class Hashed(object):
+                def __init__(self):
+                    self.hash_count = 0
+                    self.eq_count = 0
+                def __hash__(self):
+                    self.hash_count += 1
+                    return 42
+                def __eq__(self, other):
+                    self.eq_count += 1
+                    return id(self) == id(other)
         hashed1 = Hashed()
         y = {hashed1: 5}
         hashed2 = Hashed()
@@ -510,16 +533,17 @@ def __eq__(self, other):
         self.assertEqual(hashed1.eq_count + hashed2.eq_count, 1)
 
     def test_setitem_atomic_at_resize(self):
-        class Hashed(object):
-            def __init__(self):
-                self.hash_count = 0
-                self.eq_count = 0
-            def __hash__(self):
-                self.hash_count += 1
-                return 42
-            def __eq__(self, other):
-                self.eq_count += 1
-                return id(self) == id(other)
+        with torch._dynamo.error_on_graph_break(False):
+            class Hashed(object):
+                def __init__(self):
+                    self.hash_count = 0
+                    self.eq_count = 0
+                def __hash__(self):
+                    self.hash_count += 1
+                    return 42
+                def __eq__(self, other):
+                    self.eq_count += 1
+                    return id(self) == id(other)
         hashed1 = Hashed()
         # 5 items
         y = {hashed1: 5, 0: 0, 1: 1, 2: 2, 3: 3}
@@ -575,15 +599,16 @@ def test_pop(self):
 
         self.assertRaises(TypeError, d.pop)
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadHash(object):
-            fail = False
-            def __hash__(self):
-                if self.fail:
-                    raise Exc()
-                else:
-                    return 42
+            class BadHash(object):
+                fail = False
+                def __hash__(self):
+                    if self.fail:
+                        raise Exc()
+                    else:
+                        return 42
 
         x = BadHash()
         d[x] = 42
@@ -627,22 +652,23 @@ def test_mutating_iteration_delete_over_items(self):
 
     def test_mutating_lookup(self):
         # changing dict during a lookup (issue #14417)
-        class NastyKey:
-            mutate_dict = None
+        with torch._dynamo.error_on_graph_break(False):
+            class NastyKey:
+                mutate_dict = None
 
-            def __init__(self, value):
-                self.value = value
+                def __init__(self, value):
+                    self.value = value
 
-            def __hash__(self):
-                # hash collision!
-                return 1
+                def __hash__(self):
+                    # hash collision!
+                    return 1
 
-            def __eq__(self, other):
-                if NastyKey.mutate_dict:
-                    mydict, key = NastyKey.mutate_dict
-                    NastyKey.mutate_dict = None
-                    del mydict[key]
-                return self.value == other.value
+                def __eq__(self, other):
+                    if NastyKey.mutate_dict:
+                        mydict, key = NastyKey.mutate_dict
+                        NastyKey.mutate_dict = None
+                        del mydict[key]
+                    return self.value == other.value
 
         key1 = NastyKey(1)
         key2 = NastyKey(2)
@@ -660,11 +686,12 @@ def test_repr(self):
         d[1] = d
         self.assertEqual(repr(d), '{1: {...}}')
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadRepr(object):
-            def __repr__(self):
-                raise Exc()
+            class BadRepr(object):
+                def __repr__(self):
+                    raise Exc()
 
         d = {1: BadRepr()}
         self.assertRaises(Exc, repr, d)
@@ -679,13 +706,14 @@ def test_eq(self):
         self.assertEqual({}, {})
         self.assertEqual({1: 2}, {1: 2})
 
-        class Exc(Exception): pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
 
-        class BadCmp(object):
-            def __eq__(self, other):
-                raise Exc()
-            def __hash__(self):
-                return 1
+            class BadCmp(object):
+                def __eq__(self, other):
+                    raise Exc()
+                def __hash__(self):
+                    return 1
 
         d1 = {BadCmp(): 1}
         d2 = {1: 1}
@@ -742,9 +770,10 @@ def helper_keys_contained(self, fn):
         self.assertFalse(larger == larger3)
 
     def test_errors_in_view_containment_check(self):
-        class C:
-            def __eq__(self, other):
-                raise RuntimeError
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def __eq__(self, other):
+                    raise RuntimeError
 
         d1 = {1: C()}
         d2 = {1: C()}
@@ -824,9 +853,10 @@ def test_missing(self):
         # (E) subclass defines __missing__ method raising RuntimeError
         # (F) subclass sets __missing__ instance variable (no effect)
         # (G) subclass doesn't define __missing__ at all
-        class D(dict):
-            def __missing__(self, key):
-                return 42
+        with torch._dynamo.error_on_graph_break(False):
+            class D(dict):
+                def __missing__(self, key):
+                    return 42
         d = D({1: 2, 3: 4})
         self.assertEqual(d[1], 2)
         self.assertEqual(d[3], 4)
@@ -834,25 +864,28 @@ def __missing__(self, key):
         self.assertNotIn(2, d.keys())
         self.assertEqual(d[2], 42)
 
-        class E(dict):
-            def __missing__(self, key):
-                raise RuntimeError(key)
+        with torch._dynamo.error_on_graph_break(False):
+            class E(dict):
+                def __missing__(self, key):
+                    raise RuntimeError(key)
         e = E()
         with self.assertRaises(RuntimeError) as c:
             e[42]
         self.assertEqual(c.exception.args, (42,))
 
-        class F(dict):
-            def __init__(self):
-                # An instance variable __missing__ should have no effect
-                self.__missing__ = lambda key: None
+        with torch._dynamo.error_on_graph_break(False):
+            class F(dict):
+                def __init__(self):
+                    # An instance variable __missing__ should have no effect
+                    self.__missing__ = lambda key: None
         f = F()
         with self.assertRaises(KeyError) as c:
             f[42]
         self.assertEqual(c.exception.args, (42,))
 
-        class G(dict):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class G(dict):
+                pass
         g = G()
         with self.assertRaises(KeyError) as c:
             g[42]
@@ -867,17 +900,18 @@ def test_tuple_keyerror(self):
 
     def test_bad_key(self):
         # Dictionary lookups should fail if __eq__() raises an exception.
-        class CustomException(Exception):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomException(Exception):
+                pass
 
-        class BadDictKey:
-            def __hash__(self):
-                return hash(self.__class__)
+            class BadDictKey:
+                def __hash__(self):
+                    return hash(self.__class__)
 
-            def __eq__(self, other):
-                if isinstance(other, self.__class__):
-                    raise CustomException
-                return other
+                def __eq__(self, other):
+                    if isinstance(other, self.__class__):
+                        raise CustomException
+                    return other
 
         d = {}
         x1 = BadDictKey()
@@ -913,13 +947,14 @@ def test_resize2(self):
         # Another dict resizing bug (SF bug #1456209).
         # This caused Segmentation faults or Illegal instructions.
 
-        class X(object):
-            def __hash__(self):
-                return 5
-            def __eq__(self, other):
-                if resizing:
-                    d.clear()
-                return False
+        with torch._dynamo.error_on_graph_break(False):
+            class X(object):
+                def __hash__(self):
+                    return 5
+                def __eq__(self, other):
+                    if resizing:
+                        d.clear()
+                    return False
         d = {}
         resizing = False
         d[X()] = 1
@@ -942,8 +977,9 @@ def test_empty_presized_dict_in_freelist(self):
     def test_container_iterator(self):
         # Bug #3680: tp_traverse was not implemented for dictiter and
         # dictview objects.
-        class C(object):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C(object):
+                pass
         views = (dict.items, dict.values, dict.keys)
         for v in views:
             obj = C()
@@ -996,8 +1032,10 @@ def test_track_literals(self):
     @support.cpython_only
     def test_track_dynamic(self):
         # Test GC-optimization of dynamically-created dicts
-        class MyObject(object):
-            pass
+
+        with torch._dynamo.error_on_graph_break(False):
+            class MyObject(object):
+                pass
         x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
 
         d = dict()
@@ -1065,8 +1103,9 @@ class MyDict(dict):
         self._tracked(MyDict())
 
     def make_shared_key_dict(self, n):
-        class C:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                pass
 
         dicts = []
         for i in range(n):
@@ -1155,12 +1194,13 @@ def test_splittable_popitem(self):
     @support.cpython_only
     def test_splittable_update(self):
         """dict.update(other) must preserve order in other."""
-        class C:
-            def __init__(self, order):
-                if order:
-                    self.a, self.b, self.c = 1, 2, 3
-                else:
-                    self.c, self.b, self.a = 1, 2, 3
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                def __init__(self, order):
+                    if order:
+                        self.a, self.b, self.c = 1, 2, 3
+                    else:
+                        self.c, self.b, self.a = 1, 2, 3
         o = C(True)
         o = C(False)  # o.__dict__ has reversed order.
         self.assertEqual(list(o.__dict__), ["c", "b", "a"])
@@ -1172,8 +1212,9 @@ def __init__(self, order):
     @support.cpython_only
     def test_splittable_to_generic_combinedtable(self):
         """split table must be correctly resized and converted to generic combined table"""
-        class C:
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class C:
+                pass
 
         a = C()
         a.x = 1
@@ -1295,17 +1336,20 @@ def test_reversevaluesiterator_pickling(self):
             self.assertEqual(sorted(values), sorted(data.values()))
 
     def test_instance_dict_getattr_str_subclass(self):
-        class Foo:
-            def __init__(self, msg):
-                self.msg = msg
+        with torch._dynamo.error_on_graph_break(False):
+            class Foo:
+                def __init__(self, msg):
+                    self.msg = msg
         f = Foo('123')
-        class _str(str):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class _str(str):
+                pass
         self.assertEqual(f.msg, getattr(f, _str('msg')))
         self.assertEqual(f.msg, f.__dict__[_str('msg')])
 
     def test_object_set_item_single_instance_non_str_key(self):
-        class Foo: pass
+        with torch._dynamo.error_on_graph_break(False):
+            class Foo: pass
         f = Foo()
         f.__dict__[1] = 1
         f.a = 'a'
@@ -1315,9 +1359,10 @@ def check_reentrant_insertion(self, mutate):
         # This object will trigger mutation of the dict when replaced
         # by another value.  Note this relies on refcounting: the test
         # won't achieve its purpose on fully-GCed Python implementations.
-        class Mutating:
-            def __del__(self):
-                mutate(d)
+        with torch._dynamo.error_on_graph_break(False):
+            class Mutating:
+                def __del__(self):
+                    mutate(d)
 
         d = {k: Mutating() for k in 'abcdefghijklmnopqr'}
         for k in list(d):
@@ -1340,13 +1385,14 @@ def mutate(d):
         self.check_reentrant_insertion(mutate)
 
     def test_merge_and_mutate(self):
-        class X:
-            def __hash__(self):
-                return 0
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __hash__(self):
+                    return 0
 
-            def __eq__(self, o):
-                other.clear()
-                return False
+                def __eq__(self, o):
+                    other.clear()
+                    return False
 
         l = [(i,0) for i in range(1, 1337)]
         other = dict(l)
@@ -1362,26 +1408,28 @@ def test_free_after_iterating(self):
 
     def test_equal_operator_modifying_operand(self):
         # test fix for seg fault reported in bpo-27945 part 3.
-        class X():
-            def __del__(self):
-                dict_b.clear()
+        with torch._dynamo.error_on_graph_break(False):
+            class X():
+                def __del__(self):
+                    dict_b.clear()
 
-            def __eq__(self, other):
-                dict_a.clear()
-                return True
+                def __eq__(self, other):
+                    dict_a.clear()
+                    return True
 
-            def __hash__(self):
-                return 13
+                def __hash__(self):
+                    return 13
 
         dict_a = {X(): 0}
         dict_b = {X(): X()}
         self.assertTrue(dict_a == dict_b)
 
         # test fix for seg fault reported in bpo-38588 part 1.
-        class Y:
-            def __eq__(self, other):
-                dict_d.clear()
-                return True
+        with torch._dynamo.error_on_graph_break(False):
+            class Y:
+                def __eq__(self, other):
+                    dict_d.clear()
+                    return True
 
         dict_c = {0: Y()}
         dict_d = {0: set()}
@@ -1389,14 +1437,15 @@ def __eq__(self, other):
 
     def test_fromkeys_operator_modifying_dict_operand(self):
         # test fix for seg fault reported in issue 27945 part 4a.
-        class X(int):
-            def __hash__(self):
-                return 13
+        with torch._dynamo.error_on_graph_break(False):
+            class X(int):
+                def __hash__(self):
+                    return 13
 
-            def __eq__(self, other):
-                if len(d) > 1:
-                    d.clear()
-                return False
+                def __eq__(self, other):
+                    if len(d) > 1:
+                        d.clear()
+                    return False
 
         d = {}  # this is required to exist so that d can be constructed!
         d = {X(1): 1, X(2): 2}
@@ -1407,14 +1456,15 @@ def __eq__(self, other):
 
     def test_fromkeys_operator_modifying_set_operand(self):
         # test fix for seg fault reported in issue 27945 part 4b.
-        class X(int):
-            def __hash__(self):
-                return 13
+        with torch._dynamo.error_on_graph_break(False):
+            class X(int):
+                def __hash__(self):
+                    return 13
 
-            def __eq__(self, other):
-                if len(d) > 1:
-                    d.clear()
-                return False
+                def __eq__(self, other):
+                    if len(d) > 1:
+                        d.clear()
+                    return False
 
         d = {}  # this is required to exist so that d can be constructed!
         d = {X(1), X(2)}
@@ -1424,40 +1474,44 @@ def __eq__(self, other):
             pass
 
     def test_dictitems_contains_use_after_free(self):
-        class X:
-            def __eq__(self, other):
-                d.clear()
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __eq__(self, other):
+                    d.clear()
+                    return NotImplemented
 
         d = {0: set()}
         (0, X()) in d.items()
 
     def test_dict_contain_use_after_free(self):
         # bpo-40489
-        class S(str):
-            def __eq__(self, other):
-                d.clear()
-                return NotImplemented
+        with torch._dynamo.error_on_graph_break(False):
+            class S(str):
+                def __eq__(self, other):
+                    d.clear()
+                    return NotImplemented
 
-            def __hash__(self):
-                return hash('test')
+                def __hash__(self):
+                    return hash('test')
 
         d = {S(): 'value'}
         self.assertFalse('test' in d)
 
     def test_init_use_after_free(self):
-        class X:
-            def __hash__(self):
-                pair[:] = []
-                return 13
+        with torch._dynamo.error_on_graph_break(False):
+            class X:
+                def __hash__(self):
+                    pair[:] = []
+                    return 13
 
         pair = [X(), 123]
         dict([pair])
 
     def test_oob_indexing_dictiter_iternextitem(self):
-        class X(int):
-            def __del__(self):
-                d.clear()
+        with torch._dynamo.error_on_graph_break(False):
+            class X(int):
+                def __del__(self):
+                    d.clear()
 
         d = {i: X(i) for i in range(8)}
 
@@ -1491,10 +1545,11 @@ def test_reverse_iterator_for_empty_dict(self):
         self.assertEqual(list(reversed(dict().keys())), [])
 
     def test_reverse_iterator_for_shared_shared_dicts(self):
-        class A:
-            def __init__(self, x, y):
-                if x: self.x = x
-                if y: self.y = y
+        with torch._dynamo.error_on_graph_break(False):
+            class A:
+                def __init__(self, x, y):
+                    if x: self.x = x
+                    if y: self.y = y
 
         self.assertEqual(list(reversed(A(1, 2).__dict__)), ['y', 'x'])
         self.assertEqual(list(reversed(A(1, 0).__dict__)), ['x'])
@@ -1510,22 +1565,24 @@ def test_dict_copy_order(self):
         self.assertEqual(list(copy.items()), expected)
 
         # dict subclass doesn't override __iter__
-        class CustomDict(dict):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomDict(dict):
+                pass
 
         pairs = [('a', 1), ('b', 2), ('c', 3)]
 
         d = CustomDict(pairs)
         self.assertEqual(pairs, list(dict(d).items()))
 
-        class CustomReversedDict(dict):
-            def keys(self):
-                return reversed(list(dict.keys(self)))
+        with torch._dynamo.error_on_graph_break(False):
+            class CustomReversedDict(dict):
+                def keys(self):
+                    return reversed(list(dict.keys(self)))
 
-            __iter__ = keys
+                __iter__ = keys
 
-            def items(self):
-                return reversed(dict.items(self))
+                def items(self):
+                    return reversed(dict.items(self))
 
         d = CustomReversedDict(pairs)
         self.assertEqual(pairs[::-1], list(dict(d).items()))
@@ -1550,17 +1607,18 @@ def test_dict_items_result_gc_reversed(self):
         self.assertTrue(gc.is_tracked(next(it)))
 
     def test_store_evilattr(self):
-        class EvilAttr:
-            def __init__(self, d):
-                self.d = d
+        with torch._dynamo.error_on_graph_break(False):
+            class EvilAttr:
+                def __init__(self, d):
+                    self.d = d
 
-            def __del__(self):
-                if 'attr' in self.d:
-                    del self.d['attr']
-                gc.collect()
+                def __del__(self):
+                    if 'attr' in self.d:
+                        del self.d['attr']
+                    gc.collect()
 
-        class Obj:
-            pass
+            class Obj:
+                pass
 
         obj = Obj()
         obj.__dict__ = {}
@@ -1572,21 +1630,23 @@ def test_str_nonstr(self):
         # `str` keys. Make sure the unoptimized path is used when a non-`str`
         # key appears.
 
-        class StrSub(str):
-            pass
+        with torch._dynamo.error_on_graph_break(False):
+            class StrSub(str):
+                pass
 
         eq_count = 0
         # This class compares equal to the string 'key3'
-        class Key3:
-            def __hash__(self):
-                return hash('key3')
-
-            def __eq__(self, other):
-                nonlocal eq_count
-                if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
-                    eq_count += 1
-                    return True
-                return False
+        with torch._dynamo.error_on_graph_break(False):
+            class Key3:
+                def __hash__(self):
+                    return hash('key3')
+
+                def __eq__(self, other):
+                    nonlocal eq_count
+                    if isinstance(other, Key3) or isinstance(other, str) and other == 'key3':
+                        eq_count += 1
+                        return True
+                    return False
 
         key3_1 = StrSub('key3')
         key3_2 = Key3()
@@ -1686,12 +1746,13 @@ def test_getitem_knownhash(self):
         # key does not exist
         self.assertRaises(KeyError, dict_getitem_knownhash, {}, 1, hash(1))
 
-        class Exc(Exception): pass
-        class BadEq:
-            def __eq__(self, other):
-                raise Exc
-            def __hash__(self):
-                return 7
+        with torch._dynamo.error_on_graph_break(False):
+            class Exc(Exception): pass
+            class BadEq:
+                def __eq__(self, other):
+                    raise Exc
+                def __hash__(self):
+                    return 7
 
         k1, k2 = BadEq(), BadEq()
         d = {k1: 1}
diff --git a/test/dynamo/cpython/3_13/test_float.diff b/test/dynamo/cpython/3_13/test_float.diff
index f7695ede4ab4..3e1d08e8fe60 100644
--- a/test/dynamo/cpython/3_13/test_float.diff
+++ b/test/dynamo/cpython/3_13/test_float.diff
@@ -62,7 +62,7 @@ index 97f951f1299..da82bd190c3 100644
  import os
 @@ -8,11 +62,84 @@ import time
  import unittest
- 
+
  from test import support
 -from test.support.testcase import FloatsAreIdenticalMixin
 -from test.support.numbers import (
@@ -149,14 +149,14 @@ index 97f951f1299..da82bd190c3 100644
 +
  from math import isinf, isnan, copysign, ldexp
  import math
- 
+
 @@ -35,7 +162,7 @@ class FloatSubclass(float):
  class OtherFloatSubclass(float):
      pass
- 
+
 -class GeneralFloatCases(unittest.TestCase):
 +class GeneralFloatCases(__TestCase):
- 
+
      def test_float(self):
          self.assertEqual(float(3.14), 3.14)
 @@ -95,9 +222,10 @@ class GeneralFloatCases(unittest.TestCase):
@@ -166,51 +166,51 @@ index 97f951f1299..da82bd190c3 100644
 -        class CustomStr(str): pass
 -        class CustomBytes(bytes): pass
 -        class CustomByteArray(bytearray): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class CustomStr(str): pass
 +            class CustomBytes(bytes): pass
 +            class CustomByteArray(bytearray): pass
- 
+
          factories = [
              bytes,
 @@ -184,30 +312,31 @@ class GeneralFloatCases(unittest.TestCase):
- 
+
      def test_floatconversion(self):
          # Make sure that calls to __float__() work properly
 -        class Foo1(object):
 -            def __float__(self):
 -                return 42.
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Foo1(object):
 +                def __float__(self):
 +                    return 42.
- 
+
 -        class Foo2(float):
 -            def __float__(self):
 -                return 42.
 +            class Foo2(float):
 +                def __float__(self):
 +                    return 42.
- 
+
 -        class Foo3(float):
 -            def __new__(cls, value=0.):
 -                return float.__new__(cls, 2*value)
 +            class Foo3(float):
 +                def __new__(cls, value=0.):
 +                    return float.__new__(cls, 2*value)
- 
+
 -            def __float__(self):
 -                return self
 +                def __float__(self):
 +                    return self
- 
+
 -        class Foo4(float):
 -            def __float__(self):
 -                return 42
 +            class Foo4(float):
 +                def __float__(self):
 +                    return 42
- 
+
 -        # Issue 5759: __float__ not called on str subclasses (though it is on
 -        # unicode subclasses).
 -        class FooStr(str):
@@ -221,27 +221,27 @@ index 97f951f1299..da82bd190c3 100644
 +            class FooStr(str):
 +                def __float__(self):
 +                    return float(str(self)) + 1
- 
+
          self.assertEqual(float(Foo1()), 42.)
          self.assertEqual(float(Foo2()), 42.)
 @@ -216,15 +345,17 @@ class GeneralFloatCases(unittest.TestCase):
          self.assertRaises(TypeError, float, Foo4(42))
          self.assertEqual(float(FooStr('8')), 9.)
- 
+
 -        class Foo5:
 -            def __float__(self):
 -                return ""
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Foo5:
 +                def __float__(self):
 +                    return ""
          self.assertRaises(TypeError, time.sleep, Foo5())
- 
+
 -        # Issue #24731
 -        class F:
 -            def __float__(self):
 -                return OtherFloatSubclass(42.)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # Issue #24731
 +            class F:
 +                def __float__(self):
@@ -252,39 +252,39 @@ index 97f951f1299..da82bd190c3 100644
 @@ -234,18 +365,20 @@ class GeneralFloatCases(unittest.TestCase):
          with self.assertWarns(DeprecationWarning):
              self.assertIs(type(FloatSubclass(F())), FloatSubclass)
- 
+
 -        class MyIndex:
 -            def __init__(self, value):
 -                self.value = value
 -            def __index__(self):
 -                return self.value
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyIndex:
 +                def __init__(self, value):
 +                    self.value = value
 +                def __index__(self):
 +                    return self.value
- 
+
          self.assertEqual(float(MyIndex(42)), 42.0)
          self.assertRaises(OverflowError, float, MyIndex(2**2000))
- 
+
 -        class MyInt:
 -            def __int__(self):
 -                return 42
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyInt:
 +                def __int__(self):
 +                    return 42
- 
+
          self.assertRaises(TypeError, float, MyInt())
- 
+
 @@ -254,27 +387,30 @@ class GeneralFloatCases(unittest.TestCase):
              float(x='3.14')
- 
+
      def test_keywords_in_subclass(self):
 -        class subclass(float):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass(float):
 +                pass
          u = subclass(2.5)
@@ -292,11 +292,11 @@ index 97f951f1299..da82bd190c3 100644
          self.assertEqual(float(u), 2.5)
          with self.assertRaises(TypeError):
              subclass(x=0)
- 
+
 -        class subclass_with_init(float):
 -            def __init__(self, arg, newarg=None):
 -                self.newarg = newarg
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_init(float):
 +                def __init__(self, arg, newarg=None):
 +                    self.newarg = newarg
@@ -304,13 +304,13 @@ index 97f951f1299..da82bd190c3 100644
          self.assertIs(type(u), subclass_with_init)
          self.assertEqual(float(u), 2.5)
          self.assertEqual(u.newarg, 3)
- 
+
 -        class subclass_with_new(float):
 -            def __new__(cls, arg, newarg=None):
 -                self = super().__new__(cls, arg)
 -                self.newarg = newarg
 -                return self
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_new(float):
 +                def __new__(cls, arg, newarg=None):
 +                    self = super().__new__(cls, arg)
@@ -328,7 +328,7 @@ index 97f951f1299..da82bd190c3 100644
 -                return 42
 -        class F(float, H):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class H:
 +                def __hash__(self):
 +                    return 42
@@ -336,8 +336,8 @@ index 97f951f1299..da82bd190c3 100644
 +                pass
          value = F('nan')
          self.assertEqual(hash(value), object.__hash__(value))
- 
- 
+
+
  @unittest.skipUnless(hasattr(float, "__getformat__"), "requires __getformat__")
 -class FormatFunctionsTestCase(unittest.TestCase):
 +class FormatFunctionsTestCase(__TestCase):
@@ -347,25 +347,25 @@ index 97f951f1299..da82bd190c3 100644
 @@ -645,7 +782,7 @@ LE_FLOAT_NAN = bytes(reversed(BE_FLOAT_NAN))
  # is accident (today).
  # let's also try to guarantee that -0.0 and 0.0 don't get confused.
- 
+
 -class IEEEFormatTestCase(unittest.TestCase):
 +class IEEEFormatTestCase(__TestCase):
- 
+
      @support.requires_IEEE_754
      def test_double_specials_do_unpack(self):
 @@ -670,7 +807,7 @@ class IEEEFormatTestCase(unittest.TestCase):
          self.assertEqual(struct.pack("<f", 3.40282356e38), struct.pack("<f", FLT_MAX))
          self.assertEqual(struct.pack("<f", -3.40282356e38), struct.pack("<f", -FLT_MAX))
- 
+
 -class FormatTestCase(unittest.TestCase):
 +class FormatTestCase(__TestCase):
- 
+
      def test_format(self):
          # these should be rewritten to use both format(x, spec) and
 @@ -767,7 +904,7 @@ class FormatTestCase(unittest.TestCase):
          self.assertEqual(format(-123.34, '00.10e'), '-1.2334000000e+02')
          self.assertEqual(format(-123.34, '00.10g'), '-123.34')
- 
+
 -class ReprTestCase(unittest.TestCase):
 +class ReprTestCase(__TestCase):
      def test_repr(self):
@@ -373,7 +373,7 @@ index 97f951f1299..da82bd190c3 100644
                    'mathdata',
 @@ -832,7 +969,29 @@ class ReprTestCase(unittest.TestCase):
              self.assertEqual(repr(float(negs)), str(float(negs)))
- 
+
  @support.requires_IEEE_754
 -class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
 +class RoundTestCase(__TestCase):
@@ -399,11 +399,11 @@ index 97f951f1299..da82bd190c3 100644
 +            else:
 +                msg += ': zeros have different signs'
 +        self.fail(msg.format(x, y))
- 
+
      def test_inf_nan(self):
          self.assertRaises(OverflowError, round, INF)
 @@ -955,7 +1114,7 @@ class RoundTestCase(unittest.TestCase, FloatsAreIdenticalMixin):
- 
+
  # Beginning with Python 2.6 float has cross platform compatible
  # ways to create and represent inf and nan
 -class InfNanTest(unittest.TestCase):
@@ -412,7 +412,7 @@ index 97f951f1299..da82bd190c3 100644
          self.assertTrue(isinf(float("inf")))
          self.assertTrue(isinf(float("+inf")))
 @@ -1056,12 +1215,35 @@ class InfNanTest(unittest.TestCase):
- 
+
  fromHex = float.fromhex
  toHex = float.hex
 -class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
@@ -421,7 +421,7 @@ index 97f951f1299..da82bd190c3 100644
      MIN = fromHex('0x1p-1022')                # min normal
      TINY = fromHex('0x0.0000000000001p-1022') # min subnormal
      EPS = fromHex('0x0.0000000000001p0') # diff between 1.0 and next float up
- 
+
 +    def assertFloatsAreIdentical(self, x, y):
 +        """assert that floats x and y are identical, in the sense that:
 +        (1) both x and y are nans, or
@@ -447,37 +447,37 @@ index 97f951f1299..da82bd190c3 100644
 +
      def identical(self, x, y):
          self.assertFloatsAreIdentical(x, y)
- 
+
 @@ -1482,17 +1664,19 @@ class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
                  self.identical(x, fromHex(toHex(x)))
- 
+
      def test_subclass(self):
 -        class F(float):
 -            def __new__(cls, value):
 -                return float.__new__(cls, value + 1)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class F(float):
 +                def __new__(cls, value):
 +                    return float.__new__(cls, value + 1)
- 
+
          f = F.fromhex((1.5).hex())
          self.assertIs(type(f), F)
          self.assertEqual(f, 2.5)
- 
+
 -        class F2(float):
 -            def __init__(self, value):
 -                self.foo = 'bar'
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class F2(float):
 +                def __init__(self, value):
 +                    self.foo = 'bar'
- 
+
          f = F2.fromhex((1.5).hex())
          self.assertIs(type(f), F2)
 @@ -1500,5 +1684,5 @@ class HexFloatTestCase(FloatsAreIdenticalMixin, unittest.TestCase):
          self.assertEqual(getattr(f, 'foo', 'none'), 'bar')
- 
- 
+
+
 -if __name__ == '__main__':
 -    unittest.main()
 +if __name__ == "__main__":
diff --git a/test/dynamo/cpython/3_13/test_float.py b/test/dynamo/cpython/3_13/test_float.py
index da82bd190c3f..efc387023a4a 100644
--- a/test/dynamo/cpython/3_13/test_float.py
+++ b/test/dynamo/cpython/3_13/test_float.py
@@ -222,7 +222,7 @@ def test_underscores(self):
     def test_non_numeric_input_types(self):
         # Test possible non-numeric types for the argument x, including
         # subclasses of the explicitly documented accepted types.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class CustomStr(str): pass
             class CustomBytes(bytes): pass
             class CustomByteArray(bytearray): pass
@@ -312,7 +312,7 @@ def test_float_with_comma(self):
 
     def test_floatconversion(self):
         # Make sure that calls to __float__() work properly
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Foo1(object):
                 def __float__(self):
                     return 42.
@@ -345,13 +345,13 @@ def __float__(self):
         self.assertRaises(TypeError, float, Foo4(42))
         self.assertEqual(float(FooStr('8')), 9.)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Foo5:
                 def __float__(self):
                     return ""
         self.assertRaises(TypeError, time.sleep, Foo5())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # Issue #24731
             class F:
                 def __float__(self):
@@ -365,7 +365,7 @@ def __float__(self):
         with self.assertWarns(DeprecationWarning):
             self.assertIs(type(FloatSubclass(F())), FloatSubclass)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyIndex:
                 def __init__(self, value):
                     self.value = value
@@ -375,7 +375,7 @@ def __index__(self):
         self.assertEqual(float(MyIndex(42)), 42.0)
         self.assertRaises(OverflowError, float, MyIndex(2**2000))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyInt:
                 def __int__(self):
                     return 42
@@ -387,7 +387,7 @@ def test_keyword_args(self):
             float(x='3.14')
 
     def test_keywords_in_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass(float):
                 pass
         u = subclass(2.5)
@@ -396,7 +396,7 @@ class subclass(float):
         with self.assertRaises(TypeError):
             subclass(x=0)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(float):
                 def __init__(self, arg, newarg=None):
                     self.newarg = newarg
@@ -405,7 +405,7 @@ def __init__(self, arg, newarg=None):
         self.assertEqual(float(u), 2.5)
         self.assertEqual(u.newarg, 3)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(float):
                 def __new__(cls, arg, newarg=None):
                     self = super().__new__(cls, arg)
@@ -746,7 +746,7 @@ def test_hash(self):
     def test_hash_nan(self):
         value = float('nan')
         self.assertEqual(hash(value), object.__hash__(value))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class H:
                 def __hash__(self):
                     return 42
@@ -1664,7 +1664,7 @@ def roundtrip(x):
                 self.identical(x, fromHex(toHex(x)))
 
     def test_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class F(float):
                 def __new__(cls, value):
                     return float.__new__(cls, value + 1)
@@ -1673,7 +1673,7 @@ def __new__(cls, value):
         self.assertIs(type(f), F)
         self.assertEqual(f, 2.5)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class F2(float):
                 def __init__(self, value):
                     self.foo = 'bar'
diff --git a/test/dynamo/cpython/3_13/test_int.diff b/test/dynamo/cpython/3_13/test_int.diff
index 7d479aea3259..20ab3ed2f58b 100644
--- a/test/dynamo/cpython/3_13/test_int.diff
+++ b/test/dynamo/cpython/3_13/test_int.diff
@@ -59,7 +59,7 @@ index 48825f46911..731680d82a0 100644
 +
  import sys
  import time
- 
+
  import unittest
  from unittest import mock
  from test import support
@@ -144,35 +144,35 @@ index 48825f46911..731680d82a0 100644
 +    '(1+1.5_j_)',
 +    '(1+1.5_j)',
 +]
- 
+
  try:
      import _pylong
 @@ -38,7 +165,7 @@ L = [
  class IntSubclass(int):
      pass
- 
+
 -class IntTestCases(unittest.TestCase):
 +class IntTestCases(__TestCase):
- 
+
      def test_basic(self):
          self.assertEqual(int(314), 314)
 @@ -309,11 +436,13 @@ class IntTestCases(unittest.TestCase):
              int('0', 5.0)
- 
+
      def test_int_base_indexable(self):
 -        class MyIndexable(object):
 -            def __init__(self, value):
 -                self.value = value
 -            def __index__(self):
 -                return self.value
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class MyIndexable(object):
 +                    def __init__(self, value):
 +                        self.value = value
 +                    def __index__(self):
 +                        return self.value
- 
+
          # Check out of range bases.
          for base in 2**100, -2**100, 1, 37:
 @@ -328,9 +457,11 @@ class IntTestCases(unittest.TestCase):
@@ -183,44 +183,44 @@ index 48825f46911..731680d82a0 100644
 -        class CustomBytes(bytes): pass
 -        class CustomByteArray(bytearray): pass
 +
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class CustomStr(str): pass
 +            class CustomBytes(bytes): pass
 +            class CustomByteArray(bytearray): pass
- 
+
          factories = [
              bytes,
 @@ -372,72 +503,82 @@ class IntTestCases(unittest.TestCase):
- 
+
      def test_intconversion(self):
          # Test __int__()
 -        class ClassicMissingMethods:
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class ClassicMissingMethods:
 +                pass
          self.assertRaises(TypeError, int, ClassicMissingMethods())
- 
+
 -        class MissingMethods(object):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MissingMethods(object):
 +                pass
          self.assertRaises(TypeError, int, MissingMethods())
- 
+
 -        class Foo0:
 -            def __int__(self):
 -                return 42
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Foo0:
 +                def __int__(self):
 +                    return 42
- 
+
          self.assertEqual(int(Foo0()), 42)
- 
+
 -        class Classic:
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Classic:
 +                pass
          for base in (object, Classic):
@@ -229,35 +229,35 @@ index 48825f46911..731680d82a0 100644
 -                    return 42
 -                def __trunc__(self):
 -                    return -12
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class IntOverridesTrunc(base):
 +                    def __int__(self):
 +                        return 42
 +                    def __trunc__(self):
 +                        return -12
              self.assertEqual(int(IntOverridesTrunc()), 42)
- 
+
 -            class JustTrunc(base):
 -                def __trunc__(self):
 -                    return 42
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class JustTrunc(base):
 +                    def __trunc__(self):
 +                        return 42
              with self.assertWarns(DeprecationWarning):
                  self.assertEqual(int(JustTrunc()), 42)
- 
+
 -            class ExceptionalTrunc(base):
 -                def __trunc__(self):
 -                    1 / 0
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class ExceptionalTrunc(base):
 +                    def __trunc__(self):
 +                        1 / 0
              with self.assertRaises(ZeroDivisionError), \
                   self.assertWarns(DeprecationWarning):
                  int(ExceptionalTrunc())
- 
+
              for trunc_result_base in (object, Classic):
 -                class Index(trunc_result_base):
 -                    def __index__(self):
@@ -266,7 +266,7 @@ index 48825f46911..731680d82a0 100644
 -                class TruncReturnsNonInt(base):
 -                    def __trunc__(self):
 -                        return Index()
-+                with torch._dynamo.set_fullgraph(fullgraph=False):
++                with torch._dynamo.error_on_graph_break(False):
 +                    class Index(trunc_result_base):
 +                        def __index__(self):
 +                            return 42
@@ -276,15 +276,15 @@ index 48825f46911..731680d82a0 100644
 +                            return Index()
                  with self.assertWarns(DeprecationWarning):
                      self.assertEqual(int(TruncReturnsNonInt()), 42)
- 
+
 -                class Intable(trunc_result_base):
 -                    def __int__(self):
 -                        return 42
-+                with torch._dynamo.set_fullgraph(fullgraph=False):
++                with torch._dynamo.error_on_graph_break(False):
 +                    class Intable(trunc_result_base):
 +                        def __int__(self):
 +                            return 42
- 
+
 -                class TruncReturnsNonIndex(base):
 -                    def __trunc__(self):
 -                        return Intable()
@@ -293,17 +293,17 @@ index 48825f46911..731680d82a0 100644
 +                            return Intable()
                  with self.assertWarns(DeprecationWarning):
                      self.assertEqual(int(TruncReturnsNonInt()), 42)
- 
+
 -                class NonIntegral(trunc_result_base):
 -                    def __trunc__(self):
 -                        # Check that we avoid infinite recursion.
 -                        return NonIntegral()
-+                with torch._dynamo.set_fullgraph(fullgraph=False):
++                with torch._dynamo.error_on_graph_break(False):
 +                    class NonIntegral(trunc_result_base):
 +                        def __trunc__(self):
 +                            # Check that we avoid infinite recursion.
 +                            return NonIntegral()
- 
+
 -                class TruncReturnsNonIntegral(base):
 -                    def __trunc__(self):
 -                        return NonIntegral()
@@ -316,152 +316,152 @@ index 48825f46911..731680d82a0 100644
 @@ -449,27 +590,29 @@ class IntTestCases(unittest.TestCase):
                      self.fail("Failed to raise TypeError with %s" %
                                ((base, trunc_result_base),))
- 
+
 -                # Regression test for bugs.python.org/issue16060.
 -                class BadInt(trunc_result_base):
 -                    def __int__(self):
 -                        return 42.0
-+                with torch._dynamo.set_fullgraph(fullgraph=False):
++                with torch._dynamo.error_on_graph_break(False):
 +                    # Regression test for bugs.python.org/issue16060.
 +                    class BadInt(trunc_result_base):
 +                        def __int__(self):
 +                            return 42.0
- 
+
 -                class TruncReturnsBadInt(base):
 -                    def __trunc__(self):
 -                        return BadInt()
 +                    class TruncReturnsBadInt(base):
 +                        def __trunc__(self):
 +                            return BadInt()
- 
+
                  with self.assertRaises(TypeError), \
                       self.assertWarns(DeprecationWarning):
                      int(TruncReturnsBadInt())
- 
+
      def test_int_subclass_with_index(self):
 -        class MyIndex(int):
 -            def __index__(self):
 -                return 42
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyIndex(int):
 +                def __index__(self):
 +                    return 42
- 
+
 -        class BadIndex(int):
 -            def __index__(self):
 -                return 42.0
 +            class BadIndex(int):
 +                def __index__(self):
 +                    return 42.0
- 
+
          my_int = MyIndex(7)
          self.assertEqual(my_int, 7)
 @@ -478,13 +621,14 @@ class IntTestCases(unittest.TestCase):
          self.assertEqual(int(BadIndex()), 0)
- 
+
      def test_int_subclass_with_int(self):
 -        class MyInt(int):
 -            def __int__(self):
 -                return 42
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyInt(int):
 +                def __int__(self):
 +                    return 42
- 
+
 -        class BadInt(int):
 -            def __int__(self):
 -                return 42.0
 +            class BadInt(int):
 +                def __int__(self):
 +                    return 42.0
- 
+
          my_int = MyInt(7)
          self.assertEqual(my_int, 7)
 @@ -495,33 +639,34 @@ class IntTestCases(unittest.TestCase):
          self.assertRaises(TypeError, int, my_int)
- 
+
      def test_int_returns_int_subclass(self):
 -        class BadIndex:
 -            def __index__(self):
 -                return True
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class BadIndex:
 +                def __index__(self):
 +                    return True
- 
+
 -        class BadIndex2(int):
 -            def __index__(self):
 -                return True
 +            class BadIndex2(int):
 +                def __index__(self):
 +                    return True
- 
+
 -        class BadInt:
 -            def __int__(self):
 -                return True
 +            class BadInt:
 +                def __int__(self):
 +                    return True
- 
+
 -        class BadInt2(int):
 -            def __int__(self):
 -                return True
 +            class BadInt2(int):
 +                def __int__(self):
 +                    return True
- 
+
 -        class TruncReturnsBadIndex:
 -            def __trunc__(self):
 -                return BadIndex()
 +            class TruncReturnsBadIndex:
 +                def __trunc__(self):
 +                    return BadIndex()
- 
+
 -        class TruncReturnsBadInt:
 -            def __trunc__(self):
 -                return BadInt()
 +            class TruncReturnsBadInt:
 +                def __trunc__(self):
 +                    return BadInt()
- 
+
 -        class TruncReturnsIntSubclass:
 -            def __trunc__(self):
 -                return True
 +            class TruncReturnsIntSubclass:
 +                def __trunc__(self):
 +                    return True
- 
+
          bad_int = BadIndex()
          with self.assertWarns(DeprecationWarning):
 @@ -566,6 +711,7 @@ class IntTestCases(unittest.TestCase):
          self.assertEqual(n, 1)
          self.assertIs(type(n), IntSubclass)
- 
+
 +    @skipIfTorchDynamo("flaky under dynamo")
      def test_error_message(self):
          def check(s, base=None):
              with self.assertRaises(ValueError,
 @@ -607,7 +753,7 @@ class IntTestCases(unittest.TestCase):
          self.assertEqual(int('1_2_3_4_5_6_7', 32), 1144132807)
- 
- 
+
+
 -class IntStrDigitLimitsTests(unittest.TestCase):
 +class IntStrDigitLimitsTests(__TestCase):
- 
+
      int_class = int  # Override this in subclasses to reuse the suite.
- 
+
 @@ -818,7 +964,7 @@ class IntSubclassStrDigitLimitsTests(IntStrDigitLimitsTests):
      int_class = IntSubclass
- 
- 
+
+
 -class PyLongModuleTests(unittest.TestCase):
 +class PyLongModuleTests(__TestCase):
      # Tests of the functions in _pylong.py.  Those get used when the
      # number of digits in the input values are large enough.
- 
+
 @@ -922,4 +1068,4 @@ class PyLongModuleTests(unittest.TestCase):
              bits <<= 1
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_int.py b/test/dynamo/cpython/3_13/test_int.py
index 731680d82a02..b0f8fe49d1b9 100644
--- a/test/dynamo/cpython/3_13/test_int.py
+++ b/test/dynamo/cpython/3_13/test_int.py
@@ -436,8 +436,8 @@ def test_int_base_bad_types(self):
             int('0', 5.0)
 
     def test_int_base_indexable(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
+            with torch._dynamo.error_on_graph_break(False):
                 class MyIndexable(object):
                     def __init__(self, value):
                         self.value = value
@@ -458,7 +458,7 @@ def test_non_numeric_input_types(self):
         # Test possible non-numeric types for the argument x, including
         # subclasses of the explicitly documented accepted types.
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class CustomStr(str): pass
             class CustomBytes(bytes): pass
             class CustomByteArray(bytearray): pass
@@ -503,28 +503,28 @@ def test_string_float(self):
 
     def test_intconversion(self):
         # Test __int__()
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class ClassicMissingMethods:
                 pass
         self.assertRaises(TypeError, int, ClassicMissingMethods())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MissingMethods(object):
                 pass
         self.assertRaises(TypeError, int, MissingMethods())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Foo0:
                 def __int__(self):
                     return 42
 
         self.assertEqual(int(Foo0()), 42)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Classic:
                 pass
         for base in (object, Classic):
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            with torch._dynamo.error_on_graph_break(False):
                 class IntOverridesTrunc(base):
                     def __int__(self):
                         return 42
@@ -532,14 +532,14 @@ def __trunc__(self):
                         return -12
             self.assertEqual(int(IntOverridesTrunc()), 42)
 
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            with torch._dynamo.error_on_graph_break(False):
                 class JustTrunc(base):
                     def __trunc__(self):
                         return 42
             with self.assertWarns(DeprecationWarning):
                 self.assertEqual(int(JustTrunc()), 42)
 
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            with torch._dynamo.error_on_graph_break(False):
                 class ExceptionalTrunc(base):
                     def __trunc__(self):
                         1 / 0
@@ -548,7 +548,7 @@ def __trunc__(self):
                 int(ExceptionalTrunc())
 
             for trunc_result_base in (object, Classic):
-                with torch._dynamo.set_fullgraph(fullgraph=False):
+                with torch._dynamo.error_on_graph_break(False):
                     class Index(trunc_result_base):
                         def __index__(self):
                             return 42
@@ -559,7 +559,7 @@ def __trunc__(self):
                 with self.assertWarns(DeprecationWarning):
                     self.assertEqual(int(TruncReturnsNonInt()), 42)
 
-                with torch._dynamo.set_fullgraph(fullgraph=False):
+                with torch._dynamo.error_on_graph_break(False):
                     class Intable(trunc_result_base):
                         def __int__(self):
                             return 42
@@ -570,7 +570,7 @@ def __trunc__(self):
                 with self.assertWarns(DeprecationWarning):
                     self.assertEqual(int(TruncReturnsNonInt()), 42)
 
-                with torch._dynamo.set_fullgraph(fullgraph=False):
+                with torch._dynamo.error_on_graph_break(False):
                     class NonIntegral(trunc_result_base):
                         def __trunc__(self):
                             # Check that we avoid infinite recursion.
@@ -590,7 +590,7 @@ def __trunc__(self):
                     self.fail("Failed to raise TypeError with %s" %
                               ((base, trunc_result_base),))
 
-                with torch._dynamo.set_fullgraph(fullgraph=False):
+                with torch._dynamo.error_on_graph_break(False):
                     # Regression test for bugs.python.org/issue16060.
                     class BadInt(trunc_result_base):
                         def __int__(self):
@@ -605,7 +605,7 @@ def __trunc__(self):
                     int(TruncReturnsBadInt())
 
     def test_int_subclass_with_index(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyIndex(int):
                 def __index__(self):
                     return 42
@@ -621,7 +621,7 @@ def __index__(self):
         self.assertEqual(int(BadIndex()), 0)
 
     def test_int_subclass_with_int(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyInt(int):
                 def __int__(self):
                     return 42
@@ -639,7 +639,7 @@ def __int__(self):
         self.assertRaises(TypeError, int, my_int)
 
     def test_int_returns_int_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class BadIndex:
                 def __index__(self):
                     return True
diff --git a/test/dynamo/cpython/3_13/test_iter.diff b/test/dynamo/cpython/3_13/test_iter.diff
index e9986cf304be..18bdcdfb3df8 100644
--- a/test/dynamo/cpython/3_13/test_iter.diff
+++ b/test/dynamo/cpython/3_13/test_iter.diff
@@ -61,15 +61,15 @@ index 1b9f3cf7624..6560c7423a6 100644
 +# ======= END DYNAMO PATCH =======
 +
  # Test iterators.
- 
+
  import sys
 @@ -104,12 +161,10 @@ class EmptyIterClass:
- 
+
  # Main test suite
- 
+
 -class TestCase(unittest.TestCase):
 +class TestCase(__TestCase):
- 
+
      # Helper to check that an iterator returns a given sequence
      def check_iterator(self, it, seq, pickle=True):
 -        if pickle:
@@ -78,7 +78,7 @@ index 1b9f3cf7624..6560c7423a6 100644
          while 1:
              try:
 @@ -121,8 +176,6 @@ class TestCase(unittest.TestCase):
- 
+
      # Helper to check that a for loop generates a given sequence
      def check_for_loop(self, expr, seq, pickle=True):
 -        if pickle:
@@ -89,7 +89,7 @@ index 1b9f3cf7624..6560c7423a6 100644
 @@ -261,19 +314,20 @@ class TestCase(unittest.TestCase):
          def run(builtin_name, item, sentinel=None):
              it = iter(item) if sentinel is None else iter(item, sentinel)
- 
+
 -            class CustomStr:
 -                def __init__(self, name, iterator):
 -                    self.name = name
@@ -103,7 +103,7 @@ index 1b9f3cf7624..6560c7423a6 100644
 -                    # the pointers after this call
 -                    list(self.iterator)
 -                    return other == self.name
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class CustomStr:
 +                    def __init__(self, name, iterator):
 +                        self.name = name
@@ -117,25 +117,25 @@ index 1b9f3cf7624..6560c7423a6 100644
 +                        # the pointers after this call
 +                        list(self.iterator)
 +                        return other == self.name
- 
+
              # del is required here
              # to not prematurely call __eq__ from
 @@ -323,9 +377,10 @@ class TestCase(unittest.TestCase):
- 
+
      # Test a new_style class with __iter__ but no next() method
      def test_new_style_iter_class(self):
 -        class IterClass(object):
 -            def __iter__(self):
 -                return self
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class IterClass(object):
 +                def __iter__(self):
 +                    return self
          self.assertRaises(TypeError, iter, IterClass())
- 
+
      # Test two-argument iter() with callable instance
 @@ -394,11 +449,12 @@ class TestCase(unittest.TestCase):
- 
+
      # Test exception propagation through sequence iterator
      def test_exception_sequence(self):
 -        class MySequenceClass(SequenceClass):
@@ -143,7 +143,7 @@ index 1b9f3cf7624..6560c7423a6 100644
 -                if i == 10:
 -                    raise RuntimeError
 -                return SequenceClass.__getitem__(self, i)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MySequenceClass(SequenceClass):
 +                def __getitem__(self, i):
 +                    if i == 10:
@@ -153,7 +153,7 @@ index 1b9f3cf7624..6560c7423a6 100644
          try:
              for x in MySequenceClass(20):
 @@ -410,11 +466,12 @@ class TestCase(unittest.TestCase):
- 
+
      # Test for StopIteration from __getitem__
      def test_stop_sequence(self):
 -        class MySequenceClass(SequenceClass):
@@ -161,25 +161,25 @@ index 1b9f3cf7624..6560c7423a6 100644
 -                if i == 10:
 -                    raise StopIteration
 -                return SequenceClass.__getitem__(self, i)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MySequenceClass(SequenceClass):
 +                def __getitem__(self, i):
 +                    if i == 10:
 +                        raise StopIteration
 +                    return SequenceClass.__getitem__(self, i)
          self.check_for_loop(MySequenceClass(20), list(range(10)), pickle=False)
- 
+
      # Test a big range
 @@ -541,32 +598,34 @@ class TestCase(unittest.TestCase):
          self.assertRaises(TypeError, filter, None, list)
          self.assertRaises(TypeError, filter, None, 42)
- 
+
 -        class Boolean:
 -            def __init__(self, truth):
 -                self.truth = truth
 -            def __bool__(self):
 -                return self.truth
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Boolean:
 +                def __init__(self, truth):
 +                    self.truth = truth
@@ -187,7 +187,7 @@ index 1b9f3cf7624..6560c7423a6 100644
 +                    return self.truth
          bTrue = Boolean(True)
          bFalse = Boolean(False)
- 
+
 -        class Seq:
 -            def __init__(self, *args):
 -                self.vals = args
@@ -206,7 +206,7 @@ index 1b9f3cf7624..6560c7423a6 100644
 -                        else:
 -                            raise StopIteration
 -                return SeqIter(self.vals)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Seq:
 +                def __init__(self, *args):
 +                    self.vals = args
@@ -225,12 +225,12 @@ index 1b9f3cf7624..6560c7423a6 100644
 +                            else:
 +                                raise StopIteration
 +                    return SeqIter(self.vals)
- 
+
          seq = Seq(*([bTrue, bFalse] * 25))
          self.assertEqual(list(filter(lambda x: not x, seq)), [bFalse]*25)
 @@ -635,6 +694,7 @@ class TestCase(unittest.TestCase):
                  pass
- 
+
      # Test zip()'s use of iterators.
 +    @skipIfTorchDynamo("infinite loop")
      def test_builtin_zip(self):
@@ -238,21 +238,21 @@ index 1b9f3cf7624..6560c7423a6 100644
          self.assertEqual(list(zip(*[])), [])
 @@ -653,17 +713,18 @@ class TestCase(unittest.TestCase):
          self.assertEqual(list(d.items()), list(zip(d, d.values())))
- 
+
          # Generate all ints starting at constructor arg.
 -        class IntsFrom:
 -            def __init__(self, start):
 -                self.i = start
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class IntsFrom:
 +                def __init__(self, start):
 +                    self.i = start
- 
+
 -            def __iter__(self):
 -                return self
 +                def __iter__(self):
 +                    return self
- 
+
 -            def __next__(self):
 -                i = self.i
 -                self.i = i+1
@@ -261,60 +261,60 @@ index 1b9f3cf7624..6560c7423a6 100644
 +                    i = self.i
 +                    self.i = i+1
 +                    return i
- 
+
          f = open(TESTFN, "w", encoding="utf-8")
          try:
 @@ -686,19 +747,20 @@ class TestCase(unittest.TestCase):
          self.assertEqual(list(zip(range(5))), [(i,) for i in range(5)])
- 
+
          # Classes that lie about their lengths.
 -        class NoGuessLen5:
 -            def __getitem__(self, i):
 -                if i >= 5:
 -                    raise IndexError
 -                return i
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class NoGuessLen5:
 +                def __getitem__(self, i):
 +                    if i >= 5:
 +                        raise IndexError
 +                    return i
- 
+
 -        class Guess3Len5(NoGuessLen5):
 -            def __len__(self):
 -                return 3
 +            class Guess3Len5(NoGuessLen5):
 +                def __len__(self):
 +                    return 3
- 
+
 -        class Guess30Len5(NoGuessLen5):
 -            def __len__(self):
 -                return 30
 +            class Guess30Len5(NoGuessLen5):
 +                def __len__(self):
 +                    return 30
- 
+
          def lzip(*args):
              return list(zip(*args))
 @@ -718,20 +780,21 @@ class TestCase(unittest.TestCase):
- 
+
          # This class inserts a Unicode object into its argument's natural
          # iteration, in the 3rd position.
 -        class OhPhooey:
 -            def __init__(self, seq):
 -                self.it = iter(seq)
 -                self.i = 0
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class OhPhooey:
 +                def __init__(self, seq):
 +                    self.it = iter(seq)
 +                    self.i = 0
- 
+
 -            def __iter__(self):
 -                return self
 +                def __iter__(self):
 +                    return self
- 
+
 -            def __next__(self):
 -                i = self.i
 -                self.i = i+1
@@ -327,25 +327,25 @@ index 1b9f3cf7624..6560c7423a6 100644
 +                    if i == 2:
 +                        return "fooled you!"
 +                    return next(self.it)
- 
+
          f = open(TESTFN, "w", encoding="utf-8")
          try:
 @@ -895,29 +958,30 @@ class TestCase(unittest.TestCase):
              f.writelines({})
- 
+
              # Try a big chunk too.
 -            class Iterator:
 -                def __init__(self, start, finish):
 -                    self.start = start
 -                    self.finish = finish
 -                    self.i = self.start
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
++            with torch._dynamo.error_on_graph_break(False):
 +                class Iterator:
 +                    def __init__(self, start, finish):
 +                        self.start = start
 +                        self.finish = finish
 +                        self.i = self.start
- 
+
 -                def __next__(self):
 -                    if self.i >= self.finish:
 -                        raise StopIteration
@@ -358,12 +358,12 @@ index 1b9f3cf7624..6560c7423a6 100644
 +                        result = str(self.i) + '\n'
 +                        self.i += 1
 +                        return result
- 
+
 -                def __iter__(self):
 -                    return self
 +                    def __iter__(self):
 +                        return self
- 
+
 -            class Whatever:
 -                def __init__(self, start, finish):
 -                    self.start = start
@@ -372,16 +372,16 @@ index 1b9f3cf7624..6560c7423a6 100644
 +                    def __init__(self, start, finish):
 +                        self.start = start
 +                        self.finish = finish
- 
+
 -                def __iter__(self):
 -                    return Iterator(self.start, self.finish)
 +                    def __iter__(self):
 +                        return Iterator(self.start, self.finish)
- 
+
              f.writelines(Whatever(6, 6+2000))
              f.close()
 @@ -990,15 +1054,16 @@ class TestCase(unittest.TestCase):
- 
+
      @cpython_only
      def test_ref_counting_behavior(self):
 -        class C(object):
@@ -393,7 +393,7 @@ index 1b9f3cf7624..6560c7423a6 100644
 -                cls = self.__class__
 -                assert cls.count > 0
 -                cls.count -= 1
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C(object):
 +                count = 0
 +                def __new__(cls):
@@ -407,7 +407,7 @@ index 1b9f3cf7624..6560c7423a6 100644
          self.assertEqual(C.count, 1)
          del x
 @@ -1089,12 +1154,13 @@ class TestCase(unittest.TestCase):
- 
+
      def test_3720(self):
          # Avoid a crash, when an iterator deletes its next() method.
 -        class BadIterator(object):
@@ -416,19 +416,19 @@ index 1b9f3cf7624..6560c7423a6 100644
 -            def __next__(self):
 -                del BadIterator.__next__
 -                return 1
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class BadIterator(object):
 +                def __iter__(self):
 +                    return self
 +                def __next__(self):
 +                    del BadIterator.__next__
 +                    return 1
- 
+
          try:
              for i in BadIterator() :
 @@ -1187,4 +1253,4 @@ class TestCase(unittest.TestCase):
- 
- 
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_iter.py b/test/dynamo/cpython/3_13/test_iter.py
index 6560c7423a65..8e6240d99ce6 100644
--- a/test/dynamo/cpython/3_13/test_iter.py
+++ b/test/dynamo/cpython/3_13/test_iter.py
@@ -314,7 +314,7 @@ def test_reduce_mutating_builtins_iter(self):
         def run(builtin_name, item, sentinel=None):
             it = iter(item) if sentinel is None else iter(item, sentinel)
 
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            with torch._dynamo.error_on_graph_break(False):
                 class CustomStr:
                     def __init__(self, name, iterator):
                         self.name = name
@@ -377,7 +377,7 @@ def __eq__(self, other):
 
     # Test a new_style class with __iter__ but no next() method
     def test_new_style_iter_class(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class IterClass(object):
                 def __iter__(self):
                     return self
@@ -449,7 +449,7 @@ def spam(state=[0]):
 
     # Test exception propagation through sequence iterator
     def test_exception_sequence(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MySequenceClass(SequenceClass):
                 def __getitem__(self, i):
                     if i == 10:
@@ -466,7 +466,7 @@ def __getitem__(self, i):
 
     # Test for StopIteration from __getitem__
     def test_stop_sequence(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MySequenceClass(SequenceClass):
                 def __getitem__(self, i):
                     if i == 10:
@@ -598,7 +598,7 @@ def test_builtin_filter(self):
         self.assertRaises(TypeError, filter, None, list)
         self.assertRaises(TypeError, filter, None, 42)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Boolean:
                 def __init__(self, truth):
                     self.truth = truth
@@ -607,7 +607,7 @@ def __bool__(self):
         bTrue = Boolean(True)
         bFalse = Boolean(False)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Seq:
                 def __init__(self, *args):
                     self.vals = args
@@ -713,7 +713,7 @@ def test_builtin_zip(self):
         self.assertEqual(list(d.items()), list(zip(d, d.values())))
 
         # Generate all ints starting at constructor arg.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class IntsFrom:
                 def __init__(self, start):
                     self.i = start
@@ -747,7 +747,7 @@ def __next__(self):
         self.assertEqual(list(zip(range(5))), [(i,) for i in range(5)])
 
         # Classes that lie about their lengths.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class NoGuessLen5:
                 def __getitem__(self, i):
                     if i >= 5:
@@ -780,7 +780,7 @@ def test_unicode_join_endcase(self):
 
         # This class inserts a Unicode object into its argument's natural
         # iteration, in the 3rd position.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class OhPhooey:
                 def __init__(self, seq):
                     self.it = iter(seq)
@@ -958,7 +958,7 @@ def test_writelines(self):
             f.writelines({})
 
             # Try a big chunk too.
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            with torch._dynamo.error_on_graph_break(False):
                 class Iterator:
                     def __init__(self, start, finish):
                         self.start = start
@@ -1054,7 +1054,7 @@ def test_unpack_iter(self):
 
     @cpython_only
     def test_ref_counting_behavior(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 count = 0
                 def __new__(cls):
@@ -1154,7 +1154,7 @@ def test_sinkstate_enumerate(self):
 
     def test_3720(self):
         # Avoid a crash, when an iterator deletes its next() method.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class BadIterator(object):
                 def __iter__(self):
                     return self
diff --git a/test/dynamo/cpython/3_13/test_itertools.diff b/test/dynamo/cpython/3_13/test_itertools.diff
index c577573f007a..2dc5803abda3 100644
--- a/test/dynamo/cpython/3_13/test_itertools.diff
+++ b/test/dynamo/cpython/3_13/test_itertools.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_itertools.py b/test/dynamo/cpython/3_13/test_itertools.py
-index 7d5ba727389..8d462284884 100644
+index 7d5ba727389..ff514815da2 100644
 --- a/test/dynamo/cpython/3_13/test_itertools.py
 +++ b/test/dynamo/cpython/3_13/test_itertools.py
 @@ -1,3 +1,25 @@
@@ -28,32 +28,47 @@ index 7d5ba727389..8d462284884 100644
  import doctest
  import unittest
  import itertools
-@@ -90,10 +112,10 @@ def fact(n):
+@@ -40,6 +62,14 @@ def pickle_deprecated(testfunc):
+ maxsize = support.MAX_Py_ssize_t
+ minsize = -maxsize-1
+
++@torch._dynamo.disable
++def choice(*args):
++    return random.choice(*args)
++
++@torch._dynamo.disable
++def randrange(*args):
++    return random.randrange(*args)
++
+ def lzip(*args):
+     return list(zip(*args))
+
+@@ -90,10 +120,10 @@ def fact(n):
      return prod(range(1, n+1))
- 
+
  # root level methods for pickling ability
 -def testR(r):
 +def _testR(r):
      return r[0]
- 
+
 -def testR2(r):
 +def _testR2(r):
      return r[2]
- 
+
  def underten(x):
-@@ -102,7 +124,7 @@ def underten(x):
+@@ -102,7 +132,7 @@ def underten(x):
  picklecopiers = [lambda s, proto=proto: pickle.loads(pickle.dumps(s, proto))
                   for proto in range(pickle.HIGHEST_PROTOCOL + 1)]
- 
+
 -class TestBasicOps(unittest.TestCase):
 +class TestBasicOps(__TestCase):
- 
+
      def pickletest(self, protocol, it, stop=4, take=1, compare=None):
          """Test that an iterator is the same after pickling, also when part-consumed"""
-@@ -454,14 +476,8 @@ class TestBasicOps(unittest.TestCase):
+@@ -454,14 +484,8 @@ class TestBasicOps(unittest.TestCase):
          self.assertEqual(len(set(map(id, cwr('abcde', 3)))), 1)
          self.assertNotEqual(len(set(map(id, list(cwr('abcde', 3))))), 1)
- 
+
 -    @pickle_deprecated
      def test_permutations(self):
 -        self.assertRaises(TypeError, permutations)              # too few arguments
@@ -64,27 +79,27 @@ index 7d5ba727389..8d462284884 100644
 -        self.assertRaises(TypeError, permutations, 'abc', 's')  # r is not an int or None
          self.assertEqual(list(permutations(range(3), 2)),
                                             [(0,1), (0,2), (1,0), (1,2), (2,0), (2,1)])
- 
-@@ -498,7 +514,7 @@ class TestBasicOps(unittest.TestCase):
+
+@@ -498,7 +522,7 @@ class TestBasicOps(unittest.TestCase):
                  if len(set(indices)) == r:
                      yield tuple(pool[i] for i in indices)
- 
+
 -        for n in range(7):
 +        for n in range(5):
              values = [5*x-12 for x in range(n)]
              for r in range(n+2):
                  result = list(permutations(values, r))
-@@ -515,9 +531,6 @@ class TestBasicOps(unittest.TestCase):
+@@ -515,9 +539,6 @@ class TestBasicOps(unittest.TestCase):
                      self.assertEqual(result, list(permutations(values, None))) # test r as None
                      self.assertEqual(result, list(permutations(values)))       # test default r
- 
+
 -                for proto in range(pickle.HIGHEST_PROTOCOL + 1):
 -                    self.pickletest(proto, permutations(values, r))     # test pickling
 -
      @support.bigaddrspacetest
      def test_permutations_overflow(self):
          with self.assertRaises((OverflowError, MemoryError)):
-@@ -756,7 +769,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -756,7 +777,7 @@ class TestBasicOps(unittest.TestCase):
      def test_cycle(self):
          self.assertEqual(take(10, cycle('abc')), list('abcabcabca'))
          self.assertEqual(list(cycle('')), [])
@@ -92,8 +107,8 @@ index 7d5ba727389..8d462284884 100644
 +        # self.assertRaises(TypeError, cycle)
          self.assertRaises(TypeError, cycle, 5)
          self.assertEqual(list(islice(cycle(gen3()),10)), [0,1,2,0,1,2,0,1,2,0])
- 
-@@ -888,7 +901,7 @@ class TestBasicOps(unittest.TestCase):
+
+@@ -888,7 +909,7 @@ class TestBasicOps(unittest.TestCase):
          # Check normal pickled
          for proto in range(pickle.HIGHEST_PROTOCOL + 1):
              dup = []
@@ -102,8 +117,8 @@ index 7d5ba727389..8d462284884 100644
                  for elem in g:
                      self.assertEqual(k, elem[0])
                      dup.append(elem)
-@@ -896,8 +909,8 @@ class TestBasicOps(unittest.TestCase):
- 
+@@ -896,8 +917,8 @@ class TestBasicOps(unittest.TestCase):
+
          # Check nested case
          dup = []
 -        for k, g in groupby(s, testR):
@@ -113,7 +128,7 @@ index 7d5ba727389..8d462284884 100644
                  for elem in ig:
                      self.assertEqual(k, elem[0])
                      self.assertEqual(ik, elem[2])
-@@ -907,8 +920,8 @@ class TestBasicOps(unittest.TestCase):
+@@ -907,8 +928,8 @@ class TestBasicOps(unittest.TestCase):
          # Check nested and pickled
          for proto in range(pickle.HIGHEST_PROTOCOL + 1):
              dup = []
@@ -124,16 +139,16 @@ index 7d5ba727389..8d462284884 100644
                      for elem in ig:
                          self.assertEqual(k, elem[0])
                          self.assertEqual(ik, elem[2])
-@@ -917,7 +930,7 @@ class TestBasicOps(unittest.TestCase):
- 
- 
+@@ -917,7 +938,7 @@ class TestBasicOps(unittest.TestCase):
+
+
          # Check case where inner iterator is not used
 -        keys = [k for k, g in groupby(s, testR)]
 +        keys = [k for k, g in groupby(s, _testR)]
          expectedkeys = set([r[0] for r in s])
          self.assertEqual(set(keys), expectedkeys)
          self.assertEqual(len(keys), len(expectedkeys))
-@@ -925,7 +938,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -925,7 +946,7 @@ class TestBasicOps(unittest.TestCase):
          # Check case where inner iterator is used after advancing the groupby
          # iterator
          s = list(zip('AABBBAAAA', range(9)))
@@ -142,16 +157,16 @@ index 7d5ba727389..8d462284884 100644
          _, g1 = next(it)
          _, g2 = next(it)
          _, g3 = next(it)
-@@ -936,7 +949,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -936,7 +957,7 @@ class TestBasicOps(unittest.TestCase):
          self.assertEqual(list(g3), [])
- 
+
          for proto in range(pickle.HIGHEST_PROTOCOL + 1):
 -            it = groupby(s, testR)
 +            it = groupby(s, _testR)
              _, g = next(it)
              next(it)
              next(it)
-@@ -1002,29 +1015,30 @@ class TestBasicOps(unittest.TestCase):
+@@ -1002,29 +1023,30 @@ class TestBasicOps(unittest.TestCase):
          self.assertEqual(list(filter(None, [0,1,0,2,0])), [1,2])
          self.assertEqual(list(filter(bool, [0,1,0,2,0])), [1,2])
          self.assertEqual(take(4, filter(isEven, count())), [0,2,4,6])
@@ -167,7 +182,7 @@ index 7d5ba727389..8d462284884 100644
 +        # self.assertRaises(TypeError, filter, isEven, 3)
 +        # dynamo raises Unsupported in this case
 +        # self.assertRaises(TypeError, next, filter(range(6), range(6)))
- 
+
          # check copy, deepcopy, pickle
 -        ans = [0,2,4]
 -
@@ -197,26 +212,26 @@ index 7d5ba727389..8d462284884 100644
 +        # for proto in range(pickle.HIGHEST_PROTOCOL + 1):
 +        #     c = filter(isEven, range(6))
 +        #     self.pickletest(proto, c)
- 
+
 -    @pickle_deprecated
      def test_filterfalse(self):
          self.assertEqual(list(filterfalse(isEven, range(6))), [1,3,5])
          self.assertEqual(list(filterfalse(None, [0,1,0,2,0])), [0,0,0])
-@@ -1034,9 +1048,10 @@ class TestBasicOps(unittest.TestCase):
+@@ -1034,9 +1056,10 @@ class TestBasicOps(unittest.TestCase):
          self.assertRaises(TypeError, filterfalse, lambda x:x)
          self.assertRaises(TypeError, filterfalse, lambda x:x, range(6), 7)
          self.assertRaises(TypeError, filterfalse, isEven, 3)
 -        self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
 -        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
 -            self.pickletest(proto, filterfalse(isEven, range(6)))
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
 +            for proto in range(pickle.HIGHEST_PROTOCOL + 1):
 +                self.pickletest(proto, filterfalse(isEven, range(6)))
- 
+
      def test_zip(self):
          # XXX This is rather silly now that builtin zip() calls zip()...
-@@ -1047,8 +1062,8 @@ class TestBasicOps(unittest.TestCase):
+@@ -1047,8 +1070,8 @@ class TestBasicOps(unittest.TestCase):
          self.assertEqual(take(3,zip('abcdef', count())), lzip('abcdef', range(3)))
          self.assertEqual(list(zip('abcdef')), lzip('abcdef'))
          self.assertEqual(list(zip()), lzip())
@@ -227,8 +242,8 @@ index 7d5ba727389..8d462284884 100644
          self.assertEqual([tuple(list(pair)) for pair in zip('abc', 'def')],
                           lzip('abc', 'def'))
          self.assertEqual([pair for pair in zip('abc', 'def')],
-@@ -1105,19 +1120,19 @@ class TestBasicOps(unittest.TestCase):
- 
+@@ -1105,19 +1128,19 @@ class TestBasicOps(unittest.TestCase):
+
          self.assertEqual(list(zip_longest('abc', 'defg', **{})),
                           list(zip(list('abc')+[None], 'defg'))) # empty keyword dict
 -        self.assertRaises(TypeError, zip_longest, 3)
@@ -257,118 +272,118 @@ index 7d5ba727389..8d462284884 100644
 +        #         pass
 +        #     else:
 +        #         self.fail('Did not raise Type in:  ' + stmt)
- 
+
          self.assertEqual([tuple(list(pair)) for pair in zip_longest('abc', 'def')],
                           list(zip('abc', 'def')))
-@@ -1296,7 +1311,6 @@ class TestBasicOps(unittest.TestCase):
+@@ -1296,7 +1319,6 @@ class TestBasicOps(unittest.TestCase):
                  self.assertEqual(list(product(*(args*r))),
                                   list(product(*args, **dict(repeat=r))))
          self.assertEqual(len(list(product(*[range(7)]*6))), 7**6)
 -        self.assertRaises(TypeError, product, range(6), None)
- 
+
          def product1(*args, **kwds):
              pools = list(map(tuple, args)) * kwds.get('repeat', 1)
-@@ -1336,7 +1350,8 @@ class TestBasicOps(unittest.TestCase):
+@@ -1336,7 +1358,8 @@ class TestBasicOps(unittest.TestCase):
          argtypes = ['', 'abc', '', range(0), range(4), dict(a=1, b=2, c=3),
                      set('abcdefg'), range(11), tuple(range(13))]
          for i in range(100):
 -            args = [random.choice(argtypes) for j in range(random.randrange(5))]
 +            with torch._dynamo.set_fullgraph(fullgraph=False):
-+                args = [random.choice(argtypes) for j in range(random.randrange(5))]
++                args = [choice(argtypes) for j in range(randrange(5))]
              expected_len = prod(map(len, args))
              self.assertEqual(len(list(product(*args))), expected_len)
              self.assertEqual(list(product(*args)), list(product1(*args)))
-@@ -1767,6 +1782,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -1767,6 +1790,7 @@ class TestBasicOps(unittest.TestCase):
          script_helper.assert_python_ok("-c", script)
- 
+
      # Issue 13454: Crash when deleting backward iterator from tee()
 +    @skipIfTorchDynamo("infinite loop in torch dynamo")
      def test_tee_del_backward(self):
          forward, backward = tee(repeat(None, 20000000))
          try:
-@@ -1920,7 +1936,7 @@ class TestBasicOps(unittest.TestCase):
+@@ -1920,7 +1944,7 @@ class TestBasicOps(unittest.TestCase):
                      tp.foobar = 1
- 
- 
+
+
 -class TestExamples(unittest.TestCase):
 +class TestExamples(__TestCase):
- 
+
      def test_accumulate(self):
          self.assertEqual(list(accumulate([1,2,3,4,5])), [1, 3, 6, 10, 15])
-@@ -2032,7 +2048,7 @@ class TestExamples(unittest.TestCase):
+@@ -2032,7 +2056,7 @@ class TestExamples(unittest.TestCase):
          self.assertEqual(list(takewhile(lambda x: x<5, [1,4,6,4,1])), [1,4])
- 
- 
+
+
 -class TestPurePythonRoughEquivalents(unittest.TestCase):
 +class TestPurePythonRoughEquivalents(__TestCase):
- 
+
      def test_batched_recipe(self):
          def batched_recipe(iterable, n):
-@@ -2081,6 +2097,7 @@ class TestPurePythonRoughEquivalents(unittest.TestCase):
+@@ -2081,6 +2105,7 @@ class TestPurePythonRoughEquivalents(unittest.TestCase):
              for i, element in zip(range(i + 1, stop), iterable):
                  pass
- 
+
 +    @skipIfTorchDynamo("infinite loop in torch dynamo")
      def test_islice_recipe(self):
          self.assertEqual(list(self.islice('ABCDEFG', 2)), list('AB'))
          self.assertEqual(list(self.islice('ABCDEFG', 2, 4)), list('CD'))
-@@ -2265,7 +2282,7 @@ class TestPurePythonRoughEquivalents(unittest.TestCase):
+@@ -2265,7 +2290,7 @@ class TestPurePythonRoughEquivalents(unittest.TestCase):
              raise
- 
- 
+
+
 -class TestGC(unittest.TestCase):
 +class TestGC(__TestCase):
- 
+
      def makecycle(self, iterator, container):
          container.append(iterator)
-@@ -2465,7 +2482,7 @@ def L(seqn):
+@@ -2465,7 +2490,7 @@ def L(seqn):
      return chain(map(lambda x:x, R(Ig(G(seqn)))))
- 
- 
+
+
 -class TestVariousIteratorArgs(unittest.TestCase):
 +class TestVariousIteratorArgs(__TestCase):
- 
+
      def test_accumulate(self):
          s = [1,2,3,4,5]
-@@ -2644,7 +2661,7 @@ class TestVariousIteratorArgs(unittest.TestCase):
+@@ -2644,7 +2669,7 @@ class TestVariousIteratorArgs(unittest.TestCase):
              self.assertRaises(TypeError, tee, N(s))
              self.assertRaises(ZeroDivisionError, list, tee(E(s))[0])
- 
+
 -class LengthTransparency(unittest.TestCase):
 +class LengthTransparency(__TestCase):
- 
+
      def test_repeat(self):
          self.assertEqual(operator.length_hint(repeat(None, 50)), 50)
-@@ -2657,7 +2674,7 @@ class LengthTransparency(unittest.TestCase):
+@@ -2657,7 +2682,7 @@ class LengthTransparency(unittest.TestCase):
          self.assertEqual(operator.length_hint(repeat(None, times=-1)), 0)
          self.assertEqual(operator.length_hint(repeat(None, times=-2)), 0)
- 
+
 -class RegressionTests(unittest.TestCase):
 +class RegressionTests(__TestCase):
- 
+
      def test_sf_793826(self):
          # Fix Armin Rigo's successful efforts to wreak havoc
-@@ -2718,6 +2735,7 @@ class RegressionTests(unittest.TestCase):
- 
+@@ -2718,6 +2743,7 @@ class RegressionTests(unittest.TestCase):
+
      @support.skip_if_pgo_task
      @support.requires_resource('cpu')
 +    @slowTest
      def test_long_chain_of_empty_iterables(self):
          # Make sure itertools.chain doesn't run into recursion limits when
          # dealing with long chains of empty iterables. Even with a high
-@@ -2750,7 +2768,7 @@ class RegressionTests(unittest.TestCase):
+@@ -2750,7 +2776,7 @@ class RegressionTests(unittest.TestCase):
              next(g, None)  # shouldn't crash
- 
- 
+
+
 -class SubclassWithKwargsTest(unittest.TestCase):
 +class SubclassWithKwargsTest(__TestCase):
      def test_keywords_in_subclass(self):
          # count is not subclassable...
          testcases = [
-@@ -2805,49 +2823,5 @@ class SubclassWithKwargsTest(unittest.TestCase):
+@@ -2805,49 +2831,5 @@ class SubclassWithKwargsTest(unittest.TestCase):
                  self.assertEqual(u.newarg, 3)
- 
- 
+
+
 -@support.cpython_only
 -class SizeofTest(unittest.TestCase):
 -    def setUp(self):
diff --git a/test/dynamo/cpython/3_13/test_itertools.py b/test/dynamo/cpython/3_13/test_itertools.py
index 8d462284884a..fe32a3491d17 100644
--- a/test/dynamo/cpython/3_13/test_itertools.py
+++ b/test/dynamo/cpython/3_13/test_itertools.py
@@ -62,6 +62,14 @@ def inner(self):
 maxsize = support.MAX_Py_ssize_t
 minsize = -maxsize-1
 
+@torch._dynamo.disable
+def choice(*args):
+    return random.choice(*args)
+
+@torch._dynamo.disable
+def randrange(*args):
+    return random.randrange(*args)
+
 def lzip(*args):
     return list(zip(*args))
 
@@ -1048,7 +1056,7 @@ def test_filterfalse(self):
         self.assertRaises(TypeError, filterfalse, lambda x:x)
         self.assertRaises(TypeError, filterfalse, lambda x:x, range(6), 7)
         self.assertRaises(TypeError, filterfalse, isEven, 3)
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
             for proto in range(pickle.HIGHEST_PROTOCOL + 1):
                 self.pickletest(proto, filterfalse(isEven, range(6)))
@@ -1350,8 +1358,8 @@ def product2(*iterables, repeat=1):
         argtypes = ['', 'abc', '', range(0), range(4), dict(a=1, b=2, c=3),
                     set('abcdefg'), range(11), tuple(range(13))]
         for i in range(100):
-            with torch._dynamo.set_fullgraph(fullgraph=False):
-                args = [random.choice(argtypes) for j in range(random.randrange(5))]
+            with torch._dynamo.error_on_graph_break(False):
+                args = [choice(argtypes) for j in range(randrange(5))]
             expected_len = prod(map(len, args))
             self.assertEqual(len(list(product(*args))), expected_len)
             self.assertEqual(list(product(*args)), list(product1(*args)))
diff --git a/test/dynamo/cpython/3_13/test_list.diff b/test/dynamo/cpython/3_13/test_list.diff
index c7edc7e2fb76..7b0a90735d87 100644
--- a/test/dynamo/cpython/3_13/test_list.diff
+++ b/test/dynamo/cpython/3_13/test_list.diff
@@ -67,17 +67,17 @@ index 23ef902aa0b..b9afb1ef26e 100644
 @@ -36,7 +90,7 @@ class ListTest(list_tests.CommonTest):
              # earlier due to a newlib bug.  See the following mailing list
              # thread for the details:
- 
+
              self.assertRaises(MemoryError, list, range(sys.maxsize // 2))
- 
+
          # This code used to segfault in Py2.4a3
 @@ -49,28 +103,31 @@ class ListTest(list_tests.CommonTest):
              list(sequence=[])
- 
+
      def test_keywords_in_subclass(self):
 -        class subclass(list):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass(list):
 +                pass
          u = subclass([1, 2])
@@ -85,12 +85,12 @@ index 23ef902aa0b..b9afb1ef26e 100644
          self.assertEqual(list(u), [1, 2])
          with self.assertRaises(TypeError):
              subclass(sequence=())
- 
+
 -        class subclass_with_init(list):
 -            def __init__(self, seq, newarg=None):
 -                super().__init__(seq)
 -                self.newarg = newarg
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_init(list):
 +                def __init__(self, seq, newarg=None):
 +                    super().__init__(seq)
@@ -99,13 +99,13 @@ index 23ef902aa0b..b9afb1ef26e 100644
          self.assertIs(type(u), subclass_with_init)
          self.assertEqual(list(u), [1, 2])
          self.assertEqual(u.newarg, 3)
- 
+
 -        class subclass_with_new(list):
 -            def __new__(cls, seq, newarg=None):
 -                self = super().__new__(cls, seq)
 -                self.newarg = newarg
 -                return self
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_new(list):
 +                def __new__(cls, seq, newarg=None):
 +                    self = super().__new__(cls, seq)
@@ -116,7 +116,7 @@ index 23ef902aa0b..b9afb1ef26e 100644
          self.assertEqual(list(u), [1, 2])
 @@ -117,14 +174,15 @@ class ListTest(list_tests.CommonTest):
              lst *= size
- 
+
      def test_repr_mutate(self):
 -        class Obj:
 -            @staticmethod
@@ -126,7 +126,7 @@ index 23ef902aa0b..b9afb1ef26e 100644
 -                except IndexError:
 -                    pass
 -                return 'obj'
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Obj:
 +                @staticmethod
 +                def __repr__():
@@ -135,7 +135,7 @@ index 23ef902aa0b..b9afb1ef26e 100644
 +                    except IndexError:
 +                        pass
 +                    return 'obj'
- 
+
          mylist = [Obj() for _ in range(5)]
          self.assertEqual(repr(mylist), '[obj, obj, obj]')
 @@ -220,26 +278,28 @@ class ListTest(list_tests.CommonTest):
@@ -143,11 +143,11 @@ index 23ef902aa0b..b9afb1ef26e 100644
          # optimization causes failures in code that relies on distinct
          # function addresses.
 -        class L(list): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class L(list): pass
          with self.assertRaises(TypeError):
              (3,) + L([1,2])
- 
+
      def test_equal_operator_modifying_operand(self):
          # test fix for seg fault reported in bpo-38588 part 2.
 -        class X:
@@ -164,7 +164,7 @@ index 23ef902aa0b..b9afb1ef26e 100644
 -            def __eq__(self, other):
 -                list3.clear()
 -                return NotImplemented
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class X:
 +                def __eq__(self,other) :
 +                    list2.clear()
@@ -179,29 +179,29 @@ index 23ef902aa0b..b9afb1ef26e 100644
 +                def __eq__(self, other):
 +                    list3.clear()
 +                    return NotImplemented
- 
+
          list1 = [X()]
          list2 = [Y()]
 @@ -250,24 +310,26 @@ class ListTest(list_tests.CommonTest):
          self.assertFalse(list3 == list4)
- 
+
      def test_lt_operator_modifying_operand(self):
 -        # See gh-120298
 -        class evil:
 -            def __lt__(self, other):
 -                other.clear()
 -                return NotImplemented
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # See gh-120298
 +            class evil:
 +                def __lt__(self, other):
 +                    other.clear()
 +                    return NotImplemented
- 
+
          a = [[evil()]]
          with self.assertRaises(TypeError):
              a[0] < a
- 
+
      def test_list_index_modifing_operand(self):
 -        # See gh-120384
 -        class evil:
@@ -210,7 +210,7 @@ index 23ef902aa0b..b9afb1ef26e 100644
 -            def __iter__(self):
 -                yield from self.lst
 -                self.lst.clear()
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            # See gh-120384
 +            class evil:
 +                def __init__(self, lst):
@@ -218,7 +218,7 @@ index 23ef902aa0b..b9afb1ef26e 100644
 +                def __iter__(self):
 +                    yield from self.lst
 +                    self.lst.clear()
- 
+
          lst = list(range(5))
          operand = evil(lst)
 @@ -286,19 +348,21 @@ class ListTest(list_tests.CommonTest):
@@ -229,39 +229,39 @@ index 23ef902aa0b..b9afb1ef26e 100644
 -            def __eq__(self, other):
 -                lst.clear()
 -                return NotImplemented
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class X:
 +                def __eq__(self, other):
 +                    lst.clear()
 +                    return NotImplemented
- 
+
          lst = [X()]
          with self.assertRaises(ValueError):
              lst.index(lst)
- 
+
 -        class L(list):
 -            def __eq__(self, other):
 -                str(other)
 -                return NotImplemented
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class L(list):
 +                def __eq__(self, other):
 +                    str(other)
 +                    return NotImplemented
- 
+
          lst = L([X()])
          lst.count(lst)
 @@ -324,6 +388,7 @@ class ListTest(list_tests.CommonTest):
              a.append(4)
              self.assertEqual(list(it), [])
- 
+
 +    @unittest.skip("Fails on python <=3.13.2 and passes on >=3.13.3")
      def test_deopt_from_append_list(self):
          # gh-132011: it used to crash, because
          # of `CALL_LIST_APPEND` specialization failure.
 @@ -345,4 +410,4 @@ class ListTest(list_tests.CommonTest):
          self.assertEqual(rc, 0)
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_list.py b/test/dynamo/cpython/3_13/test_list.py
index f1f65647df19..7f91b7b84080 100644
--- a/test/dynamo/cpython/3_13/test_list.py
+++ b/test/dynamo/cpython/3_13/test_list.py
@@ -101,7 +101,7 @@ def test_keyword_args(self):
             list(sequence=[])
 
     def test_keywords_in_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass(list):
                 pass
         u = subclass([1, 2])
@@ -110,7 +110,7 @@ class subclass(list):
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(list):
                 def __init__(self, seq, newarg=None):
                     super().__init__(seq)
@@ -120,7 +120,7 @@ def __init__(self, seq, newarg=None):
         self.assertEqual(list(u), [1, 2])
         self.assertEqual(u.newarg, 3)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(list):
                 def __new__(cls, seq, newarg=None):
                     self = super().__new__(cls, seq)
@@ -172,7 +172,7 @@ def test_list_resize_overflow(self):
             lst *= size
 
     def test_repr_mutate(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Obj:
                 @staticmethod
                 def __repr__():
@@ -276,14 +276,14 @@ def test_no_comdat_folding(self):
         # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
         # optimization causes failures in code that relies on distinct
         # function addresses.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class L(list): pass
         with self.assertRaises(TypeError):
             (3,) + L([1,2])
 
     def test_equal_operator_modifying_operand(self):
         # test fix for seg fault reported in bpo-38588 part 2.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __eq__(self,other) :
                     list2.clear()
@@ -308,7 +308,7 @@ def __eq__(self, other):
         self.assertFalse(list3 == list4)
 
     def test_lt_operator_modifying_operand(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # See gh-120298
             class evil:
                 def __lt__(self, other):
@@ -320,7 +320,7 @@ def __lt__(self, other):
             a[0] < a
 
     def test_list_index_modifing_operand(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             # See gh-120384
             class evil:
                 def __init__(self, lst):
@@ -346,7 +346,7 @@ def test_count_index_remove_crashes(self):
         # bpo-38610: The count(), index(), and remove() methods were not
         # holding strong references to list elements while calling
         # PyObject_RichCompareBool().
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __eq__(self, other):
                     lst.clear()
@@ -356,7 +356,7 @@ def __eq__(self, other):
         with self.assertRaises(ValueError):
             lst.index(lst)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class L(list):
                 def __eq__(self, other):
                     str(other)
diff --git a/test/dynamo/cpython/3_13/test_math.diff b/test/dynamo/cpython/3_13/test_math.diff
index 1bf9a31e969e..058477820c63 100644
--- a/test/dynamo/cpython/3_13/test_math.diff
+++ b/test/dynamo/cpython/3_13/test_math.diff
@@ -63,20 +63,20 @@ index 5ee3055c871..5402cdc4a6c 100644
 +
  # Python test set -- math module
  # XXXX Should not do tests around zero only
- 
+
 @@ -242,7 +300,7 @@ class BadDescr:
      def __get__(self, obj, objtype=None):
          raise ValueError
- 
+
 -class MathTests(unittest.TestCase):
 +class MathTests(__TestCase):
- 
+
      def ftest(self, name, got, expected, ulp_tol=5, abs_tol=0.0):
          """Compare arguments expected and got, as floats, if either
 @@ -417,16 +475,17 @@ class MathTests(unittest.TestCase):
          #self.assertEqual(math.ceil(NINF), NINF)
          #self.assertTrue(math.isnan(math.ceil(NAN)))
- 
+
 -        class TestCeil:
 -            def __ceil__(self):
 -                return 42
@@ -87,7 +87,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 -            pass
 -        class TestBadCeil:
 -            __ceil__ = BadDescr()
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class TestCeil:
 +                def __ceil__(self):
 +                    return 42
@@ -104,7 +104,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 @@ -533,6 +592,7 @@ class MathTests(unittest.TestCase):
          self.ftest('fabs(0)', math.fabs(0), 0)
          self.ftest('fabs(1)', math.fabs(1), 1)
- 
+
 +    @skipIfTorchDynamo("infinite loop")
      def testFactorial(self):
          self.assertEqual(math.factorial(0), 1)
@@ -112,7 +112,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 @@ -573,16 +633,17 @@ class MathTests(unittest.TestCase):
          #self.assertEqual(math.ceil(NINF), NINF)
          #self.assertTrue(math.isnan(math.floor(NAN)))
- 
+
 -        class TestFloor:
 -            def __floor__(self):
 -                return 42
@@ -123,7 +123,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 -            pass
 -        class TestBadFloor:
 -            __floor__ = BadDescr()
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class TestFloor:
 +                def __floor__(self):
 +                    return 42
@@ -139,32 +139,32 @@ index 5ee3055c871..5402cdc4a6c 100644
          self.assertEqual(math.floor(FloatLike(41.9)), 41)
 @@ -995,8 +1056,9 @@ class MathTests(unittest.TestCase):
          )
- 
+
          # Verify tuple subclasses are allowed
 -        class T(tuple):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class T(tuple):
 +                pass
          self.assertEqual(dist(T((1, 2, 3)), ((4, 2, -1))), 5.0)
- 
+
          # Test handling of bad arguments
 @@ -1028,8 +1090,9 @@ class MathTests(unittest.TestCase):
          with self.assertRaises(TypeError):
              dist([1], 2)
- 
+
 -        class BadFloat:
 -            __float__ = BadDescr()
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class BadFloat:
 +                __float__ = BadDescr()
- 
+
          with self.assertRaises(ValueError):
              dist([1], [BadFloat()])
 @@ -1072,6 +1135,7 @@ class MathTests(unittest.TestCase):
          with self.assertRaises(ValueError):
              math.dist([1, 2], [3, 4, 5])
- 
+
 +    @slowTest
      def testIsqrt(self):
          # Test a variety of inputs, large and small.
@@ -172,26 +172,26 @@ index 5ee3055c871..5402cdc4a6c 100644
 @@ -1101,12 +1165,13 @@ class MathTests(unittest.TestCase):
          self.assertIs(type(s), int)
          self.assertEqual(s, 0)
- 
+
 -        class IntegerLike(object):
 -            def __init__(self, value):
 -                self.value = value
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class IntegerLike(object):
 +                def __init__(self, value):
 +                    self.value = value
- 
+
 -            def __index__(self):
 -                return self.value
 +                def __index__(self):
 +                    return self.value
- 
+
          s = math.isqrt(IntegerLike(1729))
          self.assertIs(type(s), int)
 @@ -1202,12 +1267,6 @@ class MathTests(unittest.TestCase):
              self.assertEqual(math.ldexp(NINF, n), NINF)
              self.assertTrue(math.isnan(math.ldexp(NAN, n)))
- 
+
 -    @requires_IEEE_754
 -    def testLdexp_denormal(self):
 -        # Denormal output incorrectly rounded (truncated)
@@ -204,7 +204,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 @@ -1233,6 +1292,7 @@ class MathTests(unittest.TestCase):
          self.assertRaises(ValueError, math.log1p, -1)
          self.assertEqual(math.log1p(INF), INF)
- 
+
 +    @skipIfTorchDynamo("Infinite loop")
      @requires_IEEE_754
      def testLog2(self):
@@ -212,7 +212,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 @@ -1251,6 +1311,7 @@ class MathTests(unittest.TestCase):
          self.assertRaises(ValueError, math.log2, NINF)
          self.assertTrue(math.isnan(math.log2(NAN)))
- 
+
 +    @skipIfTorchDynamo("Infinite loop")
      @requires_IEEE_754
      # log2() is not accurate enough on Mac OS X Tiger (10.4)
@@ -220,20 +220,20 @@ index 5ee3055c871..5402cdc4a6c 100644
 @@ -1332,17 +1393,18 @@ class MathTests(unittest.TestCase):
          with self.assertRaises(RuntimeError):
              sumprod(raise_after(5), range(10))
- 
+
 -        from test.test_iter import BasicIterClass
 +        from test_iter import BasicIterClass
- 
+
          self.assertEqual(sumprod(BasicIterClass(1), [1]), 0)
          self.assertEqual(sumprod([1], BasicIterClass(1)), 0)
- 
+
          # Error in multiplication
 -        class BadMultiply:
 -            def __mul__(self, other):
 -                raise RuntimeError
 -            def __rmul__(self, other):
 -                raise RuntimeError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class BadMultiply:
 +                def __mul__(self, other):
 +                    raise RuntimeError
@@ -245,7 +245,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 @@ -1387,25 +1449,26 @@ class MathTests(unittest.TestCase):
          Decimal = decimal.Decimal
          Fraction = fractions.Fraction
- 
+
 -        class Int(int):
 -            def __add__(self, other):
 -                return Int(int(self) + int(other))
@@ -265,7 +265,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 -            __rmul__ = __mul__
 -            def __repr__(self):
 -                return f'Flt({int(self)})'
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Int(int):
 +                def __add__(self, other):
 +                    return Int(int(self) + int(other))
@@ -285,13 +285,13 @@ index 5ee3055c871..5402cdc4a6c 100644
 +                __rmul__ = __mul__
 +                def __repr__(self):
 +                    return f'Flt({int(self)})'
- 
+
          def baseline_sumprod(p, q):
              """This defines the target behavior including exceptions and special values.
 @@ -1925,16 +1988,17 @@ class MathTests(unittest.TestCase):
          self.assertEqual(math.trunc(-0.999999), -0)
          self.assertEqual(math.trunc(-100.999), -100)
- 
+
 -        class TestTrunc:
 -            def __trunc__(self):
 -                return 23
@@ -302,7 +302,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 -            pass
 -        class TestBadTrunc:
 -            __trunc__ = BadDescr()
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class TestTrunc:
 +                def __trunc__(self):
 +                    return 23
@@ -313,27 +313,27 @@ index 5ee3055c871..5402cdc4a6c 100644
 +                pass
 +            class TestBadTrunc:
 +                __trunc__ = BadDescr()
- 
+
          self.assertEqual(math.trunc(TestTrunc()), 23)
          self.assertEqual(math.trunc(FloatTrunc()), 23)
 @@ -2167,9 +2231,10 @@ class MathTests(unittest.TestCase):
          self.assertEqual(prod([1., F(3, 2)]), 1.5)
- 
+
          # Error in multiplication
 -        class BadMultiply:
 -            def __rmul__(self, other):
 -                raise RuntimeError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class BadMultiply:
 +                def __rmul__(self, other):
 +                    raise RuntimeError
          with self.assertRaises(RuntimeError):
              prod([10., BadMultiply()])
- 
+
 @@ -2252,6 +2317,7 @@ class MathTests(unittest.TestCase):
          self.assertEqual(type(prod([1, decimal.Decimal(2.0), 3, 4, 5, 6])),
                           decimal.Decimal)
- 
+
 +    @skipIfTorchDynamo("Infinite loop")
      def testPerm(self):
          perm = math.perm
@@ -341,15 +341,15 @@ index 5ee3055c871..5402cdc4a6c 100644
 @@ -2316,6 +2382,7 @@ class MathTests(unittest.TestCase):
              self.assertIs(type(perm(IntSubclass(5), IntSubclass(k))), int)
              self.assertIs(type(perm(MyIndexable(5), MyIndexable(k))), int)
- 
+
 +    @skipIfTorchDynamo("infinite loop")
      def testComb(self):
          comb = math.comb
          factorial = math.factorial
 @@ -2446,6 +2513,7 @@ class MathTests(unittest.TestCase):
              math.nextafter(1.0, INF, steps=-1)
- 
- 
+
+
 +    @unittest.skip("flaky test under torch dynamo")  # works on pytest and crashes on unittest
      @requires_IEEE_754
      def test_ulp(self):
@@ -362,7 +362,7 @@ index 5ee3055c871..5402cdc4a6c 100644
 -            def __float__(self):
 -                self.converted = True
 -                1/0
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class F:
 +                def __float__(self):
 +                    self.converted = True
@@ -372,21 +372,21 @@ index 5ee3055c871..5402cdc4a6c 100644
              with self.assertRaises(TypeError):
 @@ -2508,7 +2577,7 @@ class MathTests(unittest.TestCase):
          self.assertEqual(math.copysign(1.0, x), math.copysign(1.0, y))
- 
- 
+
+
 -class IsCloseTests(unittest.TestCase):
 +class IsCloseTests(__TestCase):
      isclose = math.isclose  # subclasses should override this
- 
+
      def assertIsClose(self, a, b, *args, **kwargs):
 @@ -2631,7 +2700,7 @@ class IsCloseTests(unittest.TestCase):
          self.assertAllNotClose(fraction_examples, rel_tol=1e-9)
- 
- 
+
+
 -class FMATests(unittest.TestCase):
 +class FMATests(__TestCase):
      """ Tests for math.fma. """
- 
+
      def test_fma_nan_results(self):
 @@ -2719,8 +2788,7 @@ class FMATests(unittest.TestCase):
      # properly: it doesn't use the right sign when the result is zero.
@@ -400,8 +400,8 @@ index 5ee3055c871..5402cdc4a6c 100644
          nonnegative_finites = [0.0, 1e-300, 2.3, 1e300]
 @@ -2879,10 +2947,5 @@ class FMATests(unittest.TestCase):
          )
- 
- 
+
+
 -def load_tests(loader, tests, pattern):
 -    from doctest import DocFileSuite
 -    tests.addTest(DocFileSuite(os.path.join("mathdata", "ieee754.txt")))
diff --git a/test/dynamo/cpython/3_13/test_math.py b/test/dynamo/cpython/3_13/test_math.py
index 5402cdc4a6c3..d9f6b5fd1d94 100644
--- a/test/dynamo/cpython/3_13/test_math.py
+++ b/test/dynamo/cpython/3_13/test_math.py
@@ -475,7 +475,7 @@ def testCeil(self):
         #self.assertEqual(math.ceil(NINF), NINF)
         #self.assertTrue(math.isnan(math.ceil(NAN)))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class TestCeil:
                 def __ceil__(self):
                     return 42
@@ -633,7 +633,7 @@ def testFloor(self):
         #self.assertEqual(math.ceil(NINF), NINF)
         #self.assertTrue(math.isnan(math.floor(NAN)))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class TestFloor:
                 def __floor__(self):
                     return 42
@@ -1056,7 +1056,7 @@ def testDist(self):
         )
 
         # Verify tuple subclasses are allowed
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class T(tuple):
                 pass
         self.assertEqual(dist(T((1, 2, 3)), ((4, 2, -1))), 5.0)
@@ -1090,7 +1090,7 @@ class T(tuple):
         with self.assertRaises(TypeError):
             dist([1], 2)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class BadFloat:
                 __float__ = BadDescr()
 
@@ -1165,7 +1165,7 @@ def testIsqrt(self):
         self.assertIs(type(s), int)
         self.assertEqual(s, 0)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class IntegerLike(object):
                 def __init__(self, value):
                     self.value = value
@@ -1399,7 +1399,7 @@ def raise_after(n):
         self.assertEqual(sumprod([1], BasicIterClass(1)), 0)
 
         # Error in multiplication
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class BadMultiply:
                 def __mul__(self, other):
                     raise RuntimeError
@@ -1449,7 +1449,7 @@ def test_sumprod_stress(self):
         Decimal = decimal.Decimal
         Fraction = fractions.Fraction
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Int(int):
                 def __add__(self, other):
                     return Int(int(self) + int(other))
@@ -1988,7 +1988,7 @@ def test_trunc(self):
         self.assertEqual(math.trunc(-0.999999), -0)
         self.assertEqual(math.trunc(-100.999), -100)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class TestTrunc:
                 def __trunc__(self):
                     return 23
@@ -2231,7 +2231,7 @@ def test_prod(self):
         self.assertEqual(prod([1., F(3, 2)]), 1.5)
 
         # Error in multiplication
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class BadMultiply:
                 def __rmul__(self, other):
                     raise RuntimeError
@@ -2540,7 +2540,7 @@ def test_ulp(self):
     def test_issue39871(self):
         # A SystemError should not be raised if the first arg to atan2(),
         # copysign(), or remainder() cannot be converted to a float.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class F:
                 def __float__(self):
                     self.converted = True
diff --git a/test/dynamo/cpython/3_13/test_operator.diff b/test/dynamo/cpython/3_13/test_operator.diff
index 43dba185cfcc..70629e03d3ba 100644
--- a/test/dynamo/cpython/3_13/test_operator.diff
+++ b/test/dynamo/cpython/3_13/test_operator.diff
@@ -27,13 +27,13 @@ index d90f820052c..5d9fdfb70a4 100644
  import inspect
  import pickle
 @@ -84,9 +104,10 @@ class OperatorTestCase:
- 
+
      def test_eq(self):
          operator = self.module
 -        class C(object):
 -            def __eq__(self, other):
 -                raise SyntaxError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C(object):
 +                def __eq__(self, other):
 +                    raise SyntaxError
@@ -41,13 +41,13 @@ index d90f820052c..5d9fdfb70a4 100644
          self.assertRaises(SyntaxError, operator.eq, C(), C())
          self.assertFalse(operator.eq(1, 0))
 @@ -98,9 +119,10 @@ class OperatorTestCase:
- 
+
      def test_ne(self):
          operator = self.module
 -        class C(object):
 -            def __ne__(self, other):
 -                raise SyntaxError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C(object):
 +                def __ne__(self, other):
 +                    raise SyntaxError
@@ -61,21 +61,21 @@ index d90f820052c..5d9fdfb70a4 100644
 -        class M:
 -            def __matmul__(self, other):
 -                return other - 1
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class M:
 +                def __matmul__(self, other):
 +                    return other - 1
          self.assertEqual(M() @ 42, 41)
- 
+
      def test_neg(self):
 @@ -315,9 +338,10 @@ class OperatorTestCase:
- 
+
      def test_truth(self):
          operator = self.module
 -        class C(object):
 -            def __bool__(self):
 -                raise SyntaxError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C(object):
 +                def __bool__(self):
 +                    raise SyntaxError
@@ -83,12 +83,12 @@ index d90f820052c..5d9fdfb70a4 100644
          self.assertRaises(SyntaxError, operator.truth, C())
          self.assertTrue(operator.truth(5))
 @@ -349,8 +373,9 @@ class OperatorTestCase:
- 
+
      def test_attrgetter(self):
          operator = self.module
 -        class A:
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class A:
 +                pass
          a = A()
@@ -97,39 +97,39 @@ index d90f820052c..5d9fdfb70a4 100644
 @@ -371,9 +396,10 @@ class OperatorTestCase:
          self.assertEqual(operator.attrgetter('x','z','y')(record), ('X', 'Z', 'Y'))
          self.assertRaises(TypeError, operator.attrgetter, ('x', (), 'y'))
- 
+
 -        class C(object):
 -            def __getattr__(self, name):
 -                raise SyntaxError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C(object):
 +                def __getattr__(self, name):
 +                    raise SyntaxError
          self.assertRaises(SyntaxError, operator.attrgetter('foo'), C())
- 
+
          # recursive gets
 @@ -411,9 +437,10 @@ class OperatorTestCase:
          f = operator.itemgetter(10)
          self.assertRaises(IndexError, f, a)
- 
+
 -        class C(object):
 -            def __getitem__(self, name):
 -                raise SyntaxError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C(object):
 +                def __getitem__(self, name):
 +                    raise SyntaxError
          self.assertRaises(SyntaxError, operator.itemgetter(42), C())
- 
+
          f = operator.itemgetter('name')
 @@ -444,9 +471,10 @@ class OperatorTestCase:
          self.assertEqual(operator.itemgetter(slice(2, 4))(t), ('c', 'd'))
- 
+
          # interesting sequences
 -        class T(tuple):
 -            'Tuple subclass'
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class T(tuple):
 +                'Tuple subclass'
 +                pass
@@ -147,7 +147,7 @@ index d90f820052c..5d9fdfb70a4 100644
 -                return f
 -            def baz(*args, **kwds):
 -                return kwds['name'], kwds['self']
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class A:
 +                def foo(self, *args, **kwds):
 +                    return args[0] + args[1]
@@ -159,7 +159,7 @@ index d90f820052c..5d9fdfb70a4 100644
          f = operator.methodcaller('foo')
          self.assertRaises(IndexError, f, a)
 @@ -480,21 +509,22 @@ class OperatorTestCase:
- 
+
      def test_inplace(self):
          operator = self.module
 -        class C(object):
@@ -177,7 +177,7 @@ index d90f820052c..5d9fdfb70a4 100644
 -            def __itruediv__ (self, other): return "itruediv"
 -            def __ixor__     (self, other): return "ixor"
 -            def __getitem__(self, other): return 5  # so that C is a sequence
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C(object):
 +                def __iadd__     (self, other): return "iadd"
 +                def __iand__     (self, other): return "iand"
@@ -197,27 +197,27 @@ index d90f820052c..5d9fdfb70a4 100644
          self.assertEqual(operator.iadd     (c, 5), "iadd")
          self.assertEqual(operator.iand     (c, 5), "iand")
 @@ -520,9 +550,10 @@ class OperatorTestCase:
- 
+
      def test_index(self):
          operator = self.module
 -        class X:
 -            def __index__(self):
 -                return 1
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class X:
 +                def __index__(self):
 +                    return 1
- 
+
          self.assertEqual(operator.index(X()), 1)
          self.assertEqual(operator.index(0), 0)
 @@ -539,9 +570,10 @@ class OperatorTestCase:
- 
+
      def test_not_(self):
          operator = self.module
 -        class C:
 -            def __bool__(self):
 -                raise SyntaxError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C:
 +                def __bool__(self):
 +                    raise SyntaxError
@@ -225,17 +225,17 @@ index d90f820052c..5d9fdfb70a4 100644
          self.assertRaises(SyntaxError, operator.not_, C())
          self.assertFalse(operator.not_(5))
 @@ -551,15 +583,16 @@ class OperatorTestCase:
- 
+
      def test_length_hint(self):
          operator = self.module
 -        class X(object):
 -            def __init__(self, value):
 -                self.value = value
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class X(object):
 +                def __init__(self, value):
 +                    self.value = value
- 
+
 -            def __length_hint__(self):
 -                if type(self.value) is type:
 -                    raise self.value
@@ -246,47 +246,47 @@ index d90f820052c..5d9fdfb70a4 100644
 +                        raise self.value
 +                    else:
 +                        return self.value
- 
+
          self.assertEqual(operator.length_hint([], 2), 0)
          self.assertEqual(operator.length_hint(iter([1, 2, 3])), 3)
 @@ -574,7 +607,8 @@ class OperatorTestCase:
          with self.assertRaises(LookupError):
              operator.length_hint(X(LookupError))
- 
+
 -        class Y: pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Y: pass
- 
+
          msg = "'str' object cannot be interpreted as an integer"
          with self.assertRaisesRegex(TypeError, msg):
 @@ -628,11 +662,11 @@ class OperatorTestCase:
          self.assertEqual(str(sig), '(obj, /)')
- 
- 
+
+
 -class PyOperatorTestCase(OperatorTestCase, unittest.TestCase):
 +class PyOperatorTestCase(OperatorTestCase, __TestCase):
      module = py_operator
- 
+
  @unittest.skipUnless(c_operator, 'requires _operator')
 -class COperatorTestCase(OperatorTestCase, unittest.TestCase):
 +class COperatorTestCase(OperatorTestCase, __TestCase):
      module = c_operator
- 
- 
+
+
 @@ -645,8 +679,9 @@ class OperatorPickleTestCase:
- 
+
      def test_attrgetter(self):
          attrgetter = self.module.attrgetter
 -        class A:
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class A:
 +                pass
          a = A()
          a.x = 'X'
          a.y = 'Y'
 @@ -688,13 +723,14 @@ class OperatorPickleTestCase:
- 
+
      def test_methodcaller(self):
          methodcaller = self.module.methodcaller
 -        class A:
@@ -296,7 +296,7 @@ index d90f820052c..5d9fdfb70a4 100644
 -                return f
 -            def baz(*args, **kwds):
 -                return kwds['name'], kwds['self']
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class A:
 +                def foo(self, *args, **kwds):
 +                    return args[0] + args[1]
@@ -310,31 +310,31 @@ index d90f820052c..5d9fdfb70a4 100644
 @@ -717,25 +753,25 @@ class OperatorPickleTestCase:
                  # Can't test repr consistently with multiple keyword args
                  self.assertEqual(f2(a), f(a))
- 
+
 -class PyPyOperatorPickleTestCase(OperatorPickleTestCase, unittest.TestCase):
 +class PyPyOperatorPickleTestCase(OperatorPickleTestCase, __TestCase):
      module = py_operator
      module2 = py_operator
- 
+
  @unittest.skipUnless(c_operator, 'requires _operator')
 -class PyCOperatorPickleTestCase(OperatorPickleTestCase, unittest.TestCase):
 +class PyCOperatorPickleTestCase(OperatorPickleTestCase, __TestCase):
      module = py_operator
      module2 = c_operator
- 
+
  @unittest.skipUnless(c_operator, 'requires _operator')
 -class CPyOperatorPickleTestCase(OperatorPickleTestCase, unittest.TestCase):
 +class CPyOperatorPickleTestCase(OperatorPickleTestCase, __TestCase):
      module = c_operator
      module2 = py_operator
- 
+
  @unittest.skipUnless(c_operator, 'requires _operator')
 -class CCOperatorPickleTestCase(OperatorPickleTestCase, unittest.TestCase):
 +class CCOperatorPickleTestCase(OperatorPickleTestCase, __TestCase):
      module = c_operator
      module2 = c_operator
- 
- 
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_operator.py b/test/dynamo/cpython/3_13/test_operator.py
index 5d9fdfb70a43..7b9c0f0dd583 100644
--- a/test/dynamo/cpython/3_13/test_operator.py
+++ b/test/dynamo/cpython/3_13/test_operator.py
@@ -104,7 +104,7 @@ def test_le(self):
 
     def test_eq(self):
         operator = self.module
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 def __eq__(self, other):
                     raise SyntaxError
@@ -119,7 +119,7 @@ def __eq__(self, other):
 
     def test_ne(self):
         operator = self.module
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 def __ne__(self, other):
                     raise SyntaxError
@@ -267,7 +267,7 @@ def test_matmul(self):
         operator = self.module
         self.assertRaises(TypeError, operator.matmul)
         self.assertRaises(TypeError, operator.matmul, 42, 42)
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class M:
                 def __matmul__(self, other):
                     return other - 1
@@ -338,7 +338,7 @@ def test_sub(self):
 
     def test_truth(self):
         operator = self.module
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 def __bool__(self):
                     raise SyntaxError
@@ -373,7 +373,7 @@ def test_is_not(self):
 
     def test_attrgetter(self):
         operator = self.module
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class A:
                 pass
         a = A()
@@ -396,7 +396,7 @@ class A:
         self.assertEqual(operator.attrgetter('x','z','y')(record), ('X', 'Z', 'Y'))
         self.assertRaises(TypeError, operator.attrgetter, ('x', (), 'y'))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 def __getattr__(self, name):
                     raise SyntaxError
@@ -437,7 +437,7 @@ def test_itemgetter(self):
         f = operator.itemgetter(10)
         self.assertRaises(IndexError, f, a)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 def __getitem__(self, name):
                     raise SyntaxError
@@ -471,7 +471,7 @@ def __getitem__(self, name):
         self.assertEqual(operator.itemgetter(slice(2, 4))(t), ('c', 'd'))
 
         # interesting sequences
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class T(tuple):
                 'Tuple subclass'
                 pass
@@ -483,7 +483,7 @@ def test_methodcaller(self):
         operator = self.module
         self.assertRaises(TypeError, operator.methodcaller)
         self.assertRaises(TypeError, operator.methodcaller, 12)
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class A:
                 def foo(self, *args, **kwds):
                     return args[0] + args[1]
@@ -509,7 +509,7 @@ def baz(*args, **kwds):
 
     def test_inplace(self):
         operator = self.module
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 def __iadd__     (self, other): return "iadd"
                 def __iand__     (self, other): return "iand"
@@ -550,7 +550,7 @@ def test_iconcat_without_getitem(self):
 
     def test_index(self):
         operator = self.module
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __index__(self):
                     return 1
@@ -570,7 +570,7 @@ def __index__(self):
 
     def test_not_(self):
         operator = self.module
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C:
                 def __bool__(self):
                     raise SyntaxError
@@ -583,7 +583,7 @@ def __bool__(self):
 
     def test_length_hint(self):
         operator = self.module
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class X(object):
                 def __init__(self, value):
                     self.value = value
@@ -607,7 +607,7 @@ def __length_hint__(self):
         with self.assertRaises(LookupError):
             operator.length_hint(X(LookupError))
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Y: pass
 
         msg = "'str' object cannot be interpreted as an integer"
@@ -679,7 +679,7 @@ def copy(self, obj, proto):
 
     def test_attrgetter(self):
         attrgetter = self.module.attrgetter
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class A:
                 pass
         a = A()
@@ -723,7 +723,7 @@ def test_itemgetter(self):
 
     def test_methodcaller(self):
         methodcaller = self.module.methodcaller
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class A:
                 def foo(self, *args, **kwds):
                     return args[0] + args[1]
diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.diff b/test/dynamo/cpython/3_13/test_ordered_dict.diff
index d7ef884f2954..1df02fabdfd2 100644
--- a/test/dynamo/cpython/3_13/test_ordered_dict.diff
+++ b/test/dynamo/cpython/3_13/test_ordered_dict.diff
@@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.py b/test/dynamo/cpython/3_13/test_ordered_dict.py
-index a9b6a84996e..d9fce736a10 100644
+index a9b6a84996e..efc4288d1a4 100644
 --- a/test/dynamo/cpython/3_13/test_ordered_dict.py
 +++ b/test/dynamo/cpython/3_13/test_ordered_dict.py
 @@ -1,3 +1,60 @@
@@ -63,114 +63,356 @@ index a9b6a84996e..d9fce736a10 100644
  import builtins
  import contextlib
  import copy
-@@ -760,7 +817,7 @@ class _TriggerSideEffectOnEqual:
+@@ -113,13 +170,14 @@ class OrderedDictTests:
+
+     def test_init_calls(self):
+         calls = []
+-        class Spam:
+-            def keys(self):
+-                calls.append('keys')
+-                return ()
+-            def items(self):
+-                calls.append('items')
+-                return ()
++        with torch._dynamo.error_on_graph_break(False):
++            class Spam:
++                def keys(self):
++                    calls.append('keys')
++                    return ()
++                def items(self):
++                    calls.append('items')
++                    return ()
+
+         self.OrderedDict(Spam())
+         self.assertEqual(calls, ['keys'])
+@@ -129,9 +187,10 @@ class OrderedDictTests:
+         # a consistent internal state is created in __new__
+         # rather than __init__.
+         OrderedDict = self.OrderedDict
+-        class ODNI(OrderedDict):
+-            def __init__(*args, **kwargs):
+-                pass
++        with torch._dynamo.error_on_graph_break(False):
++            class ODNI(OrderedDict):
++                def __init__(*args, **kwargs):
++                    pass
+         od = ODNI()
+         od['a'] = 1  # This used to fail because __init__ was bypassed
+
+@@ -267,9 +326,10 @@ class OrderedDictTests:
+         self.assertEqual(od.pop(k, 12345), 12345)
+
+         # make sure pop still works when __missing__ is defined
+-        class Missing(OrderedDict):
+-            def __missing__(self, key):
+-                return 0
++        with torch._dynamo.error_on_graph_break(False):
++            class Missing(OrderedDict):
++                def __missing__(self, key):
++                    return 0
+         m = Missing(a=1)
+         self.assertEqual(m.pop('b', 5), 5)
+         self.assertEqual(m.pop('a', 6), 1)
+@@ -416,9 +476,10 @@ class OrderedDictTests:
+         self.assertEqual(od.setdefault('g', default=9), 9)
+
+         # make sure setdefault still works when __missing__ is defined
+-        class Missing(OrderedDict):
+-            def __missing__(self, key):
+-                return 0
++        with torch._dynamo.error_on_graph_break(False):
++            class Missing(OrderedDict):
++                def __missing__(self, key):
++                    return 0
+         self.assertEqual(Missing().setdefault(5, 9), 9)
+
+     def test_reinsert(self):
+@@ -484,9 +545,10 @@ class OrderedDictTests:
+     def test_override_update(self):
+         OrderedDict = self.OrderedDict
+         # Verify that subclasses can override update() without breaking __init__()
+-        class MyOD(OrderedDict):
+-            def update(self, *args, **kwds):
+-                raise Exception()
++        with torch._dynamo.error_on_graph_break(False):
++            class MyOD(OrderedDict):
++                def update(self, *args, **kwds):
++                    raise Exception()
+         items = [('a', 1), ('c', 3), ('b', 2)]
+         self.assertEqual(list(MyOD(items).items()), items)
+
+@@ -507,9 +569,10 @@ class OrderedDictTests:
+         # should not crash Python.
+         OrderedDict = self.OrderedDict
+         deleted = []
+-        class MyOD(OrderedDict):
+-            def __del__(self):
+-                deleted.append(self.i)
++        with torch._dynamo.error_on_graph_break(False):
++            class MyOD(OrderedDict):
++                def __del__(self):
++                    deleted.append(self.i)
+         obj = None
+         for i in range(100):
+             obj = MyOD([(None, obj)])
+@@ -521,19 +584,20 @@ class OrderedDictTests:
+     def test_delitem_hash_collision(self):
+         OrderedDict = self.OrderedDict
+
+-        class Key:
+-            def __init__(self, hash):
+-                self._hash = hash
+-                self.value = str(id(self))
+-            def __hash__(self):
+-                return self._hash
+-            def __eq__(self, other):
+-                try:
+-                    return self.value == other.value
+-                except AttributeError:
+-                    return False
+-            def __repr__(self):
+-                return self.value
++        with torch._dynamo.error_on_graph_break(False):
++            class Key:
++                def __init__(self, hash):
++                    self._hash = hash
++                    self.value = str(id(self))
++                def __hash__(self):
++                    return self._hash
++                def __eq__(self, other):
++                    try:
++                        return self.value == other.value
++                    except AttributeError:
++                        return False
++                def __repr__(self):
++                    return self.value
+
+         def blocking_hash(hash):
+             # See the collision-handling in lookdict (in Objects/dictobject.c).
+@@ -560,9 +624,10 @@ class OrderedDictTests:
+     def test_issue24347(self):
+         OrderedDict = self.OrderedDict
+
+-        class Key:
+-            def __hash__(self):
+-                return randrange(100000)
++        with torch._dynamo.error_on_graph_break(False):
++            class Key:
++                def __hash__(self):
++                    return randrange(100000)
+
+         od = OrderedDict()
+         for i in range(100):
+@@ -582,9 +647,10 @@ class OrderedDictTests:
+     def test_issue24348(self):
+         OrderedDict = self.OrderedDict
+
+-        class Key:
+-            def __hash__(self):
+-                return 1
++        with torch._dynamo.error_on_graph_break(False):
++            class Key:
++                def __hash__(self):
++                    return 1
+
+         od = OrderedDict()
+         od[Key()] = 0
+@@ -760,15 +826,16 @@ class _TriggerSideEffectOnEqual:
      def side_effect(self):
          raise NotImplementedError
- 
+
 -class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
 +class PurePythonOrderedDictTests(OrderedDictTests, __TestCase):
- 
+
      module = py_coll
      OrderedDict = py_coll.OrderedDict
-@@ -781,7 +838,7 @@ class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
+
+     def test_issue119004_attribute_error(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                del dict1[TODEL]
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    del dict1[TODEL]
+
+         TODEL = Key()
+         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+@@ -781,7 +848,7 @@ class PurePythonOrderedDictTests(OrderedDictTests, unittest.TestCase):
          self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
- 
- 
+
+
 -class CPythonBuiltinDictTests(unittest.TestCase):
 +class CPythonBuiltinDictTests(__TestCase):
      """Builtin dict preserves insertion order.
- 
+
      Reuse some of tests in OrderedDict selectively.
-@@ -800,6 +857,7 @@ for method in (
+@@ -800,6 +867,7 @@ for method in (
  del method
- 
- 
+
+
 +
  class CPythonOrderedDictSideEffects:
- 
+
      def check_runtime_error_issue119004(self, dict1, dict2):
-@@ -878,7 +936,7 @@ class CPythonOrderedDictSideEffects:
+@@ -807,9 +875,10 @@ class CPythonOrderedDictSideEffects:
+         self.assertRaisesRegex(RuntimeError, msg, operator.eq, dict1, dict2)
+
+     def test_issue119004_change_size_by_clear(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                dict1.clear()
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    dict1.clear()
+
+         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+@@ -819,9 +888,10 @@ class CPythonOrderedDictSideEffects:
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+     def test_issue119004_change_size_by_delete_key(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                del dict1[TODEL]
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    del dict1[TODEL]
+
+         TODEL = Key()
+         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+@@ -832,10 +902,11 @@ class CPythonOrderedDictSideEffects:
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+     def test_issue119004_change_linked_list_by_clear(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                dict1.clear()
+-                dict1['a'] = dict1['b'] = 'c'
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    dict1.clear()
++                    dict1['a'] = dict1['b'] = 'c'
+
+         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
+@@ -845,10 +916,11 @@ class CPythonOrderedDictSideEffects:
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+     def test_issue119004_change_linked_list_by_delete_key(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            def side_effect(self):
+-                del dict1[TODEL]
+-                dict1['a'] = 'c'
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                def side_effect(self):
++                    del dict1[TODEL]
++                    dict1['a'] = 'c'
+
+         TODEL = Key()
+         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+@@ -859,10 +931,11 @@ class CPythonOrderedDictSideEffects:
+         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
+
+     def test_issue119004_change_size_by_delete_key_in_dict_eq(self):
+-        class Key(_TriggerSideEffectOnEqual):
+-            trigger = 0
+-            def side_effect(self):
+-                del dict1[TODEL]
++        with torch._dynamo.error_on_graph_break(False):
++            class Key(_TriggerSideEffectOnEqual):
++                trigger = 0
++                def side_effect(self):
++                    del dict1[TODEL]
+
+         TODEL = Key()
+         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
+@@ -878,7 +951,7 @@ class CPythonOrderedDictSideEffects:
  @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
  class CPythonOrderedDictTests(OrderedDictTests,
                                CPythonOrderedDictSideEffects,
 -                              unittest.TestCase):
 +                              __TestCase):
- 
+
      module = c_coll
      OrderedDict = c_coll.OrderedDict
-@@ -986,7 +1044,7 @@ class CPythonOrderedDictSubclassTests(CPythonOrderedDictTests):
+@@ -986,7 +1059,7 @@ class CPythonOrderedDictSubclassTests(CPythonOrderedDictTests):
          pass
- 
- 
+
+
 -class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
 +class PurePythonOrderedDictWithSlotsCopyingTests(__TestCase):
- 
+
      module = py_coll
      class OrderedDict(py_coll.OrderedDict):
-@@ -995,7 +1053,7 @@ class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
- 
- 
+@@ -995,7 +1068,7 @@ class PurePythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
+
+
  @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
 -class CPythonOrderedDictWithSlotsCopyingTests(unittest.TestCase):
 +class CPythonOrderedDictWithSlotsCopyingTests(__TestCase):
- 
+
      module = c_coll
      class OrderedDict(c_coll.OrderedDict):
-@@ -1008,6 +1066,7 @@ class PurePythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+@@ -1008,6 +1081,7 @@ class PurePythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
      @classmethod
      def setUpClass(cls):
          cls.type2test = py_coll.OrderedDict
 +        super().setUpClass()
- 
+
      def test_popitem(self):
          d = self._empty_mapping()
-@@ -1020,6 +1079,7 @@ class CPythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
+@@ -1020,6 +1094,7 @@ class CPythonGeneralMappingTests(mapping_tests.BasicTestMappingProtocol):
      @classmethod
      def setUpClass(cls):
          cls.type2test = c_coll.OrderedDict
 +        super().setUpClass()
- 
+
      def test_popitem(self):
          d = self._empty_mapping()
-@@ -1033,6 +1093,7 @@ class PurePythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+@@ -1033,6 +1108,7 @@ class PurePythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
          class MyOrderedDict(py_coll.OrderedDict):
              pass
          cls.type2test = MyOrderedDict
 +        super().setUpClass()
- 
+
      def test_popitem(self):
          d = self._empty_mapping()
-@@ -1047,6 +1108,7 @@ class CPythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
+@@ -1047,6 +1123,7 @@ class CPythonSubclassMappingTests(mapping_tests.BasicTestMappingProtocol):
          class MyOrderedDict(c_coll.OrderedDict):
              pass
          cls.type2test = MyOrderedDict
 +        super().setUpClass()
- 
+
      def test_popitem(self):
          d = self._empty_mapping()
-@@ -1120,21 +1182,22 @@ class SimpleLRUCacheTests:
+@@ -1120,21 +1197,22 @@ class SimpleLRUCacheTests:
          self.assertEqual(list(c), [1, 3, 2])
- 
- 
+
+
 -class PySimpleLRUCacheTests(SimpleLRUCacheTests, unittest.TestCase):
 +class PySimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
- 
+
      class type2test(SimpleLRUCache, py_coll.OrderedDict):
          pass
- 
- 
+
+
  @unittest.skipUnless(c_coll, 'requires the C version of the collections module')
 -class CSimpleLRUCacheTests(SimpleLRUCacheTests, unittest.TestCase):
 +class CSimpleLRUCacheTests(SimpleLRUCacheTests, __TestCase):
- 
+
      @classmethod
      def setUpClass(cls):
          class type2test(SimpleLRUCache, c_coll.OrderedDict):
              pass
          cls.type2test = type2test
 +        super().setUpClass()
- 
- 
+
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_ordered_dict.py b/test/dynamo/cpython/3_13/test_ordered_dict.py
index d9fce736a109..56a8662de133 100644
--- a/test/dynamo/cpython/3_13/test_ordered_dict.py
+++ b/test/dynamo/cpython/3_13/test_ordered_dict.py
@@ -170,13 +170,14 @@ def test_update(self):
 
     def test_init_calls(self):
         calls = []
-        class Spam:
-            def keys(self):
-                calls.append('keys')
-                return ()
-            def items(self):
-                calls.append('items')
-                return ()
+        with torch._dynamo.error_on_graph_break(False):
+            class Spam:
+                def keys(self):
+                    calls.append('keys')
+                    return ()
+                def items(self):
+                    calls.append('items')
+                    return ()
 
         self.OrderedDict(Spam())
         self.assertEqual(calls, ['keys'])
@@ -186,9 +187,10 @@ def test_overridden_init(self):
         # a consistent internal state is created in __new__
         # rather than __init__.
         OrderedDict = self.OrderedDict
-        class ODNI(OrderedDict):
-            def __init__(*args, **kwargs):
-                pass
+        with torch._dynamo.error_on_graph_break(False):
+            class ODNI(OrderedDict):
+                def __init__(*args, **kwargs):
+                    pass
         od = ODNI()
         od['a'] = 1  # This used to fail because __init__ was bypassed
 
@@ -324,9 +326,10 @@ def test_pop(self):
         self.assertEqual(od.pop(k, 12345), 12345)
 
         # make sure pop still works when __missing__ is defined
-        class Missing(OrderedDict):
-            def __missing__(self, key):
-                return 0
+        with torch._dynamo.error_on_graph_break(False):
+            class Missing(OrderedDict):
+                def __missing__(self, key):
+                    return 0
         m = Missing(a=1)
         self.assertEqual(m.pop('b', 5), 5)
         self.assertEqual(m.pop('a', 6), 1)
@@ -473,9 +476,10 @@ def test_setdefault(self):
         self.assertEqual(od.setdefault('g', default=9), 9)
 
         # make sure setdefault still works when __missing__ is defined
-        class Missing(OrderedDict):
-            def __missing__(self, key):
-                return 0
+        with torch._dynamo.error_on_graph_break(False):
+            class Missing(OrderedDict):
+                def __missing__(self, key):
+                    return 0
         self.assertEqual(Missing().setdefault(5, 9), 9)
 
     def test_reinsert(self):
@@ -541,9 +545,10 @@ def test_views(self):
     def test_override_update(self):
         OrderedDict = self.OrderedDict
         # Verify that subclasses can override update() without breaking __init__()
-        class MyOD(OrderedDict):
-            def update(self, *args, **kwds):
-                raise Exception()
+        with torch._dynamo.error_on_graph_break(False):
+            class MyOD(OrderedDict):
+                def update(self, *args, **kwds):
+                    raise Exception()
         items = [('a', 1), ('c', 3), ('b', 2)]
         self.assertEqual(list(MyOD(items).items()), items)
 
@@ -564,9 +569,10 @@ def test_highly_nested_subclass(self):
         # should not crash Python.
         OrderedDict = self.OrderedDict
         deleted = []
-        class MyOD(OrderedDict):
-            def __del__(self):
-                deleted.append(self.i)
+        with torch._dynamo.error_on_graph_break(False):
+            class MyOD(OrderedDict):
+                def __del__(self):
+                    deleted.append(self.i)
         obj = None
         for i in range(100):
             obj = MyOD([(None, obj)])
@@ -578,19 +584,20 @@ def __del__(self):
     def test_delitem_hash_collision(self):
         OrderedDict = self.OrderedDict
 
-        class Key:
-            def __init__(self, hash):
-                self._hash = hash
-                self.value = str(id(self))
-            def __hash__(self):
-                return self._hash
-            def __eq__(self, other):
-                try:
-                    return self.value == other.value
-                except AttributeError:
-                    return False
-            def __repr__(self):
-                return self.value
+        with torch._dynamo.error_on_graph_break(False):
+            class Key:
+                def __init__(self, hash):
+                    self._hash = hash
+                    self.value = str(id(self))
+                def __hash__(self):
+                    return self._hash
+                def __eq__(self, other):
+                    try:
+                        return self.value == other.value
+                    except AttributeError:
+                        return False
+                def __repr__(self):
+                    return self.value
 
         def blocking_hash(hash):
             # See the collision-handling in lookdict (in Objects/dictobject.c).
@@ -617,9 +624,10 @@ def blocking_hash(hash):
     def test_issue24347(self):
         OrderedDict = self.OrderedDict
 
-        class Key:
-            def __hash__(self):
-                return randrange(100000)
+        with torch._dynamo.error_on_graph_break(False):
+            class Key:
+                def __hash__(self):
+                    return randrange(100000)
 
         od = OrderedDict()
         for i in range(100):
@@ -639,9 +647,10 @@ def __hash__(self):
     def test_issue24348(self):
         OrderedDict = self.OrderedDict
 
-        class Key:
-            def __hash__(self):
-                return 1
+        with torch._dynamo.error_on_graph_break(False):
+            class Key:
+                def __hash__(self):
+                    return 1
 
         od = OrderedDict()
         od[Key()] = 0
@@ -823,9 +832,10 @@ class PurePythonOrderedDictTests(OrderedDictTests, __TestCase):
     OrderedDict = py_coll.OrderedDict
 
     def test_issue119004_attribute_error(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                del dict1[TODEL]
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    del dict1[TODEL]
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
@@ -865,9 +875,10 @@ def check_runtime_error_issue119004(self, dict1, dict2):
         self.assertRaisesRegex(RuntimeError, msg, operator.eq, dict1, dict2)
 
     def test_issue119004_change_size_by_clear(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                dict1.clear()
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    dict1.clear()
 
         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
@@ -877,9 +888,10 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_size_by_delete_key(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                del dict1[TODEL]
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    del dict1[TODEL]
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
@@ -890,10 +902,11 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_linked_list_by_clear(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                dict1.clear()
-                dict1['a'] = dict1['b'] = 'c'
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    dict1.clear()
+                    dict1['a'] = dict1['b'] = 'c'
 
         dict1 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
         dict2 = self.OrderedDict(dict.fromkeys((0, Key(), 4.2)))
@@ -903,10 +916,11 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_linked_list_by_delete_key(self):
-        class Key(_TriggerSideEffectOnEqual):
-            def side_effect(self):
-                del dict1[TODEL]
-                dict1['a'] = 'c'
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                def side_effect(self):
+                    del dict1[TODEL]
+                    dict1['a'] = 'c'
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
@@ -917,10 +931,11 @@ def side_effect(self):
         self.assertDictEqual(dict2, dict.fromkeys((0, Key(), 4.2)))
 
     def test_issue119004_change_size_by_delete_key_in_dict_eq(self):
-        class Key(_TriggerSideEffectOnEqual):
-            trigger = 0
-            def side_effect(self):
-                del dict1[TODEL]
+        with torch._dynamo.error_on_graph_break(False):
+            class Key(_TriggerSideEffectOnEqual):
+                trigger = 0
+                def side_effect(self):
+                    del dict1[TODEL]
 
         TODEL = Key()
         dict1 = self.OrderedDict(dict.fromkeys((0, TODEL, 4.2)))
diff --git a/test/dynamo/cpython/3_13/test_range.diff b/test/dynamo/cpython/3_13/test_range.diff
new file mode 100644
index 000000000000..ee28294dba4f
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_range.diff
@@ -0,0 +1,124 @@
+diff --git a/test/dynamo/cpython/3_13/test_range.py b/test/dynamo/cpython/3_13/test_range.py
+index 3870b153688..4d3a3d136e4 100644
+--- a/test/dynamo/cpython/3_13/test_range.py
++++ b/test/dynamo/cpython/3_13/test_range.py
+@@ -1,3 +1,23 @@
++# ======= BEGIN Dynamo patch =======
++# Owner(s): ["module: dynamo"]
++
++# ruff: noqa
++# flake8: noqa
++
++# Test copied from
++# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_range.py
++
++import sys
++import torch
++import torch._dynamo.test_case
++import unittest
++from torch._dynamo.test_case import CPythonTestCase
++from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo
++
++__TestCase = CPythonTestCase
++
++# ======= END DYNAMO PATCH =======
++
+ # Python test set -- built-in functions
+ 
+ import unittest
+@@ -21,7 +41,7 @@ def pyrange_reversed(start, stop, step):
+     return pyrange(stop - step, start - step, -step)
+ 
+ 
+-class RangeTest(unittest.TestCase):
++class RangeTest(__TestCase):
+     def assert_iterators_equal(self, xs, ys, test_id, limit=None):
+         # check that an iterator xs matches the expected results ys,
+         # up to a given limit.
+@@ -74,18 +94,6 @@ class RangeTest(unittest.TestCase):
+         self.assertNotIn(-b, seq)
+         self.assertEqual(len(seq), 2)
+ 
+-        self.assertRaises(TypeError, range)
+-        self.assertRaises(TypeError, range, 1, 2, 3, 4)
+-        self.assertRaises(ValueError, range, 1, 2, 0)
+-
+-        self.assertRaises(TypeError, range, 0.0, 2, 1)
+-        self.assertRaises(TypeError, range, 1, 2.0, 1)
+-        self.assertRaises(TypeError, range, 1, 2, 1.0)
+-        self.assertRaises(TypeError, range, 1e100, 1e101, 1e101)
+-
+-        self.assertRaises(TypeError, range, 0, "spam")
+-        self.assertRaises(TypeError, range, 0, 42, "spam")
+-
+         self.assertEqual(len(range(0, sys.maxsize, sys.maxsize-1)), 2)
+ 
+         r = range(-sys.maxsize, sys.maxsize, 2)
+@@ -354,7 +362,7 @@ class RangeTest(unittest.TestCase):
+         self.assertEqual(range(1, 2**100, 2).count(2**87), 0)
+         self.assertEqual(range(1, 2**100, 2).count(2**87+1), 1)
+ 
+-        self.assertEqual(range(10).count(ALWAYS_EQ), 10)
++        # self.assertEqual(range(10).count(ALWAYS_EQ), 10)
+ 
+         self.assertEqual(len(range(sys.maxsize, sys.maxsize+10)), 10)
+ 
+@@ -403,6 +411,7 @@ class RangeTest(unittest.TestCase):
+                     it = pickle.loads(d)
+                     self.assertEqual(list(it), data[1:])
+ 
++    @skipIfTorchDynamo("infinite loop")
+     def test_iterator_pickling_overflowing_index(self):
+         for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+             with self.subTest(proto=proto):
+@@ -653,28 +662,18 @@ class RangeTest(unittest.TestCase):
+         ranges_ne = [a != b for a in test_ranges for b in test_ranges]
+         self.assertEqual(ranges_ne, [not x for x in ranges_eq])
+ 
+-        # Equal ranges should have equal hashes.
+-        for a in test_ranges:
+-            for b in test_ranges:
+-                if a == b:
+-                    self.assertEqual(hash(a), hash(b))
+-
+         # Ranges are unequal to other types (even sequence types)
+         self.assertIs(range(0) == (), False)
+-        self.assertIs(() == range(0), False)
++        # self.assertIs(() == range(0), False)
+         self.assertIs(range(2) == [0, 1], False)
+ 
+         # Huge integers aren't a problem.
+         self.assertEqual(range(0, 2**100 - 1, 2),
+                          range(0, 2**100, 2))
+-        self.assertEqual(hash(range(0, 2**100 - 1, 2)),
+-                         hash(range(0, 2**100, 2)))
+         self.assertNotEqual(range(0, 2**100, 2),
+                             range(0, 2**100 + 1, 2))
+         self.assertEqual(range(2**200, 2**201 - 2**99, 2**100),
+                          range(2**200, 2**201, 2**100))
+-        self.assertEqual(hash(range(2**200, 2**201 - 2**99, 2**100)),
+-                         hash(range(2**200, 2**201, 2**100)))
+         self.assertNotEqual(range(2**200, 2**201, 2**100),
+                             range(2**200, 2**201 + 1, 2**100))
+ 
+@@ -710,19 +709,6 @@ class RangeTest(unittest.TestCase):
+         self.assertIs(type(rangeobj.stop), int)
+         self.assertIs(type(rangeobj.step), int)
+ 
+-        with self.assertRaises(AttributeError):
+-            rangeobj.start = 0
+-        with self.assertRaises(AttributeError):
+-            rangeobj.stop = 10
+-        with self.assertRaises(AttributeError):
+-            rangeobj.step = 1
+-
+-        with self.assertRaises(AttributeError):
+-            del rangeobj.start
+-        with self.assertRaises(AttributeError):
+-            del rangeobj.stop
+-        with self.assertRaises(AttributeError):
+-            del rangeobj.step
+ 
+ if __name__ == "__main__":
+-    unittest.main()
++    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_range.py b/test/dynamo/cpython/3_13/test_range.py
new file mode 100644
index 000000000000..4d3a3d136e4a
--- /dev/null
+++ b/test/dynamo/cpython/3_13/test_range.py
@@ -0,0 +1,714 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+# Test copied from
+# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_range.py
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import run_tests, skipIfTorchDynamo
+
+__TestCase = CPythonTestCase
+
+# ======= END DYNAMO PATCH =======
+
+# Python test set -- built-in functions
+
+import unittest
+import sys
+import pickle
+import itertools
+from test.support import ALWAYS_EQ
+
+# pure Python implementations (3 args only), for comparison
+def pyrange(start, stop, step):
+    if (start - stop) // step < 0:
+        # replace stop with next element in the sequence of integers
+        # that are congruent to start modulo step.
+        stop += (start - stop) % step
+        while start != stop:
+            yield start
+            start += step
+
+def pyrange_reversed(start, stop, step):
+    stop += (start - stop) % step
+    return pyrange(stop - step, start - step, -step)
+
+
+class RangeTest(__TestCase):
+    def assert_iterators_equal(self, xs, ys, test_id, limit=None):
+        # check that an iterator xs matches the expected results ys,
+        # up to a given limit.
+        if limit is not None:
+            xs = itertools.islice(xs, limit)
+            ys = itertools.islice(ys, limit)
+        sentinel = object()
+        pairs = itertools.zip_longest(xs, ys, fillvalue=sentinel)
+        for i, (x, y) in enumerate(pairs):
+            if x == y:
+                continue
+            elif x == sentinel:
+                self.fail('{}: iterator ended unexpectedly '
+                          'at position {}; expected {}'.format(test_id, i, y))
+            elif y == sentinel:
+                self.fail('{}: unexpected excess element {} at '
+                          'position {}'.format(test_id, x, i))
+            else:
+                self.fail('{}: wrong element at position {}; '
+                          'expected {}, got {}'.format(test_id, i, y, x))
+
+    def test_range(self):
+        self.assertEqual(list(range(3)), [0, 1, 2])
+        self.assertEqual(list(range(1, 5)), [1, 2, 3, 4])
+        self.assertEqual(list(range(0)), [])
+        self.assertEqual(list(range(-3)), [])
+        self.assertEqual(list(range(1, 10, 3)), [1, 4, 7])
+        self.assertEqual(list(range(5, -5, -3)), [5, 2, -1, -4])
+
+        a = 10
+        b = 100
+        c = 50
+
+        self.assertEqual(list(range(a, a+2)), [a, a+1])
+        self.assertEqual(list(range(a+2, a, -1)), [a+2, a+1])
+        self.assertEqual(list(range(a+4, a, -2)), [a+4, a+2])
+
+        seq = list(range(a, b, c))
+        self.assertIn(a, seq)
+        self.assertNotIn(b, seq)
+        self.assertEqual(len(seq), 2)
+
+        seq = list(range(b, a, -c))
+        self.assertIn(b, seq)
+        self.assertNotIn(a, seq)
+        self.assertEqual(len(seq), 2)
+
+        seq = list(range(-a, -b, -c))
+        self.assertIn(-a, seq)
+        self.assertNotIn(-b, seq)
+        self.assertEqual(len(seq), 2)
+
+        self.assertEqual(len(range(0, sys.maxsize, sys.maxsize-1)), 2)
+
+        r = range(-sys.maxsize, sys.maxsize, 2)
+        self.assertEqual(len(r), sys.maxsize)
+
+    def test_range_constructor_error_messages(self):
+        with self.assertRaisesRegex(
+                TypeError,
+                "range expected at least 1 argument, got 0"
+        ):
+            range()
+
+        with self.assertRaisesRegex(
+                TypeError,
+                "range expected at most 3 arguments, got 6"
+        ):
+            range(1, 2, 3, 4, 5, 6)
+
+    def test_large_operands(self):
+        x = range(10**20, 10**20+10, 3)
+        self.assertEqual(len(x), 4)
+        self.assertEqual(len(list(x)), 4)
+
+        x = range(10**20+10, 10**20, 3)
+        self.assertEqual(len(x), 0)
+        self.assertEqual(len(list(x)), 0)
+        self.assertFalse(x)
+
+        x = range(10**20, 10**20+10, -3)
+        self.assertEqual(len(x), 0)
+        self.assertEqual(len(list(x)), 0)
+        self.assertFalse(x)
+
+        x = range(10**20+10, 10**20, -3)
+        self.assertEqual(len(x), 4)
+        self.assertEqual(len(list(x)), 4)
+        self.assertTrue(x)
+
+        # Now test range() with longs
+        for x in [range(-2**100),
+                  range(0, -2**100),
+                  range(0, 2**100, -1)]:
+            self.assertEqual(list(x), [])
+            self.assertFalse(x)
+
+        a = int(10 * sys.maxsize)
+        b = int(100 * sys.maxsize)
+        c = int(50 * sys.maxsize)
+
+        self.assertEqual(list(range(a, a+2)), [a, a+1])
+        self.assertEqual(list(range(a+2, a, -1)), [a+2, a+1])
+        self.assertEqual(list(range(a+4, a, -2)), [a+4, a+2])
+
+        seq = list(range(a, b, c))
+        self.assertIn(a, seq)
+        self.assertNotIn(b, seq)
+        self.assertEqual(len(seq), 2)
+        self.assertEqual(seq[0], a)
+        self.assertEqual(seq[-1], a+c)
+
+        seq = list(range(b, a, -c))
+        self.assertIn(b, seq)
+        self.assertNotIn(a, seq)
+        self.assertEqual(len(seq), 2)
+        self.assertEqual(seq[0], b)
+        self.assertEqual(seq[-1], b-c)
+
+        seq = list(range(-a, -b, -c))
+        self.assertIn(-a, seq)
+        self.assertNotIn(-b, seq)
+        self.assertEqual(len(seq), 2)
+        self.assertEqual(seq[0], -a)
+        self.assertEqual(seq[-1], -a-c)
+
+    def test_large_range(self):
+        # Check long ranges (len > sys.maxsize)
+        # len() is expected to fail due to limitations of the __len__ protocol
+        def _range_len(x):
+            try:
+                length = len(x)
+            except OverflowError:
+                step = x[1] - x[0]
+                length = 1 + ((x[-1] - x[0]) // step)
+            return length
+
+        a = -sys.maxsize
+        b = sys.maxsize
+        expected_len = b - a
+        x = range(a, b)
+        self.assertIn(a, x)
+        self.assertNotIn(b, x)
+        self.assertRaises(OverflowError, len, x)
+        self.assertTrue(x)
+        self.assertEqual(_range_len(x), expected_len)
+        self.assertEqual(x[0], a)
+        idx = sys.maxsize+1
+        self.assertEqual(x[idx], a+idx)
+        self.assertEqual(x[idx:idx+1][0], a+idx)
+        with self.assertRaises(IndexError):
+            x[-expected_len-1]
+        with self.assertRaises(IndexError):
+            x[expected_len]
+
+        a = 0
+        b = 2 * sys.maxsize
+        expected_len = b - a
+        x = range(a, b)
+        self.assertIn(a, x)
+        self.assertNotIn(b, x)
+        self.assertRaises(OverflowError, len, x)
+        self.assertTrue(x)
+        self.assertEqual(_range_len(x), expected_len)
+        self.assertEqual(x[0], a)
+        idx = sys.maxsize+1
+        self.assertEqual(x[idx], a+idx)
+        self.assertEqual(x[idx:idx+1][0], a+idx)
+        with self.assertRaises(IndexError):
+            x[-expected_len-1]
+        with self.assertRaises(IndexError):
+            x[expected_len]
+
+        a = 0
+        b = sys.maxsize**10
+        c = 2*sys.maxsize
+        expected_len = 1 + (b - a) // c
+        x = range(a, b, c)
+        self.assertIn(a, x)
+        self.assertNotIn(b, x)
+        self.assertRaises(OverflowError, len, x)
+        self.assertTrue(x)
+        self.assertEqual(_range_len(x), expected_len)
+        self.assertEqual(x[0], a)
+        idx = sys.maxsize+1
+        self.assertEqual(x[idx], a+(idx*c))
+        self.assertEqual(x[idx:idx+1][0], a+(idx*c))
+        with self.assertRaises(IndexError):
+            x[-expected_len-1]
+        with self.assertRaises(IndexError):
+            x[expected_len]
+
+        a = sys.maxsize**10
+        b = 0
+        c = -2*sys.maxsize
+        expected_len = 1 + (b - a) // c
+        x = range(a, b, c)
+        self.assertIn(a, x)
+        self.assertNotIn(b, x)
+        self.assertRaises(OverflowError, len, x)
+        self.assertTrue(x)
+        self.assertEqual(_range_len(x), expected_len)
+        self.assertEqual(x[0], a)
+        idx = sys.maxsize+1
+        self.assertEqual(x[idx], a+(idx*c))
+        self.assertEqual(x[idx:idx+1][0], a+(idx*c))
+        with self.assertRaises(IndexError):
+            x[-expected_len-1]
+        with self.assertRaises(IndexError):
+            x[expected_len]
+
+    def test_invalid_invocation(self):
+        self.assertRaises(TypeError, range)
+        self.assertRaises(TypeError, range, 1, 2, 3, 4)
+        self.assertRaises(ValueError, range, 1, 2, 0)
+        a = int(10 * sys.maxsize)
+        self.assertRaises(ValueError, range, a, a + 1, int(0))
+        self.assertRaises(TypeError, range, 1., 1., 1.)
+        self.assertRaises(TypeError, range, 1e100, 1e101, 1e101)
+        self.assertRaises(TypeError, range, 0, "spam")
+        self.assertRaises(TypeError, range, 0, 42, "spam")
+        # Exercise various combinations of bad arguments, to check
+        # refcounting logic
+        self.assertRaises(TypeError, range, 0.0)
+        self.assertRaises(TypeError, range, 0, 0.0)
+        self.assertRaises(TypeError, range, 0.0, 0)
+        self.assertRaises(TypeError, range, 0.0, 0.0)
+        self.assertRaises(TypeError, range, 0, 0, 1.0)
+        self.assertRaises(TypeError, range, 0, 0.0, 1)
+        self.assertRaises(TypeError, range, 0, 0.0, 1.0)
+        self.assertRaises(TypeError, range, 0.0, 0, 1)
+        self.assertRaises(TypeError, range, 0.0, 0, 1.0)
+        self.assertRaises(TypeError, range, 0.0, 0.0, 1)
+        self.assertRaises(TypeError, range, 0.0, 0.0, 1.0)
+
+    def test_index(self):
+        u = range(2)
+        self.assertEqual(u.index(0), 0)
+        self.assertEqual(u.index(1), 1)
+        self.assertRaises(ValueError, u.index, 2)
+
+        u = range(-2, 3)
+        self.assertEqual(u.count(0), 1)
+        self.assertEqual(u.index(0), 2)
+        self.assertRaises(TypeError, u.index)
+
+        class BadExc(Exception):
+            pass
+
+        class BadCmp:
+            def __eq__(self, other):
+                if other == 2:
+                    raise BadExc()
+                return False
+
+        a = range(4)
+        self.assertRaises(BadExc, a.index, BadCmp())
+
+        a = range(-2, 3)
+        self.assertEqual(a.index(0), 2)
+        self.assertEqual(range(1, 10, 3).index(4), 1)
+        self.assertEqual(range(1, -10, -3).index(-5), 2)
+
+        self.assertEqual(range(10**20).index(1), 1)
+        self.assertEqual(range(10**20).index(10**20 - 1), 10**20 - 1)
+
+        self.assertRaises(ValueError, range(1, 2**100, 2).index, 2**87)
+        self.assertEqual(range(1, 2**100, 2).index(2**87+1), 2**86)
+
+        self.assertEqual(range(10).index(ALWAYS_EQ), 0)
+
+    def test_user_index_method(self):
+        bignum = 2*sys.maxsize
+        smallnum = 42
+
+        # User-defined class with an __index__ method
+        class I:
+            def __init__(self, n):
+                self.n = int(n)
+            def __index__(self):
+                return self.n
+        self.assertEqual(list(range(I(bignum), I(bignum + 1))), [bignum])
+        self.assertEqual(list(range(I(smallnum), I(smallnum + 1))), [smallnum])
+
+        # User-defined class with a failing __index__ method
+        class IX:
+            def __index__(self):
+                raise RuntimeError
+        self.assertRaises(RuntimeError, range, IX())
+
+        # User-defined class with an invalid __index__ method
+        class IN:
+            def __index__(self):
+                return "not a number"
+
+        self.assertRaises(TypeError, range, IN())
+
+        # Test use of user-defined classes in slice indices.
+        self.assertEqual(range(10)[:I(5)], range(5))
+
+        with self.assertRaises(RuntimeError):
+            range(0, 10)[:IX()]
+
+        with self.assertRaises(TypeError):
+            range(0, 10)[:IN()]
+
+    def test_count(self):
+        self.assertEqual(range(3).count(-1), 0)
+        self.assertEqual(range(3).count(0), 1)
+        self.assertEqual(range(3).count(1), 1)
+        self.assertEqual(range(3).count(2), 1)
+        self.assertEqual(range(3).count(3), 0)
+        self.assertIs(type(range(3).count(-1)), int)
+        self.assertIs(type(range(3).count(1)), int)
+        self.assertEqual(range(10**20).count(1), 1)
+        self.assertEqual(range(10**20).count(10**20), 0)
+        self.assertEqual(range(3).index(1), 1)
+        self.assertEqual(range(1, 2**100, 2).count(2**87), 0)
+        self.assertEqual(range(1, 2**100, 2).count(2**87+1), 1)
+
+        # self.assertEqual(range(10).count(ALWAYS_EQ), 10)
+
+        self.assertEqual(len(range(sys.maxsize, sys.maxsize+10)), 10)
+
+    def test_repr(self):
+        self.assertEqual(repr(range(1)), 'range(0, 1)')
+        self.assertEqual(repr(range(1, 2)), 'range(1, 2)')
+        self.assertEqual(repr(range(1, 2, 3)), 'range(1, 2, 3)')
+
+    def test_pickling(self):
+        testcases = [(13,), (0, 11), (-22, 10), (20, 3, -1),
+                     (13, 21, 3), (-2, 2, 2), (2**65, 2**65+2)]
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            for t in testcases:
+                with self.subTest(proto=proto, test=t):
+                    r = range(*t)
+                    self.assertEqual(list(pickle.loads(pickle.dumps(r, proto))),
+                                     list(r))
+
+    def test_iterator_pickling(self):
+        testcases = [(13,), (0, 11), (-22, 10), (20, 3, -1), (13, 21, 3),
+                     (-2, 2, 2)]
+        for M in 2**31, 2**63:
+            testcases += [
+                (M-3, M-1), (4*M, 4*M+2),
+                (M-2, M-1, 2), (-M+1, -M, -2),
+                (1, 2, M-1), (-1, -2, -M),
+                (1, M-1, M-1), (-1, -M, -M),
+            ]
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            for t in testcases:
+                with self.subTest(proto=proto, t=t):
+                    it = itorg = iter(range(*t))
+                    data = list(range(*t))
+
+                    d = pickle.dumps(it, proto)
+                    it = pickle.loads(d)
+                    self.assertEqual(type(itorg), type(it))
+                    self.assertEqual(list(it), data)
+
+                    it = pickle.loads(d)
+                    try:
+                        next(it)
+                    except StopIteration:
+                        continue
+                    d = pickle.dumps(it, proto)
+                    it = pickle.loads(d)
+                    self.assertEqual(list(it), data[1:])
+
+    @skipIfTorchDynamo("infinite loop")
+    def test_iterator_pickling_overflowing_index(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            with self.subTest(proto=proto):
+                it = iter(range(2**32 + 2))
+                it.__setstate__(2**32 + 1)  # undocumented way to advance an iterator
+                d = pickle.dumps(it, proto)
+                it = pickle.loads(d)
+                self.assertEqual(next(it), 2**32 + 1)
+
+    def test_exhausted_iterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            r = range(2**65, 2**65+2)
+            i = iter(r)
+            while True:
+                r = next(i)
+                if r == 2**65+1:
+                    break
+            d = pickle.dumps(i, proto)
+            i2 = pickle.loads(d)
+            self.assertEqual(list(i), [])
+            self.assertEqual(list(i2), [])
+
+    def test_large_exhausted_iterator_pickling(self):
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            r = range(20)
+            i = iter(r)
+            while True:
+                r = next(i)
+                if r == 19:
+                    break
+            d = pickle.dumps(i, proto)
+            i2 = pickle.loads(d)
+            self.assertEqual(list(i), [])
+            self.assertEqual(list(i2), [])
+
+    def test_iterator_unpickle_compat(self):
+        testcases = [
+            b'c__builtin__\niter\n(c__builtin__\nxrange\n(I10\nI20\nI2\ntRtRI2\nb.',
+            b'c__builtin__\niter\n(c__builtin__\nxrange\n(K\nK\x14K\x02tRtRK\x02b.',
+            b'\x80\x02c__builtin__\niter\nc__builtin__\nxrange\nK\nK\x14K\x02\x87R\x85RK\x02b.',
+            b'\x80\x03cbuiltins\niter\ncbuiltins\nrange\nK\nK\x14K\x02\x87R\x85RK\x02b.',
+            b'\x80\x04\x951\x00\x00\x00\x00\x00\x00\x00\x8c\x08builtins\x8c\x04iter\x93\x8c\x08builtins\x8c\x05range\x93K\nK\x14K\x02\x87R\x85RK\x02b.',
+
+            b'c__builtin__\niter\n(c__builtin__\nxrange\n(L-36893488147419103232L\nI20\nI2\ntRtRL18446744073709551623L\nb.',
+            b'c__builtin__\niter\n(c__builtin__\nxrange\n(L-36893488147419103232L\nK\x14K\x02tRtRL18446744073709551623L\nb.',
+            b'\x80\x02c__builtin__\niter\nc__builtin__\nxrange\n\x8a\t\x00\x00\x00\x00\x00\x00\x00\x00\xfeK\x14K\x02\x87R\x85R\x8a\t\x07\x00\x00\x00\x00\x00\x00\x00\x01b.',
+            b'\x80\x03cbuiltins\niter\ncbuiltins\nrange\n\x8a\t\x00\x00\x00\x00\x00\x00\x00\x00\xfeK\x14K\x02\x87R\x85R\x8a\t\x07\x00\x00\x00\x00\x00\x00\x00\x01b.',
+            b'\x80\x04\x95C\x00\x00\x00\x00\x00\x00\x00\x8c\x08builtins\x8c\x04iter\x93\x8c\x08builtins\x8c\x05range\x93\x8a\t\x00\x00\x00\x00\x00\x00\x00\x00\xfeK\x14K\x02\x87R\x85R\x8a\t\x07\x00\x00\x00\x00\x00\x00\x00\x01b.',
+        ]
+        for t in testcases:
+            it = pickle.loads(t)
+            self.assertEqual(list(it), [14, 16, 18])
+
+    def test_iterator_setstate(self):
+        it = iter(range(10, 20, 2))
+        it.__setstate__(2)
+        self.assertEqual(list(it), [14, 16, 18])
+        it = reversed(range(10, 20, 2))
+        it.__setstate__(3)
+        self.assertEqual(list(it), [12, 10])
+        it = iter(range(-2**65, 20, 2))
+        it.__setstate__(2**64 + 7)
+        self.assertEqual(list(it), [14, 16, 18])
+        it = reversed(range(10, 2**65, 2))
+        it.__setstate__(2**64 - 7)
+        self.assertEqual(list(it), [12, 10])
+
+    def test_odd_bug(self):
+        # This used to raise a "SystemError: NULL result without error"
+        # because the range validation step was eating the exception
+        # before NULL was returned.
+        with self.assertRaises(TypeError):
+            range([], 1, -1)
+
+    def test_types(self):
+        # Non-integer objects *equal* to any of the range's items are supposed
+        # to be contained in the range.
+        self.assertIn(1.0, range(3))
+        self.assertIn(True, range(3))
+        self.assertIn(1+0j, range(3))
+
+        self.assertIn(ALWAYS_EQ, range(3))
+
+        # Objects are never coerced into other types for comparison.
+        class C2:
+            def __int__(self): return 1
+            def __index__(self): return 1
+        self.assertNotIn(C2(), range(3))
+        # ..except if explicitly told so.
+        self.assertIn(int(C2()), range(3))
+
+        # Check that the range.__contains__ optimization is only
+        # used for ints, not for instances of subclasses of int.
+        class C3(int):
+            def __eq__(self, other): return True
+        self.assertIn(C3(11), range(10))
+        self.assertIn(C3(11), list(range(10)))
+
+    def test_strided_limits(self):
+        r = range(0, 101, 2)
+        self.assertIn(0, r)
+        self.assertNotIn(1, r)
+        self.assertIn(2, r)
+        self.assertNotIn(99, r)
+        self.assertIn(100, r)
+        self.assertNotIn(101, r)
+
+        r = range(0, -20, -1)
+        self.assertIn(0, r)
+        self.assertIn(-1, r)
+        self.assertIn(-19, r)
+        self.assertNotIn(-20, r)
+
+        r = range(0, -20, -2)
+        self.assertIn(-18, r)
+        self.assertNotIn(-19, r)
+        self.assertNotIn(-20, r)
+
+    def test_empty(self):
+        r = range(0)
+        self.assertNotIn(0, r)
+        self.assertNotIn(1, r)
+
+        r = range(0, -10)
+        self.assertNotIn(0, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(1, r)
+
+    def test_range_iterators(self):
+        # exercise 'fast' iterators, that use a rangeiterobject internally.
+        # see issue 7298
+        limits = [base + jiggle
+                  for M in (2**32, 2**64)
+                  for base in (-M, -M//2, 0, M//2, M)
+                  for jiggle in (-2, -1, 0, 1, 2)]
+        test_ranges = [(start, end, step)
+                       for start in limits
+                       for end in limits
+                       for step in (-2**63, -2**31, -2, -1, 1, 2)]
+        test_ranges += [(-2**63, 2**63-2, 1)] # regression test for gh-100810
+
+        for start, end, step in test_ranges:
+            iter1 = range(start, end, step)
+            iter2 = pyrange(start, end, step)
+            test_id = "range({}, {}, {})".format(start, end, step)
+            # check first 100 entries
+            self.assert_iterators_equal(iter1, iter2, test_id, limit=100)
+
+            iter1 = reversed(range(start, end, step))
+            iter2 = pyrange_reversed(start, end, step)
+            test_id = "reversed(range({}, {}, {}))".format(start, end, step)
+            self.assert_iterators_equal(iter1, iter2, test_id, limit=100)
+
+    def test_range_iterators_invocation(self):
+        # verify range iterators instances cannot be created by
+        # calling their type
+        rangeiter_type = type(iter(range(0)))
+        self.assertRaises(TypeError, rangeiter_type, 1, 3, 1)
+        long_rangeiter_type = type(iter(range(1 << 1000)))
+        self.assertRaises(TypeError, long_rangeiter_type, 1, 3, 1)
+
+    def test_slice(self):
+        def check(start, stop, step=None):
+            i = slice(start, stop, step)
+            self.assertEqual(list(r[i]), list(r)[i])
+            self.assertEqual(len(r[i]), len(list(r)[i]))
+        for r in [range(10),
+                  range(0),
+                  range(1, 9, 3),
+                  range(8, 0, -3),
+                  range(sys.maxsize+1, sys.maxsize+10),
+                  ]:
+            check(0, 2)
+            check(0, 20)
+            check(1, 2)
+            check(20, 30)
+            check(-30, -20)
+            check(-1, 100, 2)
+            check(0, -1)
+            check(-1, -3, -1)
+
+    def test_contains(self):
+        r = range(10)
+        self.assertIn(0, r)
+        self.assertIn(1, r)
+        self.assertIn(5.0, r)
+        self.assertNotIn(5.1, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(10, r)
+        self.assertNotIn("", r)
+        r = range(9, -1, -1)
+        self.assertIn(0, r)
+        self.assertIn(1, r)
+        self.assertIn(5.0, r)
+        self.assertNotIn(5.1, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(10, r)
+        self.assertNotIn("", r)
+        r = range(0, 10, 2)
+        self.assertIn(0, r)
+        self.assertNotIn(1, r)
+        self.assertNotIn(5.0, r)
+        self.assertNotIn(5.1, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(10, r)
+        self.assertNotIn("", r)
+        r = range(9, -1, -2)
+        self.assertNotIn(0, r)
+        self.assertIn(1, r)
+        self.assertIn(5.0, r)
+        self.assertNotIn(5.1, r)
+        self.assertNotIn(-1, r)
+        self.assertNotIn(10, r)
+        self.assertNotIn("", r)
+
+    def test_reverse_iteration(self):
+        for r in [range(10),
+                  range(0),
+                  range(1, 9, 3),
+                  range(8, 0, -3),
+                  range(sys.maxsize+1, sys.maxsize+10),
+                  ]:
+            self.assertEqual(list(reversed(r)), list(r)[::-1])
+
+    def test_issue11845(self):
+        r = range(*slice(1, 18, 2).indices(20))
+        values = {None, 0, 1, -1, 2, -2, 5, -5, 19, -19,
+                  20, -20, 21, -21, 30, -30, 99, -99}
+        for i in values:
+            for j in values:
+                for k in values - {0}:
+                    r[i:j:k]
+
+    def test_comparison(self):
+        test_ranges = [range(0), range(0, -1), range(1, 1, 3),
+                       range(1), range(5, 6), range(5, 6, 2),
+                       range(5, 7, 2), range(2), range(0, 4, 2),
+                       range(0, 5, 2), range(0, 6, 2)]
+        test_tuples = list(map(tuple, test_ranges))
+
+        # Check that equality of ranges matches equality of the corresponding
+        # tuples for each pair from the test lists above.
+        ranges_eq = [a == b for a in test_ranges for b in test_ranges]
+        tuples_eq = [a == b for a in test_tuples for b in test_tuples]
+        self.assertEqual(ranges_eq, tuples_eq)
+
+        # Check that != correctly gives the logical negation of ==
+        ranges_ne = [a != b for a in test_ranges for b in test_ranges]
+        self.assertEqual(ranges_ne, [not x for x in ranges_eq])
+
+        # Ranges are unequal to other types (even sequence types)
+        self.assertIs(range(0) == (), False)
+        # self.assertIs(() == range(0), False)
+        self.assertIs(range(2) == [0, 1], False)
+
+        # Huge integers aren't a problem.
+        self.assertEqual(range(0, 2**100 - 1, 2),
+                         range(0, 2**100, 2))
+        self.assertNotEqual(range(0, 2**100, 2),
+                            range(0, 2**100 + 1, 2))
+        self.assertEqual(range(2**200, 2**201 - 2**99, 2**100),
+                         range(2**200, 2**201, 2**100))
+        self.assertNotEqual(range(2**200, 2**201, 2**100),
+                            range(2**200, 2**201 + 1, 2**100))
+
+        # Order comparisons are not implemented for ranges.
+        with self.assertRaises(TypeError):
+            range(0) < range(0)
+        with self.assertRaises(TypeError):
+            range(0) > range(0)
+        with self.assertRaises(TypeError):
+            range(0) <= range(0)
+        with self.assertRaises(TypeError):
+            range(0) >= range(0)
+
+
+    def test_attributes(self):
+        # test the start, stop and step attributes of range objects
+        self.assert_attrs(range(0), 0, 0, 1)
+        self.assert_attrs(range(10), 0, 10, 1)
+        self.assert_attrs(range(-10), 0, -10, 1)
+        self.assert_attrs(range(0, 10, 1), 0, 10, 1)
+        self.assert_attrs(range(0, 10, 3), 0, 10, 3)
+        self.assert_attrs(range(10, 0, -1), 10, 0, -1)
+        self.assert_attrs(range(10, 0, -3), 10, 0, -3)
+        self.assert_attrs(range(True), 0, 1, 1)
+        self.assert_attrs(range(False, True), 0, 1, 1)
+        self.assert_attrs(range(False, True, True), 0, 1, 1)
+
+    def assert_attrs(self, rangeobj, start, stop, step):
+        self.assertEqual(rangeobj.start, start)
+        self.assertEqual(rangeobj.stop, stop)
+        self.assertEqual(rangeobj.step, step)
+        self.assertIs(type(rangeobj.start), int)
+        self.assertIs(type(rangeobj.stop), int)
+        self.assertIs(type(rangeobj.step), int)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_set.diff b/test/dynamo/cpython/3_13/test_set.diff
index 36af351c514e..77dce156a1e1 100644
--- a/test/dynamo/cpython/3_13/test_set.diff
+++ b/test/dynamo/cpython/3_13/test_set.diff
@@ -62,23 +62,23 @@ index d9102eb98a5..c8ee5ca451f 100644
 @@ -38,7 +91,7 @@ class HashCountingInt(int):
          self.hash_count += 1
          return int.__hash__(self)
- 
+
 -class TestJointOps:
 +class _TestJointOps:
      # Tests common to both set and frozenset
- 
+
      def setUp(self):
 @@ -47,6 +100,7 @@ class TestJointOps:
          self.letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
          self.s = self.thetype(word)
          self.d = dict.fromkeys(word)
 +        super().setUp()
- 
+
      def test_new_or_init(self):
          self.assertRaises(TypeError, self.thetype, [], 2)
 @@ -261,13 +315,14 @@ class TestJointOps:
              self.assertEqual(self.thetype(it), data - self.thetype((drop,)))
- 
+
      def test_deepcopy(self):
 -        class Tracer:
 -            def __init__(self, value):
@@ -87,7 +87,7 @@ index d9102eb98a5..c8ee5ca451f 100644
 -                return self.value
 -            def __deepcopy__(self, memo=None):
 -                return Tracer(self.value + 1)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Tracer:
 +                def __init__(self, value):
 +                    self.value = value
@@ -99,25 +99,25 @@ index d9102eb98a5..c8ee5ca451f 100644
          s = self.thetype([t])
          dup = copy.deepcopy(s)
 @@ -279,8 +334,9 @@ class TestJointOps:
- 
+
      def test_gc(self):
          # Create a nest of cycles to exercise overall ref count check
 -        class A:
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class A:
 +                pass
          s = set(A() for i in range(1000))
          for elem in s:
              elem.cycle = s
 @@ -289,9 +345,10 @@ class TestJointOps:
- 
+
      def test_subclass_with_custom_hash(self):
          # Bug #1257731
 -        class H(self.thetype):
 -            def __hash__(self):
 -                return int(id(self) & 0x7fffffff)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class H(self.thetype):
 +                def __hash__(self):
 +                    return int(id(self) & 0x7fffffff)
@@ -125,12 +125,12 @@ index d9102eb98a5..c8ee5ca451f 100644
          f=set()
          f.add(s)
 @@ -342,8 +399,9 @@ class TestJointOps:
- 
+
      def test_container_iterator(self):
          # Bug #3680: tp_traverse was not implemented for set iterator object
 -        class C(object):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C(object):
 +                pass
          obj = C()
@@ -139,15 +139,15 @@ index d9102eb98a5..c8ee5ca451f 100644
 @@ -355,7 +413,7 @@ class TestJointOps:
      def test_free_after_iterating(self):
          support.check_free_after_iterating(self, iter, self.thetype)
- 
+
 -class TestSet(TestJointOps, unittest.TestCase):
 +class TestSet(_TestJointOps, __TestCase):
      thetype = set
      basetype = set
- 
+
 @@ -600,19 +658,20 @@ class TestSet(TestJointOps, unittest.TestCase):
          self.assertRaises(ReferenceError, str, p)
- 
+
      def test_rich_compare(self):
 -        class TestRichSetCompare:
 -            def __gt__(self, some_set):
@@ -162,7 +162,7 @@ index d9102eb98a5..c8ee5ca451f 100644
 -            def __le__(self, some_set):
 -                self.le_called = True
 -                return False
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class TestRichSetCompare:
 +                def __gt__(self, some_set):
 +                    self.gt_called = True
@@ -176,16 +176,16 @@ index d9102eb98a5..c8ee5ca451f 100644
 +                def __le__(self, some_set):
 +                    self.le_called = True
 +                    return False
- 
+
          # This first tries the builtin rich set comparison, which doesn't know
          # how to handle the custom object. Upon returning NotImplemented, the
 @@ -644,28 +703,31 @@ class TestSetSubclass(TestSet):
      basetype = set
- 
+
      def test_keywords_in_subclass(self):
 -        class subclass(set):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass(set):
 +                pass
          u = subclass([1, 2])
@@ -193,12 +193,12 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.assertEqual(set(u), {1, 2})
          with self.assertRaises(TypeError):
              subclass(sequence=())
- 
+
 -        class subclass_with_init(set):
 -            def __init__(self, arg, newarg=None):
 -                super().__init__(arg)
 -                self.newarg = newarg
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_init(set):
 +                def __init__(self, arg, newarg=None):
 +                    super().__init__(arg)
@@ -207,13 +207,13 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.assertIs(type(u), subclass_with_init)
          self.assertEqual(set(u), {1, 2})
          self.assertEqual(u.newarg, 3)
- 
+
 -        class subclass_with_new(set):
 -            def __new__(cls, arg, newarg=None):
 -                self = super().__new__(cls, arg)
 -                self.newarg = newarg
 -                return self
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_new(set):
 +                def __new__(cls, arg, newarg=None):
 +                    self = super().__new__(cls, arg)
@@ -224,20 +224,20 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.assertEqual(set(u), {1, 2})
 @@ -675,7 +737,7 @@ class TestSetSubclass(TestSet):
              subclass_with_new([1, 2], newarg=3)
- 
- 
+
+
 -class TestFrozenSet(TestJointOps, unittest.TestCase):
 +class TestFrozenSet(_TestJointOps, __TestCase):
      thetype = frozenset
      basetype = frozenset
- 
+
 @@ -756,27 +818,30 @@ class TestFrozenSetSubclass(TestFrozenSet):
      basetype = frozenset
- 
+
      def test_keywords_in_subclass(self):
 -        class subclass(frozenset):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass(frozenset):
 +                pass
          u = subclass([1, 2])
@@ -245,11 +245,11 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.assertEqual(set(u), {1, 2})
          with self.assertRaises(TypeError):
              subclass(sequence=())
- 
+
 -        class subclass_with_init(frozenset):
 -            def __init__(self, arg, newarg=None):
 -                self.newarg = newarg
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_init(frozenset):
 +                def __init__(self, arg, newarg=None):
 +                    self.newarg = newarg
@@ -257,13 +257,13 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.assertIs(type(u), subclass_with_init)
          self.assertEqual(set(u), {1, 2})
          self.assertEqual(u.newarg, 3)
- 
+
 -        class subclass_with_new(frozenset):
 -            def __new__(cls, arg, newarg=None):
 -                self = super().__new__(cls, arg)
 -                self.newarg = newarg
 -                return self
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_new(frozenset):
 +                def __new__(cls, arg, newarg=None):
 +                    self = super().__new__(cls, arg)
@@ -275,7 +275,7 @@ index d9102eb98a5..c8ee5ca451f 100644
 @@ -811,10 +876,17 @@ class TestFrozenSetSubclass(TestFrozenSet):
  class SetSubclassWithSlots(set):
      __slots__ = ('x', 'y', '__dict__')
- 
+
 -class TestSetSubclassWithSlots(unittest.TestCase):
 +class TestSetSubclassWithSlots(__TestCase):
      thetype = SetSubclassWithSlots
@@ -290,22 +290,22 @@ index d9102eb98a5..c8ee5ca451f 100644
 +        self.s = self.thetype(word)
 +        self.d = dict.fromkeys(word)
 +        super().setUp()
- 
+
  class FrozenSetSubclassWithSlots(frozenset):
      __slots__ = ('x', 'y', '__dict__')
 @@ -828,7 +900,7 @@ empty_set = set()
- 
+
  #==============================================================================
- 
+
 -class TestBasicOps:
 +class _TestBasicOps:
- 
+
      def test_repr(self):
          if self.repr is not None:
 @@ -934,7 +1006,7 @@ class TestBasicOps:
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsEmpty(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsEmpty(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -316,9 +316,9 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.length = 0
          self.repr   = "set()"
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsSingleton(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -329,13 +329,13 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.length = 1
          self.repr   = "{3}"
 +        super().setUp()
- 
+
      def test_in(self):
          self.assertIn(3, self.set)
 @@ -962,7 +1036,7 @@ class TestBasicOpsSingleton(TestBasicOps, unittest.TestCase):
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsTuple(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -346,13 +346,13 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.length = 1
          self.repr   = "{(0, 'zero')}"
 +        super().setUp()
- 
+
      def test_in(self):
          self.assertIn((0, "zero"), self.set)
 @@ -979,7 +1054,7 @@ class TestBasicOpsTuple(TestBasicOps, unittest.TestCase):
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsTriple(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsTriple(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -363,9 +363,9 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.length = 3
          self.repr   = None
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsString(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsString(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -375,12 +375,12 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.dup    = set(self.values)
          self.length = 3
 +        super().setUp()
- 
+
      def test_repr(self):
          self.check_repr_against_values()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsBytes(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsBytes(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -390,12 +390,12 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.dup    = set(self.values)
          self.length = 3
 +        super().setUp()
- 
+
      def test_repr(self):
          self.check_repr_against_values()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestBasicOpsMixedStringBytes(TestBasicOps, unittest.TestCase):
 +class TestBasicOpsMixedStringBytes(_TestBasicOps, __TestCase):
      def setUp(self):
@@ -406,71 +406,71 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.dup    = set(self.values)
          self.length = 4
 +        super().setUp()
- 
+
      def test_repr(self):
          self.check_repr_against_values()
 @@ -1038,7 +1117,7 @@ def baditer():
  def gooditer():
      yield True
- 
+
 -class TestExceptionPropagation(unittest.TestCase):
 +class TestExceptionPropagation(__TestCase):
      """SF 628246:  Set constructor should not trap iterator TypeErrors"""
- 
+
      def test_instanceWithException(self):
 @@ -1065,7 +1144,7 @@ class TestExceptionPropagation(unittest.TestCase):
- 
+
  #==============================================================================
- 
+
 -class TestSetOfSets(unittest.TestCase):
 +class TestSetOfSets(__TestCase):
      def test_constructor(self):
          inner = frozenset([1])
          outer = set([inner])
 @@ -1078,9 +1157,10 @@ class TestSetOfSets(unittest.TestCase):
- 
+
  #==============================================================================
- 
+
 -class TestBinaryOps(unittest.TestCase):
 +class TestBinaryOps(__TestCase):
      def setUp(self):
          self.set = set((2, 4, 6))
 +        super().setUp()
- 
+
      def test_eq(self):              # SF bug 643115
          self.assertEqual(self.set, set({2:1,4:3,6:5}))
 @@ -1151,9 +1231,10 @@ class TestBinaryOps(unittest.TestCase):
- 
+
  #==============================================================================
- 
+
 -class TestUpdateOps(unittest.TestCase):
 +class TestUpdateOps(__TestCase):
      def setUp(self):
          self.set = set((2, 4, 6))
 +        super().setUp()
- 
+
      def test_union_subset(self):
          self.set |= set([2])
 @@ -1237,10 +1318,11 @@ class TestUpdateOps(unittest.TestCase):
- 
+
  #==============================================================================
- 
+
 -class TestMutate(unittest.TestCase):
 +class TestMutate(__TestCase):
      def setUp(self):
          self.values = ["a", "b", "c"]
          self.set = set(self.values)
 +        super().setUp()
- 
+
      def test_add_present(self):
          self.set.add("c")
 @@ -1311,7 +1393,7 @@ class TestMutate(unittest.TestCase):
- 
+
  #==============================================================================
- 
+
 -class TestSubsets:
 +class _TestSubsets:
- 
+
      case2method = {"<=": "issubset",
                     ">=": "issuperset",
 @@ -1334,22 +1416,22 @@ class TestSubsets:
@@ -483,7 +483,7 @@ index d9102eb98a5..c8ee5ca451f 100644
 +                method = getattr(x, _TestSubsets.case2method[case])
                  result = method(y)
                  self.assertEqual(result, expected)
- 
+
              # Now do the same for the operands reversed.
 -            rcase = TestSubsets.reverse[case]
 +            rcase = _TestSubsets.reverse[case]
@@ -496,61 +496,61 @@ index d9102eb98a5..c8ee5ca451f 100644
                  result = method(x)
                  self.assertEqual(result, expected)
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
 +class TestSubsetEqualEmpty(_TestSubsets, __TestCase):
      left  = set()
      right = set()
      name  = "both empty"
 @@ -1357,7 +1439,7 @@ class TestSubsetEqualEmpty(TestSubsets, unittest.TestCase):
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
 +class TestSubsetEqualNonEmpty(_TestSubsets, __TestCase):
      left  = set([1, 2])
      right = set([1, 2])
      name  = "equal pair"
 @@ -1365,7 +1447,7 @@ class TestSubsetEqualNonEmpty(TestSubsets, unittest.TestCase):
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
 +class TestSubsetEmptyNonEmpty(_TestSubsets, __TestCase):
      left  = set()
      right = set([1, 2])
      name  = "one empty, one non-empty"
 @@ -1373,7 +1455,7 @@ class TestSubsetEmptyNonEmpty(TestSubsets, unittest.TestCase):
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetPartial(TestSubsets, unittest.TestCase):
 +class TestSubsetPartial(_TestSubsets, __TestCase):
      left  = set([1])
      right = set([1, 2])
      name  = "one a non-empty proper subset of other"
 @@ -1381,7 +1463,7 @@ class TestSubsetPartial(TestSubsets, unittest.TestCase):
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
 +class TestSubsetNonOverlap(_TestSubsets, __TestCase):
      left  = set([1])
      right = set([2])
      name  = "neither empty, neither contains"
 @@ -1389,7 +1471,7 @@ class TestSubsetNonOverlap(TestSubsets, unittest.TestCase):
- 
+
  #==============================================================================
- 
+
 -class TestOnlySetsInBinaryOps:
 +class _TestOnlySetsInBinaryOps:
- 
+
      def test_eq_ne(self):
          # Unlike the others, this is testing that == and != *are* allowed.
 @@ -1505,47 +1587,52 @@ class TestOnlySetsInBinaryOps:
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsNumeric(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsNumeric(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -558,9 +558,9 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = 19
          self.otherIsIterable = False
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsDict(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsDict(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -568,9 +568,9 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = {1:2, 3:4}
          self.otherIsIterable = True
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsOperator(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsOperator(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -578,9 +578,9 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = operator.add
          self.otherIsIterable = False
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsTuple(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsTuple(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -588,9 +588,9 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = (2, 4, 6)
          self.otherIsIterable = True
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsString(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsString(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -598,9 +598,9 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = 'abc'
          self.otherIsIterable = True
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestOnlySetsGenerator(TestOnlySetsInBinaryOps, unittest.TestCase):
 +class TestOnlySetsGenerator(_TestOnlySetsInBinaryOps, __TestCase):
      def setUp(self):
@@ -611,80 +611,80 @@ index d9102eb98a5..c8ee5ca451f 100644
          self.other = gen()
          self.otherIsIterable = True
 +        super().setUp()
- 
+
  #==============================================================================
- 
+
 -class TestCopying:
 +class _TestCopying:
- 
+
      def test_copy(self):
          dup = self.set.copy()
 @@ -1577,40 +1665,46 @@ class TestCopying:
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingEmpty(TestCopying, unittest.TestCase):
 +class TestCopyingEmpty(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set()
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingSingleton(TestCopying, unittest.TestCase):
 +class TestCopyingSingleton(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set(["hello"])
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingTriple(TestCopying, unittest.TestCase):
 +class TestCopyingTriple(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set(["zero", 0, None])
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingTuple(TestCopying, unittest.TestCase):
 +class TestCopyingTuple(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set([(1, 2)])
 +        super().setUp()
- 
+
  #------------------------------------------------------------------------------
- 
+
 -class TestCopyingNested(TestCopying, unittest.TestCase):
 +class TestCopyingNested(_TestCopying, __TestCase):
      def setUp(self):
          self.set = set([((1, 2), (3, 4))])
 +        super().setUp()
- 
+
  #==============================================================================
- 
+
 -class TestIdentities(unittest.TestCase):
 +class TestIdentities(__TestCase):
      def setUp(self):
          self.a = set('abracadabra')
          self.b = set('alacazam')
 +        super().setUp()
- 
+
      def test_binopsVsSubsets(self):
          a, b = self.a, self.b
 @@ -1727,7 +1821,7 @@ def L(seqn):
      'Test multiple tiers of iterators'
      return chain(map(lambda x:x, R(Ig(G(seqn)))))
- 
+
 -class TestVariousIteratorArgs(unittest.TestCase):
 +class TestVariousIteratorArgs(__TestCase):
- 
+
      def test_constructor(self):
          for cons in (set, frozenset):
 @@ -1785,7 +1879,7 @@ class bad_dict_clear:
      def __hash__(self):
          return 0
- 
+
 -class TestWeirdBugs(unittest.TestCase):
 +class TestWeirdBugs(__TestCase):
      def test_8420_set_merge(self):
@@ -692,7 +692,7 @@ index d9102eb98a5..c8ee5ca451f 100644
          global be_bad, set2, dict2
 @@ -1813,12 +1907,13 @@ class TestWeirdBugs(unittest.TestCase):
          list(si)
- 
+
      def test_merge_and_mutate(self):
 -        class X:
 -            def __hash__(self):
@@ -700,27 +700,27 @@ index d9102eb98a5..c8ee5ca451f 100644
 -            def __eq__(self, o):
 -                other.clear()
 -                return False
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class X:
 +                def __hash__(self):
 +                    return hash(0)
 +                def __eq__(self, o):
 +                    other.clear()
 +                    return False
- 
+
          other = set()
          other = {X() for i in range(10)}
 @@ -1826,24 +1921,25 @@ class TestWeirdBugs(unittest.TestCase):
          s.update(other)
- 
- 
+
+
 -class TestOperationsMutating:
 +class _TestOperationsMutating:
      """Regression test for bpo-46615"""
- 
+
      constructor1 = None
      constructor2 = None
- 
+
      def make_sets_of_bad_objects(self):
 -        class Bad:
 -            def __eq__(self, other):
@@ -733,7 +733,7 @@ index d9102eb98a5..c8ee5ca451f 100644
 -                return bool(randrange(2))
 -            def __hash__(self):
 -                return randrange(2)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Bad:
 +                def __eq__(self, other):
 +                    if not enabled:
@@ -750,89 +750,89 @@ index d9102eb98a5..c8ee5ca451f 100644
          set1 = self.constructor1(Bad() for _ in range(randrange(50)))
 @@ -1862,7 +1958,7 @@ class TestOperationsMutating:
                  self.assertIn("changed size during iteration", str(e))
- 
- 
+
+
 -class TestBinaryOpsMutating(TestOperationsMutating):
 +class _TestBinaryOpsMutating(_TestOperationsMutating):
- 
+
      def test_eq_with_mutation(self):
          self.check_set_op_does_not_crash(lambda a, b: a == b)
 @@ -1933,24 +2029,24 @@ class TestBinaryOpsMutating(TestOperationsMutating):
          self.check_set_op_does_not_crash(f3)
- 
- 
+
+
 -class TestBinaryOpsMutating_Set_Set(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Set_Set(_TestBinaryOpsMutating, __TestCase):
      constructor1 = set
      constructor2 = set
- 
+
 -class TestBinaryOpsMutating_Subclass_Subclass(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Subclass_Subclass(_TestBinaryOpsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = SetSubclass
- 
+
 -class TestBinaryOpsMutating_Set_Subclass(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Set_Subclass(_TestBinaryOpsMutating, __TestCase):
      constructor1 = set
      constructor2 = SetSubclass
- 
+
 -class TestBinaryOpsMutating_Subclass_Set(TestBinaryOpsMutating, unittest.TestCase):
 +class TestBinaryOpsMutating_Subclass_Set(_TestBinaryOpsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = set
- 
- 
+
+
 -class TestMethodsMutating(TestOperationsMutating):
 +class _TestMethodsMutating(_TestOperationsMutating):
- 
+
      def test_issubset_with_mutation(self):
          self.check_set_op_does_not_crash(set.issubset)
 @@ -1986,27 +2082,27 @@ class TestMethodsMutating(TestOperationsMutating):
          self.check_set_op_does_not_crash(set.update)
- 
- 
+
+
 -class TestMethodsMutating_Set_Set(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_Set(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = set
- 
+
 -class TestMethodsMutating_Subclass_Subclass(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Subclass_Subclass(_TestMethodsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = SetSubclass
- 
+
 -class TestMethodsMutating_Set_Subclass(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_Subclass(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = SetSubclass
- 
+
 -class TestMethodsMutating_Subclass_Set(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Subclass_Set(_TestMethodsMutating, __TestCase):
      constructor1 = SetSubclass
      constructor2 = set
- 
+
 -class TestMethodsMutating_Set_Dict(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_Dict(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = dict.fromkeys
- 
+
 -class TestMethodsMutating_Set_List(TestMethodsMutating, unittest.TestCase):
 +class TestMethodsMutating_Set_List(_TestMethodsMutating, __TestCase):
      constructor1 = set
      constructor2 = list
- 
+
 @@ -2068,7 +2164,7 @@ def faces(G):
      return f
- 
- 
+
+
 -class TestGraphs(unittest.TestCase):
 +class TestGraphs(__TestCase):
- 
+
      def test_cube(self):
- 
+
 @@ -2118,4 +2214,4 @@ class TestGraphs(unittest.TestCase):
  #==============================================================================
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_set.py b/test/dynamo/cpython/3_13/test_set.py
index c8ee5ca451f4..1d80fccca5b1 100644
--- a/test/dynamo/cpython/3_13/test_set.py
+++ b/test/dynamo/cpython/3_13/test_set.py
@@ -315,7 +315,7 @@ def test_iterator_pickling(self):
             self.assertEqual(self.thetype(it), data - self.thetype((drop,)))
 
     def test_deepcopy(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Tracer:
                 def __init__(self, value):
                     self.value = value
@@ -334,7 +334,7 @@ def __deepcopy__(self, memo=None):
 
     def test_gc(self):
         # Create a nest of cycles to exercise overall ref count check
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class A:
                 pass
         s = set(A() for i in range(1000))
@@ -345,7 +345,7 @@ class A:
 
     def test_subclass_with_custom_hash(self):
         # Bug #1257731
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class H(self.thetype):
                 def __hash__(self):
                     return int(id(self) & 0x7fffffff)
@@ -399,7 +399,7 @@ def test_do_not_rehash_dict_keys(self):
 
     def test_container_iterator(self):
         # Bug #3680: tp_traverse was not implemented for set iterator object
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C(object):
                 pass
         obj = C()
@@ -658,7 +658,7 @@ def test_weakref(self):
         self.assertRaises(ReferenceError, str, p)
 
     def test_rich_compare(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class TestRichSetCompare:
                 def __gt__(self, some_set):
                     self.gt_called = True
@@ -703,7 +703,7 @@ class TestSetSubclass(TestSet):
     basetype = set
 
     def test_keywords_in_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass(set):
                 pass
         u = subclass([1, 2])
@@ -712,7 +712,7 @@ class subclass(set):
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(set):
                 def __init__(self, arg, newarg=None):
                     super().__init__(arg)
@@ -722,7 +722,7 @@ def __init__(self, arg, newarg=None):
         self.assertEqual(set(u), {1, 2})
         self.assertEqual(u.newarg, 3)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(set):
                 def __new__(cls, arg, newarg=None):
                     self = super().__new__(cls, arg)
@@ -818,7 +818,7 @@ class TestFrozenSetSubclass(TestFrozenSet):
     basetype = frozenset
 
     def test_keywords_in_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass(frozenset):
                 pass
         u = subclass([1, 2])
@@ -827,7 +827,7 @@ class subclass(frozenset):
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(frozenset):
                 def __init__(self, arg, newarg=None):
                     self.newarg = newarg
@@ -836,7 +836,7 @@ def __init__(self, arg, newarg=None):
         self.assertEqual(set(u), {1, 2})
         self.assertEqual(u.newarg, 3)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(frozenset):
                 def __new__(cls, arg, newarg=None):
                     self = super().__new__(cls, arg)
@@ -1907,7 +1907,7 @@ def test_iter_and_mutate(self):
         list(si)
 
     def test_merge_and_mutate(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class X:
                 def __hash__(self):
                     return hash(0)
@@ -1928,7 +1928,7 @@ class _TestOperationsMutating:
     constructor2 = None
 
     def make_sets_of_bad_objects(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Bad:
                 def __eq__(self, other):
                     if not enabled:
diff --git a/test/dynamo/cpython/3_13/test_sort.diff b/test/dynamo/cpython/3_13/test_sort.diff
index 8a39fbbc80d5..2e719655d9df 100644
--- a/test/dynamo/cpython/3_13/test_sort.diff
+++ b/test/dynamo/cpython/3_13/test_sort.diff
@@ -63,7 +63,7 @@ index 2a7cfb7affa..4805f1fcceb 100644
 @@ -39,7 +93,7 @@ def check(tag, expected, raw, compare=None):
              nerrors += 1
              return
- 
+
 -class TestBase(unittest.TestCase):
 +class TestBase(__TestCase):
      def testStressfully(self):
@@ -72,18 +72,18 @@ index 2a7cfb7affa..4805f1fcceb 100644
 @@ -48,32 +102,33 @@ class TestBase(unittest.TestCase):
              sizes.extend(range(n-1, n+2))
          sizes.extend([10, 100, 1000])
- 
+
 -        class Complains(object):
 -            maybe_complain = True
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class Complains(object):
 +                maybe_complain = True
- 
+
 -            def __init__(self, i):
 -                self.i = i
 +                def __init__(self, i):
 +                    self.i = i
- 
+
 -            def __lt__(self, other):
 -                if Complains.maybe_complain and random.random() < 0.001:
 -                    if verbose:
@@ -96,12 +96,12 @@ index 2a7cfb7affa..4805f1fcceb 100644
 +                            print("        complaining at", self, other)
 +                        raise RuntimeError
 +                    return self.i < other.i
- 
+
 -            def __repr__(self):
 -                return "Complains(%d)" % self.i
 +                def __repr__(self):
 +                    return "Complains(%d)" % self.i
- 
+
 -        class Stable(object):
 -            def __init__(self, key, i):
 -                self.key = key
@@ -110,31 +110,31 @@ index 2a7cfb7affa..4805f1fcceb 100644
 +                def __init__(self, key, i):
 +                    self.key = key
 +                    self.index = i
- 
+
 -            def __lt__(self, other):
 -                return self.key < other.key
 +                def __lt__(self, other):
 +                    return self.key < other.key
- 
+
 -            def __repr__(self):
 -                return "Stable(%d, %d)" % (self.key, self.index)
 +                def __repr__(self):
 +                    return "Stable(%d, %d)" % (self.key, self.index)
- 
+
          for n in sizes:
              x = list(range(n))
 @@ -151,20 +206,21 @@ class TestBase(unittest.TestCase):
                  self.assertEqual(forced, native)
  #==============================================================================
- 
+
 -class TestBugs(unittest.TestCase):
 +class TestBugs(__TestCase):
- 
+
      def test_bug453523(self):
          # bug 453523 -- list.sort() crasher.
          # If this fails, the most likely outcome is a core dump.
          # Mutations during a list sort should raise a ValueError.
- 
+
 -        class C:
 -            def __lt__(self, other):
 -                if L and random.random() < 0.75:
@@ -142,7 +142,7 @@ index 2a7cfb7affa..4805f1fcceb 100644
 -                else:
 -                    L.append(3)
 -                return random.random() < 0.5
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C:
 +                def __lt__(self, other):
 +                    if L and random.random() < 0.75:
@@ -150,20 +150,20 @@ index 2a7cfb7affa..4805f1fcceb 100644
 +                    else:
 +                        L.append(3)
 +                    return random.random() < 0.5
- 
+
          L = [C() for i in range(50)]
          self.assertRaises(ValueError, L.sort)
 @@ -188,7 +244,7 @@ class TestBugs(unittest.TestCase):
- 
+
  #==============================================================================
- 
+
 -class TestDecorateSortUndecorate(unittest.TestCase):
 +class TestDecorateSortUndecorate(__TestCase):
- 
+
      def test_decorated(self):
          data = 'The quick Brown fox Jumped over The lazy Dog'.split()
 @@ -228,26 +284,28 @@ class TestDecorateSortUndecorate(unittest.TestCase):
- 
+
      def test_key_with_mutating_del(self):
          data = list(range(10))
 -        class SortKiller(object):
@@ -174,7 +174,7 @@ index 2a7cfb7affa..4805f1fcceb 100644
 -                data[:] = range(20)
 -            def __lt__(self, other):
 -                return id(self) < id(other)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class SortKiller(object):
 +                def __init__(self, x):
 +                    pass
@@ -184,7 +184,7 @@ index 2a7cfb7affa..4805f1fcceb 100644
 +                def __lt__(self, other):
 +                    return id(self) < id(other)
          self.assertRaises(ValueError, data.sort, key=SortKiller)
- 
+
      def test_key_with_mutating_del_and_exception(self):
          data = list(range(10))
          ## dup = data[:]
@@ -195,7 +195,7 @@ index 2a7cfb7affa..4805f1fcceb 100644
 -            def __del__(self):
 -                del data[:]
 -                data[:] = list(range(20))
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class SortKiller(object):
 +                def __init__(self, x):
 +                    if x > 2:
@@ -209,7 +209,7 @@ index 2a7cfb7affa..4805f1fcceb 100644
 @@ -309,7 +367,7 @@ def check_against_PyObject_RichCompareBool(self, L):
              self.assertIs(opt, ref)
              #note: not assertEqual! We want to ensure *identical* behavior.
- 
+
 -class TestOptimizedCompares(unittest.TestCase):
 +class TestOptimizedCompares(__TestCase):
      def test_safe_object_compare(self):
@@ -218,39 +218,39 @@ index 2a7cfb7affa..4805f1fcceb 100644
 @@ -331,17 +389,18 @@ class TestOptimizedCompares(unittest.TestCase):
          # This test is by ppperry. It ensures that unsafe_object_compare is
          # verifying ms->key_richcompare == tp->richcompare before comparing.
- 
+
 -        class WackyComparator(int):
 -            def __lt__(self, other):
 -                elem.__class__ = WackyList2
 -                return int.__lt__(self, other)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class WackyComparator(int):
 +                def __lt__(self, other):
 +                    elem.__class__ = WackyList2
 +                    return int.__lt__(self, other)
- 
+
 -        class WackyList1(list):
 -            pass
 +            class WackyList1(list):
 +                pass
- 
+
 -        class WackyList2(list):
 -            def __lt__(self, other):
 -                raise ValueError
 +            class WackyList2(list):
 +                def __lt__(self, other):
 +                    raise ValueError
- 
+
          L = [WackyList1([WackyComparator(i), i]) for i in range(10)]
          elem = L[-1]
 @@ -355,9 +414,10 @@ class TestOptimizedCompares(unittest.TestCase):
- 
+
          # The following test is also by ppperry. It ensures that
          # unsafe_object_compare handles Py_NotImplemented appropriately.
 -        class PointlessComparator:
 -            def __lt__(self, other):
 -                return NotImplemented
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class PointlessComparator:
 +                def __lt__(self, other):
 +                    return NotImplemented
@@ -259,7 +259,7 @@ index 2a7cfb7affa..4805f1fcceb 100644
          self.assertRaises(TypeError, [(x,) for x in L].sort)
 @@ -408,4 +468,4 @@ class TestOptimizedCompares(unittest.TestCase):
  #==============================================================================
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_sort.py b/test/dynamo/cpython/3_13/test_sort.py
index 4805f1fcceb8..ab9f094cab1b 100644
--- a/test/dynamo/cpython/3_13/test_sort.py
+++ b/test/dynamo/cpython/3_13/test_sort.py
@@ -102,7 +102,7 @@ def testStressfully(self):
             sizes.extend(range(n-1, n+2))
         sizes.extend([10, 100, 1000])
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class Complains(object):
                 maybe_complain = True
 
@@ -213,7 +213,7 @@ def test_bug453523(self):
         # If this fails, the most likely outcome is a core dump.
         # Mutations during a list sort should raise a ValueError.
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C:
                 def __lt__(self, other):
                     if L and random.random() < 0.75:
@@ -284,7 +284,7 @@ def k(x):
 
     def test_key_with_mutating_del(self):
         data = list(range(10))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class SortKiller(object):
                 def __init__(self, x):
                     pass
@@ -298,7 +298,7 @@ def __lt__(self, other):
     def test_key_with_mutating_del_and_exception(self):
         data = list(range(10))
         ## dup = data[:]
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class SortKiller(object):
                 def __init__(self, x):
                     if x > 2:
@@ -389,7 +389,7 @@ def test_unsafe_object_compare(self):
         # This test is by ppperry. It ensures that unsafe_object_compare is
         # verifying ms->key_richcompare == tp->richcompare before comparing.
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class WackyComparator(int):
                 def __lt__(self, other):
                     elem.__class__ = WackyList2
@@ -414,7 +414,7 @@ def __lt__(self, other):
 
         # The following test is also by ppperry. It ensures that
         # unsafe_object_compare handles Py_NotImplemented appropriately.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class PointlessComparator:
                 def __lt__(self, other):
                     return NotImplemented
diff --git a/test/dynamo/cpython/3_13/test_tuple.diff b/test/dynamo/cpython/3_13/test_tuple.diff
index b0d6f7d917a0..d7ae3af2a2c8 100644
--- a/test/dynamo/cpython/3_13/test_tuple.diff
+++ b/test/dynamo/cpython/3_13/test_tuple.diff
@@ -60,15 +60,15 @@ index 9ce80c5e8ea..1080e85e31a 100644
 +from test import support
 +import seq_tests
  import unittest
- 
+
  import gc
 @@ -43,27 +97,30 @@ class TupleTest(seq_tests.CommonTest):
              tuple(sequence=())
- 
+
      def test_keywords_in_subclass(self):
 -        class subclass(tuple):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass(tuple):
 +                pass
          u = subclass([1, 2])
@@ -76,11 +76,11 @@ index 9ce80c5e8ea..1080e85e31a 100644
          self.assertEqual(list(u), [1, 2])
          with self.assertRaises(TypeError):
              subclass(sequence=())
- 
+
 -        class subclass_with_init(tuple):
 -            def __init__(self, arg, newarg=None):
 -                self.newarg = newarg
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_init(tuple):
 +                def __init__(self, arg, newarg=None):
 +                    self.newarg = newarg
@@ -88,13 +88,13 @@ index 9ce80c5e8ea..1080e85e31a 100644
          self.assertIs(type(u), subclass_with_init)
          self.assertEqual(list(u), [1, 2])
          self.assertEqual(u.newarg, 3)
- 
+
 -        class subclass_with_new(tuple):
 -            def __new__(cls, arg, newarg=None):
 -                self = super().__new__(cls, arg)
 -                self.newarg = newarg
 -                return self
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class subclass_with_new(tuple):
 +                def __new__(cls, arg, newarg=None):
 +                    self = super().__new__(cls, arg)
@@ -109,25 +109,25 @@ index 9ce80c5e8ea..1080e85e31a 100644
          # Tuple subtypes must always be tracked
 -        class MyTuple(tuple):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class MyTuple(tuple):
 +                pass
          self.check_track_dynamic(MyTuple, True)
- 
+
      @support.cpython_only
 @@ -404,7 +462,8 @@ class TupleTest(seq_tests.CommonTest):
          # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
          # optimization causes failures in code that relies on distinct
          # function addresses.
 -        class T(tuple): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class T(tuple): pass
          with self.assertRaises(TypeError):
              [3,] + T((1,2))
- 
+
 @@ -510,4 +569,4 @@ class TupleTest(seq_tests.CommonTest):
  #            pileup 262,143 mean 8.0 coll 262,143 z +92683.6
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_tuple.py b/test/dynamo/cpython/3_13/test_tuple.py
index 1080e85e31ac..914e3443f287 100644
--- a/test/dynamo/cpython/3_13/test_tuple.py
+++ b/test/dynamo/cpython/3_13/test_tuple.py
@@ -97,7 +97,7 @@ def test_keyword_args(self):
             tuple(sequence=())
 
     def test_keywords_in_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass(tuple):
                 pass
         u = subclass([1, 2])
@@ -106,7 +106,7 @@ class subclass(tuple):
         with self.assertRaises(TypeError):
             subclass(sequence=())
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_init(tuple):
                 def __init__(self, arg, newarg=None):
                     self.newarg = newarg
@@ -115,7 +115,7 @@ def __init__(self, arg, newarg=None):
         self.assertEqual(list(u), [1, 2])
         self.assertEqual(u.newarg, 3)
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class subclass_with_new(tuple):
                 def __new__(cls, arg, newarg=None):
                     self = super().__new__(cls, arg)
@@ -408,7 +408,7 @@ def test_track_dynamic(self):
     @support.cpython_only
     def test_track_subtypes(self):
         # Tuple subtypes must always be tracked
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class MyTuple(tuple):
                 pass
         self.check_track_dynamic(MyTuple, True)
@@ -462,7 +462,7 @@ def test_no_comdat_folding(self):
         # Issue 8847: In the PGO build, the MSVC linker's COMDAT folding
         # optimization causes failures in code that relies on distinct
         # function addresses.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class T(tuple): pass
         with self.assertRaises(TypeError):
             [3,] + T((1,2))
diff --git a/test/dynamo/cpython/3_13/test_userlist.diff b/test/dynamo/cpython/3_13/test_userlist.diff
index d32df2db769c..77e951de5fad 100644
--- a/test/dynamo/cpython/3_13/test_userlist.diff
+++ b/test/dynamo/cpython/3_13/test_userlist.diff
@@ -58,29 +58,29 @@ index 312702c8e39..d3d8dbf394a 100644
 +# ======= END DYNAMO PATCH =======
 +
  # Check every path through every method of UserList
- 
+
  from collections import UserList
 -from test import list_tests
 +import list_tests
  import unittest
  from test import support
- 
+
 @@ -56,9 +110,10 @@ class UserListTest(list_tests.CommonTest):
- 
+
      def test_getitemoverwriteiter(self):
          # Verify that __getitem__ overrides *are* recognized by __iter__
 -        class T(self.type2test):
 -            def __getitem__(self, key):
 -                return str(key) + '!!!'
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class T(self.type2test):
 +                def __getitem__(self, key):
 +                    return str(key) + '!!!'
          self.assertEqual(next(iter(T((1,2)))), "0!!!")
- 
+
      def test_userlist_copy(self):
 @@ -69,9 +124,9 @@ class UserListTest(list_tests.CommonTest):
- 
+
      # Decorate existing test with recursion limit, because
      # the test is for C structure, but `UserList` is a Python structure.
 -    test_repr_deep = support.infinite_recursion(25)(
@@ -89,7 +89,7 @@ index 312702c8e39..d3d8dbf394a 100644
 +    # test_repr_deep = support.infinite_recursion(25)(
 +    #     list_tests.CommonTest.test_repr_deep,
 +    # )
- 
+
  if __name__ == "__main__":
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_userlist.py b/test/dynamo/cpython/3_13/test_userlist.py
index d3d8dbf394a0..9bd988c45883 100644
--- a/test/dynamo/cpython/3_13/test_userlist.py
+++ b/test/dynamo/cpython/3_13/test_userlist.py
@@ -110,7 +110,7 @@ def test_mixedadd(self):
 
     def test_getitemoverwriteiter(self):
         # Verify that __getitem__ overrides *are* recognized by __iter__
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class T(self.type2test):
                 def __getitem__(self, key):
                     return str(key) + '!!!'
diff --git a/test/dynamo/cpython/3_13/test_with.diff b/test/dynamo/cpython/3_13/test_with.diff
index 29d0550c419f..2c27a4e4f0a3 100644
--- a/test/dynamo/cpython/3_13/test_with.diff
+++ b/test/dynamo/cpython/3_13/test_with.diff
@@ -24,84 +24,84 @@ index 8e9ed8500c7..66c18ad886a 100644
 +# ======= END DYNAMO PATCH =======
 +
  """Unit tests for the with statement specified in PEP 343."""
- 
- 
+
+
 @@ -104,16 +124,17 @@ class MockNested(Nested):
          return Nested.__exit__(self, *exc_info)
- 
- 
+
+
 -class FailureTestCase(unittest.TestCase):
 +class FailureTestCase(__TestCase):
      def testNameError(self):
          def fooNotDeclared():
              with foo: pass
          self.assertRaises(NameError, fooNotDeclared)
- 
+
      def testEnterAttributeError1(self):
 -        class LacksEnter(object):
 -            def __exit__(self, type, value, traceback):
 -                pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class LacksEnter(object):
 +                def __exit__(self, type, value, traceback):
 +                    pass
- 
+
          def fooLacksEnter():
              foo = LacksEnter()
 @@ -121,8 +142,9 @@ class FailureTestCase(unittest.TestCase):
          self.assertRaisesRegex(TypeError, 'the context manager', fooLacksEnter)
- 
+
      def testEnterAttributeError2(self):
 -        class LacksEnterAndExit(object):
 -            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class LacksEnterAndExit(object):
 +                pass
- 
+
          def fooLacksEnterAndExit():
              foo = LacksEnterAndExit()
 @@ -130,9 +152,10 @@ class FailureTestCase(unittest.TestCase):
          self.assertRaisesRegex(TypeError, 'the context manager', fooLacksEnterAndExit)
- 
+
      def testExitAttributeError(self):
 -        class LacksExit(object):
 -            def __enter__(self):
 -                pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class LacksExit(object):
 +                def __enter__(self):
 +                    pass
- 
+
          def fooLacksExit():
              foo = LacksExit()
 @@ -162,11 +185,12 @@ class FailureTestCase(unittest.TestCase):
              '  pass')
- 
+
      def testEnterThrows(self):
 -        class EnterThrows(object):
 -            def __enter__(self):
 -                raise RuntimeError("Enter threw")
 -            def __exit__(self, *args):
 -                pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class EnterThrows(object):
 +                def __enter__(self):
 +                    raise RuntimeError("Enter threw")
 +                def __exit__(self, *args):
 +                    pass
- 
+
          def shouldThrow():
              ct = EnterThrows()
 @@ -180,11 +204,12 @@ class FailureTestCase(unittest.TestCase):
          self.assertEqual(self.foo, None)
- 
+
      def testExitThrows(self):
 -        class ExitThrows(object):
 -            def __enter__(self):
 -                return
 -            def __exit__(self, *args):
 -                raise RuntimeError(42)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class ExitThrows(object):
 +                def __enter__(self):
 +                    return
@@ -111,17 +111,17 @@ index 8e9ed8500c7..66c18ad886a 100644
              with ExitThrows():
                  pass
 @@ -194,6 +219,7 @@ class ContextmanagerAssertionMixin(object):
- 
+
      def setUp(self):
          self.TEST_EXCEPTION = RuntimeError("test exception")
 +        super().setUp()
- 
+
      def assertInWithManagerInvariants(self, mock_manager):
          self.assertTrue(mock_manager.enter_called)
 @@ -237,7 +263,7 @@ class ContextmanagerAssertionMixin(object):
          self.assertTrue(mock_generator.stopped)
- 
- 
+
+
 -class NonexceptionalTestCase(unittest.TestCase, ContextmanagerAssertionMixin):
 +class NonexceptionalTestCase(__TestCase, ContextmanagerAssertionMixin):
      def testInlineGeneratorSyntax(self):
@@ -129,8 +129,8 @@ index 8e9ed8500c7..66c18ad886a 100644
              pass
 @@ -289,7 +315,7 @@ class NonexceptionalTestCase(unittest.TestCase, ContextmanagerAssertionMixin):
          self.assertAfterWithGeneratorInvariantsNoError(foo)
- 
- 
+
+
 -class NestedNonexceptionalTestCase(unittest.TestCase,
 +class NestedNonexceptionalTestCase(__TestCase,
      ContextmanagerAssertionMixin):
@@ -138,15 +138,15 @@ index 8e9ed8500c7..66c18ad886a 100644
          with Nested(mock_contextmanager_generator()):
 @@ -355,7 +381,7 @@ class NestedNonexceptionalTestCase(unittest.TestCase,
          self.assertAfterWithManagerInvariantsNoError(mock_nested)
- 
- 
+
+
 -class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
 +class ExceptionalTestCase(ContextmanagerAssertionMixin, __TestCase):
      def testSingleResource(self):
          cm = mock_contextmanager_generator()
          def shouldThrow():
 @@ -466,11 +492,12 @@ class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
- 
+
      def testRaisedStopIteration2(self):
          # From bug 1462485
 -        class cm(object):
@@ -154,17 +154,17 @@ index 8e9ed8500c7..66c18ad886a 100644
 -                pass
 -            def __exit__(self, type, value, traceback):
 -                pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class cm(object):
 +                def __enter__(self):
 +                    pass
 +                def __exit__(self, type, value, traceback):
 +                    pass
- 
+
          def shouldThrow():
              with cm():
 @@ -507,11 +534,12 @@ class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
- 
+
      def testRaisedGeneratorExit2(self):
          # From bug 1462485
 -        class cm (object):
@@ -172,19 +172,19 @@ index 8e9ed8500c7..66c18ad886a 100644
 -                pass
 -            def __exit__(self, type, value, traceback):
 -                pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class cm (object):
 +                def __enter__(self):
 +                    pass
 +                def __exit__(self, type, value, traceback):
 +                    pass
- 
+
          def shouldThrow():
              with cm():
 @@ -523,16 +551,17 @@ class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
          # issue4589: __exit__ return code may raise an exception
          # when looking at its truth value.
- 
+
 -        class cm(object):
 -            def __init__(self, bool_conversion):
 -                class Bool:
@@ -195,7 +195,7 @@ index 8e9ed8500c7..66c18ad886a 100644
 -                return 3
 -            def __exit__(self, a, b, c):
 -                return self.exit_result
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class cm(object):
 +                def __init__(self, bool_conversion):
 +                    class Bool:
@@ -206,25 +206,25 @@ index 8e9ed8500c7..66c18ad886a 100644
 +                    return 3
 +                def __exit__(self, a, b, c):
 +                    return self.exit_result
- 
+
          def trueAsBool():
              with cm(lambda: True):
 @@ -550,7 +579,7 @@ class ExceptionalTestCase(ContextmanagerAssertionMixin, unittest.TestCase):
          self.assertRaises(ZeroDivisionError, failAsBool)
- 
- 
+
+
 -class NonLocalFlowControlTestCase(unittest.TestCase):
 +class NonLocalFlowControlTestCase(__TestCase):
- 
+
      def testWithBreak(self):
          counter = 0
 @@ -607,7 +636,7 @@ class NonLocalFlowControlTestCase(unittest.TestCase):
              self.fail("Didn't raise RuntimeError")
- 
- 
+
+
 -class AssignmentTargetTestCase(unittest.TestCase):
 +class AssignmentTargetTestCase(__TestCase):
- 
+
      def testSingleComplexTarget(self):
          targets = {1: [0, 1, 2]}
 @@ -621,15 +650,17 @@ class AssignmentTargetTestCase(unittest.TestCase):
@@ -232,17 +232,17 @@ index 8e9ed8500c7..66c18ad886a 100644
              keys.sort()
              self.assertEqual(keys, [1, 2])
 -        class C: pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C: pass
          blah = C()
          with mock_contextmanager_generator() as blah.foo:
              self.assertEqual(hasattr(blah, "foo"), True)
- 
+
      def testMultipleComplexTargets(self):
 -        class C:
 -            def __enter__(self): return 1, 2, 3
 -            def __exit__(self, t, v, tb): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class C:
 +                def __enter__(self): return 1, 2, 3
 +                def __exit__(self, t, v, tb): pass
@@ -254,23 +254,23 @@ index 8e9ed8500c7..66c18ad886a 100644
          with C() as (targets[1], targets[2], targets[3]):
              self.assertEqual(targets, {1: 1, 2: 2, 3: 3})
 -        class B: pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class B: pass
          blah = B()
          with C() as (blah.one, blah.two, blah.three):
              self.assertEqual(blah.one, 1)
 @@ -651,12 +683,13 @@ class AssignmentTargetTestCase(unittest.TestCase):
              self.assertEqual(c, 4)
- 
- 
+
+
 -class ExitSwallowsExceptionTestCase(unittest.TestCase):
 +class ExitSwallowsExceptionTestCase(__TestCase):
- 
+
      def testExitTrueSwallowsException(self):
 -        class AfricanSwallow:
 -            def __enter__(self): pass
 -            def __exit__(self, t, v, tb): return True
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class AfricanSwallow:
 +                def __enter__(self): pass
 +                def __exit__(self, t, v, tb): return True
@@ -279,12 +279,12 @@ index 8e9ed8500c7..66c18ad886a 100644
                  1/0
 @@ -664,9 +697,10 @@ class ExitSwallowsExceptionTestCase(unittest.TestCase):
              self.fail("ZeroDivisionError should have been swallowed")
- 
+
      def testExitFalseDoesntSwallowException(self):
 -        class EuropeanSwallow:
 -            def __enter__(self): pass
 -            def __exit__(self, t, v, tb): return False
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
++        with torch._dynamo.error_on_graph_break(False):
 +            class EuropeanSwallow:
 +                def __enter__(self): pass
 +                def __exit__(self, t, v, tb): return False
@@ -293,16 +293,16 @@ index 8e9ed8500c7..66c18ad886a 100644
                  1/0
 @@ -676,7 +710,7 @@ class ExitSwallowsExceptionTestCase(unittest.TestCase):
              self.fail("ZeroDivisionError should have been raised")
- 
- 
+
+
 -class NestedWith(unittest.TestCase):
 +class NestedWith(__TestCase):
- 
+
      class Dummy(object):
          def __init__(self, value=None, gobble=False):
 @@ -796,4 +830,4 @@ class NestedWith(unittest.TestCase):
- 
- 
+
+
  if __name__ == '__main__':
 -    unittest.main()
 +    run_tests()
diff --git a/test/dynamo/cpython/3_13/test_with.py b/test/dynamo/cpython/3_13/test_with.py
index 66c18ad886aa..7465532f764b 100644
--- a/test/dynamo/cpython/3_13/test_with.py
+++ b/test/dynamo/cpython/3_13/test_with.py
@@ -131,7 +131,7 @@ def fooNotDeclared():
         self.assertRaises(NameError, fooNotDeclared)
 
     def testEnterAttributeError1(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class LacksEnter(object):
                 def __exit__(self, type, value, traceback):
                     pass
@@ -142,7 +142,7 @@ def fooLacksEnter():
         self.assertRaisesRegex(TypeError, 'the context manager', fooLacksEnter)
 
     def testEnterAttributeError2(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class LacksEnterAndExit(object):
                 pass
 
@@ -152,7 +152,7 @@ def fooLacksEnterAndExit():
         self.assertRaisesRegex(TypeError, 'the context manager', fooLacksEnterAndExit)
 
     def testExitAttributeError(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class LacksExit(object):
                 def __enter__(self):
                     pass
@@ -185,7 +185,7 @@ def testAssignmentToTupleContainingNoneError(self):
             '  pass')
 
     def testEnterThrows(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class EnterThrows(object):
                 def __enter__(self):
                     raise RuntimeError("Enter threw")
@@ -204,7 +204,7 @@ def shouldThrow():
         self.assertEqual(self.foo, None)
 
     def testExitThrows(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class ExitThrows(object):
                 def __enter__(self):
                     return
@@ -492,7 +492,7 @@ def shouldThrow():
 
     def testRaisedStopIteration2(self):
         # From bug 1462485
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class cm(object):
                 def __enter__(self):
                     pass
@@ -534,7 +534,7 @@ def shouldThrow():
 
     def testRaisedGeneratorExit2(self):
         # From bug 1462485
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class cm (object):
                 def __enter__(self):
                     pass
@@ -551,7 +551,7 @@ def testErrorsInBool(self):
         # issue4589: __exit__ return code may raise an exception
         # when looking at its truth value.
 
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class cm(object):
                 def __init__(self, bool_conversion):
                     class Bool:
@@ -650,14 +650,14 @@ def testSingleComplexTarget(self):
             keys = list(targets.keys())
             keys.sort()
             self.assertEqual(keys, [1, 2])
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C: pass
         blah = C()
         with mock_contextmanager_generator() as blah.foo:
             self.assertEqual(hasattr(blah, "foo"), True)
 
     def testMultipleComplexTargets(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class C:
                 def __enter__(self): return 1, 2, 3
                 def __exit__(self, t, v, tb): pass
@@ -668,7 +668,7 @@ def __exit__(self, t, v, tb): pass
             self.assertEqual(targets, {1: [3, 2, 1]})
         with C() as (targets[1], targets[2], targets[3]):
             self.assertEqual(targets, {1: 1, 2: 2, 3: 3})
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class B: pass
         blah = B()
         with C() as (blah.one, blah.two, blah.three):
@@ -686,7 +686,7 @@ def testWithExtendedTargets(self):
 class ExitSwallowsExceptionTestCase(__TestCase):
 
     def testExitTrueSwallowsException(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class AfricanSwallow:
                 def __enter__(self): pass
                 def __exit__(self, t, v, tb): return True
@@ -697,7 +697,7 @@ def __exit__(self, t, v, tb): return True
             self.fail("ZeroDivisionError should have been swallowed")
 
     def testExitFalseDoesntSwallowException(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        with torch._dynamo.error_on_graph_break(False):
             class EuropeanSwallow:
                 def __enter__(self): pass
                 def __exit__(self, t, v, tb): return False
diff --git a/test/dynamo/test_activation_checkpointing.py b/test/dynamo/test_activation_checkpointing.py
index 6b7662cbe646..8fe89e84546b 100644
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@@ -19,6 +19,7 @@
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu, skipIfRocm
+from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils.checkpoint import (
@@ -28,6 +29,26 @@
 )
 
 
+if HAS_CUDA_AND_TRITON:
+    import triton
+    from triton import language as tl
+
+    @triton.jit
+    def add_one_kernel(
+        in_ptr0,
+        out_ptr,
+        n_elements,
+        BLOCK_SIZE: "tl.constexpr",
+    ):
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+        x = tl.load(in_ptr0 + offsets, mask=mask)
+        output = x + 1
+        tl.store(out_ptr + offsets, output, mask=mask)
+
+
 requires_distributed = functools.partial(
     unittest.skipIf, not dist.is_available(), "requires distributed"
 )
@@ -182,46 +203,75 @@ def _compare_orig_and_checkpointed_fns(
         # The original version and the checkpointed version of the same function
         # should produce the same outputs and the same gradients under torch.compile.
 
-        # Run original version
-        cloned_args_orig_fn = []
-        for arg in args:
-            cloned_args_orig_fn.append(
-                arg.detach().clone().requires_grad_(arg.requires_grad)
-            )
-        torch.manual_seed(0)
-        compiled_orig_fn = torch.compile(
-            orig_fn, fullgraph=fullgraph, backend="inductor"
-        )
-        result_orig_fn = compiled_orig_fn(*cloned_args_orig_fn)
-        result_orig_fn.sum().backward()
+        def clone_args(args):
+            cloned_args = []
+            for arg in args:
+                cloned_args.append(
+                    arg.detach().clone().requires_grad_(arg.requires_grad)
+                )
+            return cloned_args
 
-        # Run checkpointed version
-        cloned_args_checkpointed_fn = []
-        for arg in args:
-            cloned_args_checkpointed_fn.append(
-                arg.detach().clone().requires_grad_(arg.requires_grad)
+        def run(compiler):
+            # Run original version
+            cloned_args_orig_fn = clone_args(args)
+            torch.manual_seed(0)
+            compiled_orig_fn = compiler(orig_fn)
+            result_orig_fn = compiled_orig_fn(*cloned_args_orig_fn)
+            result_orig_fn.sum().backward()
+
+            # Run checkpointed version
+            cloned_args_checkpointed_fn = clone_args(args)
+            torch.manual_seed(0)
+            compiled_checkpointed_fn = compiler(copy.deepcopy(checkpointed_fn))
+            result_checkpointed_fn = compiled_checkpointed_fn(
+                *cloned_args_checkpointed_fn
             )
-        torch.manual_seed(0)
-        compiled_checkpointed_fn = torch.compile(
-            checkpointed_fn, fullgraph=fullgraph, backend="inductor"
-        )
-        result_checkpointed_fn = compiled_checkpointed_fn(*cloned_args_checkpointed_fn)
-        result_checkpointed_fn.sum().backward()
+            result_checkpointed_fn.sum().backward()
 
-        # Check that outputs and gradients are equal
-        self.assertEqual(
-            result_orig_fn,
-            result_checkpointed_fn,
-            msg="Output mismatch between the original version and the checkpointed version of the same function",
-        )
-        for cloned_arg_orig_fn, cloned_arg_checkpointed_fn in zip(
-            cloned_args_orig_fn, cloned_args_checkpointed_fn
-        ):
+            # Check that outputs and gradients are equal
             self.assertEqual(
-                cloned_arg_orig_fn.grad,
-                cloned_arg_checkpointed_fn.grad,
-                msg="Gradient mismatch between the original version and the checkpointed version of the same function",
+                result_orig_fn,
+                result_checkpointed_fn,
+                msg="Output mismatch between the original version and the checkpointed version of the same function",
             )
+            for cloned_arg_orig_fn, cloned_arg_checkpointed_fn in zip(
+                cloned_args_orig_fn, cloned_args_checkpointed_fn
+            ):
+                self.assertEqual(
+                    cloned_arg_orig_fn.grad,
+                    cloned_arg_checkpointed_fn.grad,
+                    msg="Gradient mismatch between the original version and the checkpointed version of the same function",
+                )
+
+        run(functools.partial(torch.compile, fullgraph=fullgraph))
+        if fullgraph:
+
+            def export_compiler(fn):
+                class WrapAsModule(nn.Module):
+                    def forward(self, *args, **kwargs):
+                        return fn(*args, **kwargs)
+
+                mod = WrapAsModule()
+
+                def runtime_wrapper(*runtime_args):
+                    from torch.export import _trace
+
+                    gm = _trace._export_to_torch_ir(
+                        f=mod,
+                        args=tuple(clone_args(args)),
+                        kwargs={},
+                        dynamic_shapes=None,
+                        preserve_module_call_signature=(),
+                        restore_fqn=False,
+                        prefer_deferred_runtime_asserts_over_guards=False,
+                        _log_export_usage=False,
+                    )
+                    # NOTE: this is necessary for rng to be added to the exported graph
+                    return torch.compile(gm, fullgraph=False)(*runtime_args)
+
+                return runtime_wrapper
+
+            run(export_compiler)
 
     def test_tags_function(self, device):
         def gn(x, y):
@@ -756,6 +806,73 @@ def fn(x, y):
         self._validate(fn, backend, x, y)
         self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
 
+    @requires_cuda_and_triton
+    @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
+    def test_compile_selective_checkpoint_triton_kernel(self, device):
+        # Copy of the above test, but make sure that having a triton kernel in the
+        # region does not error.
+        def add_one(x):
+            out = torch.empty_like(x)
+            n_elements = x.numel()
+            add_one_kernel[(n_elements,)](x, out, n_elements, BLOCK_SIZE=4)
+            return out
+
+        class AddOne(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                return add_one(x)
+
+            @staticmethod
+            def backward(ctx, x):
+                return x
+
+        def selective_checkpointing_context_fn():
+            no_recompute_list = [
+                torch.ops.aten.mm.default,
+            ]
+            return create_selective_checkpoint_contexts(
+                _get_custom_policy(no_recompute_list=no_recompute_list)
+            )
+
+        def gn(x, y):
+            return (
+                torch.sigmoid(torch.matmul(torch.matmul(AddOne.apply(x.sin()), y), y))
+                * y
+            )
+
+        def fn(x, y):
+            return torch.utils.checkpoint.checkpoint(
+                gn,
+                x,
+                y,
+                use_reentrant=False,
+                context_fn=selective_checkpointing_context_fn,
+            )
+
+        x = torch.randn(4, 4, requires_grad=True, device=device)
+        y = torch.randn(4, 4, requires_grad=True, device=device)
+
+        fw_compiler = functools.partial(
+            count_ops,
+            freq=2,
+            op=torch.ops.aten.mm.default,
+        )
+        bw_compiler = functools.partial(
+            count_ops,
+            # We would've expected 6 here
+            # (2 matmul recompute and 2 mm ops per fwd matmul, so 2 + 2 * 2 = 6)
+            # if we didn't enable selective checkpointing.
+            freq=4,
+            op=torch.ops.aten.mm.default,
+        )
+        backend = aot_autograd(
+            fw_compiler=fw_compiler,
+            bw_compiler=bw_compiler,
+            partition_fn=min_cut_rematerialization_partition,
+        )
+        self._validate(fn, backend, x, y)
+        self._compare_orig_and_checkpointed_fns(gn, fn, x, y)
+
     @requires_cuda_and_triton
     @unittest.skipIf(IS_WINDOWS, "torch.compile doesn't work with windows")
     def test_compile_selective_checkpoint_tensor_subclass(self, device):
diff --git a/test/dynamo/test_aot_autograd_cache.py b/test/dynamo/test_aot_autograd_cache.py
index 68ac9d427f8e..04af76c90c52 100644
--- a/test/dynamo/test_aot_autograd_cache.py
+++ b/test/dynamo/test_aot_autograd_cache.py
@@ -519,11 +519,7 @@ def fn(x, y):
     @functorch_config.patch(
         {"enable_autograd_cache": True, "view_replay_for_aliased_outputs": True}
     )
-    def test_view_replay_bypass(self):
-        """
-        Should bypass when view replay is turned on
-        """
-
+    def test_view_replay(self):
         def fn(a):
             tmp = a.detach()
             a.mul_(2)
@@ -531,10 +527,25 @@ def fn(a):
 
         with torch.autograd._force_original_view_tracking(True):
             compiled_fn = torch.compile(fn)
-            compiled_fn(torch.rand(2, 3))
 
-        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
-        self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)
+        def run_and_check(miss, hit, bypass):
+            self._clear_dynamo_and_codecache()
+
+            inp = torch.rand(2, 3)
+            compiled_inp = inp.clone().detach()
+
+            with torch.autograd._force_original_view_tracking(True):
+                out = fn(inp)
+                compiled_out = compiled_fn(compiled_inp)
+
+            self.assertEqual(out, compiled_out)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], miss)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], hit)
+            self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], bypass)
+
+        run_and_check(miss=1, hit=0, bypass=0)
+        run_and_check(miss=1, hit=1, bypass=0)
+        run_and_check(miss=1, hit=2, bypass=0)
 
     @inductor_config.patch("fx_graph_remote_cache", False)
     @inductor_config.patch("fx_graph_cache", True)
diff --git a/test/dynamo/test_aot_compile.py b/test/dynamo/test_aot_compile.py
new file mode 100644
index 000000000000..6589428bda6c
--- /dev/null
+++ b/test/dynamo/test_aot_compile.py
@@ -0,0 +1,233 @@
+# Owner(s): ["module: dynamo"]
+
+import os
+import pickle
+
+import torch
+import torch._dynamo.testing
+import torch._inductor.config
+import torch._inductor.test_case
+import torch.onnx.operators
+import torch.utils.cpp_extension
+from torch._dynamo.exc import PackageError, Unsupported
+from torch._dynamo.package import DynamoCache
+from torch._dynamo.precompile_context import PrecompileContext
+from torch._inductor.runtime.runtime_utils import cache_dir
+from torch.fx._graph_pickler import GraphPickler
+from torch.testing._internal.common_utils import instantiate_parametrized_tests
+
+
+class CustomCompiledFunction(torch._dynamo.aot_compile.SerializableCallable):
+    def __init__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]):
+        self.gm = gm
+        self.example_inputs = example_inputs
+
+    @classmethod
+    def serialize_compile_artifacts(cls, fn) -> bytes:
+        state = fn.__dict__.copy()
+        state["gm"] = GraphPickler.dumps(state["gm"])
+        return pickle.dumps(state)
+
+    @classmethod
+    def deserialize_compile_artifacts(cls, data: bytes):
+        state = pickle.loads(data)
+        fake_mode = torch._subclasses.FakeTensorMode()
+        state["gm"] = GraphPickler.loads(state["gm"], fake_mode)
+        state["gm"].recompile()
+        return cls(**state)
+
+    def __call__(self, *args, **kwargs):
+        return self.gm(*args, **kwargs)
+
+
+class SimpleLinearModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(3, 3)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+@torch._dynamo.config.patch("enable_aot_compile", True)
+@instantiate_parametrized_tests
+class TestAOTCompile(torch._inductor.test_case.TestCase):
+    def path(self):
+        path = os.path.join(cache_dir(), f"package_{self.id()}")
+        os.makedirs(path, exist_ok=True)
+        return os.path.join(path, "model.pt")
+
+    def setUp(self):
+        super().setUp()
+        torch._dynamo.reset()
+        torch._dynamo.utils.counters.clear()
+        DynamoCache.clear()
+        PrecompileContext.clear()
+
+    def test_aot_compile_basic_fn(self):
+        def fn(x, y):
+            return x + y
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        compiled_fn = torch.compile(fn, fullgraph=True, backend=backend).aot_compile(
+            ((torch.randn(3, 4), torch.randn(3, 4)), {})
+        )
+        inputs = (torch.randn(3, 4), torch.randn(3, 4))
+        expected = fn(*inputs)
+        actual = compiled_fn(*inputs)
+        self.assertEqual(expected, actual)
+        compiled_fn.save_compiled_function(self.path())
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(*inputs)
+            self.assertEqual(expected, actual)
+
+    def test_aot_compile_basic_forward(self):
+        mod = SimpleLinearModule()
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        compiled_fn = torch.compile(
+            mod,
+            fullgraph=True,
+            backend=backend,
+        ).forward.aot_compile(((torch.randn(3, 3),), {}))
+        inputs = (torch.randn(3, 3),)
+        expected = mod(*inputs)
+        actual = compiled_fn(mod, *inputs)
+        self.assertEqual(expected, actual)
+        compiled_fn.save_compiled_function(self.path())
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(mod, *inputs)
+            self.assertEqual(expected, actual)
+
+    def test_decorated_function_aot(self):
+        def check_inputs(fn):
+            def _fn(*args, **kwargs):
+                for arg in args:
+                    assert arg.shape[0] > 1
+
+                return fn(*args, **kwargs)
+
+            return _fn
+
+        @check_inputs
+        def foo(x, y):
+            a = x + x
+            b = y + y
+            c = a + b
+            return c
+
+        example_inputs = (torch.ones(3), torch.ones(3))
+        expected = foo(*example_inputs)
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        with torch.compiler.set_stance("fail_on_recompile"):
+            compiled_fn = torch.compile(
+                foo,
+                fullgraph=True,
+                backend=backend,
+            ).aot_compile((example_inputs, {}))
+            actual = compiled_fn(*example_inputs)
+            self.assertEqual(expected, actual)
+
+    def test_aot_compile_graph_break_error_fmt(self):
+        def foo(x, y):
+            a = x + x
+            torch._dynamo.graph_break()
+            b = y + y
+            c = a + b
+            return c
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(foo, fullgraph=True).aot_compile(
+                ((torch.ones(3), torch.ones(3)), {})
+            ),
+            """\
+Call to `torch._dynamo.graph_break()`
+  Explanation: User-inserted graph break. Message: None
+  Hint: Remove the `torch._dynamo.graph_break()` call.
+
+  Developer debug context: Called `torch._dynamo.graph_break()` with args `[]`, kwargs `{}`
+
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0025.html
+
+from user code:
+   File "test_aot_compile.py", line N, in foo
+    torch._dynamo.graph_break()""",
+        )
+
+    def test_guard_filter_override_aot(self):
+        def check_inputs(fn):
+            def _fn(*args, **kwargs):
+                for arg in args:
+                    assert arg.shape[0] > 1
+
+                return fn(*args, **kwargs)
+
+            return _fn
+
+        @check_inputs
+        def foo(x, y):
+            a = x + x
+            b = y + y
+            c = a + b
+            return c
+
+        example_inputs = (torch.ones(3), torch.ones(3))
+        expected = foo(*example_inputs)  # noqa: F841
+
+        def backend(gm, example_inputs):
+            return CustomCompiledFunction(gm, example_inputs)
+
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with self.assertRaisesRegex(
+                PackageError,
+                "CLOSURE_MATCH guard cannot be serialized.",
+            ):
+                compiled_fn = torch.compile(  # noqa: F841
+                    foo,
+                    fullgraph=True,
+                    backend=backend,
+                    options={
+                        "guard_filter_fn": lambda guard_entries: [
+                            True for g in guard_entries
+                        ]
+                    },
+                ).aot_compile((example_inputs, {}))
+
+    def test_aot_compile_basic_fn_inductor(self):
+        def fn(x, y):
+            return x + y
+
+        compiled_fn = torch.compile(fn, fullgraph=True, backend="inductor").aot_compile(
+            ((torch.randn(3, 4), torch.randn(3, 4)), {})
+        )
+        inputs = (torch.randn(3, 4), torch.randn(3, 4))
+        expected = fn(*inputs)
+        actual = compiled_fn(*inputs)
+        self.assertEqual(expected, actual)
+        compiled_fn.save_compiled_function(self.path())
+        torch._dynamo.reset()
+        with torch.compiler.set_stance("fail_on_recompile"):
+            with open(self.path(), "rb") as f:
+                compiled_fn = torch.compiler.load_compiled_function(f)
+            actual = compiled_fn(*inputs)
+            self.assertEqual(expected, actual)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_decorators.py b/test/dynamo/test_decorators.py
index 9bf982c5b90e..6af25a385c2f 100644
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@@ -1067,11 +1067,10 @@ def fn3(x):
         self.assertEqual(cnts.frame_count, 2)
         self.assertEqual(cnts.op_count, 4)
 
-        cnts.clear()
-        torch._dynamo.reset()
-        fn3(torch.randn(4, 5))
-        self.assertEqual(cnts.frame_count, 2)
-        self.assertEqual(cnts.op_count, 4)
+        with self.assertRaisesRegex(
+            Unsupported, r"Skip calling `torch.compiler.disable\(\)`d function"
+        ):
+            fn3(torch.randn(4, 5))
 
     def test_disable_optimize(self):
         cnt = torch._dynamo.testing.CompileCounter()
@@ -1721,13 +1720,14 @@ def f4(x):
         ):
             f4(torch.randn(3))
 
-    def test_set_fullgraph(self):
+    def test_error_on_graph_break(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f1(x):
             x = x + 1
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.graph_break()
             return x + 2
 
@@ -1738,17 +1738,18 @@ def f1(x):
         @torch.compile(backend=cnts)
         def f2(x):
             x = x + 1
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 torch._dynamo.graph_break()
             return x + 2
 
         with self.assertRaises(Unsupported):
             f2(inp)
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f3(x):
             x = x + 1
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.graph_break()
                 x = x + 2
                 torch._dynamo.graph_break()
@@ -1763,10 +1764,11 @@ def inner_f4(x):
             torch._dynamo.graph_break()
             return x + 4
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f4(x):
             x = x + 1
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.skip_frame()
                 return inner_f4(x)
 
@@ -1774,17 +1776,18 @@ def f4(x):
         self.assertEqual(f4(inp), inp + 7)
         self.assertEqual(cnts.frame_count, 2)
 
-    def test_set_fullgraph_nested(self):
-        # set_fullgraph in a nested frame
+    def test_error_on_graph_break_nested(self):
+        # error_on_graph_break in a nested frame
         cnts = torch._dynamo.testing.CompileCounter()
 
-        @torch._dynamo.set_fullgraph(False)
+        @torch._dynamo.error_on_graph_break(False)
         def inner_f5(x):
             x = x + 2
             torch._dynamo.graph_break()
             return x + 4
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f5(x):
             x = x + 1
             return inner_f5(x)
@@ -1795,11 +1798,12 @@ def f5(x):
 
         def inner_f6(x):
             x = x + 2
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.graph_break()
             return x + 4
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f6(x):
             x = x + 1
             return inner_f6(x)
@@ -1810,11 +1814,12 @@ def f6(x):
 
         def inner_f7(x):
             x = x + 2
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 torch._dynamo.graph_break()
             return x + 4
 
-        @torch.compile(backend=cnts, fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend=cnts)
         def f7(x):
             x = x + 1
             return inner_f7(x)
@@ -1822,22 +1827,23 @@ def f7(x):
         with self.assertRaises(Unsupported):
             f7(inp)
 
-    def test_set_fullgraph_nested_with_skip(self):
-        # set_fullgraph in a nested frame with a skipped frame in between
+    def test_error_on_graph_break_nested_with_skip(self):
+        # error_on_graph_break in a nested frame with a skipped frame in between
         cnts = torch._dynamo.testing.CompileCounter()
 
-        @torch._dynamo.set_fullgraph(False)
+        @torch._dynamo.error_on_graph_break(False)
         def inner2_f8(x):
             x = x + 2
             torch._dynamo.graph_break()
             return x + 4
 
         def inner1_f8(x):
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 torch._dynamo.skip_frame()
             return inner2_f8(x)
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f8(x):
             x = x + 1
             return inner1_f8(x)
@@ -1848,7 +1854,7 @@ def f8(x):
 
         def inner2_f9(x):
             x = x + 2
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 torch._dynamo.graph_break()
             return x + 4
 
@@ -1856,7 +1862,8 @@ def inner2_f9(x):
         def inner1_f9(x):
             return inner2_f9(x)
 
-        @torch.compile(backend=cnts, fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend=cnts)
         def f9(x):
             x = x + 1
             return inner1_f9(x)
@@ -1864,10 +1871,10 @@ def f9(x):
         with self.assertRaises(Unsupported):
             f9(inp)
 
-        # test export with set_fullgraph(False) still errors
+        # test export with error_on_graph_break(False) still errors
 
-    def test_set_fullgraph_export(self):
-        @torch._dynamo.set_fullgraph(False)
+    def test_error_on_graph_break_export(self):
+        @torch._dynamo.error_on_graph_break(False)
         def inner(x):
             x = x + 2
             torch._dynamo.graph_break()
@@ -1880,7 +1887,7 @@ def f(x):
         with self.assertRaises(Unsupported):
             torch._dynamo.export(f)(torch.ones(3))
 
-    def test_set_fullgraph_nested_deep(self):
+    def test_error_on_graph_break_nested_deep(self):
         cnts = torch._dynamo.testing.CompileCounter()
 
         def inner1_f1(x):
@@ -1892,13 +1899,14 @@ def inner2_f1(x):
             return inner1_f1(x)
 
         def inner3_f1(x):
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 return inner2_f1(x)
 
         def inner4_f1(x):
             return inner3_f1(x)
 
-        @torch.compile(backend=cnts, fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend=cnts)
         def f1(x):
             x = x + 4
             return inner4_f1(x)
@@ -1916,13 +1924,14 @@ def inner2_f2(x):
             return inner1_f2(x)
 
         def inner3_f2(x):
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 return inner2_f2(x)
 
         def inner4_f2(x):
             return inner3_f2(x)
 
-        @torch.compile(backend=cnts, fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend=cnts)
         def f2(x):
             x = x + 4
             return inner4_f2(x)
@@ -1930,20 +1939,20 @@ def f2(x):
         with self.assertRaises(Unsupported):
             f2(inp)
 
-    def test_set_fullgraph_error(self):
+    def test_error_on_graph_break_error(self):
         @torch.compile(backend="eager")
         def f1():
-            with torch._dynamo.set_fullgraph(foo="bar"):
+            with torch._dynamo.error_on_graph_break(foo="bar"):
                 pass
 
         @torch.compile(backend="eager")
         def f2():
-            with torch._dynamo.set_fullgraph():
+            with torch._dynamo.error_on_graph_break():
                 pass
 
         @torch.compile(backend="eager")
         def f3():
-            with torch._dynamo.set_fullgraph("foo"):
+            with torch._dynamo.error_on_graph_break("foo"):
                 pass
 
         with self.assertRaises(Exception):
@@ -1953,34 +1962,88 @@ def f3():
         with self.assertRaises(Exception):
             f3()
 
-    def test_nested_compile_fullgraph(self):
+    def test_nested_compile_error_on_graph_break(self):
         inp = torch.ones(3)
 
-        @torch.compile(backend="eager", fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend="eager")
         def inner_f1(x):
             x = x + 1
             torch._dynamo.graph_break()
             return x + 2
 
-        @torch.compile(backend="eager", fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend="eager")
         def f1(x):
             return inner_f1(x)
 
         with self.assertRaises(Unsupported):
             f1(inp)
 
-        @torch.compile(backend="eager", fullgraph=False)
+        @torch._dynamo.error_on_graph_break(False)
+        @torch.compile(backend="eager")
         def inner_f2(x):
             x = x + 1
             torch._dynamo.graph_break()
             return x + 2
 
-        @torch.compile(backend="eager", fullgraph=True)
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend="eager")
         def f2(x):
             return inner_f2(x)
 
         self.assertEqual(f2(inp), inp + 3)
 
+    def test_error_on_graph_break_fullgraph(self):
+        # Test that error_on_graph_break=False cannot override fullgraph=True
+        inp = torch.ones(3)
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(x):
+            x = x + 1
+            with torch._dynamo.error_on_graph_break(False):
+                torch._dynamo.graph_break()
+            return x + 2
+
+        with self.assertRaises(Unsupported):
+            f(inp)
+
+    def test_error_on_graph_break_empty_graph(self):
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend="eager")
+        def f():
+            return 1
+
+        self.assertEqual(f(), 1)
+
+    def test_nested_compile_fullgraph(self):
+        # Test that fullgraph=True cannot be toggled back by fullgraph=False
+        inp = torch.ones(3)
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def inner_f1(x):
+            torch._dynamo.graph_break()
+            return x + 1
+
+        @torch.compile(backend="eager", fullgraph=False)
+        def outer_f1(x):
+            return inner_f1(x)
+
+        with self.assertRaises(Unsupported):
+            outer_f1(inp)
+
+        @torch.compile(backend="eager", fullgraph=False)
+        def inner_f2(x):
+            torch._dynamo.graph_break()
+            return x + 1
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def outer_f2(x):
+            return inner_f2(x)
+
+        with self.assertRaises(Unsupported):
+            outer_f2(inp)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_dicts.py b/test/dynamo/test_dicts.py
index 18f916e46dc0..3b1c9315336e 100644
--- a/test/dynamo/test_dicts.py
+++ b/test/dynamo/test_dicts.py
@@ -7,7 +7,7 @@
 import types
 import unittest
 import weakref
-from collections import defaultdict, namedtuple, OrderedDict
+from collections import defaultdict, namedtuple, OrderedDict, UserDict
 from typing import Any
 
 import torch
@@ -31,6 +31,10 @@ class SimpleDict(dict):
     pass
 
 
+class DummyUserDict(UserDict):
+    pass
+
+
 class DictTests(torch._dynamo.test_case.TestCase):
     def test_dict_subclass_instantiation(self):
         def fn(x):
@@ -788,6 +792,17 @@ def fn(x):
         x = torch.randn(4)
         self.assertEqual(fn(x), opt_fn(x))
 
+    def test_construct_user_dict_and_return(self):
+        def fn(x):
+            return DummyUserDict({"a": x + 1})
+
+        x = torch.randn(4)
+        res = fn(x)
+        self.assertEqual(res["a"], x + 1)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        self.assertEqual(res["a"], opt_fn(x)["a"])
+
     def test_fn_id(self):
         def fn(x, f):
             d = {id(f): 3}
@@ -931,6 +946,25 @@ def fn(x):
         self.assertEqual(["b", "c", "a"], list(opt_fn(x).keys()))
         self.assertEqual(fn(x), opt_fn(x))
 
+    def test_mapping_proxy_ban_muation_on_dict_realization(self):
+        def fn(x):
+            class Foo:
+                b = 4
+
+            d = dict(Foo.__dict__)
+            y = torch.sin(x) * d["b"]
+            # This should cause a graph break, because otherwise the
+            # Foo.__dict__ will not be updated.
+            Foo.bar = 3
+            return Foo, y * Foo.__dict__["bar"]
+
+        opt_fn = torch.compile(fn, backend="eager")
+        x = torch.randn(4)
+        foo1, ref = fn(x)
+        foo2, res = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(foo1.bar, foo2.bar)
+
     def test_overridden_get_item(self):
         class MyDict(dict):
             def __init__(self, *args, **kwargs):
@@ -1139,6 +1173,59 @@ def fn(x, d1, d2):
             munge_exc(record.getMessage()),
         )
 
+    @make_logging_test(recompiles=True)
+    def test_cmp_or(self, records):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x, d1, d2):
+            d = d1 | d2
+            if d.get(5, False):
+                return x.sin()
+            return x.cos()
+
+        x = torch.tensor(1.0)
+        d1 = self.thetype({1: 2, 3: 4})
+        d2 = self.thetype({1: 2, 5: 6})
+        y = fn(x, d1, d2)
+        # sanity check
+        self.assertEqual(len(records), 0)
+        self.assertEqual(y, x.sin())
+
+        y = fn(x, d1, d1)
+        self.assertEqual(len(records), 1)
+        self.assertEqual(y, x.cos())
+        record = self.getRecord(records, "d2")
+        self.assertIn(
+            """KeyError on d2[5]""",
+            munge_exc(record.getMessage()),
+        )
+
+    @make_logging_test(recompiles=True)
+    def test_cmp_ior(self, records):
+        @torch.compile(backend="eager", fullgraph=True)
+        def fn(x, d1, d2):
+            d2 |= d1
+            if d2.get(3, False):
+                return x.sin()
+            return x.cos()
+
+        x = torch.tensor(1.0)
+        d1 = self.thetype({1: 2, 3: 4})
+        d2 = self.thetype({1: 2, 5: 6})
+        d3, d4 = d2.copy(), d2.copy()
+        y = fn(x, d1, d2)
+        # sanity check
+        self.assertEqual(len(records), 0)
+        self.assertEqual(y, x.sin())
+
+        y = fn(x, d3, d4)
+        self.assertEqual(len(records), 1)
+        self.assertEqual(y, x.cos())
+        record = self.getRecord(records, "d1")
+        self.assertIn(
+            """KeyError on d1[3]""",
+            munge_exc(record.getMessage()),
+        )
+
 
 class DictMethodsTests(torch._dynamo.test_case.TestCase):
     thetype = dict
@@ -1232,6 +1319,53 @@ def test_binop_or(self):
         # Test with non-dict types
         self.assertRaises(TypeError, lambda: d1 | 1)
 
+    @make_dynamo_test
+    def test_binop_ior(self):
+        d1 = self.thetype({"a": 1, "b": 2})
+        d2 = self.thetype({"b": 3, "c": 4})
+
+        # Test the |= operator
+        d3, d4 = d1.copy(), d2.copy()
+        d3 |= d2
+        d4 |= d1
+        self.assertEqual(d3, {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(d4, {"a": 1, "b": 2, "c": 4})
+
+        # Test with an iterable
+        d3, d4 = d1.copy(), d2.copy()
+
+        # Test the __ior__ method
+        d3, d4 = d1.copy(), d2.copy()
+        d3.__ior__(d2)
+        d4.__ior__(d1)
+        self.assertEqual(d3, {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(d4, {"a": 1, "b": 2, "c": 4})
+
+        # Test Dict.__or__
+        d3, d4 = d1.copy(), d2.copy()
+        self.assertEqual(dict.__ior__(d3, d2), {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(self.thetype.__ior__(d4, d1), {"a": 1, "b": 2, "c": 4})
+
+        # Test return value
+        d3, d4 = d1.copy(), d2.copy()
+        self.assertEqual(d3.__ior__(d2), {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(dict.__ior__(d4, d1), {"a": 1, "b": 2, "c": 4})
+
+        # Test with non-dict types
+        self.assertRaises(TypeError, lambda: dict.__ior__(d1, 1))
+
+    @make_dynamo_test
+    def test_binop_ior_iterable(self):
+        d1 = self.thetype({"a": 1, "b": 2})
+        d2 = self.thetype({"b": 3, "c": 4})
+        d3, d4 = d1.copy(), d2.copy()
+
+        def fn(d):
+            yield from d.items()
+
+        self.assertEqual(d3.__ior__(d2.items()), {"a": 1, "b": 3, "c": 4})
+        self.assertEqual(d4.__ior__(fn(d1)), {"a": 1, "b": 2, "c": 4})
+
     @make_dynamo_test
     def test_clear(self):
         d = self.thetype({"a": 1, "b": 2})
@@ -1367,7 +1501,7 @@ def test_popitem(self):
         self.assertEqual(value, 1)
 
         # Test invalid usage
-        if self.thetype != OrderedDict:
+        if self.thetype is not OrderedDict:
             # OrderedDict accepts a keyword arg
             self.assertRaises(TypeError, d.popitem, 1)
 
@@ -1432,6 +1566,23 @@ def test_values(self):
         # Test invalid usage
         self.assertRaises(TypeError, d.values, 1)
 
+    @make_dynamo_test
+    def test_type(self):
+        d = self.thetype({"a": 1, "b": 2})
+        self.assertIsInstance(d, self.thetype)
+        self.assertIs(type(d), self.thetype)
+
+    @make_dynamo_test
+    def test_dict_type_comparison(self):
+        types = (dict, OrderedDict, defaultdict)
+        self.assertEqual(self.thetype, self.thetype)
+        self.assertTrue(self.thetype is self.thetype)
+        for other in types:
+            if self.thetype == other:
+                continue
+            self.assertNotEqual(self.thetype, other)
+            self.assertTrue(self.thetype is not other, f"{self.thetype=}, {other=}")
+
 
 class DictSubclassMethodsTests(DictMethodsTests):
     thetype = SimpleDict
@@ -1445,11 +1596,98 @@ class OrderedDictMethodsTests(DictMethodsTests):
     # + move_to_end
 
     @make_dynamo_test
+    def test_move_to_end(self):
+        d = self.thetype.fromkeys("abcde")
+        self.assertEqual("".join(d), "abcde")
+        d.move_to_end("b")
+        self.assertEqual("".join(d), "acdeb")
+
+        # Test OrderedDict.move_to_end
+        self.thetype.move_to_end(d, "a")
+        self.assertEqual("".join(d), "cdeba")
+
+        # Test last=False
+        self.thetype.move_to_end(d, "a", last=False)
+        self.assertEqual("".join(d), "acdeb")
+
+        # Test KeyError
+        self.assertRaises(KeyError, d.move_to_end, "f")
+
     def test_cmp_eq_order(self):
         a = self.thetype.fromkeys("abc")
         b = self.thetype.fromkeys("bca")
         self.assertFalse(a == b)
 
+    @make_dynamo_test
+    def test_binop_or_return_type(self):
+        d1 = self.thetype({"a": 1, "b": 2})
+        d2 = self.thetype({"b": 3, "c": 4})
+
+        # Test return type
+        self.assertIs(type(d1 | d2), OrderedDict)
+        self.assertIs(type(dict(d1) | d2), OrderedDict)
+        self.assertIs(type(d1 | dict(d2)), OrderedDict)
+
+    @make_dynamo_test
+    def test_binop_ior_return_type(self):
+        d1 = self.thetype({"a": 1, "b": 2})
+        d2 = self.thetype({"b": 3, "c": 4})
+
+        # Test return type
+        d3, d4 = d1.copy(), d2.copy()
+        self.assertIs(type(d3.__ior__(d2)), OrderedDict)
+        self.assertIs(type(dict.__ior__(d4, d2)), OrderedDict)
+        self.assertIs(type(self.thetype.__ior__(d4, d2)), OrderedDict)
+
+        d3, d4 = d1.copy(), d2.copy()
+        self.assertIs(type(dict.__ior__(d3, dict(d2))), OrderedDict)
+        self.assertIs(type(dict.__ior__(dict(d3), d2)), dict)
+        self.assertIs(type(dict(d4).__ior__(d2)), dict)
+
+    @make_dynamo_test
+    def test_popitem_kwarg(self):
+        d = self.thetype.fromkeys("abcdf")
+        self.assertEqual(d.popitem(last=True), ("f", None))
+        self.assertEqual(list(d), list("abcd"))
+        self.assertEqual(d.popitem(last=False), ("a", None))
+        self.assertEqual(list(d), list("bcd"))
+        self.assertEqual(d.popitem(False), ("b", None))
+        self.assertEqual(list(d), list("cd"))
+        self.assertEqual(d.popitem(True), ("d", None))
+        self.assertEqual(list(d), list("c"))
+
+
+class OrderedDictSubclassOverload(torch._dynamo.test_case.TestCase):
+    def setUp(self):
+        torch._dynamo.config.enable_trace_unittest = True
+        super().setUp()
+
+    def tearDown(self):
+        torch._dynamo.config.enable_trace_unittest = False
+        return super().tearDown()
+
+    def assertEqual(self, x, y):
+        self.assertTrue(x == y, f"Expected {x} to be equal to {y}")
+
+    def assertNotEqual(self, x, y):
+        self.assertFalse(x == y, f"Expected {x} to not be equal to {y}")
+
+    class OrderedDictSubclass(OrderedDict):
+        def get(self, key, default=None, /):
+            return default
+
+        def move_to_end(self, key, last=True, /):
+            # change the behavior to something else
+            self.pop(key)
+
+    thetype = OrderedDictSubclass
+
+    @make_dynamo_test
+    def test_move_to_end(self):
+        p = self.thetype({"a": 1, "b": 2, "c": 3})
+        p.move_to_end("a")
+        self.assertEqual(list(p.keys()), list("bc"))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_error_messages.py b/test/dynamo/test_error_messages.py
index f525cd84b680..847f3a6fd216 100644
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@@ -47,7 +47,7 @@ def __exit__(self, exc_type, exc_value, traceback):
         pass
 
 
-class GraphBreakMessagesTest(LoggingTestCase):
+class ErrorMessagesTest(LoggingTestCase):
     def test_dynamic_shape_operator(self):
         def fn():
             return torch.nonzero(torch.rand([10, 10]))
@@ -790,12 +790,12 @@ def post_munge(s):
             lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
             """\
 Reconstruction failure
-  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
   Hint: If Dynamo is attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
   Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
   Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have reconstruction rules may be fundamentally unreconstructable.
 
-  Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
+  Developer debug context: UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0092.html
 
@@ -846,12 +846,12 @@ def post_munge(s):
             post_munge(munge_exc(records[1].exc_info[1], suppress_suffix=True, skip=0)),
             """\
 Reconstruction failure
-  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
+  Explanation: Dynamo has no bytecode reconstruction implemented for sourceless variable UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo)).
   Hint: If Dynamo is attempting to trace a return statement and your code is attempting to return a variable that Dynamo cannot reconstruct, then remove it from the return statement.
   Hint: This graph break may have been caused by an earlier graph break. Resolving the earlier graph break may resolve this one.
   Hint: Report an issue to PyTorch if you need reconstrtuction support. Note that objects that don't have reconstruction rules may be fundamentally unreconstructable.
 
-  Developer debug context: UserMethodVariable(<function GraphBreakMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
+  Developer debug context: UserMethodVariable(<function ErrorMessagesTest.test_reconstruction_failure_gb.<locals>.Foo.meth at 0xmem_addr>, UserDefinedObjectVariable(Foo))
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0092.html
 
@@ -1305,10 +1305,10 @@ def post_munge(s):
             lambda: outer(f, torch.randn(3)),
             """\
 Skip calling `torch.compiler.disable()`d function
-  Explanation: Skip calling function `<function GraphBreakMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: None)
+  Explanation: Skip calling function `<function ErrorMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: None)
   Hint: Remove the `torch.compiler.disable` call
 
-  Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>
+  Developer debug context: <function ErrorMessagesTest.test_disable_message.<locals>.f at 0xmem_addr>
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0098.html
 
@@ -1327,10 +1327,10 @@ def g(x):
             lambda: outer(g, torch.randn(3)),
             """\
 Skip calling `torch.compiler.disable()`d function
-  Explanation: Skip calling function `<function GraphBreakMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: test message)
+  Explanation: Skip calling function `<function ErrorMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>` since it was wrapped with `torch.compiler.disable` (reason: test message)
   Hint: Remove the `torch.compiler.disable` call
 
-  Developer debug context: <function GraphBreakMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>
+  Developer debug context: <function ErrorMessagesTest.test_disable_message.<locals>.g at 0xmem_addr>
 
  For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0098.html
 
diff --git a/test/dynamo/test_exceptions.py b/test/dynamo/test_exceptions.py
index 7a1913be5460..43fdc335b8c2 100644
--- a/test/dynamo/test_exceptions.py
+++ b/test/dynamo/test_exceptions.py
@@ -136,6 +136,20 @@ def fn(x):
         res = opt_fn(x)
         self.assertEqual(ref, res)
 
+    def test_exception_with_vars(self):
+        def fn(x):
+            try:
+                vars(42)
+                raise RuntimeError("Should not be raised")
+            except TypeError:
+                return x.sin()
+
+        x = torch.randn(4)
+        ref = fn(x)
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
     def test_autocast_with_exception(self):
         class Optimizer(torch.autograd.Function):
             @staticmethod
diff --git a/test/dynamo/test_fake_distributed.py b/test/dynamo/test_fake_distributed.py
index 729024828bb1..7a73e24cc8b0 100644
--- a/test/dynamo/test_fake_distributed.py
+++ b/test/dynamo/test_fake_distributed.py
@@ -14,7 +14,6 @@
         wait_tensor,
     )
     from torch.distributed.device_mesh import init_device_mesh
-    from torch.testing._internal.distributed.fake_pg import FakeStore
 
 
 def normalize_graph(gm):
@@ -25,8 +24,7 @@ def normalize_graph(gm):
 class TestFakeDistributed(DynamoTestCase):
     def setUp(self):
         # Use FakeProcessGroup to run tests on a single process
-        self.store = FakeStore()
-        dist.init_process_group(backend="fake", rank=0, world_size=2, store=self.store)
+        dist.init_process_group(backend="fake", rank=0, world_size=2)
         self.local_rank = 0
         self.world_size = 2
 
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 86ccdfb21cd2..5b8aa5c61e40 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -31,7 +31,7 @@
     EagerAndRecordGraphs,
     normalize_gm,
 )
-from torch._dynamo.utils import ifdynstaticdefault, same
+from torch._dynamo.utils import ifdynstaticdefault, range_iterator, same
 from torch._dynamo.variables import ConstantVariable, SkipFunctionVariable
 from torch._dynamo.variables.lists import RangeVariable
 from torch.nn import functional as F
@@ -1756,7 +1756,6 @@ def test_tuple_contains(a, b):
             return a + b
         return a - b
 
-    @unittest.expectedFailure
     @make_test
     def test_set_in_frozenset(x):
         var = set("abc")
@@ -3492,6 +3491,51 @@ def gen_random_range_args(self):
             args[2] = 1
         return args
 
+    def test_range_iterator_graph_break(self):
+        @torch.compile(backend="eager")
+        def fn(x):
+            it = range(1, 7, 2).__iter__()
+            y = x + next(it)
+            torch._dynamo.graph_break()
+            return y + next(it) + next(it)
+
+        x = torch.tensor([1.0])
+        y = fn(x)
+        self.assertEqual(y, x + 1 + 3 + 5)
+
+    def test_range_iterator_graph_break_2(self):
+        @torch.compiler.disable
+        def g(y, it):
+            return y + next(it) + next(it)
+
+        @torch.compile(backend="eager")
+        def fn(x):
+            it = range(1, 10, 2).__iter__()
+            y = x + next(it)
+            z = g(y, it)
+            k = next(it)
+            assert k == 7
+            return z + k
+
+        x = torch.tensor([1.0])
+        z = fn(x)
+        self.assertEqual(z, x + 1 + 3 + 5 + 7)
+
+    @make_test
+    def test_range_iterator(a, b):
+        it = range(5).__iter__()
+        if isinstance(it, range_iterator):
+            return a + b
+        return a - b
+
+    @make_test
+    def test_range_iterator_2(a, b):
+        # should pass once we stop having three different paths on call_iter
+        it = iter(range(5))
+        if isinstance(it, range_iterator):
+            return a + b
+        return a - b
+
     def test_range_length(self):
         def test(*args, expected=None):
             r = range(*args)
@@ -4044,7 +4088,8 @@ def new_get_device_module(device=None):
             print(torch.get_device_module())
             self.assertEqual(f5(), getattr(torch, new_device))
 
-        @torch.compile(backend="eager", fullgraph=True)
+        # synchronize causes a graph break, so no fullgraph=True
+        @torch.compile(backend="eager")
         def f6():
             mod = torch.get_device_module()
             mod.synchronize()
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 176ac62784c9..9f093d4dc0ce 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -2608,25 +2608,17 @@ def f(x):
             f, default_args_generator((x,)), arg_count, expected_opcount=3
         )
 
-    def test_fallback_on_python_primitives_output(self):
+    def test_support_float_in_output(self):
         counters.clear()
         cnt = CompileCounter()
 
-        @torch.compile(backend=cnt)
+        @torch.compile(backend=cnt, fullgraph=True)
         def f(x):
             return wrap(lambda x: [1, torch.sin(x), 2.0], x)
 
         x = torch.randn(3)
         result = f(x)
         self.assertEqual(result, [1, torch.sin(x), 2.0])
-        self.assertEqual(cnt.frame_count, 0)
-        assert_dict_matches_regex(
-            self,
-            dict(counters["graph_break"]),
-            {
-                ".*HigherOrderOperator body's output must consist of tensors or ints only but got": 1
-            },
-        )
 
     def test_nested_tuple_output(self):
         def f(x):
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index ff8c6cd58bf9..1a9d8e8155e4 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -87,12 +87,15 @@
 )
 from torch.testing._internal.common_utils import (
     freeze_rng_state,
+    instantiate_parametrized_tests,
     IS_FBCODE,
+    parametrize,
     scoped_load_inline,
     set_default_dtype,
     skipIfHpu,
     skipIfNNModuleInlined,
     skipIfWindows,
+    subtest,
     TEST_HPU,
     TEST_XPU,
     wrapDeterministicFlagAPITest,
@@ -101,11 +104,21 @@
 from torch.testing._internal.logging_utils import logs_to_string
 
 
+pytree_modules = {
+    "python": python_pytree,
+}
 if python_pytree._cxx_pytree_dynamo_traceable:
     import torch.utils._cxx_pytree as cxx_pytree
+
+    pytree_modules["cxx"] = cxx_pytree
 else:
     cxx_pytree = None
 
+parametrize_pytree_module = parametrize(
+    "pytree",
+    [subtest(module, name=name) for name, module in pytree_modules.items()],
+)
+
 MyTuple = collections.namedtuple("MyTuple", ["a", "b", "ab"])
 T = typing.TypeVar("T")
 
@@ -8683,28 +8696,15 @@ def fn1(y):
             # there will be a resume function here
             return f(x)
 
-        with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
-            with self.assertRaises(torch._dynamo.exc.RecompileError):
-                x = torch.rand(2, 3)
-                self.assertEqual(outer(x, True), torch.compile(outer)(x, True))
-                self.assertEqual(outer(x, False), torch.compile(outer)(x, False))
-
-    def test_create_nested_fn_cache_clear(self):
-        def outer(x):
-            @torch._dynamo.disable()
-            def f(y):
-                return y + 2
-
-            return f(x) + 1
+    def test_error_on_recompile(self):
+        @torch.compile(backend="eager")
+        def fn(a, b):
+            return a + b
 
-        outer = torch.compile(outer)
         with unittest.mock.patch("torch._dynamo.config.error_on_recompile", True):
             with self.assertRaises(torch._dynamo.exc.RecompileError):
-                outer(torch.randn(3, 3))
-                from torch._dynamo.utils import create_nested_fn_cache
-
-                create_nested_fn_cache.clear()
-                outer(torch.randn(3, 3))
+                fn(torch.rand(2, 3), torch.rand(2, 3))
+                fn(torch.rand(2, 3), (1, 2, 3))
 
     def test_guards_strip_function_call(self):
         from torch._dynamo.guards import strip_function_call
@@ -9120,71 +9120,6 @@ def fn():
         opt = torch.compile(fn, backend="eager")
         opt()
 
-    def test_tracing_py_tree(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-
-        counter = CompileCounter()
-        torch.compile(fn, backend=counter, fullgraph=True)(xs)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 3)
-
-    def test_tracing_nested_py_tree(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-        xsl = [xs, xs, xs, xs]
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
-        real_out = fn(xsl)
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 12)
-
-    def test_tracing_nested_py_tree_tuples(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-        xsl = (xs, xs, xs, xs)
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
-        real_out = fn(xsl)
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 12)
-
-    def test_tracing_nested_py_tree_dicts(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-        xsl = {
-            "a": xs,
-            "b": xs,
-            "c": xs,
-        }
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
-        real_out = fn(xsl)
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 9)
-
     def test_dynamic_one_hot(self):
         def fn(x):
             x = x + 1
@@ -9201,28 +9136,6 @@ def fn(x):
         self.assertEqual(counter.frame_count, 2)
         self.assertEqual(counter.op_count, 2)
 
-    def test_tracing_nested_py_tree_mixed_all(self):
-        def fn(xs):
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            res = [x.clone() for x in flat_xs]
-            return python_pytree.tree_unflatten(res, spec)
-
-        xs = [torch.tensor(i) for i in range(3)]
-        xsa = (xs, xs)
-        xsb = {"aa": xsa, "ab": xs}
-        xsl = {
-            "a": xs,
-            "b": xsa,
-            "c": xsb,
-        }
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
-        real_out = fn(xsl)
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 18)
-
     def test_any_all_symnode(self):
         cnt = CompileCounter()
 
@@ -9249,46 +9162,6 @@ def fn(x):
         self.assertEqual(fn(y3), y3 - 3)
         self.assertEqual(cnt.frame_count, 2)
 
-    def test_tracing_py_tree_tensor_subclass(self):
-        from torch.testing._internal.two_tensor import TwoTensor
-        from torch.utils.checkpoint import checkpoint
-
-        def fn(xs):
-            nested_xs = [[xs]]
-            flat_xs, spec = python_pytree.tree_flatten(xs)
-            return flat_xs[0].clone()
-
-        # use checkpoint to trigger a "sourceless" tensor subclass
-        def checkpoint_fn(xs):
-            return checkpoint(fn, xs, use_reentrant=True)
-
-        xs = TwoTensor(torch.ones(2, 2), torch.ones(2, 2))
-
-        counter = CompileCounter()
-        torch.compile(checkpoint_fn, backend=counter, fullgraph=True)(xs)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 2)
-
-    def test_tracing_tree_map_only(self):
-        def fn(xs):
-            def mapper(x):
-                return x.clone()
-
-            y = python_pytree.tree_map_only(torch.Tensor, mapper, xs)
-            return y
-
-        xs = [torch.tensor(i) for i in range(3)] + ["hi"]
-        xsa = (xs, xs)
-        xsb = {"aa": xsa, "ab": xs}
-
-        counter = CompileCounter()
-        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsb)
-        real_out = fn(xsb)
-
-        self.assertEqual(comp_out, real_out)
-        self.assertEqual(counter.frame_count, 1)
-        self.assertEqual(counter.op_count, 9)
-
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
     )
@@ -10731,139 +10604,6 @@ def fn(x, y):
         expected = fn(*inps)
         self.assertEqual(actual, expected)
 
-    def test_pytree_tree_leaves(self):
-        implementations = [("python", python_pytree)]
-        if cxx_pytree is not None:
-            implementations.append(("cxx", cxx_pytree))
-
-        for name, module in implementations:
-            with self.subTest(f"pytree implement: {name}"):
-
-                def fn(x):
-                    tree = {
-                        "a": [x, x - 1],
-                        "b": x + 2,
-                        "c": (
-                            x,
-                            3.0,
-                            collections.deque([0.0, -x, 1, 2], maxlen=3),
-                        ),
-                        "d": collections.OrderedDict(
-                            {
-                                "e": torch.return_types.qr((2 * x, None)),
-                                "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
-                            },
-                        ),
-                    }
-                    leaves = module.tree_leaves(tree)
-                    return leaves
-
-                x = torch.randn(3, 2)
-                expected = fn(x)
-                fn_opt = torch.compile(fullgraph=True)(fn)
-                actual = fn_opt(x)
-
-                self.assertEqual(actual, expected)
-
-    def test_pytree_tree_flatten_unflatten(self):
-        implementations = [("python", python_pytree)]
-        if cxx_pytree is not None:
-            implementations.append(("cxx", cxx_pytree))
-
-        for name, module in implementations:
-            with self.subTest(f"pytree implement: {name}"):
-
-                def fn(x, y):
-                    tree = {
-                        "a": [x, x - 1],
-                        "b": x + 2,
-                        "c": (
-                            x,
-                            3.0,
-                            collections.deque([0.0, -x, 1, 2], maxlen=3),
-                        ),
-                        "d": collections.OrderedDict(
-                            {
-                                "e": torch.return_types.qr((2 * x, None)),
-                                "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
-                            },
-                        ),
-                    }
-                    leaves, treespec = module.tree_flatten(tree)
-                    new_leaves = [
-                        x - 1,
-                        y,
-                        x * y,
-                        3.0,
-                        y - 2,
-                        1,
-                        torch.zeros(2, 2),
-                        2 * y,
-                        -y,
-                        x + y,
-                        x - y,
-                        torch.ones(3, 2),
-                        1,
-                    ]
-                    new_tree = module.tree_unflatten(new_leaves, treespec)
-                    return leaves, new_tree
-
-            x = torch.randn(3, 2)
-            y = torch.randn(3, 2)
-            expected = fn(x, y)
-            fn_opt = torch.compile(fullgraph=True)(fn)
-            actual = fn_opt(x, y)
-
-            self.assertEqual(actual, expected)
-
-    def test_pytree_tree_map(self):
-        implementations = [("python", python_pytree)]
-        if cxx_pytree is not None:
-            implementations.append(("cxx", cxx_pytree))
-
-        for name, module in implementations:
-            with self.subTest(f"pytree implement: {name}"):
-
-                def fn(x, y):
-                    tree1 = {
-                        "a": [x, x - 1],
-                        "b": x + 2,
-                        "c": (
-                            x,
-                            3.0,
-                            collections.deque([0.0, -x, 1, 2], maxlen=3),
-                        ),
-                        "d": collections.OrderedDict(
-                            {
-                                "e": torch.return_types.qr((2 * x, None)),
-                                "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
-                            },
-                        ),
-                    }
-                    tree2 = collections.OrderedDict(
-                        [
-                            ("c", (y, 3.0, collections.deque([1, -y, 10.0]))),
-                            ("a", [y, y + 1]),
-                            ("b", y + 2),
-                            (
-                                "d",
-                                {
-                                    "f": MyTuple(torch.ones(4, 3), -y, y + 1),
-                                    "e": torch.return_types.qr((2 * y, None)),
-                                },
-                            ),
-                        ],
-                    )
-                    return module.tree_map(lambda u, v: (u, v), tree1, tree2)
-
-                x = torch.randn(3, 2)
-                y = torch.randn(3, 2)
-                expected = fn(x, y)
-                fn_opt = torch.compile(fullgraph=True)(fn)
-                actual = fn_opt(x, y)
-
-                self.assertEqual(actual, expected)
-
     def test_shape_env_no_recording(self):
         main = ShapeEnv(should_record_events=False)
 
@@ -10916,8 +10656,8 @@ def test_shape_env_equal_constructor(self):
 ShapeEnv not equal: field values don't match:
 
 ==> settings: values don't match.
-  >  Left: ShapeEnvSettings(allow_scalar_outputs=False, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False, trace_asserts=False)
-  > Right: ShapeEnvSettings(allow_scalar_outputs=True, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, allow_complex_guards_as_runtime_asserts=False, trace_asserts=False)
+  >  Left: ShapeEnvSettings(allow_scalar_outputs=False, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, trace_asserts=False)
+  > Right: ShapeEnvSettings(allow_scalar_outputs=True, allow_dynamic_output_shape_ops=True, assume_static_by_default=False, specialize_zero_one=True, duck_shape=True, prefer_deferred_runtime_asserts_over_guards=False, trace_asserts=False)
 """,
         )
         self._replay_and_check(main)
@@ -12899,6 +12639,257 @@ def f(*args, **kwargs):
         self.assertRaises(Unsupported, f, "1 + j")
 
 
+class MiscTestsPyTree(torch._inductor.test_case.TestCase):
+    @parametrize_pytree_module
+    def test_tracing_pytree(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+
+        counter = CompileCounter()
+        torch.compile(fn, backend=counter, fullgraph=True)(xs)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 3)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_pytree(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsl = [xs, xs, xs, xs]
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 12)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_tuples(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsl = (xs, xs, xs, xs)
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 12)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_dicts(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsl = {
+            "a": xs,
+            "b": xs,
+            "c": xs,
+        }
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 9)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_mixed_all(self, pytree):
+        def fn(xs):
+            flat_xs, spec = pytree.tree_flatten(xs)
+            res = [x.clone() for x in flat_xs]
+            return pytree.tree_unflatten(res, spec)
+
+        xs = [torch.tensor(i) for i in range(3)]
+        xsa = (xs, xs)
+        xsb = {"aa": xsa, "ab": xs}
+        xsl = {
+            "a": xs,
+            "b": xsa,
+            "c": xsb,
+        }
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsl)
+        real_out = fn(xsl)
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 18)
+
+    @parametrize_pytree_module
+    def test_tracing_nested_tensor_subclass(self, pytree):
+        from torch.testing._internal.two_tensor import TwoTensor
+        from torch.utils.checkpoint import checkpoint
+
+        def fn(xs):
+            nested_xs = [[xs]]
+            flat_xs, spec = pytree.tree_flatten(xs)
+            return flat_xs[0].clone()
+
+        # use checkpoint to trigger a "sourceless" tensor subclass
+        def checkpoint_fn(xs):
+            return checkpoint(fn, xs, use_reentrant=True)
+
+        xs = TwoTensor(torch.ones(2, 2), torch.ones(2, 2))
+
+        counter = CompileCounter()
+        torch.compile(checkpoint_fn, backend=counter, fullgraph=True)(xs)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 2)
+
+    @parametrize_pytree_module
+    def test_pytree_tree_leaves(self, pytree):
+        def fn(x):
+            tree = {
+                "a": [x, x - 1],
+                "b": x + 2,
+                "c": (
+                    x,
+                    3.0,
+                    collections.deque([0.0, -x, 1, 2], maxlen=3),
+                ),
+                "d": collections.OrderedDict(
+                    {
+                        "e": torch.return_types.qr((2 * x, None)),
+                        "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
+                    },
+                ),
+            }
+            leaves = pytree.tree_leaves(tree)
+            return leaves
+
+        x = torch.randn(3, 2)
+        expected = fn(x)
+        fn_opt = torch.compile(fullgraph=True)(fn)
+        actual = fn_opt(x)
+
+        self.assertEqual(actual, expected)
+
+    @parametrize_pytree_module
+    def test_pytree_tree_flatten_unflatten(self, pytree):
+        def fn(x, y):
+            tree = {
+                "a": [x, x - 1],
+                "b": x + 2,
+                "c": (
+                    x,
+                    3.0,
+                    collections.deque([0.0, -x, 1, 2], maxlen=3),
+                ),
+                "d": collections.OrderedDict(
+                    {
+                        "e": torch.return_types.qr((2 * x, None)),
+                        "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
+                    },
+                ),
+            }
+            leaves, treespec = pytree.tree_flatten(tree)
+            new_leaves = [
+                x - 1,
+                y,
+                x * y,
+                3.0,
+                y - 2,
+                1,
+                torch.zeros(2, 2),
+                2 * y,
+                -y,
+                x + y,
+                x - y,
+                torch.ones(3, 2),
+                1,
+            ]
+            new_tree = pytree.tree_unflatten(new_leaves, treespec)
+            return leaves, new_tree
+
+        x = torch.randn(3, 2)
+        y = torch.randn(3, 2)
+        expected = fn(x, y)
+        fn_opt = torch.compile(fullgraph=True)(fn)
+        actual = fn_opt(x, y)
+
+        self.assertEqual(actual, expected)
+
+    @parametrize_pytree_module
+    def test_pytree_tree_map(self, pytree):
+        def fn(x, y):
+            tree1 = {
+                "a": [x, x - 1],
+                "b": x + 2,
+                "c": (
+                    x,
+                    3.0,
+                    collections.deque([0.0, -x, 1, 2], maxlen=3),
+                ),
+                "d": collections.OrderedDict(
+                    {
+                        "e": torch.return_types.qr((2 * x, None)),
+                        "f": MyTuple(x, x + 1, torch.zeros(4, 3)),
+                    },
+                ),
+            }
+            tree2 = collections.OrderedDict(
+                [
+                    ("c", (y, 3.0, collections.deque([1, -y, 10.0]))),
+                    ("a", [y, y + 1]),
+                    ("b", y + 2),
+                    (
+                        "d",
+                        {
+                            "f": MyTuple(torch.ones(4, 3), -y, y + 1),
+                            "e": torch.return_types.qr((2 * y, None)),
+                        },
+                    ),
+                ],
+            )
+            return pytree.tree_map(lambda u, v: (u, v), tree1, tree2)
+
+        x = torch.randn(3, 2)
+        y = torch.randn(3, 2)
+        expected = fn(x, y)
+        fn_opt = torch.compile(fullgraph=True)(fn)
+        actual = fn_opt(x, y)
+
+        self.assertEqual(actual, expected)
+
+    @parametrize_pytree_module
+    def test_pytree_tree_map_only(self, pytree):
+        def fn(xs):
+            def mapper(x):
+                return x.clone()
+
+            y = pytree.tree_map_only(torch.Tensor, mapper, xs)
+            return y
+
+        xs = [torch.tensor(i) for i in range(3)] + ["hi"]
+        xsa = (xs, xs)
+        xsb = {"aa": xsa, "ab": xs}
+
+        counter = CompileCounter()
+        comp_out = torch.compile(fn, backend=counter, fullgraph=True)(xsb)
+        real_out = fn(xsb)
+
+        self.assertEqual(comp_out, real_out)
+        self.assertEqual(counter.frame_count, 1)
+        self.assertEqual(counter.op_count, 9)
+
+
 class TestTracer(JitTestCase):
     def test_jit_save(self):
         def fn():
@@ -13279,10 +13270,14 @@ def forward(self, input):
         #   RuntimeError: value cannot be converted to type at::Half without overflow
 
 
+instantiate_parametrized_tests(MiscTestsPyTree)
+
 devices = ("cuda", "hpu", "xpu")
 instantiate_device_type_tests(
     MiscTestsDevice, globals(), only_for=devices, allow_xpu=True
 )
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/test/dynamo/test_modes.py b/test/dynamo/test_modes.py
index ec9c4473a17f..818e5a85aa26 100644
--- a/test/dynamo/test_modes.py
+++ b/test/dynamo/test_modes.py
@@ -11,6 +11,7 @@
     _pop_torch_function_stack,
     _push_on_torch_function_stack,
 )
+from torch._dynamo.utils import counters
 from torch.overrides import _get_current_function_mode_stack, BaseTorchFunctionMode
 from torch.testing._internal.common_utils import skipIfXpu
 from torch.testing._internal.inductor_utils import GPU_TYPE
@@ -61,6 +62,54 @@ def setUpClass(cls):
     def tearDownClass(cls):
         super().tearDownClass()
 
+    def test_torch_dispatch_ignore_compile_internals(self):
+        counters.clear()
+        from torch.utils._python_dispatch import TorchDispatchMode
+
+        @torch.library.custom_op("mylib::foo", mutates_args=())
+        def foo(x: torch.Tensor) -> torch.Tensor:
+            return x.clone()
+
+        def checksum(x):
+            return x.abs().sum()
+
+        _checksums = []
+
+        class ChecksumFoo(TorchDispatchMode):
+            @classmethod
+            def ignore_compile_internals(cls):
+                return True
+
+            def __init__(self) -> None:
+                super().__init__()
+
+            def __torch_dispatch__(self, func, types, args, kwargs=None):
+                kwargs = kwargs or {}
+
+                if func is torch.ops.mylib.foo.default:
+                    # Do some compute, smoketest to see if there's a bad interaction
+                    _checksums.append(args[0].abs().sum())
+
+                return func(*args, **kwargs)
+
+        # test e2e, with Inductor, as smoketest.
+        @torch._dynamo.error_on_graph_break(True)
+        @torch.compile(backend="inductor")
+        def g(x):
+            return 2 * x.sin().cos()
+
+        x = torch.randn(3)
+
+        with ChecksumFoo():
+            foo(x)
+            g(x)
+            foo(x)
+
+        self.assertEqual(len(_checksums), 2)
+        # The correct result here is 1: Dynamo should capture the `g` frame.
+        self.assertEqual(counters["frames"]["total"], 1)
+        self.assertEqual(counters["frames"]["ok"], 1)
+
     def test_skip_torch_dispatch_modes(self):
         class RewriteAddToMul(TorchDispatchMode):
             def __torch_dispatch__(self, func, types, args=(), kwargs=None):
diff --git a/test/dynamo/test_nested_graph_breaks.py b/test/dynamo/test_nested_graph_breaks.py
new file mode 100644
index 000000000000..1f404239447c
--- /dev/null
+++ b/test/dynamo/test_nested_graph_breaks.py
@@ -0,0 +1,566 @@
+# Owner(s): ["module: dynamo"]
+
+import torch
+import torch._dynamo.test_case
+import torch._dynamo.testing
+from torch._dynamo import config
+from torch._dynamo.testing import make_test_cls_with_patches
+
+
+try:
+    # from . import test_ctx_manager
+    pass
+except ImportError:
+    # import test_aot_autograd
+    # import test_ctx_manager
+
+    # import test_export
+    # import test_functions
+    # import test_higher_order_ops
+    # import test_misc
+    # import test_modules
+    # import test_repros
+    # import test_sdpa
+    # import test_subgraphs
+    pass
+
+
+test_classes = {}
+
+
+def make_nested_cls(cls):
+    suffix = "_nested_graph_breaks"
+
+    cls_prefix = "NestedGraphBreaks"
+
+    test_class = make_test_cls_with_patches(
+        cls,
+        cls_prefix,
+        suffix,
+        (config, "debug_force_nested_calls", True),
+        (config, "debug_force_graph_break_on_leaf_return", True),
+        (config, "debug_disable_compile_counter", True),
+        xfail_prop="_expected_failure_nested_graph_breaks",
+    )
+
+    test_classes[test_class.__name__] = test_class
+    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
+    # globals()[test_class.__name__] = test_class
+    test_class.__module__ = __name__
+    return test_class
+
+
+tests = [
+    # test_ctx_manager.CtxManagerTests,
+    # test_functions.FunctionTests,
+    # test_misc.MiscTests,
+    # test_repros.ReproTests,
+    # test_modules.NNModuleTests,
+    # test_subgraphs.SubGraphTests,
+    # test_higher_order_ops.HigherOrderOpTests,
+    # test_higher_order_ops.FuncTorchHigherOrderOpTests,
+    # test_aot_autograd.AotAutogradFallbackTests,
+    # test_sdpa.TestSDPA,
+]
+test = None
+for test in tests:
+    make_nested_cls(test)
+del test
+
+
+# for use in test_side_effects_globals
+global1, global2, global3, global4 = (torch.zeros(3),) * 4
+
+
+class NestedGraphBreakTests(torch._dynamo.test_case.TestCase):
+    def setUp(self):
+        super().setUp()
+        torch._dynamo.config.nested_graph_breaks = True
+
+    def tearDown(self):
+        super().tearDown()
+        torch._dynamo.config.nested_graph_breaks = False
+
+    def test_single_graph_break(self):
+        # NOTE marking f1, f2, f3 as global
+        # prevents them from being freevars
+        global f1, f2, f3
+
+        def f1(x1):
+            x1 = x1 + 1
+            torch._dynamo.graph_break()
+            return x1 + 2
+
+        def f2(x2):
+            return f1(x2 + 4) + 8
+
+        def f3(x3):
+            return f2(x3 + 16) + 32
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 6)
+
+    def test_single_graph_break_repeat(self):
+        global f1, f2, f3
+
+        def f1(x1):
+            x1 = x1 + 1
+            torch._dynamo.graph_break()
+            return x1 + 2
+
+        def f2(x2):
+            tmp1 = f1(x2 + 4)
+            tmp2 = f1(x2 + 8) << 4
+            return tmp1 + tmp2
+
+        def f3(x3):
+            return f2(x3 + 256) + 512
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3, dtype=torch.long)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 3)
+        self.assertEqual(cnts.op_count, 10)
+
+    def test_doubly_nested_graph_break(self):
+        global f1, f2, f3
+
+        def f1(x1):
+            x1 = x1 + 1
+            torch._dynamo.graph_break()
+            return x1 + 2
+
+        def f2(x2):
+            x2 = x2 + 4
+            torch._dynamo.graph_break()
+            return f1(x2 + 8) + 16
+
+        def f3(x3):
+            return f2(x3 + 32) + 64
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 3)
+        self.assertEqual(cnts.op_count, 7)
+
+    def test_differing_arg_nums(self):
+        global f1, f2, f3, f4
+
+        def f1(x1, x2):
+            x = x1 + x2
+            torch._dynamo.graph_break()
+            return x + 1
+
+        def f2(x3, x4, x5, x6):
+            return f1(x3 + x4, x5 + x6) + 2
+
+        def f3(x7, x8):
+            return f2(x7, x7 + 4, x8, x8 + 8) + 16
+
+        def f4(x9):
+            return f3(x9, x9 + 32) + 64
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f4)
+        x = torch.zeros(3)
+        res = f4(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 10)
+
+    def test_differing_locals_nums(self):
+        global f1, f2, f3
+
+        def f1(x1):
+            loc1 = x1 + 1
+            torch._dynamo.graph_break()
+            return loc1 + 2
+
+        def f2(x2):
+            loc1 = x2 + 4
+            loc2 = x2 + 8
+            return f1(x2) + loc1 + loc2
+
+        def f3(x3):
+            loc1 = x3 + 16
+            loc2 = x3 + 32
+            loc3 = x3 + 64
+            loc4 = x3 + 128
+            return f2(x3) + loc1 + loc2 + loc3 + loc4
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 14)
+
+    def test_supported_ctx_manager(self):
+        global check, check_disabled, f1, f2, f3
+
+        @torch._dynamo.disable
+        def check_disabled(value):
+            assert torch.is_grad_enabled() == value
+
+        def check(value):
+            assert torch.is_grad_enabled() == value
+
+        def f1(x):
+            with torch.no_grad():
+                x = x + 1
+                check(False)
+                check_disabled(False)
+                check(False)
+                return x + 2
+
+        def f2(x):
+            with torch.enable_grad():
+                x = x + 4
+                check(True)
+                check_disabled(True)
+                check(True)
+                return f1(x) + 8
+
+        def f3(x):
+            with torch.no_grad():
+                x = x + 16
+                check(False)
+                check_disabled(False)
+                check(False)
+                return f2(x) + 32
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 4)
+        # includes set_grad_enabled ops
+        self.assertEqual(cnts.op_count, 14)
+
+    def test_inactive_ctx_manager(self):
+        global check, f1, f2, f3
+
+        def check(value):
+            assert torch.is_grad_enabled() == value
+
+        def f1(x, ctx1):
+            x = x + 1
+            ctx2 = torch.no_grad()
+            # torch.no_grad() is a stack value at the time of graph break
+            ctx3 = (torch.no_grad(), torch._dynamo.graph_break())[0]
+            x = x + 64
+            torch._dynamo.graph_break()
+            with ctx1:
+                check(False)
+            with ctx2:
+                check(False)
+            with ctx3:
+                check(False)
+            return x + 2
+
+        def f2(x, ctx1):
+            x = x + 4
+            ctx2 = torch.no_grad()
+            x = f1(x, torch.no_grad())
+            with ctx1:
+                check(False)
+            with ctx2:
+                check(False)
+            return x + 8
+
+        def f3(x):
+            x = x + 16
+            ctx = torch.no_grad()
+            x = f2(x, torch.no_grad())
+            with ctx:
+                check(False)
+            return x + 32
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 3)
+        self.assertEqual(cnts.op_count, 7)
+
+    @torch._dynamo.config.patch(recompile_limit=1, fail_on_recompile_limit_hit=True)
+    def test_no_recompiles(self):
+        global f1, f2, f3
+
+        def f1(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            return x + 2
+
+        def f2(x):
+            x = x + 4
+            x = f1(x)
+            torch._dynamo.graph_break()
+            return x + 8
+
+        def f3(x):
+            x = x + 16
+            return f2(x) + 32
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.zeros(3)
+        res = f3(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 3)
+
+    def test_cells(self):
+        def f1(x1):
+            cell1 = x1 + 1
+            cell2 = x1 + 2
+
+            def f2(x2, x3):
+                nonlocal cell1
+                cell3 = x2 + x3 + 4
+                cell1 += 8
+
+                def f3(x4):
+                    nonlocal cell2, cell3
+                    cell2 += 16
+                    cell3 += 32
+                    torch._dynamo.graph_break()
+                    return x4 + cell1 + cell2 + cell3
+
+                return f3(x2 + x3), cell3
+
+            return f2(x1 + 64, x1 + 128) + (cell1, cell2)
+
+        def outer(x):
+            return f1(x)
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(outer)
+        x = torch.zeros(3)
+        res = outer(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 13)
+
+    def test_side_effects_cells(self):
+        cell1, cell2, cell3, cell4 = (torch.zeros(3),) * 4
+
+        def f1():
+            nonlocal cell1
+            cell1 += 1
+            torch._dynamo.graph_break()
+            return cell1 + cell2
+
+        def f2():
+            nonlocal cell3
+            cell3 += 2
+            return f1() + cell3 + cell4
+
+        def f3():
+            return f2()
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+
+        cell1 = torch.zeros(3)
+        cell2 = torch.zeros(3) + 4
+        cell3 = torch.zeros(3)
+        cell4 = torch.zeros(3) + 8
+        res = f3()
+        res = (res,) + tuple(x.clone() for x in (cell1, cell2, cell3, cell4))
+
+        cell1 = torch.zeros(3)
+        cell2 = torch.zeros(3) + 4
+        cell3 = torch.zeros(3)
+        cell4 = torch.zeros(3) + 8
+        ref = opt_fn()
+        ref = (ref,) + tuple(x.clone() for x in (cell1, cell2, cell3, cell4))
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 5)
+
+    def test_side_effects_globals(self):
+        global f1, f2, f3
+        global global1, global2, global3, global4
+
+        def f1():
+            global global1
+            global1 += 1
+            torch._dynamo.graph_break()
+            return global1 + global2
+
+        def f2():
+            global global3
+            global3 += 2
+            return f1() + global3 + global4
+
+        def f3(x):
+            return x + f2()
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f3)
+        x = torch.ones(3)
+
+        global1 = torch.zeros(3)
+        global2 = torch.zeros(3) + 4
+        global3 = torch.zeros(3)
+        global4 = torch.zeros(3) + 8
+        res = (f3(x), global1.clone(), global2, global3.clone(), global4)
+
+        global1 = torch.zeros(3)
+        global2 = torch.zeros(3) + 4
+        global3 = torch.zeros(3)
+        global4 = torch.zeros(3) + 8
+        ref = (opt_fn(x), global1.clone(), global2, global3.clone(), global4)
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 6)
+
+    def test_side_effects_globals_different_module(self):
+        global f1, f2, _test_nested_graph_breaks_helper
+        try:
+            from . import _test_nested_graph_breaks_helper
+        except ImportError:
+            import _test_nested_graph_breaks_helper
+
+        def f1(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            return x + 1
+
+        def f2(x):
+            x = x + 1
+            x = _test_nested_graph_breaks_helper.fn(x, f1)
+            return x + 1
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f2)
+
+        _test_nested_graph_breaks_helper.reset_state()
+        x = torch.zeros(3)
+        res = (f2(x), _test_nested_graph_breaks_helper.global1.clone())
+
+        _test_nested_graph_breaks_helper.reset_state()
+        ref = (opt_fn(x), _test_nested_graph_breaks_helper.global1.clone())
+
+        self.assertEqual(ref, res)
+        self.assertEqual(cnts.frame_count, 2)
+        self.assertEqual(cnts.op_count, 7)
+
+    def test_nested_graph_break_in_loop(self):
+        global f1, f2, f3, f4, f5
+
+        def f1(x, i):
+            x = x + 1
+            if i == 5:
+                torch._dynamo.graph_break()
+            return x + 1
+
+        def f2(x, i):
+            x = x + 1
+            x = f1(x, i)
+            return x + 1
+
+        def f3(x):
+            for i in range(8):
+                x = f2(x, i)
+            return x
+
+        def f4(x):
+            x = x + 1
+            x = f3(x)
+            return x + 1
+
+        def f5(x):
+            x = x + 1
+            x = f4(x)
+            return x + 1
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        # dynamic=True to prevent unnecessary recompiles
+        opt_fn = torch._dynamo.optimize(backend=cnts, dynamic=True)(f5)
+        x = torch.zeros(3)
+        res = f5(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+        # skip frame due to nested graph break in for loop
+        # 2 frames from f5+f4, 2 frames from f2+f1 (i == 5), 1 frame from f2+f1 (i != 5)
+        self.assertEqual(cnts.frame_count, 5)
+        # 4 additions from f5+f4, 2 x 4 additions from f2+f1 (i == 5, i != 5)
+        self.assertEqual(cnts.op_count, 12)
+
+    def test_nested_graph_break_in_try_block(self):
+        # NOTE: this also tests nested step_graph_break
+        global f1, f2, f3, f4, f5
+
+        def f1(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            return x + 1
+
+        def f2(x):
+            x = x + 1
+            x = f1(x)
+            return x + 1
+
+        def f3(x):
+            x = x + 1
+            try:
+                x = x + 1
+                x = f2(x)
+                x = x + 1
+            finally:
+                pass
+            return x + 1
+
+        def f4(x):
+            x = x + 1
+            x = f3(x)
+            return x + 1
+
+        def f5(x):
+            x = x + 1
+            x = f4(x)
+            return x + 1
+
+        cnts = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch._dynamo.optimize(backend=cnts)(f5)
+        x = torch.zeros(3)
+        res = f5(x)
+        ref = opt_fn(x)
+        print(ref, res)
+        self.assertEqual(ref, res)
+        # skip frame due to graph break in try block
+        # 2 frames from f5+f4+(first part of f3), 2 frames from f2+f1
+        self.assertEqual(cnts.frame_count, 4)
+        # 5 additions from f5+f4+(first part of f3), 4 additions from f2+f1
+        self.assertEqual(cnts.op_count, 9)
+
+
+if __name__ == "__main__":
+    from torch._dynamo.test_case import run_tests
+
+    run_tests()
diff --git a/test/dynamo/test_package.py b/test/dynamo/test_package.py
index ccf02769d56e..96a726ad6680 100644
--- a/test/dynamo/test_package.py
+++ b/test/dynamo/test_package.py
@@ -611,6 +611,27 @@ def foo(x, mod):
 
         self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
 
+    @parametrize("device", ("cpu", "cuda", "xpu"))
+    @torch._dynamo.config.patch(caching_precompile=True)
+    def test_code_with_generator(self, device):
+        if device == "cuda" and not HAS_CUDA_AND_TRITON:
+            raise unittest.SkipTest("Requires CUDA/Triton")
+        if device == "xpu" and not HAS_XPU_AND_TRITON:
+            raise unittest.SkipTest("Requires XPU/Triton")
+
+        def foo(set_of_x):
+            if not all(isinstance(s, torch.Tensor) for s in set_of_x):
+                raise TypeError(
+                    f"Expected all elements of set_of_x to be tensors, got {set_of_x}"
+                )
+
+            return torch.cat(set_of_x, dim=0)
+
+        args = ([torch.randn(3, 2, device=device) for _ in range(3)],)
+        compiled_fn = torch.compile(foo)
+        compiled_fn(*args)
+        self._save_and_reload(expected_backends=1, expected_dynamo=1)
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_pgo.py b/test/dynamo/test_pgo.py
index bb248dedc1cb..ce2fda138729 100644
--- a/test/dynamo/test_pgo.py
+++ b/test/dynamo/test_pgo.py
@@ -122,6 +122,29 @@ def check_whitelist(sources_):
             f(torch.randn(8, 8), torch.randn(8))
             self.assertEqual(cnts.frame_count, 1)
 
+    def test_no_empty_graph_allowlist(self):
+        @torch._dynamo.disable
+        def g(x):
+            return x * 2 + x
+
+        @torch.compile(backend="eager")
+        def f(x):
+            return g(x)
+
+        self.reset()
+        f(torch.randn(4))
+        f(torch.randn(8))
+        self.assertEqual(torch._dynamo.pgo._LOGGED_DYNAMIC_ALLOWLIST, False)
+
+        @torch.compile(backend="eager")
+        def f1(x):
+            return g(x + 2) + 2
+
+        self.reset()
+        f1(torch.randn(4))
+        f1(torch.randn(8))
+        self.assertEqual(torch._dynamo.pgo._LOGGED_DYNAMIC_ALLOWLIST, True)
+
     def test_pgo_dynamic_false(self):
         @torch.compile(backend="eager", dynamic=False)
         class Foo(torch.nn.Module):
@@ -430,6 +453,50 @@ def t(x, y):
                 f(t(4, 2), t(2, 4))
                 self.assertEqual(cnts.frame_count, 1)
 
+    def test_profile_merges(self):
+        from torch._dynamo.pgo import auto_dynamic, merge_pgo_entry
+
+        @torch.compile(backend="eager", fullgraph=True)
+        def f(ints, t_scalar, tensors):
+            # arbitrary compute
+            return ints[0] + ints[1], t_scalar + 1, [t + 1 for t in tensors]
+
+        # single static run
+        f(
+            [0, 2],
+            torch.tensor(0),
+            [
+                torch.randn(2),
+                torch.randn(2, 2),
+                torch.randn(4, 4),
+            ],
+        )
+        # collect profiles
+        profile = next(
+            iter(torch._dynamo.pgo.get_code_state().values())
+        ).automatic_dynamic
+        i0, i1 = profile["L['ints'][0]"], profile["L['ints'][1]"]
+        ts = profile["L['t_scalar]"]
+        t0, t1, t2 = (
+            profile["L['tensors'][0]"],
+            profile["L['tensors'][1]"],
+            profile["L['tensors'][2]"],
+        )
+        # merging same scalar, or tensor into scalar -> no-op
+        merge_pgo_entry(i0, i0)
+        merge_pgo_entry(ts, i0)
+        merge_pgo_entry(t0, i0)
+        self.assertEqual(i0.scalar, 0)
+        # merging different scalars -> dynamic
+        merge_pgo_entry(i1, i0)
+        self.assertEqual(i0.scalar, auto_dynamic)
+        # merging different rank tensors -> static
+        merge_pgo_entry(t0, t2)
+        self.assertEqual(t2.size, (4, 4))
+        # merging same rank tensors -> dynamic
+        merge_pgo_entry(t1, t2)
+        self.assertEqual(t2.size, (auto_dynamic, auto_dynamic))
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_profiler.py b/test/dynamo/test_profiler.py
index c155613eb9ea..61dc63ed2d5c 100644
--- a/test/dynamo/test_profiler.py
+++ b/test/dynamo/test_profiler.py
@@ -192,47 +192,6 @@ def fn(x, y):
             ],
         )
 
-    def test_profiler_enabled(self):
-        def fn(x):
-            x = torch.sin(x)
-            if torch.autograd._profiler_enabled():
-                return torch.cos(x)
-            else:
-                return torch.sigmoid(x)
-
-        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
-        x = torch.randn(4)
-
-        ref = fn(x)
-        res = opt_fn(x)
-        self.assertEqual(ref, res)
-
-        with torch.autograd.profiler.profile():
-            ref = fn(x)
-            res = opt_fn(x)
-            self.assertEqual(ref, res)
-
-    def test_profiler_record_function_ignore(self):
-        def fn(x):
-            x = torch.sin(x)
-            if torch.autograd._profiler_enabled():
-                with torch.autograd.profiler.record_function("dummy"):
-                    return torch.cos(x)
-            else:
-                return torch.sigmoid(x)
-
-        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
-        x = torch.randn(4)
-
-        ref = fn(x)
-        res = opt_fn(x)
-        self.assertEqual(ref, res)
-
-        with torch.autograd.profiler.profile():
-            ref = fn(x)
-            res = opt_fn(x)
-            self.assertEqual(ref, res)
-
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 42ef410a548e..5e23e818f8eb 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -4186,6 +4186,21 @@ def fn(x, l):
         torch.compile(fn, backend=counter)(torch.randn([2, 2]), [])
         self.assertEqual(counter.frame_count, 1)
 
+    def test_get_type_hints(self):
+        class Foo:
+            pass
+
+        def fn(x):
+            typing.get_type_hints(Foo, include_extras=True)
+            return torch.sin(x)
+
+        x = torch.randn(4)
+        ref = fn(x)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref, res)
+
     def test_graph_break_on_jit_isinstance(self):
         @torch.compile(backend="eager")
         def fn(x):
@@ -4991,6 +5006,27 @@ def fn(x_weak, weight, y):
         res = opt_fn(x_weak, weight, y)
         self.assertEqual(ref, res)
 
+    # https://github.com/pytorch/pytorch/issues/159258
+    def test_weakref_proxy(self):
+        class DummyTrainer:
+            def __init__(self, x):
+                self.foo = x
+
+        class DummyModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.trainer = None
+
+            def foo(self):
+                return self.trainer.foo
+
+        x = torch.randn(4)
+        model = DummyModel()
+        trainer = DummyTrainer(x)
+        model.trainer = weakref.proxy(trainer)
+        compiled_foo = torch.compile(model.foo, backend="eager", fullgraph=True)
+        self.assertEqual(compiled_foo(), x)
+
     def test_weakref_reconstruct(self):
         def fn(x_weak, weight, y):
             y = torch.sin(y)
@@ -6486,21 +6522,6 @@ def inject_parameters(module, cls):
         with torch.no_grad():
             model(x)
 
-    def test_ao_fake_quantize_tracing(self):
-        import torch.ao.quantization.fake_quantize
-
-        q = torch.ao.quantization.FusedMovingAvgObsFakeQuantize()
-
-        def fn(x):
-            return q(x)
-
-        x = torch.ones(2, 2)
-        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
-        res = opt_fn(x)
-        eager_res = fn(x)
-
-        self.assertEqual(res, eager_res)
-
     def test_typed_dict(self):
         class LlavaImagePixelInputs(TypedDict):
             type: Literal["pixel_values"]
@@ -7129,6 +7150,48 @@ def fn(x):
                 0, sys.monitoring.events.PY_START, old_callback
             )
 
+    def test_312_local_cell_overlap(self):
+        keys = range(10)
+        allowed = [0, 1, 2, 3]
+
+        def fn(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            key = [key for key in keys if key in allowed]
+
+            def inner():
+                nonlocal key
+
+            return x + key[0]
+
+        self.assertEqual(
+            fn(torch.ones(3)), torch.compile(fn, backend="eager")(torch.ones(3))
+        )
+
+    def test_311_resume_block_keyerror(self):
+        # https://github.com/pytorch/pytorch/issues/162313
+        flag = True
+
+        def fn(x):
+            x = x + 1
+            torch._dynamo.graph_break()
+            x = x + 2
+            if flag:
+                with torch.no_grad():
+                    torch._dynamo.graph_break()
+                x = x + 4
+            else:
+                with torch.no_grad():
+                    torch._dynamo.graph_break()
+                x = x + 8
+            return x + 16
+
+        inp = torch.ones(3)
+        opt_fn = torch.compile(fn, backend="eager")
+        self.assertEqual(fn(inp), opt_fn(inp))
+        flag = False
+        self.assertEqual(fn(inp), opt_fn(inp))
+
     def test_unbind_copy_out(self):
         def f(eye, out):
             torch.unbind_copy(eye, out=out)
@@ -7731,6 +7794,19 @@ def forward(self, x):
         self.assertEqual(model.a.grad.device, torch.device("cpu"))
         self.assertEqual(model.b.grad.device, torch.device("cpu"))
 
+    @unittest.skipIf(not TEST_CUDA, "test requires CUDA")
+    def test_cuda_sync(self):
+        def fn(x):
+            y = x + 1
+            torch.cuda.synchronize()
+            return y * 2
+
+        x = torch.ones(2, device="cuda")
+        cnt = torch._dynamo.testing.CompileCounter()
+        opt_fn = torch.compile(fn, backend=cnt)
+        self.assertEqual(fn(x), opt_fn(x))
+        self.assertEqual(cnt.frame_count, 2)
+
     def test_filter_warnings(self):
         x = torch.ones(2, 2, requires_grad=True)
 
diff --git a/test/dynamo/test_sets.py b/test/dynamo/test_sets.py
index 7b6421ce6a25..1f11d1d65d0e 100644
--- a/test/dynamo/test_sets.py
+++ b/test/dynamo/test_sets.py
@@ -657,7 +657,6 @@ class FrozensetTests(_FrozensetBase, _BaseSetTests):
 class SetTests(_SetBase, _BaseSetTests):
     thetype = set
 
-    @unittest.expectedFailure
     def test_in_frozenset(self):
         super().test_in_frozenset()
 
@@ -668,13 +667,11 @@ class CustomSet(set):
 
     thetype = CustomSet
 
-    @unittest.expectedFailure
     def test_in_frozenset(self):
         super().test_in_frozenset()
 
-    @unittest.expectedFailure
     def test_equality(self):
-        super().test_in_frozenset()
+        super().test_equality()
 
 
 class UserDefinedFrozensetTests(_FrozensetBase, _BaseSetTests):
@@ -683,7 +680,6 @@ class CustomFrozenset(frozenset):
 
     thetype = CustomFrozenset
 
-    @unittest.expectedFailure
     def test_in_frozenset(self):
         super().test_in_frozenset()
 
diff --git a/test/dynamo/test_structured_trace.py b/test/dynamo/test_structured_trace.py
index cf9e0674e46c..89c14961a3a7 100644
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@@ -916,6 +916,8 @@ def fn(a):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
+{"artifact": {"name": "inductor_provenance_tracking_kernel_stack_traces", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@@ -1489,6 +1491,48 @@ def f(x):
 
             self.assertParses()
 
+    @contextmanager
+    def _setup_graph_execution_capture(self):
+        """Helper to capture the 'graph_execution' structured trace."""
+        payload_buffer = io.StringIO()
+        payload_handler = logging.StreamHandler(payload_buffer)
+        payload_handler.setLevel(logging.DEBUG)
+        payload_handler.setFormatter(StructuredTracePayloadFormatter())
+        payload_handler.addFilter(StructuredTraceTestingFilter("graph_execution"))
+        trace_log.addHandler(payload_handler)
+        try:
+            yield payload_buffer
+        finally:
+            trace_log.removeHandler(payload_handler)
+
+    @requires_tlparse
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_graph_execution_order(self):
+        """Verify graph execution order is aggregated into a single artifact."""
+        torch._dynamo.reset()
+        with self._setup_graph_execution_capture() as payload_buffer:
+
+            def fn(x):
+                y = x + 1
+                torch._dynamo.graph_break()
+                return y + 2
+
+            compiled = torch.compile(fn, backend="inductor")
+            from torch._inductor.debug import record_and_log_graph_execution_order
+
+            with record_and_log_graph_execution_order():
+                compiled(torch.randn(1))
+
+            payload_content = payload_buffer.getvalue().strip()
+            payload = json.loads(payload_content)
+            executions = payload["graph_execution_order"]
+            self.assertTrue(all(isinstance(e["compile_id"], str) for e in executions))
+            self.assertExpectedInline(
+                json.dumps(payload),
+                """{"graph_execution_order": [{"compile_id": "0/0"}, {"compile_id": "1/0"}]}""",
+            )
+            self.assertParses()
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault_atomic b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setdefault_atomic
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setitem_atomic_at_resize b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_setitem_atomic_at_resize
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_del b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_del
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop_pending b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_pop_pending
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_popitem b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_popitem
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_setdefault b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_setdefault
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_to_generic_combinedtable b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_to_generic_combinedtable
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_update b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_splittable_update
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_store_evilattr b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_store_evilattr
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_str_nonstr b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_str_nonstr
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_views_mapping b/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_views_mapping
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_heapq-TestErrorHandlingPython.test_arg_parsing b/test/dynamo_expected_failures/CPython313-test_heapq-TestErrorHandlingPython.test_arg_parsing
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_heapq-TestErrorHandlingPython.test_non_sequence b/test/dynamo_expected_failures/CPython313-test_heapq-TestErrorHandlingPython.test_non_sequence
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_chain b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_chain
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_compress b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_compress
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_cycle b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_cycle
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_filterfalse b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_filterfalse
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_tee b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_tee
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_zip b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_zip
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test___all__ b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test___all__
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_maintains_tracking b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_abs
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_copy_maintains_tracking
rename to test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_abs
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_contain_use_after_free b/test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_pos
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_contain_use_after_free
rename to test/dynamo_expected_failures/CPython313-test_operator-PyOperatorTestCase.test_pos
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonBuiltinDictTests.test_override_update
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_init_calls
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_clear
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_linked_list_by_delete_key
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_clear
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key_in_dict_eq b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue119004_change_size_by_delete_key_in_dict_eq
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24347
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_issue24348
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_overridden_init
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_override_update
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictSubclassTests.test_popitem_last
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_init_calls
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_clear
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_linked_list_by_delete_key
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_clear b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_clear
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key_in_dict_eq b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue119004_change_size_by_delete_key_in_dict_eq
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24347
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_issue24348
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_merge_operator b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_merge_operator
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end_issue25406 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_move_to_end_issue25406
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_overridden_init
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_override_update
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem_last b/test/dynamo_expected_failures/CPython313-test_ordered_dict-CPythonOrderedDictTests.test_popitem_last
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_init_calls
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue119004_attribute_error b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue119004_attribute_error
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24347
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_issue24348
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_overridden_init
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictSubclassTests.test_override_update
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem_hash_collision b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_delitem_hash_collision
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested_subclass b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_highly_nested_subclass
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init_calls b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_init_calls
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue119004_attribute_error b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue119004_attribute_error
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24347 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24347
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24348 b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_issue24348
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_overridden_init b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_overridden_init
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_override_update b/test/dynamo_expected_failures/CPython313-test_ordered_dict-PurePythonOrderedDictTests.test_override_update
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_comparison
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dict_copy_order
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_comparison
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictitems_contains_use_after_free b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_exhausted_iterator_pickling
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_dictitems_contains_use_after_free
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_exhausted_iterator_pickling
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_eq b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_index
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_eq
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_index
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_equal_operator_modifying_operand b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_invalid_invocation
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_equal_operator_modifying_operand
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_invalid_invocation
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_errors_in_view_containment_check b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_issue11845
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_errors_in_view_containment_check
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_issue11845
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_dict_operand b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_pickling
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_dict_operand
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_pickling
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_set_operand b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_pickling_overflowing_index
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_fromkeys_operator_modifying_set_operand
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_pickling_overflowing_index
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_getitem b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_setstate
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_getitem
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_setstate
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_init_use_after_free b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_unpickle_compat
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_init_use_after_free
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_iterator_unpickle_compat
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_instance_dict_getattr_str_subclass b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_large_exhausted_iterator_pickling
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_instance_dict_getattr_str_subclass
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_large_exhausted_iterator_pickling
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_invalid_keyword_arguments b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_odd_bug
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_invalid_keyword_arguments
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_odd_bug
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_and_mutate b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_pickling
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_merge_and_mutate
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_pickling
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_missing b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_constructor_error_messages
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_missing
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_constructor_error_messages
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_lookup b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_iterators
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_mutating_lookup
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_iterators
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_object_set_item_single_instance_non_str_key b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_iterators_invocation
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_object_set_item_single_instance_non_str_key
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_range_iterators_invocation
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_oob_indexing_dictiter_iternextitem b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_types
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_oob_indexing_dictiter_iternextitem
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_types
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_pop b/test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_user_index_method
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_pop
rename to test/dynamo_expected_failures/CPython313-test_range-RangeTest.test_user_index_method
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestFrozenSetSubclass.test_contains
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_difference_update
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference_update b/test/dynamo_expected_failures/CPython313-test_set-TestOnlySetsString.test_sym_difference_update
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestSet.test_contains
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_contains b/test/dynamo_expected_failures/CPython313-test_set-TestSetSubclass.test_contains
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_constructor b/test/dynamo_expected_failures/CPython313-test_set-TestVariousIteratorArgs.test_constructor
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_sort-TestBase.test_small_stability b/test/dynamo_expected_failures/CPython313-test_sort-TestBase.test_small_stability
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_eq b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_eq
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reentrant_insertion b/test/dynamo_expected_failures/TestCheckpoint.test_checkpoint_non_tensor_inputs_outputs
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reentrant_insertion
rename to test/dynamo_expected_failures/TestCheckpoint.test_checkpoint_non_tensor_inputs_outputs
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_resize2 b/test/dynamo_skips/TestScript.test_static_method_on_module
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_resize2
rename to test/dynamo_skips/TestScript.test_static_method_on_module
diff --git a/test/expect/TestSparseMPS.test_print_coalesced_mps_float32.expect b/test/expect/TestSparseMPS.test_print_coalesced_mps_float32.expect
new file mode 100644
index 000000000000..7c0d59c430cd
--- /dev/null
+++ b/test/expect/TestSparseMPS.test_print_coalesced_mps_float32.expect
@@ -0,0 +1,266 @@
+# shape: torch.Size([])
+# nnz: 2
+# sparse_dim: 0
+# indices shape: torch.Size([0, 2])
+# values shape: torch.Size([2])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0, 1]),
+       device='mps:0', size=(), nnz=2, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 2), dtype=torch.int64)
+# _values
+tensor([0, 1], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0., 1.]),
+       device='mps:0', size=(), nnz=2, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0., 1.]),
+       device='mps:0', size=(), nnz=2, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([2.]),
+       device='mps:0', size=(), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 2), dtype=torch.int64)
+# _values
+tensor([0., 1.], device='mps:0')
+
+# shape: torch.Size([0])
+# nnz: 10
+# sparse_dim: 0
+# indices shape: torch.Size([0, 10])
+# values shape: torch.Size([10, 0])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 10), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(10, 0), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([], size=(1, 0)),
+       device='mps:0', size=(0,), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 10), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(10, 0))
+
+# shape: torch.Size([2])
+# nnz: 3
+# sparse_dim: 0
+# indices shape: torch.Size([0, 3])
+# values shape: torch.Size([3, 2])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0, 0],
+                      [0, 1],
+                      [1, 1]]),
+       device='mps:0', size=(2,), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([[0, 0],
+        [0, 1],
+        [1, 1]], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0.0000, 0.3333],
+                      [0.6667, 1.0000],
+                      [1.3333, 1.6667]]),
+       device='mps:0', size=(2,), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0.0000, 0.3333],
+                      [0.6667, 1.0000],
+                      [1.3333, 1.6667]]),
+       device='mps:0', size=(2,), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([[4.0000, 6.0000]]),
+       device='mps:0', size=(2,), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([[0.0000, 0.3333],
+        [0.6667, 1.0000],
+        [1.3333, 1.6667]], device='mps:0')
+
+# shape: torch.Size([100, 3])
+# nnz: 3
+# sparse_dim: 1
+# indices shape: torch.Size([1, 3])
+# values shape: torch.Size([3, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([[0, 1, 2]]),
+       values=tensor([[0, 0, 0],
+                      [0, 0, 1],
+                      [1, 1, 1]]),
+       device='mps:0', size=(100, 3), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([[0, 1, 2]], device='mps:0')
+# _values
+tensor([[0, 0, 0],
+        [0, 0, 1],
+        [1, 1, 1]], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([[0, 1, 2]]),
+       values=tensor([[0.0000, 0.2222, 0.4444],
+                      [0.6667, 0.8889, 1.1111],
+                      [1.3333, 1.5556, 1.7778]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([[0, 1, 2]]),
+       values=tensor([[0.0000, 0.2222, 0.4444],
+                      [0.6667, 0.8889, 1.1111],
+                      [1.3333, 1.5556, 1.7778]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([[0, 1, 2]]),
+       values=tensor([[0.0000, 0.4444, 0.8889],
+                      [1.3333, 1.7778, 2.2222],
+                      [2.6667, 3.1111, 3.5556]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([[0, 1, 2]], device='mps:0')
+# _values
+tensor([[0.0000, 0.2222, 0.4444],
+        [0.6667, 0.8889, 1.1111],
+        [1.3333, 1.5556, 1.7778]], device='mps:0')
+
+# shape: torch.Size([100, 20, 3])
+# nnz: 0
+# sparse_dim: 2
+# indices shape: torch.Size([2, 0])
+# values shape: torch.Size([0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(2, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(2, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 3))
+
+# shape: torch.Size([10, 0, 3])
+# nnz: 3
+# sparse_dim: 0
+# indices shape: torch.Size([0, 3])
+# values shape: torch.Size([3, 10, 0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(3, 10, 0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([], size=(1, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(3, 10, 0, 3))
+
+# shape: torch.Size([10, 0, 3])
+# nnz: 0
+# sparse_dim: 0
+# indices shape: torch.Size([0, 0])
+# values shape: torch.Size([0, 10, 0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 10, 0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 10, 0, 3))
diff --git a/test/expect/TestSparseMPS.test_print_uncoalesced_mps_float32.expect b/test/expect/TestSparseMPS.test_print_uncoalesced_mps_float32.expect
new file mode 100644
index 000000000000..a56eee010b6c
--- /dev/null
+++ b/test/expect/TestSparseMPS.test_print_uncoalesced_mps_float32.expect
@@ -0,0 +1,265 @@
+# shape: torch.Size([])
+# nnz: 2
+# sparse_dim: 0
+# indices shape: torch.Size([0, 2])
+# values shape: torch.Size([2])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0, 1]),
+       device='mps:0', size=(), nnz=2, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 2), dtype=torch.int64)
+# _values
+tensor([0, 1], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0., 1.]),
+       device='mps:0', size=(), nnz=2, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 2)),
+       values=tensor([0., 1.]),
+       device='mps:0', size=(), nnz=2, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([2.]),
+       device='mps:0', size=(), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 2), dtype=torch.int64)
+# _values
+tensor([0., 1.], device='mps:0')
+
+# shape: torch.Size([0])
+# nnz: 10
+# sparse_dim: 0
+# indices shape: torch.Size([0, 10])
+# values shape: torch.Size([10, 0])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 10), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(10, 0), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 10)),
+       values=tensor([], size=(10, 0)),
+       device='mps:0', size=(0,), nnz=10, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([], size=(1, 0)),
+       device='mps:0', size=(0,), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 10), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(10, 0))
+
+# shape: torch.Size([2])
+# nnz: 3
+# sparse_dim: 0
+# indices shape: torch.Size([0, 3])
+# values shape: torch.Size([3, 2])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0, 0],
+                      [0, 1],
+                      [1, 1]]),
+       device='mps:0', size=(2,), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([[0, 0],
+        [0, 1],
+        [1, 1]], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0.0000, 0.3333],
+                      [0.6667, 1.0000],
+                      [1.3333, 1.6667]]),
+       device='mps:0', size=(2,), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([[0.0000, 0.3333],
+                      [0.6667, 1.0000],
+                      [1.3333, 1.6667]]),
+       device='mps:0', size=(2,), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([[4.0000, 6.0000]]),
+       device='mps:0', size=(2,), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([[0.0000, 0.3333],
+        [0.6667, 1.0000],
+        [1.3333, 1.6667]], device='mps:0')
+
+# shape: torch.Size([100, 3])
+# nnz: 3
+# sparse_dim: 1
+# indices shape: torch.Size([1, 3])
+# values shape: torch.Size([3, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([[0, 1, 0]]),
+       values=tensor([[0, 0, 0],
+                      [0, 0, 1],
+                      [1, 1, 1]]),
+       device='mps:0', size=(100, 3), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([[0, 1, 0]], device='mps:0')
+# _values
+tensor([[0, 0, 0],
+        [0, 0, 1],
+        [1, 1, 1]], device='mps:0', dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([[0, 1, 0]]),
+       values=tensor([[0.0000, 0.2222, 0.4444],
+                      [0.6667, 0.8889, 1.1111],
+                      [1.3333, 1.5556, 1.7778]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([[0, 1, 0]]),
+       values=tensor([[0.0000, 0.2222, 0.4444],
+                      [0.6667, 0.8889, 1.1111],
+                      [1.3333, 1.5556, 1.7778]]),
+       device='mps:0', size=(100, 3), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([[0, 1]]),
+       values=tensor([[2.6667, 3.5556, 4.4444],
+                      [1.3333, 1.7778, 2.2222]]),
+       device='mps:0', size=(100, 3), nnz=2, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([[0, 1, 0]], device='mps:0')
+# _values
+tensor([[0.0000, 0.2222, 0.4444],
+        [0.6667, 0.8889, 1.1111],
+        [1.3333, 1.5556, 1.7778]], device='mps:0')
+
+# shape: torch.Size([100, 20, 3])
+# nnz: 0
+# sparse_dim: 2
+# indices shape: torch.Size([2, 0])
+# values shape: torch.Size([0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(2, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(2, 0)),
+       values=tensor([], size=(0, 3)),
+       device='mps:0', size=(100, 20, 3), nnz=0, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(2, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 3))
+
+# shape: torch.Size([10, 0, 3])
+# nnz: 3
+# sparse_dim: 0
+# indices shape: torch.Size([0, 3])
+# values shape: torch.Size([3, 10, 0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(3, 10, 0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 3)),
+       values=tensor([], size=(3, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=3, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 1)),
+       values=tensor([], size=(1, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=1, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 3), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(3, 10, 0, 3))
+
+# shape: torch.Size([10, 0, 3])
+# nnz: 0
+# sparse_dim: 0
+# indices shape: torch.Size([0, 0])
+# values shape: torch.Size([0, 10, 0, 3])
+########## torch.int32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, dtype=torch.int32,
+       layout=torch.sparse_coo)
+# _indices
+tensor([], device='mps:0', size=(0, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 10, 0, 3), dtype=torch.int32)
+########## torch.float32 ##########
+# sparse tensor
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo)
+# after requires_grad_
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo,
+       requires_grad=True)
+# after addition
+tensor(indices=tensor([], size=(0, 0)),
+       values=tensor([], size=(0, 10, 0, 3)),
+       device='mps:0', size=(10, 0, 3), nnz=0, layout=torch.sparse_coo,
+       grad_fn=<AddBackward0>)
+# _indices
+tensor([], device='mps:0', size=(0, 0), dtype=torch.int64)
+# _values
+tensor([], device='mps:0', size=(0, 10, 0, 3))
diff --git a/test/export/test_converter.py b/test/export/test_converter.py
index 9d872f87d60a..e739e5c34667 100644
--- a/test/export/test_converter.py
+++ b/test/export/test_converter.py
@@ -1448,7 +1448,11 @@ def fuse_model(self):
             ep_out, _ = pytree.tree_flatten(ep.module()(*inp))
             self._check_tensor_list_equal(orig_out, ep_out)
 
-    # qnnpack not supported on s390x
+    # qnnpack/xnnpack not supported on s390x.
+    # it is required by
+    # torch.ops.prepacked.linear_clamp_prepack
+    # and
+    # torch.ops.prepacked.linear_clamp_run
     @xfailIfS390X
     def test_ts2ep_convert_quantized_model_with_opcontext(self):
         class M(torch.nn.Module):
@@ -1467,6 +1471,12 @@ def forward(self, x):
         inp = (torch.randn(1, 10),)
         self._check_equal_ts_ep_converter(m, inp, ["script"])
 
+    # qnnpack/xnnpack not supported on s390x.
+    # it is required by
+    # torch.ops.prepacked.linear_clamp_prepack
+    # and
+    # torch.ops.prepacked.linear_clamp_run
+    @xfailIfS390X
     def test_ts2ep_convert_quantized_model_with_opcontext_and_constant(self):
         class M(torch.nn.Module):
             def __init__(self, linear_op):
diff --git a/test/export/test_draft_export.py b/test/export/test_draft_export.py
index 6cf819958fcc..7f7148273ad7 100644
--- a/test/export/test_draft_export.py
+++ b/test/export/test_draft_export.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: export"]
 import copy
+import re
 import tempfile
 import unittest
 
@@ -407,7 +408,12 @@ def forward(self, a):
 
         inp = (torch.ones(3, 3),)
 
-        ep = draft_export(M(), inp, dynamic_shapes={"a": {0: Dim("a0")}})
+        ep = draft_export(
+            M(),
+            inp,
+            dynamic_shapes={"a": {0: Dim("a0")}},
+            prefer_deferred_runtime_asserts_over_guards=True,
+        )
         report = ep._report
 
         self.assertEqual(len(report.failures), 1)
@@ -417,7 +423,11 @@ def forward(self, a):
         self.assertEqual(ep.module()(*inp), M()(*inp))
 
         inp = (torch.randn(4, 3),)
-        with self.assertRaises(RuntimeError):
+        with self.assertRaisesRegex(
+            AssertionError,
+            re.escape("Guard failed: a.size()[0] <= 3"),
+        ):
+            # expected <= 3, but got 4
             ep.module()(*inp)
 
     def test_side_effect1(self):
diff --git a/test/export/test_experimental.py b/test/export/test_experimental.py
index 641dd586edb5..871dc813a687 100644
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: export"]
 # flake8: noqa
+import copy
 import types
 import unittest
 from typing import Dict, List, Tuple
@@ -318,10 +319,8 @@ def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     linear_weight = self.linear.weight
     linear_bias = self.linear.bias
-    sym_size_int_2 = torch.ops.aten.sym_size.int(x, 1)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     linear = torch.ops.aten.linear.default(x, linear_weight, linear_bias);  x = linear_weight = linear_bias = None
-    eq = sym_size_int_2 == 4;  sym_size_int_2 = None
-    _assert_scalar_default = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(s27, 4) on node 'eq'");  eq = _assert_scalar_default = None
     return pytree.tree_unflatten((linear,), self._out_spec)""",
         )
 
@@ -354,6 +353,62 @@ def generate(self, *, input_tensor, input_tensor2):
         res2 = p.generate(input_tensor=inp, input_tensor2=inp2)
         self.assertTrue(torch.allclose(res, res2))
 
+    def test_export_add_in_out_info(self):
+        class Foo(torch.nn.Module):
+            def forward(self, dct, lst, bleh):
+                x = dct["a"] * lst[1][0]
+                y = dct["b"] * lst[0]
+                out_dict = {}
+                # Mutate and get a new entry in there
+                lst_copy = lst.copy()
+                lst_copy.append(lst[0])
+                out_dict["a"] = x
+                out_dict["b"] = y
+                return (
+                    dct["a"],
+                    out_dict["b"],
+                    bleh,
+                    lst_copy[-1],
+                    out_dict["a"],
+                    [5, 6],
+                )
+
+        dct = {"a": torch.randn(2, 3), "b": torch.randn(2, 3)}
+        lst = [torch.randn(2, 3), [torch.randn(2, 3), torch.randn(2, 3)]]
+
+        export_inputs = ((dct, lst, 56), {})
+        eager_inputs = copy.deepcopy(export_inputs)
+
+        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+
+        graph_module = _dynamo_graph_capture_for_export(Foo())(
+            *export_inputs[0], **export_inputs[1]
+        )
+
+        res_export = graph_module(*export_inputs[0], **export_inputs[1])
+        res_eager = Foo()(*eager_inputs[0], **eager_inputs[1])
+
+        self.assertEqual(res_export, res_eager)
+
+    def test_export_leaf(self):
+        class Foo(torch.nn.Module):
+            def forward(self, x):
+                return x.sin()
+
+        export_inputs = ((torch.randn(4, 4),), {})
+        eager_inputs = copy.deepcopy(export_inputs)
+
+        from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
+
+        graph_module = _dynamo_graph_capture_for_export(Foo())(
+            *export_inputs[0], **export_inputs[1]
+        )
+
+        res_export = graph_module(*export_inputs[0], **export_inputs[1])
+        res_eager = Foo()(*eager_inputs[0], **eager_inputs[1])
+
+        self.assertEqual(res_export, res_eager)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 4d25cdf1dd38..664436a23ee4 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -7,11 +7,13 @@
 import logging
 import math
 import operator
+import os
 import re
 import traceback
 import unittest
 import warnings
-from contextlib import contextmanager
+import weakref
+from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from re import escape
 from typing import Dict, List, Union
@@ -24,6 +26,7 @@
 from functorch.experimental.control_flow import cond, map
 from torch import Tensor
 from torch._decomp import decomposition_table, get_decompositions
+from torch._dynamo._trace_wrapped_higher_order_op import mod_index
 from torch._dynamo.test_case import TestCase
 from torch._dynamo.testing import normalize_gm
 from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
@@ -515,7 +518,13 @@ def _test_export_same_as_eager(self, f, args, kwargs=None):
         # )
 
     def _check_dynamic_shapes_specs_and_shapes(
-        self, model, inputs, specs, passing_shapes, failing_shapes, test_serdes=False
+        self,
+        model,
+        inputs,
+        specs,
+        passing_shapes,
+        failing_shapes,
+        test_serdes=False,
     ):
         from torch._export.serde.dynamic_shapes import (
             _dump_dynamic_shapes,
@@ -557,7 +566,7 @@ def _is_tensor_leaf(x):
                     ep.module()(*test_inputs)
                 for shapes in failing_shapes:
                     test_inputs = _construct_inputs(shapes)
-                    with self.assertRaises(RuntimeError):
+                    with self.assertRaisesRegex(AssertionError, "Guard failed"):
                         ep.module()(*test_inputs)
 
     def test_basic(self):
@@ -612,6 +621,22 @@ def forward(self, x):
 
         self.assertEqual(counter, 1)
 
+    @testing.expectedFailureSerDer  # can't serialize functorch ops
+    @testing.expectedFailureSerDerNonStrict  # can't serialize functorch ops
+    def test_vmap_to_assert(self):
+        class VmapToAssert(torch.nn.Module):
+            def forward(self, x, y):
+                f = lambda x, y: (
+                    (x * y).to("cpu", memory_format=torch.channels_last) + 1
+                ).sum(dim=0)  # noqa: E731
+                vmapped = torch.vmap(f)(x, y)
+                return vmapped.sum(dim=0)
+
+        ep = export(VmapToAssert(), (torch.zeros(4, 4, 4, 4), torch.zeros(4, 4, 4, 4)))
+        exported = ep.module()(torch.ones(4, 4, 4, 4), torch.ones(4, 4, 4, 4))
+        eager = VmapToAssert()(torch.ones(4, 4, 4, 4), torch.ones(4, 4, 4, 4))
+        self.assertEqual(exported, eager)
+
     def test_from_node_metadata_export(self):
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -636,7 +661,7 @@ def example_inputs(self):
         from torch.fx.traceback import NodeSourceAction
 
         for node in gm.graph.nodes:
-            if node.op in ("placeholder", "output"):
+            if node.op in ("placeholder", "output", "call_module"):
                 continue
             if "weight" in node.name or "bias" in node.name:
                 self.assertTrue(
@@ -665,7 +690,7 @@ def example_inputs(self):
         graph_id = id(ep2.graph)
 
         for node in gm2.graph.nodes:
-            if node.op in ("placeholder", "output"):
+            if node.op in ("placeholder", "output", "call_module"):
                 continue
 
             if "weight" in node.name or "bias" in node.name:
@@ -928,7 +953,8 @@ def forward(self, x):
             """\
 graph():
     %lifted_tensor_0 : [num_users=1] = get_attr[target=lifted_tensor_0]
-    %x : [num_users=1] = placeholder[target=x]
+    %x : [num_users=2] = placeholder[target=x]
+    %_guards_fn : [num_users=0] = call_module[target=_guards_fn](args = (%x,), kwargs = {})
     %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%x, %lifted_tensor_0), kwargs = {})
     return (add,)""",
         )
@@ -1492,7 +1518,11 @@ def forward(self, x, ys, zs, c):
             {"a": torch.zeros(5), "b": torch.ones(5)},
             torch.ones(4),
         )
-        with self.assertRaisesRegex(RuntimeError, "to be equal to 6, but got 5"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: ys[0].size()[0] == x.size()[0]"),
+        ):
+            # expected 6, but got 5
             ep_ns.module()(*bad_runtime_inp1)
 
         bad_runtime_inp2 = (
@@ -1502,9 +1532,10 @@ def forward(self, x, ys, zs, c):
             torch.ones(6),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[3].shape[0] to be equal to 4, but got 6"),
+            AssertionError,
+            escape("Guard failed: c.size()[0] == 4"),
         ):
+            # expected 4, but got 6
             ep_ns.module()(*bad_runtime_inp2)
 
         good_runtime_inp = (
@@ -1652,6 +1683,8 @@ def forward(self, x):
         x: "f32[3, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sum_1: "f32[]" = torch.ops.aten.sum.default(x)
         gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 3);  sum_1 = None
 
@@ -1737,6 +1770,8 @@ def forward(self, x):
         x: "f32[3, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sum_1: "f32[]" = torch.ops.aten.sum.default(x)
         gt: "b8[]" = torch.ops.aten.gt.Scalar(sum_1, 3);  sum_1 = None
 
@@ -3105,16 +3140,22 @@ def forward(self, x, y):
         ep = export(Foo(), inputs, dynamic_shapes=shapes)
         ep.module()(torch.randn(8, 5), torch.randn(8, 5))
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input at .* to be >= 4, but got 3"
+            AssertionError,
+            escape("Guard failed: x.size()[0] >= 4"),
         ):
+            # expected >= 4, but got 3
             ep.module()(torch.randn(3, 5), torch.randn(3, 5))
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input at .* to be <= 16, but got 17"
+            AssertionError,
+            escape("Guard failed: x.size()[0] <= 16"),
         ):
+            # expected <= 16, but got 17
             ep.module()(torch.randn(17, 5), torch.randn(17, 5))
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input at .* to be <= 32, but got 33"
+            AssertionError,
+            escape("Guard failed: x.size()[1] <= 32"),
         ):
+            # expected <= 32, but got 33
             ep.module()(torch.randn(9, 33), torch.randn(9, 33))
 
     def test_dim_hint_range_violations(self):
@@ -3369,11 +3410,12 @@ def forward(self, x):
 
         actual_torch_fns = []
         for mod in gm.modules():
-            for node in mod.graph.nodes:
-                if node.name in {"sin", "cos"}:
-                    torch_fn = node.meta.get("torch_fn")
-                    print(torch_fn)
-                    actual_torch_fns.append(torch_fn)
+            if hasattr(mod, "graph"):
+                for node in mod.graph.nodes:
+                    if node.name in {"sin", "cos"}:
+                        torch_fn = node.meta.get("torch_fn")
+                        print(torch_fn)
+                        actual_torch_fns.append(torch_fn)
         exp_torch_fns = [
             ("cos_1", "method_descriptor.cos"),
             ("sin_1", "method_descriptor.sin"),
@@ -3546,9 +3588,10 @@ def forward(self, x, y):
             dynamic_shapes=({0: dimx}, {0: dimy}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 5, but got 6",
+            AssertionError,
+            escape("Guard failed: x.size()[0] == -1 + y.size()[0]"),
         ):
+            # expected 5, but got 6
             ep.module()(torch.randn(4), torch.randn(6))
 
         self.assertEqual(ep.module()(torch.randn(4), torch.randn(5)).size()[0], 4)
@@ -3607,13 +3650,16 @@ def forward(self, z, y):
             dynamic_shapes=({0: dimz}, {0: dimy}),
         )
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input.*shape.*to be <= 7, but got 8"
+            AssertionError,
+            escape("Guard failed: z.size()[0] <= 7"),
         ):
+            # expected <= 7, but got 8
             ep.module()(torch.randn(8), torch.randn(15))
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 9, but got 8",
+            AssertionError,
+            escape("Guard failed: -1 + 2 * z.size()[0] == y.size()[0]"),
         ):
+            # expected 9, but got 8
             ep.module()(torch.randn(5), torch.randn(8))
 
         self.assertEqual(ep.module()(torch.randn(5), torch.randn(9)).size()[0], 4)
@@ -3649,17 +3695,18 @@ def forward(self, w):
             dynamic_shapes=({0: dimw},),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*= 9 to be "
-            "of the form 2\\*s92, where s92 is an integer",
+            AssertionError,
+            escape("Guard failed: w.size()[0] % 2 == 0"),
         ):
+            # expected 2*..., got 9
             ep.module()(torch.randn(9))
 
         self.assertEqual(ep.module()(torch.randn(8)).size()[0], 4)
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be <= 12, but got 14",
+            AssertionError,
+            escape("Guard failed: w.size()[0] <= 12"),
         ):
+            # expected <= 12, but got 14
             ep.module()(torch.randn(14))
 
     def test_derived_dim_repeat_derived(self):
@@ -3697,9 +3744,10 @@ def forward(self, x, y, z):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 8, but got 5",
+            AssertionError,
+            escape("Guard failed: z.size()[0] >= 6"),
         ):
+            # expected 8, but got 5
             ep.module()(torch.randn(6), torch.randn(7), torch.randn(5))
 
         self.assertEqual(
@@ -3732,9 +3780,10 @@ def forward(self, x, y, z, x1, x2):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}, {0: dimx1}, {0: dimx2}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 6, but got 5",
+            AssertionError,
+            escape("Guard failed: x2.size()[0] == x.size()[0]"),
         ):
+            # expected 6, but got 5
             ep.module()(
                 torch.randn(6),
                 torch.randn(7),
@@ -3760,9 +3809,10 @@ def forward(self, x, y, z, x1, x2):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}, {0: dimx1}, {0: dimx2}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 6, but got 5",
+            AssertionError,
+            escape("Guard failed: x2.size()[0] == x.size()[0]"),
         ):
+            # expected 6, but got 5
             ep.module()(
                 torch.randn(6),
                 torch.randn(7),
@@ -4198,9 +4248,10 @@ def forward(self, x, y, z):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimz}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 8, but got 5",
+            AssertionError,
+            escape("Guard failed: z.size()[0] >= 6"),
         ):
+            # expected 8, but got 5
             ep.module()(torch.randn(6), torch.randn(7), torch.randn(5))
 
         self.assertEqual(
@@ -4235,6 +4286,7 @@ def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     linear_weight = self.linear.weight
     linear_bias = self.linear.bias
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     linear = torch.ops.aten.linear.default(x, linear_weight, linear_bias);  x = linear_weight = linear_bias = None
     return pytree.tree_unflatten((linear,), self._out_spec)""",
         )
@@ -4275,6 +4327,7 @@ def forward(self, b_buffer, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     buffer = self.buffer
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add_ = torch.ops.aten.add_.Tensor(x, 5);  x = None
     add__1 = torch.ops.aten.add_.Tensor(buffer, 5);  buffer = None
     add = torch.ops.aten.add.Tensor(add_, add__1);  add_ = add__1 = None
@@ -4369,6 +4422,186 @@ def forward(self, container):
             )
         )
 
+    def test_function_holding_tensor(self):
+        global_storage = []
+
+        class FunctionClosureLeak(torch.nn.Module):
+            def forward(self, x):
+                fake_tensor = x + 1  # In real export, this would be a FakeTensor
+
+                def closure():
+                    return fake_tensor.shape  # Captures fake_tensor
+
+                # Store closure globally - this creates the leak
+                global_storage.append(closure)
+                return x.sin()
+
+        prev_os_env = os.environ.copy()
+        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
+
+        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
+
+        with (
+            patch.dict(
+                os.environ,
+                prev_os_env,
+                clear=True,
+            ),
+            self.assertWarnsRegex(
+                UserWarning, "Detected 1 fake tensors that are still alive after export"
+            ),
+        ):
+            export(FunctionClosureLeak(), (torch.randn(4, 4),), strict=False)
+
+    def test_detect_leak_nonstrict(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                return x + y
+
+        global_list = []
+
+        class ReferenceControl:
+            def __init__(self, mod):
+                self.bank = []
+                self.bank_dict = {}
+                self.mod = mod
+
+                def hacked_up_forward(self_, x, y):
+                    self.bank.append(x.clone())
+                    self.bank_dict["x"] = x.clone()
+                    global_list.append(x.clone())
+                    return x + y
+
+                self.mod.forward = hacked_up_forward.__get__(self.mod, Foo)
+
+            def __call__(self, x, y):
+                ep = export(self.mod, (x, y), strict=False).module()
+                out = ep(x, y)
+                return out
+
+            def update(self):
+                return self.bank
+
+        foo = Foo()
+        ref = ReferenceControl(foo)
+        ref(torch.randn(4, 4), torch.randn(4, 4))
+        self.assertTrue(
+            isinstance(ref.bank[0], torch._subclasses.fake_tensor.FakeTensor)
+        )
+
+        prev_os_env = os.environ.copy()
+        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
+
+        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
+
+        with (
+            patch.dict(
+                os.environ,
+                prev_os_env,
+                clear=True,
+            ),
+            self.assertWarnsRegex(
+                UserWarning, "Detected 3 fake tensors that are still alive after export"
+            ),
+        ):
+            ref(torch.randn(4, 4), torch.randn(4, 4))
+
+    def test_detect_leak_nonstrict_with_stacktrace(self):
+        global_list = []
+
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                nonlocal global_list
+                global_list.append(x + y)
+                return x + y
+
+        foo = Foo()
+        ep = export(foo, (torch.randn(4, 4), torch.randn(4, 4)), strict=False)
+        self.assertTrue(
+            isinstance(global_list[0], torch._subclasses.fake_tensor.FakeTensor)
+        )
+
+        prev_os_env = os.environ.copy()
+        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
+
+        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
+
+        with patch.dict(
+            os.environ,
+            prev_os_env,
+            clear=True,
+        ):
+            warn_re = re.compile(
+                r"Detected\s+\d+\s+fake\s+tensors?"
+                r".*test_export\.py.*global_list\.append\(x \+ y\)",
+                re.S,
+            )
+            with self.assertWarnsRegex(UserWarning, warn_re):
+                ep = export(foo, (torch.randn(4, 4), torch.randn(4, 4)), strict=False)
+
+    def test_export_cyclic_reference_leak(self):
+        class Node:
+            def __init__(self, tag):
+                self.tag = tag
+                self.ref = None
+                self.tensor = None
+
+        bank = []
+
+        class LeakyCycle(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, y):
+                z = x + y
+                node1 = Node("A")
+                node2 = Node("B")
+                node1.ref = node2
+                node2.ref = node1
+                node1.tensor = z
+                # Keep the cycle alive intentionally -> leak
+                nonlocal bank
+                bank.append(node1)
+                return (z.sin()).cos()
+
+        lc = LeakyCycle()
+        ep = export(lc, (torch.randn(4, 4), torch.randn(4, 4)), strict=False)
+
+        node1_ref = weakref.ref(bank[0])
+        node2_ref = weakref.ref(bank[0].ref)
+
+        bank.clear()
+        del bank
+        bank = []
+
+        self.assertIsNotNone(node1_ref(), "node1 should still be alive due to cycle")
+        self.assertIsNotNone(node2_ref(), "node2 should still be alive due to cycle")
+
+        prev_os_env = os.environ.copy()
+        from torch.export._trace import NONSTRICT_EXPORT_SANITIZE_TRACE
+
+        prev_os_env[NONSTRICT_EXPORT_SANITIZE_TRACE] = "1"
+
+        with patch.dict(
+            os.environ,
+            prev_os_env,
+            clear=True,
+        ):
+            warn_re = re.compile(
+                r"Detected\s+\d+\s+fake\s+tensors?"
+                r'.*?[/\\]test_export\.py",\s+line\s+\d+,\s+in\s+forward'
+                r"(?:\\n|\n)\s*z\s*=\s*x\s*\+\s*y",
+                re.S,
+            )
+            with self.assertWarnsRegex(UserWarning, warn_re):
+                ep = export(lc, (torch.randn(4, 4), torch.randn(4, 4)), strict=False)
+
     def test_export_for_training_run_decomp(self):
         class Foo(torch.nn.Module):
             def __init__(self) -> None:
@@ -4416,9 +4649,10 @@ def forward(self, x, y, y1, z):
             dynamic_shapes=({0: dimx}, {0: dimy}, {0: dimy}, {0: dimz}),
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected input.*shape.*to be equal to 7, but got 5",
+            AssertionError,
+            escape("Guard failed: y1.size()[0] == y.size()[0]"),
         ):
+            # expected 7, but got 5
             ep.module()(
                 torch.randn(6),
                 torch.randn(7),
@@ -4470,8 +4704,9 @@ def forward(self, x, y, z):
             ep = export(foo, inputs, dynamic_shapes=dynamic_shapes)
             self.assertEqual(foo(*inputs), ep.module()(*inputs))
             for wrong_inputs in wrong_shape_inputs:
-                with self.assertRaises(RuntimeError):
-                    ep.module()(*wrong_inputs)
+                with self.assertRaisesRegex(AssertionError, "Guard failed"):
+                    with self.assertRaises(RuntimeError):
+                        ep.module()(*wrong_inputs)
 
         # check range_constraints - static dims shouldn't be present
         ep = export(foo, inputs, dynamic_shapes=((dx, None), (dy, 4), (dz, 3)))
@@ -4507,8 +4742,10 @@ def forward(self, x):
         ep.module()(torch.randn(1, 2))
         ep.module()(torch.randn(2, 2))
         with self.assertRaisesRegex(
-            RuntimeError, "Expected input at .* to be <= 2, but got 3"
+            AssertionError,
+            escape("Guard failed: x.size()[0] <= 2"),
         ):
+            # expected <= 2, but got 3
             ep.module()(torch.randn(3, 2))
         vr = list(ep.range_constraints.values())[0]
         self.assertEqual(vr.lower, 1)
@@ -4525,7 +4762,12 @@ def forward(self, x, y):
             (torch.randn(2, 2), torch.randn(3, 2)),
             dynamic_shapes=({0: dx, 1: None}, {0: dx + 1, 1: None}),
         )
-        ep.module()(torch.randn(1, 2), torch.randn(2, 2))
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: -1 + y.size()[0] != 1"),
+        ):
+            # TODO: this should not error?
+            ep.module()(torch.randn(1, 2), torch.randn(2, 2))
         range_lower_bounds = sorted(vr.lower for vr in ep.range_constraints.values())
         range_upper_bounds = sorted(vr.upper for vr in ep.range_constraints.values())
         self.assertEqual(range_lower_bounds, [1, 2])
@@ -4715,7 +4957,7 @@ def forward(self, x, y, z):
         self.assertEqual(got_shapes, expected_shapes)
 
         def expect_error(bad_args, run_time_msg, compile_time_msg):
-            with self.assertRaisesRegex(RuntimeError, run_time_msg):
+            with self.assertRaisesRegex(AssertionError, run_time_msg):
                 ep.module()(*bad_args)
 
             additional_inputs = torch.export.AdditionalInputs()
@@ -4727,21 +4969,27 @@ def expect_error(bad_args, run_time_msg, compile_time_msg):
         expect_error(
             # 4->2, 4->2, 3->3
             bad_args=(torch.randn(2), [torch.randn(2)], {"k": torch.randn(3)}),
-            run_time_msg="Expected input.*to be >= 3, but got 2",
+            run_time_msg=escape(
+                "Guard failed: x.size()[0] >= 3"
+            ),  # expected >= 3, but got 2
             compile_time_msg="Expected input.*to be >= 3, but got 2",
         )
 
         expect_error(
             # 4->6, 4->7, 3->3
             bad_args=(torch.randn(6), [torch.randn(7)], {"k": torch.randn(3)}),
-            run_time_msg="Expected input.*to be equal to 6, but got 7",
+            run_time_msg=escape(
+                "Guard failed: y[0].size()[0] == x.size()[0]"
+            ),  # expected 6, but got 7
             compile_time_msg="Expected input.*to be equal to 6, but got 7",
         )
 
         expect_error(
             # 4->5, 4->5, 3->4
             bad_args=(torch.randn(5), [torch.randn(5)], {"k": torch.randn(4)}),
-            run_time_msg="Expected input.*to be equal to 3, but got 4",
+            run_time_msg=escape(
+                "Guard failed: z['k'].size()[0] == 3"
+            ),  # expected 3, but got 4
             compile_time_msg=r"You marked.*but your code specialized it to be a constant.*If you're using Dim.DYNAMIC, replace it with either Dim.STATIC or Dim.AUTO",
         )
 
@@ -5390,7 +5638,18 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(ep.module()(x, y), model(x, y)))
         x2 = torch.arange(4).reshape((2, 2))
         y2 = torch.arange(9).reshape((3, 3))
-        self.assertTrue(torch.allclose(ep.module()(x2, y2), model(x2, y2)))
+        with self.assertRaisesRegex(
+            AssertionError,
+            (
+                escape("Guard failed: max(x.size()[1], y.size()[1]) == x.size()[1]")
+                if is_retracebility_test(self._testMethodName)
+                else escape(
+                    "Guard failed: max(1, x.size()[1], y.size()[1]) == x.size()[1]"
+                )
+            ),
+        ):
+            # TODO: this should not error?
+            self.assertTrue(torch.allclose(ep.module()(x2, y2), model(x2, y2)))
 
     def test_export_max_nonstrict(self):
         class FooMax(torch.nn.Module):
@@ -5514,11 +5773,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         dim0_x = torch.export.Dim("dim0_x", min=3)
         dim1_x = torch.export.Dim("dim1_x", max=8000)
         dynamic_shapes = {"x": (dim0_x, dim1_x)}
-        em = torch.export._trace._export(
+        em = torch.export.export(
             m,
             (a,),
             dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         em.module()(torch.randn(4, 3))
         with self.assertRaisesRegex(
@@ -5533,9 +5792,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         em = torch.export.export(m, (a,), dynamic_shapes=dynamic_shapes)
         x = torch.randn(3, 5)
         with self.assertRaisesRegex(
-            RuntimeError,
-            "Expected.*shape\\[1\\] = 5 to be of the form 2\\*s33, where s33 is an integer",
+            AssertionError,
+            escape("Guard failed: 3 * x.size()[1] % 2 == 0"),
         ):
+            # expected 2*..., but got 5
             em.module()(x)
 
     def test_dont_duck_size_for_auto_dynamic(self):
@@ -7635,6 +7895,7 @@ def forward(self, x):
     bn_running_mean = self.bn.running_mean
     bn_running_var = self.bn.running_var
     bn_num_batches_tracked = self.bn.num_batches_tracked;  bn_num_batches_tracked = None
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     conv2d = torch.ops.aten.conv2d.default(x, conv_weight, conv_bias);  x = conv_weight = conv_bias = None
     batch_norm = torch.ops.aten.batch_norm.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, False, 0.1, 1e-05, True);  conv2d = bn_weight = bn_bias = bn_running_mean = bn_running_var = None
     return pytree.tree_unflatten((batch_norm,), self._out_spec)""",
@@ -7654,6 +7915,7 @@ def forward(self, x):
     bn_running_mean = self.bn.running_mean
     bn_running_var = self.bn.running_var
     bn_num_batches_tracked = self.bn.num_batches_tracked
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     conv2d = torch.ops.aten.conv2d.default(x, conv_weight, conv_bias);  x = conv_weight = conv_bias = None
     add_ = torch.ops.aten.add_.Tensor(bn_num_batches_tracked, 1);  bn_num_batches_tracked = add_ = None
     batch_norm = torch.ops.aten.batch_norm.default(conv2d, bn_weight, bn_bias, bn_running_mean, bn_running_var, True, 0.1, 1e-05, True);  conv2d = bn_weight = bn_bias = bn_running_mean = bn_running_var = None
@@ -8247,18 +8509,20 @@ def forward(self, x, y):
             )
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1] to be equal to 5, but got 6"),
+            AssertionError,
+            escape("Guard failed: y == 5"),
         ):
+            # expected 5, but got 6
             _ = exported.module()(torch.ones(8, 5), 6)
 
         exported = torch.export.export(
             foo, (tensor_inp, 5.0), dynamic_shapes=dynamic_shapes
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1] to be equal to 5.0, but got 6.0"),
+            AssertionError,
+            escape("Guard failed: y == 5.0"),
         ):
+            # expected 5.0, but got 6.0
             _ = exported.module()(torch.ones(7, 5), 6.0)
 
     def test_runtime_assert_for_prm_str(self):
@@ -8270,8 +8534,10 @@ def forward(self, a, b, mode):
         inps = (torch.randn(4, 4), torch.randn(4), "trunc")
         exported = export(foo, inps)
         with self.assertRaisesRegex(
-            RuntimeError, "to be equal to trunc, but got floor"
+            AssertionError,
+            escape("Guard failed: mode == 'trunc'"),
         ):
+            # expected 'trunc', but got 'floor'
             _ = exported.module()(torch.randn(4, 4), torch.randn(4), "floor")
         self.assertTrue(torch.allclose(exported.module()(*inps), foo(*inps)))
 
@@ -8398,9 +8664,12 @@ def forward(self, x):
         dim0_x = torch.export.Dim("dim0_x")
         exported = torch.export.export(Foo(), (inp,), dynamic_shapes=({0: dim0_x},))
         reexported = torch.export.export(exported.module(), (inp,))
+
         with self.assertRaisesRegex(
-            RuntimeError, "shape\[0\] to be equal to 5, but got 7"
+            AssertionError,
+            escape("Guard failed: x.size()[0] == 5"),
         ):
+            # expected 5, but got 7
             reexported.module()(torch.ones(7, 5))
 
         reexported = torch.export.export(
@@ -8418,9 +8687,10 @@ def forward(self, x):
             Foo(), (inp,), dynamic_shapes={"x": {0: dim0_x_v2}}
         )
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[0] to be >= 3, but got 2"),
+            AssertionError,
+            escape("Guard failed: x.size()[0] >= 3"),
         ):
+            # expected >= 3, but got 2
             torch.export.export(exported_v2.module(), (torch.randn(2, 2),))
 
     def test_export_cond_symbool_pred(self):
@@ -9215,8 +9485,10 @@ def forward(self, a, b):
             dynamic_shapes=(None, None),
         )
         with self.assertRaisesRegex(
-            RuntimeError, "shape\[0\] to be equal to 4, but got 7"
+            AssertionError,
+            escape("Guard failed: b.size()[0] == 4"),
         ):
+            # expected 4, but got 7
             ep_v2.module()(*test_inp)
 
     def test_constant_output(self):
@@ -9296,7 +9568,11 @@ def dynamify_inp(x):
         ep = torch.export.export(foo, inp, dynamic_shapes=dynamic_shapes)
 
         test_inp = ((torch.randn(4, 4), torch.randn(2, 4)), torch.randn(4, 4))
-        with self.assertRaisesRegex(RuntimeError, "shape\[0\] to be >= 3, but got 2"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: a[1].size()[0] >= 3"),
+        ):
+            # expected >= 3, but got 2
             ep.module()(*test_inp)
 
     def test_nested_module(self):
@@ -9498,13 +9774,17 @@ def forward(self, x):
         ).module()
 
         with self.assertRaisesRegex(
-            RuntimeError, escape("Expected input at *args[0].shape[0]")
+            AssertionError,
+            escape("Guard failed: x.size()[0] >= 3"),
         ):
+            # expected >= 3, got 2
             gm(torch.randn(2, 2))
 
         with self.assertRaisesRegex(
-            RuntimeError, escape("Expected input at *args[0].shape[0]")
+            AssertionError,
+            escape("Guard failed: x.size()[0] >= 3"),
         ):
+            # expected >= 3, got 2
             export(gm, (torch.randn(2, 2),))
 
         ep = export(
@@ -11323,7 +11603,11 @@ def forward(self, x, y):
 
         ep = export(M(), (4, 5))
         self.assertEqual(ep.module()(4, 5), 20)
-        with self.assertRaisesRegex(RuntimeError, r"to be equal to 4, but got 3"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x == 4"),
+        ):
+            # expected 4, but got 3
             self.assertEqual(ep.module()(3, 6), 18)
 
         ep = export(M(), (4, 5), dynamic_shapes={"x": Dim.DYNAMIC, "y": Dim.AUTO})
@@ -11336,7 +11620,11 @@ def forward(self, x, y):
 
         ep = export(M(), (5, 5), dynamic_shapes={"x": None, "y": Dim.AUTO})
         self.assertEqual(ep.module()(5, 6), 30)
-        with self.assertRaisesRegex(RuntimeError, r"to be equal to 5, but got 3"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x == 5"),
+        ):
+            # expected 5, but got 3
             self.assertEqual(ep.module()(3, 5), 18)
 
         class M(torch.nn.Module):
@@ -11352,7 +11640,6 @@ def forward(self, x, y):
         self.assertTrue(torch.allclose(ep.module()(*inp), M()(*inp)))
 
     @testing.expectedFailureCppRuntime
-    @testing.expectedFailureRetraceabilityNonStrict  # no runtime asserts added for assert x == 3
     def test_symint_input_specialization(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -11377,11 +11664,14 @@ def forward(self, x, y):
             inp,
             dynamic_shapes=(Dim.AUTO, None),
         )
-        with self.assertRaisesRegex(RuntimeError, "to be equal to 3, but got 4"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x == 3"),
+        ):
+            # expected 3, but got 4
             ep.module()(4, torch.randn(4, 4))
 
     @testing.expectedFailureCppRuntime
-    @testing.expectedFailureRetraceabilityNonStrict  # no runtime asserts added for assert x == 3
     def test_symint_input_ranges(self):
         class M(torch.nn.Module):
             def forward(self, x, y):
@@ -11395,9 +11685,17 @@ def forward(self, x, y):
         )
 
         ep.module()(4, torch.randn(4, 4))
-        with self.assertRaisesRegex(RuntimeError, "to be <= 10, but got 16"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x <= 10"),
+        ):
+            # expected <= 10, but got 16
             ep.module()(16, torch.randn(4, 4))
-        with self.assertRaisesRegex(RuntimeError, "to be >= 3, but got 2"):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x >= 3"),
+        ):
+            # expected >= 3, but got 2
             ep.module()(2, torch.randn(4, 4))
 
         # While tracing the range was found to be a subset of the original range
@@ -12078,6 +12376,7 @@ def test(m, expected_graph, expected_fqns, expected_duplicates):
                     [
                         fqn
                         for fqn, _ in unflattened.named_modules(remove_duplicate=False)
+                        if fqn != "_guards_fn"
                     ]
                 ),
                 expected_fqns,
@@ -13313,6 +13612,52 @@ def forward(self, x):
         ):
             _ = export(Foo(), (torch.randn(4, 4),), strict=False)
 
+    def test_vmap_custom_autograd_function(self):
+        from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
+
+        class IndexingModule(torch.nn.Module):
+            def __init__(self, base_size: int = 10):
+                super().__init__()
+                self.register_buffer("base", torch.arange(base_size))
+
+            def forward(self, indices: torch.Tensor) -> torch.Tensor:
+                with TransformGetItemToIndex():
+                    # Each element of `indices` is a scalar tensor, so our override kicks in
+                    return torch.vmap(lambda i: self.base[i])(indices)
+
+        m = IndexingModule(10)
+        idxs = torch.tensor([0, 3, 7, 9])
+        ep = torch.export.export(m, (idxs,), strict=False)
+        self.assertExpectedInline(
+            ep.graph,
+            """\
+graph():
+    %b_base : [num_users=1] = placeholder[target=b_base]
+    %indices : [num_users=1] = placeholder[target=indices]
+    %lazy_load_decompositions : [num_users=0] = call_function[target=torch._functorch.predispatch.lazy_load_decompositions](args = (), kwargs = {})
+    %_vmap_increment_nesting : [num_users=0] = call_function[target=torch._functorch.predispatch._vmap_increment_nesting](args = (4, error), kwargs = {})
+    %_add_batch_dim : [num_users=1] = call_function[target=torch._functorch.predispatch._add_batch_dim](args = (%indices, 0, 1), kwargs = {})
+    %torch__dynamo__trace_wrapped_higher_order_op_mod_index0 : [num_users=1] = get_attr[target=torch__dynamo__trace_wrapped_higher_order_op_ModIndex0]
+    %function_const_func_spec0 : [num_users=1] = get_attr[target=function_const_func_spec0]
+    %flat_apply : [num_users=1] = call_function[target=torch.ops.higher_order.flat_apply](args = (%function_const_func_spec0, %torch__dynamo__trace_wrapped_higher_order_op_mod_index0, torch._dynamo._trace_wrapped_higher_order_op.ModIndex, %b_base, %_add_batch_dim), kwargs = {})
+    %_remove_batch_dim : [num_users=1] = call_function[target=torch._functorch.predispatch._remove_batch_dim](args = (%flat_apply, 1, 4, 0), kwargs = {})
+    %_vmap_decrement_nesting : [num_users=0] = call_function[target=torch._functorch.predispatch._vmap_decrement_nesting](args = (), kwargs = {})
+    return (_remove_batch_dim,)""",
+        )
+
+        self.assertEqual(m(idxs), ep.module()(idxs))
+        ep = ep.run_decompositions({})
+        self.assertExpectedInline(
+            ep.graph,
+            """\
+graph():
+    %b_base : [num_users=1] = placeholder[target=b_base]
+    %indices : [num_users=1] = placeholder[target=indices]
+    %index : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%b_base, [%indices]), kwargs = {})
+    return (index,)""",
+        )
+        self.assertEqual(m(idxs), ep.module()(idxs))
+
     def test_unbacked_deferred_runtime_retrace(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
@@ -13399,7 +13744,7 @@ def forward(self, x):
 
     def test_disable_forced_specializations_ok(self):
         # check that we don't force specialization, and defer to runtime asserts
-        # with allow_complex_guards_as_runtime_asserts=True to successfully export
+        # with prefer_deferred_runtime_asserts_over_guards=True to successfully export
         # case 1: modulo guards
         from torch.export import dims
 
@@ -13409,11 +13754,11 @@ def forward(self, x):
 
         inputs = (torch.randn(10, 72),)
         dx, dy = dims("dx", "dy")
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             Mod4Reshape(),
             inputs,
             dynamic_shapes={"x": (dx, dy)},
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         out1 = ep.module()(torch.randn(8, 7))
         self.assertEqual(out1.shape, torch.ones(7, 4, 2).shape)
@@ -13440,22 +13785,39 @@ def forward(self, x, y, z):
             "y": [Dim(f"dy{i}", min=2) for i in range(2)],
             "z": [Dim(f"dz{i}", min=4) for i in range(1)],
         }
-        ep = torch.export._trace._export(
-            FreeReshape(),
-            inputs,
-            dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
-        )
-        ep = export(FreeReshape(), inputs, dynamic_shapes=dynamic_shapes)
-        out1 = ep.module()(torch.randn(48, 1), torch.randn(4, 12), torch.randn(48))
-        self.assertEqual(out1.shape, torch.ones(48).shape)
-        out2 = ep.module()(torch.randn(5, 8), torch.randn(4, 10), torch.randn(40))
-        self.assertEqual(out2.shape, torch.ones(40).shape)
-        with self.assertRaisesRegex(
-            RuntimeError,
-            r"Runtime assertion failed for expression Eq\((.*)\) on node '.*'",
-        ):  # fail only at runtime
-            ep.module()(torch.randn(5, 8), torch.randn(4, 5), torch.randn(30))  # fail
+
+        for private_api in (True, False):
+            if private_api:
+                ep = torch.export.export(
+                    FreeReshape(),
+                    inputs,
+                    dynamic_shapes=dynamic_shapes,
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                )
+            else:
+                ep = export(FreeReshape(), inputs, dynamic_shapes=dynamic_shapes)
+            out1 = ep.module()(torch.randn(48, 1), torch.randn(4, 12), torch.randn(48))
+            self.assertEqual(out1.shape, torch.ones(48).shape)
+            out2 = ep.module()(torch.randn(5, 8), torch.randn(4, 10), torch.randn(40))
+            self.assertEqual(out2.shape, torch.ones(40).shape)
+            if private_api:
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"Runtime assertion failed for expression Eq\((.*)\) on node '.*'",
+                ):  # fail only at runtime
+                    ep.module()(
+                        torch.randn(5, 8), torch.randn(4, 5), torch.randn(30)
+                    )  # fail
+            else:
+                # no runtime assert in exported module but it fails anyway with wrong inputs
+                with self.assertRaisesRegex(
+                    AssertionError,
+                    escape(
+                        "Guard failed: x.size()[1] * x.size()[0] == y.size()[0] * y.size()[1]"
+                    ),
+                ):
+                    # expected 40, but got 20
+                    ep.module()(torch.randn(5, 8), torch.randn(4, 5), torch.randn(30))
 
         # case 3: 3d reshape (previously failing with different issue)
         class Reshape3d(torch.nn.Module):
@@ -13470,11 +13832,11 @@ def forward(self, x, y):
             "x": (Dim("dx0", min=2), Dim("dx1", min=2), Dim("dx2", min=2)),
             "y": (Dim("dy", min=8),),
         }
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             Reshape3d(),
             inputs,
             dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         out1 = ep.module()(torch.randn(9, 7, 2), torch.randn(126))
         self.assertEqual(out1.shape, torch.ones(126).shape)
@@ -13535,6 +13897,22 @@ def forward(self, x, y):
         self.assertFalse(placeholders[1].meta["val"].requires_grad)
         self.assertTrue(placeholders[2].meta["val"].requires_grad)
 
+    def test_expand_copy_export_handles_implicit_true(self):
+        class ExpandModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, implicit):
+                return torch.expand_copy(x, [3, 3], implicit=implicit)
+
+        model = ExpandModel()
+        x = torch.ones([3])
+
+        model(x, False)
+        model(x, True)
+        export(model, (x, False))
+        export(model, (x, True))
+
     def test_unbacked_expand(self):
         if "cpp_runtime_nonstrict" in self.id():
             self.skipTest("TODO Unexpected success in OSS but not in fbcode.")
@@ -13596,11 +13974,11 @@ def forward(self, x):
         model = Model()
         x = torch.rand(1024, 20, 16)
         dynamic_shapes = {"x": {0: Dim("batch")}}
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             model,
             (x,),
             dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         with self.assertRaisesRegex(
             RuntimeError,
@@ -13673,11 +14051,11 @@ def forward(self, x, y):
 
         inputs = (torch.randn(6), torch.randn(12))
         dynamic_shapes = {"x": [Dim("dx", min=4)], "y": [Dim("dy", min=4)]}
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             Foo(),
             inputs,
             dynamic_shapes=dynamic_shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         # check forward pass
         out0, out1 = ep.module()(torch.randn(9), torch.randn(27))
@@ -13712,7 +14090,7 @@ def forward(self, x, y):
                 Foo(),
                 inputs,
                 dynamic_shapes=dynamic_shapes,
-                allow_complex_guards_as_runtime_asserts=True,
+                prefer_deferred_runtime_asserts_over_guards=True,
             ).run_decompositions()
 
         self.assertEqual(
@@ -14077,10 +14455,7 @@ def __init__(self):
             def forward(self, x):
                 return x.cos()
 
-        with self.assertRaisesRegex(
-            RuntimeError, "TestExport.test_capture_subclass_wrong.<locals>.Foo"
-        ):
-            export(Foo(), (torch.randn(4, 4),))
+        export(Foo(), (torch.randn(4, 4),))
 
     def test_capture_subclass_constructor_torch_ir(self):
         class Foo(torch.nn.Module):
@@ -14124,11 +14499,11 @@ def forward(self, x, y):
 
         inputs = (torch.randn(5), torch.randn(3))
         shapes = {"x": (Dim("dx"),), "y": (Dim("dy"),)}
-        ep = torch.export._trace._export(
+        ep = torch.export.export(
             Foo(),
             inputs,
             dynamic_shapes=shapes,
-            allow_complex_guards_as_runtime_asserts=True,
+            prefer_deferred_runtime_asserts_over_guards=True,
         )
         # count 2 pow nodes, 2 sym_size.int nodes
         self.assertEqual(
@@ -14925,21 +15300,44 @@ class ModConstraint(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 return x.view(x.shape[0] - 1, -1)
 
-        ep = export(
-            ModConstraint(),
-            (torch.randn(3, 4),),
-            dynamic_shapes={
-                "x": (dynamic, dynamic),
-            },
-        )
-        ep.module()(torch.randn(5, 8))
-        num_asserts = [
-            node.target == torch.ops.aten._assert_scalar.default
-            for node in ep.graph.nodes
-        ].count(True)
-        self.assertEqual(num_asserts, 2)
-        with self.assertRaises(RuntimeError):
-            ep.module()(torch.randn(4, 2))
+        for private_api in (True, False):
+            if private_api:
+                ep = torch.export.export(
+                    ModConstraint(),
+                    (torch.randn(3, 4),),
+                    dynamic_shapes={"x": (dynamic, dynamic)},
+                    prefer_deferred_runtime_asserts_over_guards=True,
+                )
+            else:
+                ep = export(
+                    ModConstraint(),
+                    (torch.randn(3, 4),),
+                    dynamic_shapes={"x": (dynamic, dynamic)},
+                )
+            ep.module()(torch.randn(5, 8))
+            num_asserts = [
+                node.target == torch.ops.aten._assert_scalar.default
+                for node in ep.graph.nodes
+            ].count(True)
+            if private_api:
+                self.assertEqual(num_asserts, 6)
+                with self.assertRaisesRegex(
+                    RuntimeError,
+                    r"Runtime assertion failed for expression Eq\(Mod\(s27\*s77, s77 - 1\), 0\)",
+                ):
+                    ep.module()(torch.randn(4, 2))
+            else:
+                # no runtime assert in exported module
+                self.assertEqual(num_asserts, 0)
+                # but it fails anyway with wrong inputs
+                with self.assertRaisesRegex(
+                    AssertionError,
+                    escape(
+                        "Guard failed: x.size()[1] * x.size()[0] % (-1 + x.size()[0]) == 0"
+                    ),
+                ):
+                    # expected 3*..., but got 8
+                    ep.module()(torch.randn(4, 2))
 
     @testing.expectedFailureSerDer  # T195866111
     @testing.expectedFailureSerDerNonStrict
@@ -15310,13 +15708,10 @@ def forward(self, x):
     @contextmanager
     def distributed_env(self, world_size):
         try:
-            from torch.testing._internal.distributed.fake_pg import FakeStore
-
             torch.distributed.init_process_group(
                 backend="fake",
                 world_size=world_size,
                 rank=0,
-                store=FakeStore(),
             )
             yield
 
@@ -15704,9 +16099,10 @@ def forward(self, x, y):
         self.assertEqual(res[1], 5)
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1] to be equal to 5, but got 20"),
+            AssertionError,
+            escape("Guard failed: y == 5"),
         ):
+            # expected 5, but got 20
             res = ep.module()(torch.tensor(4), 20)
 
         class F(torch.nn.Module):
@@ -16444,6 +16840,27 @@ def forward(self, x, y):
             str(ep.graph)
         )
 
+    def test_item(self):
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = 5
+                self.b = 5.0
+
+            def forward(self, y):
+                at = torch.tensor(self.a)
+                # This becomes 5
+                a = at.item()
+                bt = torch.tensor(self.b)
+                # This becomes 5.0
+                b = bt.item()
+                return a * b * y
+
+        ep = export(M(), (torch.ones(3),))
+        FileCheck().check_count("torch.ops.aten.mul.Tensor", 1, exactly=True).run(
+            str(ep.graph)
+        )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_passes.py b/test/export/test_passes.py
index 351c2770524a..4ae4d45498e9 100644
--- a/test/export/test_passes.py
+++ b/test/export/test_passes.py
@@ -411,9 +411,10 @@ def forward(self, x):
         )
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+            AssertionError,
+            escape("Guard failed: x.size()[1] <= 6"),
         ):
+            # expected <= 6, but got 7
             ep.module()(torch.zeros(2, 7, 3))
 
         self.assertEqual(
@@ -442,15 +443,17 @@ def forward(self, x, y):
         )
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+            AssertionError,
+            escape("Guard failed: x.size()[1] <= 6"),
         ):
+            # expected <= 6, but got 7
             ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1].shape[0] to be >= 3, but got 2"),
+            AssertionError,
+            escape("Guard failed: y.size()[0] >= 3"),
         ):
+            # expected >= 3, but got 2
             ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
     def test_runtime_assert_some_dims_not_specified(self) -> None:
@@ -475,16 +478,18 @@ def forward(self, x, y):
         )
 
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[1] to be <= 6, but got 7"),
+            AssertionError,
+            escape("Guard failed: x.size()[1] <= 6"),
         ):
+            # expected <= 6, but got 7
             ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         # y is specialized to 5
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1].shape[0] to be equal to 5, but got 2"),
+            AssertionError,
+            escape("Guard failed: y.size()[0] == 5"),
         ):
+            # expected 5, but got 2
             ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
         # Since we didn't insert the constraint for x[1] >= 2, it should work for case where x[1] == 1
@@ -509,14 +514,19 @@ def forward(self, x, y):
             M(), (x, y), dynamic_shapes={"x": None, "y": {1: dim1_y}}, strict=True
         )
 
-        with self.assertRaisesRegex(RuntimeError, escape("shape[1] to be equal to 2")):
+        with self.assertRaisesRegex(
+            AssertionError,
+            escape("Guard failed: x.size()[1] == 2"),
+        ):
+            # expected 2, but got 7
             ep.module()(torch.zeros(4, 7, 3), torch.ones(5, 5, 5))
 
         # y is specialized to 5
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[1].shape[0] to be equal to 5, but got 2"),
+            AssertionError,
+            escape("Guard failed: y.size()[0] == 5"),
         ):
+            # expected 5, but got 2
             ep.module()(torch.zeros(4, 2, 3), torch.ones(2, 5, 5))
 
         # Since we didn't insert the constraint for x[1] >= 2, it should work for case where x[1] == 1
@@ -803,6 +813,7 @@ def test_predispatch_set_grad(self):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add);  add = None
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -822,6 +833,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add);  add = None
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -841,6 +853,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add);  add = None
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -860,6 +873,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_5 = self.submod_1
     sum_1 = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_5, add);  submod_5 = add = None
@@ -880,6 +894,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.aten.sin.default(add)
     sum_1 = torch.ops.aten.sum.default(sin);  sin = None
@@ -905,6 +920,7 @@ def forward(self, x):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_5 = self.submod_1
     wrap_with_set_grad_enabled = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_5, add);  submod_5 = add = None
@@ -940,6 +956,7 @@ def test_sequential_split_graph(self):
             """\
 def forward(self, x1, x2):
     x1, x2, = fx_pytree.tree_flatten_spec(([x1, x2], {}), self._in_spec)
+    submod_0 = self.submod_0(x1, x2);  submod_0 = None
     submod_1 = self.submod_1(x1, x2);  x1 = x2 = None
     getitem = submod_1[0]
     getitem_1 = submod_1[1];  submod_1 = None
@@ -995,6 +1012,7 @@ def test_predispatch_autocast_and_set_grad(self):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     submod_3 = self.submod_3
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     sin = torch.ops.higher_order.wrap_with_set_grad_enabled(True, submod_3, add);  submod_3 = add = None
@@ -1033,6 +1051,7 @@ def test_predispatch_autocast(self):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_3 = self.submod_1
     add_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_3, add);  submod_3 = add = None
@@ -1065,6 +1084,7 @@ def forward(self, add):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
     sum_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_4, add);  submod_4 = add = None
@@ -1115,6 +1135,7 @@ def forward(self, add_1):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
     wrap_with_autocast = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_4, add);  submod_4 = add = None
@@ -1172,6 +1193,7 @@ def forward(self, add_1, add_2):
             """\
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     add = torch.ops.aten.add.Tensor(x, 1);  x = None
     submod_4 = self.submod_1
     sum_1 = torch.ops.higher_order.wrap_with_autocast('cpu', None, True, None, submod_4, add);  submod_4 = add = None
@@ -1213,6 +1235,7 @@ def test_inline_(self):
             )
             after_inline_str = new_gm.print_readable(print_output=False)
             self.assertEqual(before_str, after_inline_str)
+            new_gm._guards_fn = gm._guards_fn
             self.assertEqual(gm(*args), new_gm(*args))
 
     def test_remove_auto_functionalized_pass(self) -> None:
@@ -1381,6 +1404,38 @@ def forward(self, x):
         outputs = gm(*test_inputs)
         self.assertEqual(outputs.device, torch.device("cuda:0"))
 
+    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    def test_move_device_example_inputs(self):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x, y, z):
+                return self.linear(x) + y + z
+
+        # Create model with example inputs on CPU
+        mod = Model()
+        example_args = (torch.rand(4, 4), torch.rand(4, 4))
+        example_kwargs = {"z": torch.tensor([1.0, 2.0, 3.0, 4.0])}
+
+        # Export with example inputs
+        ep = export(mod, example_args, example_kwargs)
+
+        # Verify initial state - all tensors should be on CPU
+        self.assertEqual(ep.example_inputs[0][0].device, torch.device("cpu"))
+        self.assertEqual(ep.example_inputs[0][1].device, torch.device("cpu"))
+        self.assertEqual(ep.example_inputs[1]["z"].device, torch.device("cpu"))
+
+        # Move to CUDA
+        location = torch.device("cuda:0")
+        ep_cuda = move_to_device_pass(ep, location=location)
+
+        # Verify example_inputs moved to CUDA
+        self.assertEqual(ep_cuda.example_inputs[0][0].device, torch.device("cuda:0"))
+        self.assertEqual(ep_cuda.example_inputs[0][1].device, torch.device("cuda:0"))
+        self.assertEqual(ep_cuda.example_inputs[1]["z"].device, torch.device("cuda:0"))
+
     def test_constant_folding_pass(self):
         from torch.ao.quantization.observer import MappingType, PerGroup, PerToken
         from torch.ao.quantization.pt2e._affine_quantization import (
diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index 31f30ec7d46c..faef9b455a0e 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -14,6 +14,16 @@
 from pathlib import Path
 from typing import NamedTuple
 
+from torch.testing._internal.inductor_utils import HAS_GPU
+
+
+if HAS_GPU:
+    import triton
+    import triton.language as tl
+
+    from torch.library import wrap_triton
+    from torch.utils._triton import has_triton
+
 import torch
 import torch._dynamo as torchdynamo
 import torch._export.serde.schema as schema
@@ -21,6 +31,7 @@
 import torch.utils._pytree as pytree
 from torch._export.db.case import ExportCase, SupportLevel
 from torch._export.db.examples import all_examples
+from torch._export.serde.schema import ArgumentKind
 from torch._export.serde.serialize import (
     _dict_to_dataclass,
     _to_json_bytes,
@@ -582,6 +593,118 @@ def forward(self, x):
             serialized.exported_program.range_constraints[symint.name].max_val, 3
         )
 
+    @unittest.skipIf(
+        not torch.cuda.is_available() or not has_triton(), "requires cuda and triton"
+    )
+    def test_triton_hop(self) -> None:
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def custom_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            wrap_triton(add_kernel)[grid](x, y, output, n_elements, 16)
+
+            return output
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x, y):
+                return custom_add(x, y)
+
+        def custom_add_autotune(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            wrap_triton(add_kernel)[grid](x, y, output, n_elements, 16, num_warps=8)
+
+            return output
+
+        class MyModelAutotune(torch.nn.Module):
+            def forward(self, x, y):
+                return custom_add_autotune(x, y)
+
+        device = "cuda"
+
+        for m in [MyModel().to(device), MyModelAutotune().to(device)]:
+            args = (torch.randn(3, device=device), torch.randn(3, device=device))
+            ep = torch.export.export(m, args=args)
+            ep = ep.run_decompositions(decompose_custom_triton_ops=False)
+            assert torch.allclose(m(*args), ep.module()(*args))
+
+            serialized = ExportedProgramSerializer().serialize(ep)
+
+            for node in serialized.exported_program.graph_module.graph.nodes:
+                if (
+                    node.target
+                    == "torch.ops.higher_order.triton_kernel_wrapper_functional"
+                ):
+                    triton_node = node
+
+            self.assertIsNotNone(triton_node)
+
+            args = []
+            kwargs = []
+
+            for arg in triton_node.inputs:
+                if arg.kind == ArgumentKind.POSITIONAL:
+                    args.append(arg.arg)
+                elif arg.kind == ArgumentKind.KEYWORD:
+                    kwargs.append(arg.arg)
+
+            self.assertEqual(len(args), 4)
+            self.assertEqual(len(kwargs), 4)
+
+            for i in range(3):
+                self.assertIsNotNone(args[i].as_tensor)
+
+            self.assertEqual(args[3].as_int, 3)
+
+            self.assertEqual(kwargs[0].as_string, "add_kernel")  # name
+            self.assertEqual(kwargs[1].as_ints, [1, 1, 1])  # grid
+            self.assertEqual(kwargs[2].as_ints, [2])  # output indices
+            self.assertEqual(
+                kwargs[3].as_int, 8 if isinstance(m, MyModelAutotune) else 4
+            )  # num warps
+
+            self.assertEqual(len(triton_node.outputs), 1)
+            self.assertIsNotNone(triton_node.outputs[0].as_tensors)
+            self.assertEqual(
+                len(triton_node.outputs[0].as_tensors), len(kwargs[2].as_ints)
+            )
+            self.assertEqual(triton_node.outputs[0].as_tensors[0].name, "getitem")
+
+            with self.assertRaisesRegex(
+                SerializeError,
+                "deserialize nyi for torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional",
+            ):
+                ExportedProgramDeserializer().deserialize(
+                    serialized.exported_program,
+                    serialized.state_dict,
+                    serialized.constants,
+                    serialized.example_inputs,
+                )
+
     def test_kwargs_default(self) -> None:
         """
         Tests that the kwargs default values are serialized even if they are not
@@ -636,6 +759,48 @@ def forward(self, x):
             if "aten.sum.dim_IntList" in node.target:
                 self.assertEqual(node.inputs[1].arg.type, "as_ints")
 
+    def test_empty_constant(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        m = M()
+        sample_inputs = (torch.randn(1, 4),)
+        eager_out = m(*sample_inputs)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        torch.export.save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = torch.export.load(buffer)
+        ep_out = loaded_ep.module()(*sample_inputs)
+        self.assertTrue(torch.allclose(eager_out, ep_out))
+        self.assertEqual(len(loaded_ep.constants), 0)
+
+    def test_empty_state_dict(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.randn(4, 4)
+
+            def forward(self, x):
+                return x + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(4, 4),)
+        eager_out = m(*sample_inputs)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        torch.export.save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = torch.export.load(buffer)
+        ep_out = loaded_ep.module()(*sample_inputs)
+        self.assertTrue(torch.allclose(eager_out, ep_out))
+        self.assertEqual(len(loaded_ep.state_dict), 0)
+
     def test_preserve_aliasing(self) -> None:
         class M(torch.nn.Module):
             def __init__(self):
@@ -709,6 +874,78 @@ def forward(self, x):
         epm.const2[-1] = 321
         self.assertEqual(epm.const1[-1][-1], 321)
 
+    def test_storage_offset(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.arange(8)[:4]
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return self.linear(x) + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(1, 4),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
+    def test_1D_tensor_slicing(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.arange(8)[::2]
+
+            def forward(self, x):
+                return x + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(4),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
+    def test_2D_tensor_slicing(self) -> None:
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.const = torch.randn(4, 4)[:2, :2]
+
+            def forward(self, x):
+                return x + self.const
+
+        m = M()
+        sample_inputs = (torch.randn(2, 2),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
+    def test_complex_constant(self) -> None:
+        class M(torch.nn.Module):
+            def forward(self, x):
+                s = torch.sin(x)
+                y = (1 + 1j) * s
+                z = 1j * s
+                return y, z
+
+        m = M()
+        sample_inputs = (torch.randn(2, 2),)
+        ep = torch.export.export(m, sample_inputs)
+        buffer = io.BytesIO()
+        save(ep, buffer)
+        buffer.seek(0)
+        loaded_ep = load(buffer)
+        self.assertEqual(m(*sample_inputs), loaded_ep.module()(*sample_inputs))
+
 
 @unittest.skipIf(IS_WINDOWS, "Windows not supported for this test")
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo doesn't support")
@@ -1826,6 +2063,7 @@ def forward(self, obj_attr, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo = torch.ops._TorchScriptTesting.takes_foo.default(attr, x);  attr = None
     add = torch.ops.aten.add.Tensor(x, takes_foo);  x = takes_foo = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
diff --git a/test/export/test_torchbind.py b/test/export/test_torchbind.py
index d24262dab2b1..f45775f09f29 100644
--- a/test/export/test_torchbind.py
+++ b/test/export/test_torchbind.py
@@ -185,6 +185,7 @@ def forward(self, x, n):
 def forward(self, x, n):
     x, n, = fx_pytree.tree_flatten_spec(([x, n], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x, n);  n = _guards_fn = None
     call_torchbind = torch.ops.higher_order.call_torchbind(attr, 'add_tensor', x);  attr = None
     add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -232,6 +233,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     call_torchbind = torch.ops.higher_order.call_torchbind(attr, 'add_tensor', x);  attr = None
     add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -266,6 +268,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, x);  attr = None
     add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -300,6 +303,7 @@ def forward(self, x, cc):
             """\
 def forward(self, x, cc):
     x, cc, = fx_pytree.tree_flatten_spec(([x, cc], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x, cc);  _guards_fn = None
     call_torchbind = torch.ops.higher_order.call_torchbind(cc, 'add_tensor', x);  cc = None
     add = torch.ops.aten.add.Tensor(x, call_torchbind);  x = call_torchbind = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -362,6 +366,7 @@ def forward(self, x, cc):
             """\
 def forward(self, x, cc):
     x, cc, = fx_pytree.tree_flatten_spec(([x, cc], {}), self._in_spec)
+    _guards_fn = self._guards_fn(x, cc);  _guards_fn = None
     takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(cc, x);  cc = None
     add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
     return pytree.tree_unflatten((add,), self._out_spec)""",
@@ -457,6 +462,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo_default_1 = torch.ops._TorchScriptTesting.takes_foo.default(attr, x)
     takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, takes_foo_default_1);  attr = takes_foo_default_1 = None
     add = torch.ops.aten.add.Tensor(x, takes_foo_default);  x = takes_foo_default = None
@@ -499,6 +505,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo_list_return_default = torch.ops._TorchScriptTesting.takes_foo_list_return.default(attr, x)
     getitem_2 = takes_foo_list_return_default[0]
     getitem_3 = takes_foo_list_return_default[1]
@@ -551,6 +558,7 @@ def forward(self, x):
 def forward(self, x):
     x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
     attr = self.attr
+    _guards_fn = self._guards_fn(x);  _guards_fn = None
     takes_foo_tuple_return_default = torch.ops._TorchScriptTesting.takes_foo_tuple_return.default(attr, x)
     getitem_1 = takes_foo_tuple_return_default[0]
     getitem_2 = takes_foo_tuple_return_default[1];  takes_foo_tuple_return_default = None
@@ -1065,6 +1073,7 @@ def forward(self, tq: torch.ScriptObject, x: torch.Tensor) -> None:
             """\
 def forward(self, tq, x):
     tq, x, = fx_pytree.tree_flatten_spec(([tq, x], {}), self._in_spec)
+    _guards_fn = self._guards_fn(tq, x);  _guards_fn = None
     queue_push_default = torch.ops._TorchScriptTesting.queue_push.default(tq, x);  x = queue_push_default = None
     return pytree.tree_unflatten((tq,), self._out_spec)""",
         )
diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py
index 3510403cc164..5e1872c249ed 100644
--- a/test/export/test_unflatten.py
+++ b/test/export/test_unflatten.py
@@ -359,9 +359,10 @@ def forward(self, x):
 
         export_module = torch.export.export(Mod(), (torch.randn((2, 3)),), strict=True)
         with self.assertRaisesRegex(
-            RuntimeError,
-            escape("Expected input at *args[0].shape[0] to be equal to 2, but got 6"),
+            AssertionError,
+            escape("Guard failed: x.size()[0] == 2"),
         ):
+            # expected 2, but got 6
             export_module.module()(torch.randn(6, 6))
 
         unflattened = unflatten(export_module)
@@ -933,7 +934,7 @@ def forward(self, x, y):
         fn_count_sym_size = lambda graph: [node.target for node in graph.nodes].count(
             torch.ops.aten.sym_size.int
         )
-        self.assertEqual(fn_count_sym_size(unflat.graph), 3)
+        self.assertEqual(fn_count_sym_size(unflat.graph), 1)
         self.assertEqual(fn_count_sym_size(unflat.m1.graph), 1)
         self.assertEqual(fn_count_sym_size(unflat.m2.graph), 0)
 
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 5d068310f69d..7f365b389176 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -58,6 +58,7 @@
 )
 from torch._higher_order_ops.out_dtype import out_dtype
 from torch._inductor.codecache import compiled_fx_graph_hash
+from torch._inductor.custom_graph_pass import CustomPartitionerFn
 from torch._inductor.output_code import MockFXGraphCacheOutput
 from torch._subclasses.fake_tensor import DynamicOutputShapeException, FakeTensorMode
 from torch.fx.experimental.proxy_tensor import is_sym_node
@@ -5687,6 +5688,49 @@ def forward(self, primals_1, tangents_1):
     return (cat,)""",
         )
 
+    @unittest.skipIf(not USE_NETWORKX, "networkx not available")
+    def test_custom_partitioner_fn(self):
+        class MyCustomPartitionerFn(CustomPartitionerFn):
+            def __init__(self):
+                super().__init__()
+                self.called = False
+
+            def __call__(self, gm, joint_inputs, **kwargs):
+                self.called = True
+                return min_cut_rematerialization_partition(gm, joint_inputs, **kwargs)
+
+            def uuid(self):
+                return None
+
+        def f(x):
+            return x.cos().cos()
+
+        inp = [torch.randn((4, 4), requires_grad=True)]
+        custom_partitioner_fn = MyCustomPartitionerFn()
+        fw_graph, bw_graph = get_fw_bw_graph(f, inp, partitioner=custom_partitioner_fn)
+        self.assertTrue(custom_partitioner_fn.called)
+        self.assertExpectedInline(
+            fw_graph.code.strip(),
+            """\
+def forward(self, primals_1):
+    cos = torch.ops.aten.cos.default(primals_1)
+    cos_1 = torch.ops.aten.cos.default(cos);  cos = None
+    return (cos_1, primals_1)""",
+        )
+        self.assertExpectedInline(
+            bw_graph.code.strip(),
+            """\
+def forward(self, primals_1, tangents_1):
+    cos = torch.ops.aten.cos.default(primals_1)
+    sin = torch.ops.aten.sin.default(cos);  cos = None
+    neg = torch.ops.aten.neg.default(sin);  sin = None
+    mul = torch.ops.aten.mul.Tensor(tangents_1, neg);  tangents_1 = neg = None
+    sin_1 = torch.ops.aten.sin.default(primals_1);  primals_1 = None
+    neg_1 = torch.ops.aten.neg.default(sin_1);  sin_1 = None
+    mul_1 = torch.ops.aten.mul.Tensor(mul, neg_1);  mul = neg_1 = None
+    return (mul_1,)""",
+        )
+
     @unittest.skipIf(not USE_NETWORKX, "networkx not available")
     def test_min_cut_partitioner_save_shape(self):
         def f(x):
@@ -8248,7 +8292,6 @@ def run_autograd(
         {
             "enable_autograd_cache": True,
             "strict_autograd_cache": True,
-            "view_replay_for_aliased_outputs": False,
         }
     )
     @torch._inductor.config.patch("fx_graph_cache", True)
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index 6b8d23a191ca..81aa26c2be8a 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -144,7 +144,7 @@ def complex_pointwise(x, y):
         }
 
     def non_pointwise(x: torch.Tensor, y: torch.Tensor):
-        W = torch.diag(torch.ones(2, device=x.device))
+        W = torch.arange(4, dtype=torch.float, device=x.device).view(2, 2)
         return x @ W + y @ W
 
     def RNN(x: torch.Tensor, y: torch.Tensor):
@@ -395,14 +395,14 @@ def body_fn(a, b, c1, c2, c3, c0, u0, x):
                 ([torch.randn(3, 3)], {"x": torch.randn(3, 3), "y": torch.randn(3, 3)}),
             ),
         ),
-        "int_carry": (int_carry, (torch.randn(2, 3, requires_grad=True),)),
+        "int_carry": (int_carry, (torch.randn(2, 3),)),
         "pytree_int_carry": (
             pytree_int_carry,
-            (torch.randn(2, 3, requires_grad=True),),
+            (torch.randn(2, 3),),
         ),
         "const_and_symint_output": (
             const_and_symint_output,
-            (torch.randn(2, 3, requires_grad=True),),
+            (torch.randn(2, 3),),
         ),
     }
 
@@ -3717,11 +3717,38 @@ def setUp(self):
         torch._dynamo.reset()
         super().setUp()
 
-    def _run_test(self, model, model_fake, inputs):
+    def _check_autograd(self, result, result_exp, autograd_param):
+        grad_param = [p for p in autograd_param if p.requires_grad]
+
+        result_flatten, _ = pytree.tree_flatten(result)
+        result_exp_flatten, _ = pytree.tree_flatten(result_exp)
+        result_flatten = [r for r in result_flatten if r.requires_grad]
+        result_exp_flatten = [r for r in result_exp_flatten if r.requires_grad]
+
+        # Check the result and parameter lists
+        assert len(result_flatten) == len(result_exp_flatten), (
+            "The number of elements requiring gradients is different for the results and the expected results"
+        )
+
+        grad_exp_init = [torch.ones_like(el) for el in result_exp_flatten]
+        expected_grads = torch.autograd.grad(
+            result_exp_flatten, grad_param, grad_exp_init
+        )
+        grad_init = [torch.ones_like(el) for el in result_flatten]
+        grads = torch.autograd.grad(result_flatten, grad_param, grad_init)
+
+        self.assertEqual(grads, expected_grads, atol=6e-05, rtol=6e-06)
+
+    def _run_test(self, model, model_fake, inputs, autograd_param=None):
         result = model(inputs)
         result_exp = model_fake(inputs)
         self.assertEqual(result, result_exp)
 
+        if autograd_param is not None and any(
+            par.requires_grad for par in autograd_param
+        ):
+            self._check_autograd(result, result_exp, autograd_param)
+
         # Return the result of the functions under test for further investigations
         return result
 
@@ -3736,6 +3763,7 @@ def _prepare_fake_kwargs(self, original_kwargs):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3751,10 +3779,22 @@ def _prepare_fake_kwargs(self, original_kwargs):
             )
         ),
     )
+    # # Skipping this combination as there is a CPP compilation failure that
+    # # may be unrelated to associative_scan itself. There is a dedicated tests for
+    # # this case below.
+    # @decorateIf(
+    #     unittest.skip,
+    #     lambda params: (
+    #         params["compile_mode"] == "compile_dynamic_shape"
+    #         and params["combine_mode"] == "generic"
+    #         and params["device"] == torch.device("cpu")
+    #         and params["autograd"]
+    #     ),
+    # )
     def test_associative_scan_compile(
-        self, combine_mode, reverse, compile_mode, device
+        self, combine_mode, reverse, compile_mode, device, autograd
     ):
-        x = torch.randn(3, 10, 2, device=device)
+        x = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
         kwargs = {
             "dim": 0,
             "reverse": reverse,
@@ -3766,6 +3806,7 @@ def test_associative_scan_compile(
             model=AssociativeScanModels.Simple(**kwargs),
             model_fake=AssociativeScanModels.Simple(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
         if not reverse:
@@ -3775,7 +3816,9 @@ def test_associative_scan_compile(
             self.assertEqual(results, results_torch)
 
         # Jax Examples
-        x = torch.arange(0, 4, device=device)
+        x = torch.arange(
+            0, 4, device=device, dtype=torch.float32, requires_grad=autograd
+        )
         kwargs = {
             "dim": 0,
             "reverse": reverse,
@@ -3788,12 +3831,13 @@ def test_associative_scan_compile(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
         if not reverse:
-            results_torch = torch.tensor([0.0, 1.0, 3.0, 6.0], dtype=torch.int64)
+            results_torch = torch.tensor([0.0, 1.0, 3.0, 6.0], dtype=torch.float32)
         else:
-            results_torch = torch.tensor([6.0, 6.0, 5.0, 3.0], dtype=torch.int64)
+            results_torch = torch.tensor([6.0, 6.0, 5.0, 3.0], dtype=torch.float32)
 
         self.assertEqual(result, results_torch)
 
@@ -3803,6 +3847,7 @@ def test_associative_scan_compile(
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3818,7 +3863,9 @@ def test_associative_scan_compile(
             )
         ),
     )
-    def test_associative_scan_dim(self, combine_mode, compile_mode, reverse, device):
+    def test_associative_scan_dim(
+        self, combine_mode, compile_mode, reverse, device, autograd
+    ):
         import random
 
         random.seed(1234)
@@ -3829,7 +3876,7 @@ def test_associative_scan_dim(self, combine_mode, compile_mode, reverse, device)
             torch._dynamo.reset()
             shapes = [random.randint(1, 9) for _ in range(num_dim)]
             rnd_scan_dim = random.randint(0, num_dim - 1)
-            x = torch.randn(*shapes, device=device)
+            x = torch.randn(*shapes, device=device, requires_grad=autograd)
 
             kwargs = {
                 "dim": rnd_scan_dim,
@@ -3842,6 +3889,7 @@ def test_associative_scan_dim(self, combine_mode, compile_mode, reverse, device)
                 model=AssociativeScanModels.Simple(**kwargs),
                 model_fake=AssociativeScanModels.Simple(**kwargs_fake),
                 inputs=x,
+                autograd_param=None if not autograd else (x,),
             )
 
             if not reverse:
@@ -3880,6 +3928,7 @@ def test_associative_scan_dim_shape_failure(self, compile_mode, combine_mode):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3895,9 +3944,11 @@ def test_associative_scan_dim_shape_failure(self, compile_mode, combine_mode):
             )
         ),
     )
-    def test_associative_scan_tuple(self, compile_mode, combine_mode, reverse, device):
-        x = torch.randn(3, 2, 2, device=device)
-        y = torch.randn(3, 2, 2, device=device)
+    def test_associative_scan_tuple(
+        self, compile_mode, combine_mode, reverse, device, autograd
+    ):
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         inp = (x, y)
 
         kwargs = {
@@ -3912,18 +3963,19 @@ def test_associative_scan_tuple(self, compile_mode, combine_mode, reverse, devic
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else inp,
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
     @requires_cuda
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
-    @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     def test_associative_scan_expand_in_combine_fn(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, reverse, device, autograd
     ):
-        x = torch.randn(3, 2, 2, device=device)
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
 
         def combine_fn(x, y):
             return x * torch.sum(y, -1).expand(x.shape)
@@ -3940,6 +3992,7 @@ def combine_fn(x, y):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -3947,10 +4000,15 @@ def combine_fn(x, y):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     def test_associative_scan_non_contiguous_tensor(
-        self, compile_mode, reverse, device
+        self, compile_mode, reverse, device, autograd
     ):
-        x = torch.arange(30, device=device).view(10, 3).t()
+        x = (
+            torch.arange(30, device=device, dtype=torch.float32, requires_grad=autograd)
+            .view(10, 3)
+            .t()
+        )
         assert not x.is_contiguous()
 
         kwargs = {
@@ -3965,6 +4023,7 @@ def test_associative_scan_non_contiguous_tensor(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -3973,6 +4032,7 @@ def test_associative_scan_non_contiguous_tensor(
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -3989,11 +4049,11 @@ def test_associative_scan_non_contiguous_tensor(
         ),
     )
     def test_associative_scan_complex_pytree(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
-        x = torch.randn(3, 2, 2, device=device)
-        y = torch.randn(3, 2, 2, device=device)
-        z = torch.randn(3, 2, 2, device=device)
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        z = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         inp = {"i": x, "j": ([y], [{"o": z}])}
 
         kwargs = {
@@ -4008,6 +4068,7 @@ def test_associative_scan_complex_pytree(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (x, y, z),
         )
 
     @skipIfTorchDynamo("don't test compile on compile")
@@ -4157,6 +4218,7 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4173,7 +4235,7 @@ def forward(self, L_xs_0_0_: "f32[3, 10, 2]", L_xs_0_1_0_: "f32[3, 10, 2]", L_xs
         ),
     )
     def test_associative_scan_downstream_scan_matmul(
-        self, combine_mode, compile_mode, reverse, device
+        self, combine_mode, compile_mode, reverse, device, autograd
     ):
         def first_chain_fct(scan_fct, inp, **kwargs):
             o = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
@@ -4183,7 +4245,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             W = torch.ones(2, 5, device=device)
             return inp @ W
 
-        inp = torch.randn(3, 10, 2, device=device)
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
         kwargs = {
             "dim": 1,
             "reverse": reverse,
@@ -4196,6 +4258,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             model=AssociativeScanModels.ChainFn(**kwargs),
             model_fake=AssociativeScanModels.ChainFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4204,6 +4267,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4220,7 +4284,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
         ),
     )
     def test_associative_scan_downstream_scan_scan(
-        self, combine_mode, compile_mode, reverse, device
+        self, combine_mode, compile_mode, reverse, device, autograd
     ):
         def first_chain_fct(scan_fct, inp, **kwargs):
             o1 = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
@@ -4230,7 +4294,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             o2 = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
             return o2
 
-        inp = torch.randn(3, 10, 2, device=device)
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 1,
@@ -4244,6 +4308,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             model=AssociativeScanModels.ChainFn(**kwargs),
             model_fake=AssociativeScanModels.ChainFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4253,6 +4318,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
     @parametrize("reverse_first", [False, True])
     @parametrize("same_direction", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4268,8 +4334,20 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             )
         ),
     )
+    # Skipping the autograd=True because
+    # associative_scan does currently not support gradients for lifted parameters
+    @decorateIf(
+        unittest.skip,
+        lambda params: (params["combine_mode"] == "pointwise" and params["autograd"]),
+    )
     def test_associative_scan_downstream_scan_scan_different_dim(
-        self, combine_mode, compile_mode, reverse_first, same_direction, device
+        self,
+        combine_mode,
+        compile_mode,
+        reverse_first,
+        same_direction,
+        device,
+        autograd,
     ):
         reverse_second = reverse_first if same_direction else not reverse_first
 
@@ -4281,7 +4359,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             o2 = scan_fct(get_scan_combine_fn("add", True), inp, **kwargs)
             return o2
 
-        inp = torch.randn(3, 10, 2, device=device)
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": [1, 0],
@@ -4295,6 +4373,7 @@ def second_chain_fct(scan_fct, inp, **kwargs):
             model=AssociativeScanModels.ChainFn(**kwargs),
             model_fake=AssociativeScanModels.ChainFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     # TODO: Does not work because of the usage of vmap within associative_scan
@@ -4353,8 +4432,9 @@ def second_nested_fct(x, y):
     @parametrize("loop_type", ["for"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     def test_associative_scan_loop_in_combine_fn(
-        self, compile_mode, loop_type, reverse, device
+        self, compile_mode, loop_type, reverse, device, autograd
     ):
         def combine_fn(x, y):
             cnt = torch.zeros_like(y[0, :])
@@ -4379,7 +4459,7 @@ def body_fn(ind, loop_val):
                     cnt += torch.abs(y[ind])
             return x * cnt
 
-        inp = torch.randn(3, 10, 1, device=device) * 2
+        inp = torch.randn(3, 10, 1, device=device, requires_grad=autograd) * 2
 
         kwargs = {
             "dim": 0,
@@ -4393,6 +4473,7 @@ def body_fn(ind, loop_val):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     # TODO: Does not work because of the usage of vmap within associative_scan
@@ -4437,6 +4518,7 @@ def body_fn(ind, loop_val):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of compile_mode=compile_dynamic_shape
     # as the current implementation does not support lifted arguments
     @decorateIf(
@@ -4447,12 +4529,14 @@ def body_fn(ind, loop_val):
             or torch.version.hip
         ),
     )
-    def test_associative_scan_cond_in_combine_fn(self, compile_mode, reverse, device):
+    def test_associative_scan_cond_in_combine_fn(
+        self, compile_mode, reverse, device, autograd
+    ):
         def combine_fn(x, y):
             val = cond(torch.sum(y) > 0.0, lambda y: y.clone(), lambda y: 1.0 - y, (y,))
             return x * val
 
-        inp = torch.randn(3, 10, 1, device=device)
+        inp = torch.randn(3, 10, 1, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4466,6 +4550,7 @@ def combine_fn(x, y):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     # TODO: Does not work because of the usage of vmap within associative_scan
@@ -4507,7 +4592,10 @@ def body(x, y):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_associative_scan_vmap_in_combine_fn(self, compile_mode, reverse, device):
+    @parametrize("autograd", [False, True])
+    def test_associative_scan_vmap_in_combine_fn(
+        self, compile_mode, reverse, device, autograd
+    ):
         def combine_fn(x, y):
             def body(x):
                 return x**2
@@ -4516,7 +4604,7 @@ def body(x):
             y_new = mapped_body(y)
             return x + y_new
 
-        inp = torch.randn(3, 10, 2, device=device)
+        inp = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4530,6 +4618,7 @@ def body(x):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4537,6 +4626,7 @@ def body(x):
     @parametrize("reverse", [False, True])
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of associative_scan and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     @decorateIf(
@@ -4544,9 +4634,9 @@ def body(x):
         lambda params: (params["device"] == torch.device("cpu")),
     )
     def test_associative_scan_non_pointwise_generic(
-        self, reverse, compile_mode, device
+        self, reverse, compile_mode, device, autograd
     ):
-        x = torch.randn(3, 10, 2, device=device)
+        x = torch.randn(3, 10, 2, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4560,6 +4650,7 @@ def test_associative_scan_non_pointwise_generic(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=x,
+            autograd_param=None if not autograd else (x,),
         )
 
     @skipIfRocm(msg="Unsupported on ROCM yet")
@@ -4569,6 +4660,7 @@ def test_associative_scan_non_pointwise_generic(
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combination of combine_mode=pointwise and device=cpu
     # as the current implementation of pointwise does only support CUDA device
     # Skipping the combination of combine_mode=pointwise and compile_mode=compile_dynamic_shape
@@ -4585,14 +4677,14 @@ def test_associative_scan_non_pointwise_generic(
         ),
     )
     def test_associative_scan_binary_operator(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
         state_dim = 20
         timesteps = 10
         projected_inputs = torch.randn(
-            timesteps, state_dim, requires_grad=True, device=device
+            timesteps, state_dim, device=device, requires_grad=autograd
         )
-        A = torch.randn(state_dim, requires_grad=True, device=device)
+        A = torch.randn(state_dim, device=device, requires_grad=autograd)
         elements = (A.repeat((timesteps, 1)), projected_inputs)
 
         kwargs = {
@@ -4607,6 +4699,7 @@ def test_associative_scan_binary_operator(
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=elements,
+            autograd_param=None if not autograd else elements,
         )
 
     @skipIfRocm(msg="Unsupported on ROCM yet")
@@ -4688,6 +4781,7 @@ def test_associative_scan_different_input_size_wrong_dim(self):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4696,9 +4790,9 @@ def test_associative_scan_different_input_size_wrong_dim(self):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_simple(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
-        H = torch.rand(2, device=device)
+        H = torch.rand(2, device=device, requires_grad=autograd)
 
         def fct_freevars1(x: torch.Tensor, y: torch.Tensor):
             return x * H + y * 2
@@ -4706,13 +4800,13 @@ def fct_freevars1(x: torch.Tensor, y: torch.Tensor):
         def fct_freevars2(x: torch.Tensor, y: torch.Tensor):
             return x * H + y * H
 
-        H1 = torch.rand(1, device=device)
-        H2 = torch.rand(1, device=device)
+        H1 = torch.rand(1, device=device, requires_grad=autograd)
+        H2 = torch.rand(1, device=device, requires_grad=autograd)
 
         def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
             return x * H1 + y * H2
 
-        inp = torch.randn(3, 2, 2, device=device)
+        inp = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
 
         for fct, param in [
             (fct_freevars1, (H,)),
@@ -4731,6 +4825,7 @@ def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
                 model=AssociativeScanModels.CombineFn(**kwargs),
                 model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
                 inputs=inp,
+                autograd_param=None if not autograd else (inp, *param),
             )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4739,6 +4834,7 @@ def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4747,10 +4843,10 @@ def fct_freevars3(x: torch.Tensor, y: torch.Tensor):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_nested(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
-        H1 = torch.rand(4, 5, device=device)
-        H2 = torch.rand(4, 1, device=device)
+        H1 = torch.rand(4, 5, device=device, requires_grad=autograd)
+        H2 = torch.rand(4, 1, device=device, requires_grad=autograd)
 
         def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
             def inner(xi):
@@ -4766,13 +4862,10 @@ def inner(xi):
             ret = inner(y)
             return x + ret * H1
 
-        H1_i = torch.rand(4, 5, device=device)
-
         # TODO: Using random tensors in the `combine_fn` triggers the vmap randomness error:
         # RuntimeError: vmap: called random operation while in randomness error mode.
         # Please either use the 'same' or 'different' randomness flags on vmap or perform the randomness operation out of vmap
         def fct_nested_inside(x: torch.Tensor, y: torch.Tensor):
-            # H2_i = torch.rand(4, 1, device=device)
             H2_i = torch.ones(4, 1, device=device) * 42
 
             def inner(xi):
@@ -4782,7 +4875,6 @@ def inner(xi):
             return x + ret * H1
 
         def fct_nested_inside_fake(x: torch.Tensor, y: torch.Tensor):
-            # H2_i = torch.rand(4, 1, device=device)
             H2_i = torch.ones(4, 1, device=device) * 42
 
             def inner(xi):
@@ -4791,11 +4883,11 @@ def inner(xi):
             ret = inner(y)
             return x + ret * H1
 
-        inp = torch.randn(3, 4, 5, device=device)
+        inp = torch.randn(3, 4, 5, device=device, requires_grad=autograd)
 
         for fct, fct_fake, param in [
             (fct_nested_outside, fct_nested_outside_fake, (H1, H2)),
-            (fct_nested_inside, fct_nested_inside_fake, (H1_i,)),
+            (fct_nested_inside, fct_nested_inside_fake, ()),
         ]:
             kwargs = {
                 "dim": 0,
@@ -4810,6 +4902,7 @@ def inner(xi):
                 model=AssociativeScanModels.CombineFn(**kwargs),
                 model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
                 inputs=inp,
+                autograd_param=None if not autograd else (inp, *param),
             )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4818,6 +4911,7 @@ def inner(xi):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4826,7 +4920,7 @@ def inner(xi):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_fct(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
         def additional_fct_no_add_inp(x, y):
             return x * y
@@ -4835,7 +4929,7 @@ def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
             ret = additional_fct_no_add_inp(y, y)
             return x + ret
 
-        inp = torch.randn(3, 4, 5, device=device)
+        inp = torch.randn(3, 4, 5, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4849,6 +4943,7 @@ def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4856,7 +4951,10 @@ def fct_nested_outside(x: torch.Tensor, y: torch.Tensor):
     @parametrize("compile_mode", ["none", "eager", "compile", "compile_dynamic_shape"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
-    def test_associative_scan_freevars_fct_generic(self, compile_mode, reverse, device):
+    @parametrize("autograd", [False, True])
+    def test_associative_scan_freevars_fct_generic(
+        self, compile_mode, reverse, device, autograd
+    ):
         def additional_fct_no_add_inp(x, y):
             return x * y
 
@@ -4870,7 +4968,7 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
             ret = _fake_associative_scan(additional_fct_no_add_inp, y, 1)
             return x + ret
 
-        inp = torch.randn(3, 4, 5, device=device)
+        inp = torch.randn(3, 4, 5, device=device, requires_grad=autograd)
 
         kwargs = {
             "dim": 0,
@@ -4885,6 +4983,7 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4893,6 +4992,7 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
     @parametrize("combine_mode", ["pointwise", "generic"])
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4901,7 +5001,7 @@ def fct_nested_outside_fake(x: torch.Tensor, y: torch.Tensor):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_shape_check(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
         H = torch.eye(2, device=device, requires_grad=True)
 
@@ -4922,6 +5022,7 @@ def fct_freevars(x: torch.Tensor, y: torch.Tensor):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (inp,),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -4930,6 +5031,7 @@ def fct_freevars(x: torch.Tensor, y: torch.Tensor):
     @parametrize("reverse", [False, True])
     @parametrize("device", [torch.device("cpu"), torch.device("cuda")])
     @parametrize("combine_mode", ["pointwise", "generic"])
+    @parametrize("autograd", [False, True])
     # Skipping the combine_mode=pointwise
     # as the current implementation of associative_scan lowering
     # does not support lifted arguments
@@ -4938,11 +5040,11 @@ def fct_freevars(x: torch.Tensor, y: torch.Tensor):
         lambda params: (params["combine_mode"] == "pointwise"),
     )
     def test_associative_scan_freevars_pytree(
-        self, compile_mode, combine_mode, reverse, device
+        self, compile_mode, combine_mode, reverse, device, autograd
     ):
-        xf = torch.randn(2, 2, device=device, requires_grad=True)
-        yf = torch.randn(2, 2, device=device, requires_grad=True)
-        zf = torch.randn(2, 2, device=device, requires_grad=True)
+        xf = torch.randn(2, 2, device=device, requires_grad=autograd)
+        yf = torch.randn(2, 2, device=device, requires_grad=autograd)
+        zf = torch.randn(2, 2, device=device, requires_grad=autograd)
         inpf = {"i": xf, "j": ([yf], [{"o": zf}])}
 
         def fct_pointwise(x, y):
@@ -4959,9 +5061,9 @@ def fct_pointwise(x, y):
                 ),
             }
 
-        x = torch.randn(3, 2, 2, device=device, requires_grad=True)
-        y = torch.randn(3, 2, 2, device=device, requires_grad=True)
-        z = torch.randn(3, 2, 2, device=device, requires_grad=True)
+        x = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        y = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
+        z = torch.randn(3, 2, 2, device=device, requires_grad=autograd)
         inp = {"i": x, "j": ([y], [{"o": z}])}
 
         kwargs = {
@@ -4976,6 +5078,7 @@ def fct_pointwise(x, y):
             model=AssociativeScanModels.CombineFn(**kwargs),
             model_fake=AssociativeScanModels.CombineFn(**kwargs_fake),
             inputs=inp,
+            autograd_param=None if not autograd else (*pytree.tree_leaves(inp),),
         )
 
     @unittest.skipIf(not SM70OrLater, "triton")
@@ -5528,69 +5631,35 @@ def test_while_loop_simple_with_linear_compile_check_graph(self):
         gm = backend.graphs[0]
         if torch._dynamo.config.inline_inbuilt_nn_modules:
             self.assertExpectedInline(
-                gm.code.strip(),
-                """\
-def forward(self, L_iter_ : torch.Tensor, L_x_ : torch.Tensor, L_self_buffers_dec_ : torch.Tensor, L_self_modules_linear_parameters_weight_ : torch.nn.parameter.Parameter, L_self_modules_linear_parameters_bias_ : torch.nn.parameter.Parameter):
-    l_iter_ = L_iter_
-    l_x_ = L_x_
-    l_self_buffers_dec_ = L_self_buffers_dec_
-    l_self_modules_linear_parameters_weight_ = L_self_modules_linear_parameters_weight_
-    l_self_modules_linear_parameters_bias_ = L_self_modules_linear_parameters_bias_
-    cond_fn_0 = self.cond_fn_0
-    body_fn_0 = self.body_fn_0
-    while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (l_iter_, l_x_), (l_self_buffers_dec_, l_self_modules_linear_parameters_bias_, l_self_modules_linear_parameters_weight_));  cond_fn_0 = body_fn_0 = l_iter_ = l_x_ = l_self_buffers_dec_ = l_self_modules_linear_parameters_bias_ = l_self_modules_linear_parameters_weight_ = None
-    getitem = while_loop[0]
-    getitem_1 = while_loop[1];  while_loop = None
-    return (getitem, getitem_1)""",  # noqa: B950
-            )
-            self.assertExpectedInline(
-                gm.cond_fn_0.code.strip(),
-                """\
-def forward(self, child : torch.Tensor, child_1 : torch.Tensor, l_self_buffers_dec__cond_fn, l_self_modules_linear_parameters_bias__body_fn, l_self_modules_linear_parameters_weight__body_fn):
-    sub = child - l_self_buffers_dec__cond_fn;  child = l_self_buffers_dec__cond_fn = None
-    gt = sub > 0;  sub = None
-    return gt""",  # noqa: B950
-            )
-            self.assertExpectedInline(
-                gm.body_fn_0.code.strip(),
-                """\
-def forward(self, child_2 : torch.Tensor, child_3 : torch.Tensor, l_self_buffers_dec__cond_fn, l_self_modules_linear_parameters_bias__body_fn, l_self_modules_linear_parameters_weight__body_fn):
-    child = child_2 - 1;  child_2 = None
-    child_4 = torch._C._nn.linear(child_3, l_self_modules_linear_parameters_weight__body_fn, l_self_modules_linear_parameters_bias__body_fn);  child_3 = l_self_modules_linear_parameters_weight__body_fn = l_self_modules_linear_parameters_bias__body_fn = None
-    return (child, child_4)""",  # noqa: B950
-            )
-        else:
-            self.assertExpectedInline(
-                gm.code.strip(),
-                """\
-def forward(self, L_iter_ : torch.Tensor, L_x_ : torch.Tensor):
-    l_iter_ = L_iter_
-    l_x_ = L_x_
-    l__self___dec = self.L__self___dec
-    l__self___linear_weight = self.L__self___linear_weight
-    l__self___linear_bias = self.L__self___linear_bias
-    cond_fn_0 = self.cond_fn_0
-    body_fn_0 = self.body_fn_0
-    while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (l_iter_, l_x_), (l__self___dec, l__self___linear_bias, l__self___linear_weight));  cond_fn_0 = body_fn_0 = l_iter_ = l_x_ = l__self___dec = l__self___linear_bias = l__self___linear_weight = None
-    getitem = while_loop[0]
-    getitem_1 = while_loop[1];  while_loop = None
-    return (getitem, getitem_1)""",  # noqa: B950
-            )
-            self.assertExpectedInline(
-                gm.cond_fn_0.code.strip(),
+                normalize_gm(gm.print_readable(print_output=False)),
                 """\
-def forward(self, l_iter_, l_x_, l__self___dec_cond_fn, l__self___linear_bias_body_fn, l__self___linear_weight_body_fn):
-    sub = l_iter_ - l__self___dec_cond_fn;  l_iter_ = l__self___dec_cond_fn = None
-    gt = sub > 0;  sub = None
-    return gt""",  # noqa: B950
-            )
-            self.assertExpectedInline(
-                gm.body_fn_0.code.strip(),
-                """\
-def forward(self, l_iter_, l_x_, l__self___dec_cond_fn, l__self___linear_bias_body_fn, l__self___linear_weight_body_fn):
-    child = l_iter_ - 1;  l_iter_ = None
-    child_1 = torch._C._nn.linear(l_x_, l__self___linear_weight_body_fn, l__self___linear_bias_body_fn);  l_x_ = l__self___linear_weight_body_fn = l__self___linear_bias_body_fn = None
-    return (child, child_1)""",  # noqa: B950
+class GraphModule(torch.nn.Module):
+    def forward(self, L_iter_: "i64[]", L_x_: "f32[2, 2]", L_self_buffers_dec_: "i64[]", L_self_modules_linear_parameters_weight_: "f32[2, 2]", L_self_modules_linear_parameters_bias_: "f32[2]"):
+        l_iter_ = L_iter_
+        l_x_ = L_x_
+        l_self_buffers_dec_ = L_self_buffers_dec_
+        l_self_modules_linear_parameters_weight_ = L_self_modules_linear_parameters_weight_
+        l_self_modules_linear_parameters_bias_ = L_self_modules_linear_parameters_bias_
+
+        cond_fn_0 = self.cond_fn_0
+        body_fn_0 = self.body_fn_0
+        while_loop = torch.ops.higher_order.while_loop(cond_fn_0, body_fn_0, (l_iter_, l_x_), (l_self_buffers_dec_, l_self_modules_linear_parameters_bias_, l_self_modules_linear_parameters_weight_));  cond_fn_0 = body_fn_0 = l_iter_ = l_x_ = l_self_buffers_dec_ = l_self_modules_linear_parameters_bias_ = l_self_modules_linear_parameters_weight_ = None
+        getitem: "i64[]" = while_loop[0]
+        getitem_1: "f32[2, 2]" = while_loop[1];  while_loop = None
+        return (getitem, getitem_1)
+
+    class cond_fn_0(torch.nn.Module):
+        def forward(self, child: "i64[]", child_1: "f32[2, 2]", l_self_buffers_dec__cond_fn: "i64[]", l_self_modules_linear_parameters_bias__body_fn: "f32[2]", l_self_modules_linear_parameters_weight__body_fn: "f32[2, 2]"):
+            sub: "i64[]" = child - l_self_buffers_dec__cond_fn;  child = l_self_buffers_dec__cond_fn = None
+            gt: "b8[]" = sub > 0;  sub = None
+            return gt
+
+    class body_fn_0(torch.nn.Module):
+        def forward(self, child_2: "i64[]", child_3: "f32[2, 2]", l_self_buffers_dec__cond_fn: "i64[]", l_self_modules_linear_parameters_bias__body_fn: "f32[2]", l_self_modules_linear_parameters_weight__body_fn: "f32[2, 2]"):
+            child: "i64[]" = child_2 - 1;  child_2 = None
+            child_4: "f32[2, 2]" = torch._C._nn.linear(child_3, l_self_modules_linear_parameters_weight__body_fn, l_self_modules_linear_parameters_bias__body_fn);  child_3 = l_self_modules_linear_parameters_weight__body_fn = l_self_modules_linear_parameters_bias__body_fn = None
+            return (child, child_4)
+""",  # noqa: B950
             )
 
     def test_while_loop_nested2_traced(self):
@@ -7839,6 +7908,8 @@ def forward(self, x):
         x: "f32[s77, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
 
         while_loop_cond_graph_0 = self.while_loop_cond_graph_0
@@ -7987,6 +8058,8 @@ def forward(self, t):
         t: "f32[2, 3]";
 
         t, = fx_pytree.tree_flatten_spec(([t], {}), self._in_spec)
+        _guards_fn = self._guards_fn(t);  _guards_fn = None
+
         sum_1: "f32[]" = torch.ops.aten.sum.default(t)
         _assert_tensor_metadata_default = torch.ops.aten._assert_tensor_metadata.default(sum_1, dtype = torch.float32, device = device(type='cpu'), layout = torch.strided);  _assert_tensor_metadata_default = None
         to: "i64[]" = torch.ops.aten.to.dtype(sum_1, torch.int64);  sum_1 = None
@@ -8126,7 +8199,7 @@ def test_while_loop_op_pytree_int_carry_export(self, strict, dynamic):
         m, args = WHILE_LOOP_TESTS["pytree_int_carry"]
         dynamic_shapes = {"x": {0: torch.export.Dim("dim_x")}} if dynamic else None
         ep = self._check_export(m, args, strict=strict, dynamic_shapes=dynamic_shapes)
-        if strict and dynamic:
+        if strict and dynamic and not TEST_WITH_CROSSREF:
             self.assertExpectedInline(
                 normalize_gm(ep.module().print_readable(print_output=False)),
                 """\
@@ -8135,6 +8208,8 @@ def forward(self, x):
         x: "f32[s77, 3]";
 
         x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x);  _guards_fn = None
+
         sym_size_int_1: "Sym(s77)" = torch.ops.aten.sym_size.int(x, 0)
 
         sin: "f32[s77, 3]" = torch.ops.aten.sin.default(x);  x = None
@@ -8254,6 +8329,184 @@ def forward(self, unbacked_symint_4: "Sym(u5)", unbacked_symint_5: "Sym(u6)", un
 """,  # noqa: B950
             )
 
+    @parametrize("dynamic", [True, False])
+    @parametrize("backend", ["eager", "aot_eager"])
+    def test_compile_while_loop_stack_output(self, dynamic, backend):
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                c = torch.tensor(0, dtype=torch.int64)
+
+                def cond_fn(c, x):
+                    return c < x.size(0)
+
+                def body_fn(c, x):
+                    return c + 1, self.linear(x)
+
+                stacked_c, stacked_x = torch.ops.higher_order.while_loop_stack_output(
+                    cond_fn, body_fn, (c, x), tuple()
+                )
+                return stacked_c, stacked_x
+
+        x = torch.randn(3, 3)
+        mod = Mod()
+        compiled_out = torch.compile(mod, backend=backend, dynamic=dynamic)(x)
+        self.assertEqual(len(compiled_out), 2)
+        self.assertEqual(compiled_out[0].size(0), 3)
+        self.assertEqual(compiled_out[1].size(0), 3)
+        self.assertEqual(compiled_out, mod(x))
+
+    @torch._dynamo.config.patch(capture_scalar_outputs=True)
+    def test_while_loop_autograd_simple(self):
+        backend = torch._dynamo.testing.AotEagerAndRecordGraphs()
+
+        class ModEager(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                while x.sum() < 2:
+                    x = x * x + 1 + self.linear(x)
+                return x
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 3)
+
+            def forward(self, x):
+                def cond_fn(x):
+                    return x.sum() < 2
+
+                def body_fn(x):
+                    return x * x + 1 + self.linear(x)
+
+                return torch._higher_order_ops.while_loop(cond_fn, body_fn, (x,))
+
+        x = torch.randn(3, 3, requires_grad=True)
+        x_clone = x.clone()
+        mod = Mod()
+        mod_eager = ModEager()
+        # Copy weights from mod to mod_eager
+        mod_eager.load_state_dict(mod.state_dict())
+        compiled_out = torch.compile(mod, backend=backend, fullgraph=True)(x)
+        exp_out = mod_eager(x_clone)
+        compiled_out.sum().backward()
+        exp_out.sum().backward()
+        self.assertEqual(compiled_out, exp_out)
+        eager_parameters = dict(mod_eager.named_parameters())
+        compiled_parameters = dict(mod.named_parameters())
+        for name, param in compiled_parameters.items():
+            self.assertEqual(param, eager_parameters[name])
+            self.assertEqual(param.grad, eager_parameters[name].grad)
+
+        self.assertEqual(
+            len(
+                backend.fw_graphs[0].graph.find_nodes(
+                    op="call_function",
+                    target=torch.ops.higher_order.while_loop_stack_output,
+                )
+            ),
+            1,
+        )
+        self.assertEqual(
+            len(
+                backend.bw_graphs[0].graph.find_nodes(
+                    op="call_function", target=torch.ops.higher_order.while_loop
+                )
+            ),
+            1,
+        )
+        if not TEST_WITH_CROSSREF:
+            self.assertExpectedInline(
+                normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[3, 3]", primals_2: "f32[3, 3]", primals_3: "f32[3]"):
+        while_loop_cond_graph_0 = self.while_loop_cond_graph_0
+        while_loop_body_graph_0 = self.while_loop_body_graph_0
+        while_loop_stack_output = torch.ops.higher_order.while_loop_stack_output(while_loop_cond_graph_0, while_loop_body_graph_0, (primals_1,), (primals_3, primals_2));  while_loop_cond_graph_0 = while_loop_body_graph_0 = None
+        getitem: "f32[u2, 3, 3]" = while_loop_stack_output[0];  while_loop_stack_output = None
+        select: "f32[3, 3]" = torch.ops.aten.select.int(getitem, 0, -1)
+        unsqueeze: "f32[1, 3, 3]" = torch.ops.aten.unsqueeze.default(primals_1, 0);  primals_1 = None
+        slice_1: "f32[u2 - 1, 3, 3]" = torch.ops.aten.slice.Tensor(getitem, 0, 0, -1);  getitem = None
+        cat: "f32[u2, 3, 3]" = torch.ops.aten.cat.default([unsqueeze, slice_1]);  unsqueeze = slice_1 = None
+        return (select, primals_2, primals_3, cat)
+
+    class while_loop_cond_graph_0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3]", arg2_1: "f32[3, 3]"):
+            sum_1: "f32[]" = torch.ops.aten.sum.default(arg0_1);  arg0_1 = None
+            lt: "b8[]" = torch.ops.aten.lt.Scalar(sum_1, 2);  sum_1 = None
+            return lt
+
+    class while_loop_body_graph_0(torch.nn.Module):
+        def forward(self, arg0_1: "f32[3, 3]", arg1_1: "f32[3]", arg2_1: "f32[3, 3]"):
+            mul: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg0_1, arg0_1)
+            add: "f32[3, 3]" = torch.ops.aten.add.Tensor(mul, 1);  mul = None
+            t: "f32[3, 3]" = torch.ops.aten.t.default(arg2_1);  arg2_1 = None
+            addmm: "f32[3, 3]" = torch.ops.aten.addmm.default(arg1_1, arg0_1, t);  arg1_1 = arg0_1 = t = None
+            add_1: "f32[3, 3]" = torch.ops.aten.add.Tensor(add, addmm);  add = addmm = None
+            return (add_1,)
+""",  # noqa: B950
+            )
+
+            self.assertExpectedInline(
+                normalize_gm(backend.bw_graphs[0].print_readable(print_output=False)),
+                """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_2: "f32[3, 3]", primals_3: "f32[3]", cat: "f32[u2, 3, 3]", tangents_1: "f32[3, 3]"):
+        zeros: "i64[]" = torch.ops.aten.zeros.default([], dtype = torch.int64, device = device(type='cpu'), pin_memory = False)
+        zeros_like: "f32[3]" = torch.ops.aten.zeros_like.default(primals_3, pin_memory = False)
+        zeros_like_1: "f32[3, 3]" = torch.ops.aten.zeros_like.default(primals_2, pin_memory = False)
+        while_loop_cond_graph_1 = self.while_loop_cond_graph_1
+        while_loop_body_graph_1 = self.while_loop_body_graph_1
+        while_loop = torch.ops.higher_order.while_loop(while_loop_cond_graph_1, while_loop_body_graph_1, (zeros, tangents_1, zeros_like, zeros_like_1), (cat, primals_3, primals_2));  while_loop_cond_graph_1 = while_loop_body_graph_1 = zeros = tangents_1 = zeros_like = zeros_like_1 = cat = primals_3 = primals_2 = None
+        getitem_2: "f32[3, 3]" = while_loop[1]
+        getitem_3: "f32[3]" = while_loop[2]
+        getitem_4: "f32[3, 3]" = while_loop[3];  while_loop = None
+        return (getitem_2, getitem_4, getitem_3)
+
+    class while_loop_cond_graph_1(torch.nn.Module):
+        def forward(self, arg0_1: "i64[]", arg1_1: "f32[3, 3]", arg2_1: "f32[3]", arg3_1: "f32[3, 3]", arg4_1: "f32[u2, 3, 3]", arg5_1: "f32[3]", arg6_1: "f32[3, 3]"):
+            sym_size_int_1: "Sym(u2)" = torch.ops.aten.sym_size.int(arg4_1, 0);  arg4_1 = None
+
+            lt: "b8[]" = torch.ops.aten.lt.Scalar(arg0_1, sym_size_int_1);  arg0_1 = sym_size_int_1 = None
+            return lt
+
+    class while_loop_body_graph_1(torch.nn.Module):
+        def forward(self, arg0_1: "i64[]", arg1_1: "f32[3, 3]", arg2_1: "f32[3]", arg3_1: "f32[3, 3]", arg4_1: "f32[u2, 3, 3]", arg5_1: "f32[3]", arg6_1: "f32[3, 3]"):
+            sym_size_int_1: "Sym(u2)" = torch.ops.aten.sym_size.int(arg4_1, 0)
+
+            rsub: "i64[]" = torch.ops.aten.rsub.Scalar(arg0_1, sym_size_int_1);  sym_size_int_1 = None
+            sub_1: "i64[]" = torch.ops.aten.sub.Tensor(rsub, 1);  rsub = None
+            _local_scalar_dense: "Sym(u9)" = torch.ops.aten._local_scalar_dense.default(sub_1);  sub_1 = None
+            select: "f32[3, 3]" = torch.ops.aten.select.int(arg4_1, 0, _local_scalar_dense);  arg4_1 = _local_scalar_dense = None
+            t: "f32[3, 3]" = torch.ops.aten.t.default(arg6_1);  arg6_1 = None
+            t_1: "f32[3, 3]" = torch.ops.aten.t.default(t);  t = None
+            mm: "f32[3, 3]" = torch.ops.aten.mm.default(arg1_1, t_1);  t_1 = None
+            t_2: "f32[3, 3]" = torch.ops.aten.t.default(arg1_1)
+            mm_1: "f32[3, 3]" = torch.ops.aten.mm.default(t_2, select);  t_2 = None
+            t_3: "f32[3, 3]" = torch.ops.aten.t.default(mm_1);  mm_1 = None
+            sum_1: "f32[1, 3]" = torch.ops.aten.sum.dim_IntList(arg1_1, [0], True)
+            view: "f32[3]" = torch.ops.aten.view.default(sum_1, [3]);  sum_1 = None
+            t_4: "f32[3, 3]" = torch.ops.aten.t.default(t_3);  t_3 = None
+            mul_4: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg1_1, select)
+            mul_5: "f32[3, 3]" = torch.ops.aten.mul.Tensor(arg1_1, select);  arg1_1 = select = None
+
+            add_7: "f32[3, 3]" = torch.ops.aten.add.Tensor(mm, mul_5);  mm = mul_5 = None
+            add_8: "f32[3, 3]" = torch.ops.aten.add.Tensor(add_7, mul_4);  add_7 = mul_4 = None
+
+            add_9: "i64[]" = torch.ops.aten.add.Tensor(arg0_1, 1);  arg0_1 = None
+            add_10: "f32[3]" = torch.ops.aten.add.Tensor(view, arg2_1);  view = arg2_1 = None
+            add_11: "f32[3, 3]" = torch.ops.aten.add.Tensor(t_4, arg3_1);  t_4 = arg3_1 = None
+            return (add_9, add_8, add_10, add_11)
+""",  # noqa: B950
+            )
+
     def test_input_output_alias(self):
         def fn(f, *args):
             return torch.cond(args[0].sum() > 0, f, f, args)
@@ -8376,6 +8629,8 @@ def forward(self, a, b1, b2, c):
         a: "b8[]"; b1: "i64[1]"; b2: "i64[1]"; c: "f32[10]";
 
         a, b1, b2, c, = fx_pytree.tree_flatten_spec(([a, b1, b2, c], {}), self._in_spec)
+        _guards_fn = self._guards_fn(a, b1, b2, c);  _guards_fn = None
+
         true_graph_0 = self.true_graph_0
         false_graph_0 = self.false_graph_0
         cond = torch.ops.higher_order.cond(a, true_graph_0, false_graph_0, (c, b1, b2));  a = true_graph_0 = false_graph_0 = c = b1 = b2 = None
@@ -8458,6 +8713,8 @@ def forward(self, x, y, z):
         x: "f32[s68, 3]"; y: "f32[s17]"; z: "f32[s68, 3]";
 
         x, y, z, = fx_pytree.tree_flatten_spec(([x, y, z], {}), self._in_spec)
+        _guards_fn = self._guards_fn(x, y, z);  _guards_fn = None
+
         sym_size_int_4: "Sym(s17)" = torch.ops.aten.sym_size.int(y, 0);  y = None
         sym_size_int_5: "Sym(s68)" = torch.ops.aten.sym_size.int(z, 0)
 
diff --git a/test/functorch/test_vmap_registrations.py b/test/functorch/test_vmap_registrations.py
index bf738207a41b..adb66ac4d970 100644
--- a/test/functorch/test_vmap_registrations.py
+++ b/test/functorch/test_vmap_registrations.py
@@ -208,6 +208,7 @@
     "aten::subtract_.Scalar",
     "aten::subtract_.Tensor",
     "aten::svd.U",
+    "aten::sym_is_contiguous",
     "aten::sym_size.int",
     "aten::sym_stride.int",
     "aten::sym_numel",
diff --git a/test/fx/test_dce_pass.py b/test/fx/test_dce_pass.py
index facf74d8e2a9..7fd3a6dbb004 100644
--- a/test/fx/test_dce_pass.py
+++ b/test/fx/test_dce_pass.py
@@ -338,8 +338,6 @@ def test_keep_collectives(self):
         Test that DCE doesn't remote collective ops even the results are not used.
         """
 
-        from torch.testing._internal.distributed.fake_pg import FakeStore
-
         class TestModule(torch.nn.Module):
             def forward(
                 self, a: torch.Tensor, b: torch.Tensor, c: torch.Tensor
@@ -354,7 +352,6 @@ def forward(
             backend="fake",
             world_size=2,
             rank=0,
-            store=FakeStore(),
         )
         # collective nodes should not be removed because they have side effects.
         self._run_dce_and_test(TestModule(), expect_dce_changes=False, custom=False)
@@ -366,8 +363,6 @@ def test_keep_collectives_no_overload(self):
         Test that DCE doesn't remote collective ops (no overload version) even the results are not used.
         """
 
-        from torch.testing._internal.distributed.fake_pg import FakeStore
-
         class TestModule(torch.nn.Module):
             def forward(
                 self, a: torch.Tensor, b: torch.Tensor, c: torch.Tensor
@@ -382,7 +377,6 @@ def forward(
             backend="fake",
             world_size=2,
             rank=0,
-            store=FakeStore(),
         )
         # collective nodes should not be removed because they have side effects.
         self._run_dce_and_test(TestModule(), expect_dce_changes=False, custom=False)
diff --git a/test/fx/test_fx_split_node_finder.py b/test/fx/test_fx_split_node_finder.py
new file mode 100644
index 000000000000..a139626968ca
--- /dev/null
+++ b/test/fx/test_fx_split_node_finder.py
@@ -0,0 +1,191 @@
+# Owner(s): ["module: fx"]
+
+# pyre-strict
+import os
+import shutil
+import sys
+import tempfile
+
+import torch
+from torch.fx.passes.operator_support import OperatorSupportBase
+from torch.fx.passes.splitter_base import (
+    ALL_SUFFIX,
+    ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES,
+    FxNetAccNodesFinder,
+    NodeEventTracker,
+    NODES_SUFFIX,
+    ShapeProp,
+)
+from torch.testing._internal.common_utils import TestCase
+
+
+# Wrappepr function to make it supported
+@torch.fx.wrap
+def sup_f(x):
+    return x
+
+
+class TestFxSplitNodeFinder(TestCase):
+    def setUp(self):
+        self.save_path = sys.path[:]
+        self.tmpdir = tempfile.mkdtemp()
+        sys.path.insert(0, self.tmpdir)
+
+    def tearDown(self):
+        sys.path[:] = self.save_path
+        shutil.rmtree(self.tmpdir, ignore_errors=True)
+
+    def _testTrackerBasics(self, tracker):
+        """
+        Test the basic functionalities of the tracker by putting it into a
+        node finder and examine the events generated after the finder is called.
+        """
+
+        def getEvents(tracker, node):
+            return [tracker.events[idx] for idx in tracker.node_events[node.name]]
+
+        class IsNodeSupported(OperatorSupportBase):
+            def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+                return "sup_" in node.name
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x, y):
+                x = sup_f(x)
+                y = sup_f(y)
+                b = x + y  # non-supported to break graph
+                return sup_f(b)
+
+        gm = torch.fx.symbolic_trace(TestModule())
+        ShapeProp(gm).propagate(*(torch.rand((2, 2)), 3))
+
+        finder = FxNetAccNodesFinder(gm, IsNodeSupported(), False)
+        finder.tracker = tracker
+        acc_nodes = finder()
+        # check that acc nodes events are as expected
+        for node in gm.graph.nodes:
+            if node.name == "sup_f_1":
+                # this node should be removed from acc nodes.
+                self.assertFalse(node in acc_nodes)
+                events = getEvents(tracker, node)
+                # 2 events.
+                self.assertEqual(len(events), 2)
+                # 1st event is init_acc as supported operator
+                self.assertTrue(
+                    events[0].desc.startswith(
+                        "init_acc|callable_and_operator_supported"
+                    )
+                )
+                # 2nd event is del_acc as non-tensor output with cpu user
+                self.assertTrue(
+                    events[1].desc.startswith("acc_del|non_tensor_output_with_cpu_user")
+                )
+            elif node.name.startswith("sup_f"):
+                # other supported nodes should remain in acc nodes.
+                self.assertTrue(node in acc_nodes)
+                events = getEvents(tracker, node)
+                self.assertEqual(len(events), 1)
+                self.assertTrue(
+                    events[0].desc.startswith(
+                        "init_acc|callable_and_operator_supported"
+                    )
+                )
+            else:
+                # other nodes are on cpu.
+                self.assertFalse(node in acc_nodes)
+
+    def _validate_file_content(self, filepath, expected_lines):
+        """
+        Validate the content of the file.
+        Args:
+            filepath: the path of the file to be validated.
+            expected_lines: the expected lines of the file.
+        Returns:
+            None
+        """
+        with open(filepath) as f:
+            idx = 0
+            for line in f:
+                self.assertEqual(line.rstrip("\n"), expected_lines[idx])
+                idx += 1
+                self.assertTrue(idx <= len(expected_lines))
+            self.assertEqual(idx, len(expected_lines))
+
+    def _assert_events_file(self, events_file):
+        self._validate_file_content(
+            events_file,
+            [
+                "Node: x:",
+                "  x: init_cpu|not_callable #",
+                "Node: y:",
+                "  y: init_cpu|not_callable #",
+                "Node: sup_f:",
+                "  sup_f: init_acc|callable_and_operator_supported #",
+                "Node: sup_f_1:",
+                "  sup_f_1: init_acc|callable_and_operator_supported #",
+                "  sup_f_1: acc_del|non_tensor_output_with_cpu_user add",
+                "Node: add:",
+                "  add: init_cpu|operator_support #",
+                "Node: sup_f_2:",
+                "  sup_f_2: init_acc|callable_and_operator_supported #",
+                "Node: output:",
+                "  output: init_cpu|not_callable #",
+            ],
+        )
+
+    def _testTrackerMode(self, mode):
+        """
+        Test the tracker with different modes.
+        Args:
+            mode: the mode to be tested.
+            - 0: no local dump
+            - 1: dump all events to file
+            - 2: dump specific nodes in recursive manner
+            - 3: dump all nodes with more than 1 event in recursive manner.
+        """
+        tmp_dump_base_path = self.tmpdir + "/" + str(mode)
+        tracker = NodeEventTracker(
+            mode,  # mode: just enable the tracker without dumping
+            tmp_dump_base_path,  # dump path
+        )
+        events_file = tmp_dump_base_path + ALL_SUFFIX
+        nodes_file = tmp_dump_base_path + NODES_SUFFIX
+        self.assertFalse(os.path.exists(events_file))
+        self.assertFalse(os.path.exists(nodes_file))
+        self._testTrackerBasics(tracker)
+
+        if mode == 0:
+            # Make sure there are no files dumped
+            self.assertFalse(os.path.exists(events_file))
+            self.assertFalse(os.path.exists(nodes_file))
+        elif mode == 1:
+            self._assert_events_file(events_file)
+            self.assertFalse(os.path.exists(nodes_file))
+        elif mode == 2:
+            self._assert_events_file(events_file)
+            self._validate_file_content(
+                nodes_file,
+                ["|-sup_f_2: init_acc|callable_and_operator_supported #"],
+            )
+        elif mode == 3:
+            self._assert_events_file(events_file)
+            self._validate_file_content(
+                nodes_file,
+                [
+                    "|-sup_f_1: init_acc|callable_and_operator_supported #",
+                    "|-sup_f_1: acc_del|non_tensor_output_with_cpu_user add",
+                    "| |-add: init_cpu|operator_support #",
+                ],
+            )
+
+    def testMode0(self):
+        self._testTrackerMode(0)
+
+    def testMode1(self):
+        self._testTrackerMode(1)
+
+    def testMode2(self):
+        os.environ[ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES] = "sup_f_2"
+        self._testTrackerMode(2)
+
+    def testMode3(self):
+        self._testTrackerMode(3)
diff --git a/test/higher_order_ops/test_invoke_subgraph.py b/test/higher_order_ops/test_invoke_subgraph.py
index fc6fd1c10fc6..34d8e41d8978 100644
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@@ -1498,7 +1498,7 @@ def forward(self, L_x_: "f32[8, 8]"):
         subgraph_0 = self.subgraph_0
         invoke_subgraph = torch.ops.higher_order.invoke_subgraph(subgraph_0, 'subgraph_0', l_x_);  subgraph_0 = l_x_ = None
         getitem: "f32[8, 8]" = invoke_subgraph[0]
-        getitem_1: "f32[8, 8]" = invoke_subgraph[2];  invoke_subgraph = None
+        getitem_1: "f32[8, 8]" = invoke_subgraph[1];  invoke_subgraph = None
 
         add: "f32[8, 8]" = getitem + getitem_1;  getitem = getitem_1 = None
         return (add,)
@@ -1507,7 +1507,7 @@ class subgraph_0(torch.nn.Module):
         def forward(self, l_x_: "f32[8, 8]"):
             child: "f32[8, 8]" = l_x_ * 2
             child_1: "f32[8, 8]" = l_x_ * 3;  l_x_ = None
-            return (child, None, child_1)
+            return (child, child_1)
 """,
             )
 
@@ -1520,16 +1520,16 @@ def forward(self, primals_1: "f32[8, 8]"):
 
         invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(partitioned_fw_subgraph_0_0, 'partitioned_fw_subgraph_0_0', primals_1);  partitioned_fw_subgraph_0_0 = primals_1 = None
         getitem: "f32[8, 8]" = invoke_subgraph_2[0]
-        getitem_2: "f32[8, 8]" = invoke_subgraph_2[2];  invoke_subgraph_2 = None
+        getitem_1: "f32[8, 8]" = invoke_subgraph_2[1];  invoke_subgraph_2 = None
 
-        add: "f32[8, 8]" = torch.ops.aten.add.Tensor(getitem, getitem_2);  getitem = getitem_2 = None
+        add: "f32[8, 8]" = torch.ops.aten.add.Tensor(getitem, getitem_1);  getitem = getitem_1 = None
         return (add,)
 
     class partitioned_fw_subgraph_0_0(torch.nn.Module):
         def forward(self, primals_0: "f32[8, 8]"):
             mul: "f32[8, 8]" = torch.ops.aten.mul.Tensor(primals_0, 2)
             mul_1: "f32[8, 8]" = torch.ops.aten.mul.Tensor(primals_0, 3);  primals_0 = None
-            return (mul, None, mul_1)
+            return (mul, mul_1)
 """,
             )
 
@@ -1541,8 +1541,8 @@ def forward(self, tangents_1: "f32[8, 8]"):
         partitioned_bw_subgraph_0_0 = self.partitioned_bw_subgraph_0_0
 
         invoke_subgraph_3 = torch.ops.higher_order.invoke_subgraph(partitioned_bw_subgraph_0_0, 'partitioned_bw_subgraph_0_0', tangents_1, tangents_1);  partitioned_bw_subgraph_0_0 = tangents_1 = None
-        getitem_3: "f32[8, 8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
-        return (getitem_3,)
+        getitem_2: "f32[8, 8]" = invoke_subgraph_3[0];  invoke_subgraph_3 = None
+        return (getitem_2,)
 
     class partitioned_bw_subgraph_0_0(torch.nn.Module):
         def forward(self, tangents_0: "f32[8, 8]", tangents_1: "f32[8, 8]"):
@@ -1888,6 +1888,37 @@ def forward(self, l_y_: "f32[16, 16]"):
 """,
             )
 
+    def test_return_size(self):
+        def run(dynamic):
+            torch.compiler.reset()
+
+            @nested_compile_region
+            def gn(x):
+                y = x + 1
+                z = x.shape
+                return y, z
+
+            def fn(x):
+                z0 = gn(x)
+                z1 = gn(x)
+                return z0[0] + z1[0], z0[1]
+
+            x = torch.randn(8, 8, requires_grad=True)
+            x_clone = x.detach().clone().requires_grad_(True)
+            ref = fn(x)
+            opt_fn = torch.compile(
+                fn, backend="inductor", fullgraph=True, dynamic=dynamic
+            )
+            res = opt_fn(x_clone)
+            self.assertEqual(ref, res)
+
+            ref[0].sum().backward()
+            res[0].sum().backward()
+            self.assertEqual(x.grad, x_clone.grad)
+
+        run(dynamic=True)
+        run(dynamic=False)
+
     def test_different_symint(self):
         """
         Tests check that the same subgraph called with different symints use different graphs
diff --git a/test/inductor/indirect_assert_helper.py b/test/inductor/indirect_assert_helper.py
index 33f74f44e52b..6d1bc2b608fb 100644
--- a/test/inductor/indirect_assert_helper.py
+++ b/test/inductor/indirect_assert_helper.py
@@ -73,7 +73,10 @@ def lower2(x):
         shape = (y.numel(),) + x.shape[2:]
         z = torch.randn(shape, device=GPU_TYPE)
         fn(x, y, z)
+        # On Windows, Python will optimize away a function call if its updated value is not used.
+        # Touch the memory of x so that the fn(x, y, z) will not be optimized away
+        print(x)
     elif fn_name in ("upper1", "upper2", "lower1", "lower2"):
-        fn(x)
+        print(fn(x))
     else:
-        fn(x, y)
+        print(fn(x, y))
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index aeef3699e711..917a914a5359 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -21,6 +21,7 @@
 from torch._dynamo.utils import counters
 from torch._inductor import config
 from torch._inductor.codecache import WritableTempFile
+from torch._inductor.cpp_builder import normalize_path_separator
 from torch._inductor.package import package_aoti
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch._inductor.test_case import TestCase
@@ -29,6 +30,7 @@
     maybe_aoti_standalone_config,
     run_and_get_cpp_code,
 )
+from torch._library import capture_triton
 from torch._utils_internal import full_aoti_runtime_assert
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
@@ -62,6 +64,7 @@
     MACOS_VERSION,
     MI300_ARCH,
     parametrize,
+    runOnRocm,
     skipIfMPS,
     skipIfRocm,
     skipIfRocmArch,
@@ -153,6 +156,13 @@
     raise
 
 
+def get_module_ext_type():
+    if IS_WINDOWS:
+        return "pyd"
+    else:
+        return "so"
+
+
 class AOTInductorTestsTemplate:
     # Temporarily skipping test as pytorch/cpuinfo not able to retrieve cache size for
     # AMD EPYC 9575F 64-Core Processor CPU in gfx942 VM Runners
@@ -306,7 +316,11 @@ def forward(self, x, y):
             torch.randn(10, 10, device=self.device),
             torch.randn(10, 10, device=self.device),
         )
-        expected_path = os.path.join(tempfile.mkdtemp(dir=cache_dir()), "model.so")
+        expected_path = normalize_path_separator(
+            os.path.join(
+                tempfile.mkdtemp(dir=cache_dir()), f"model.{get_module_ext_type()}"
+            )
+        )
         actual_path = AOTIRunnerUtil.legacy_compile(
             model, example_inputs, options={"aot_inductor.output_path": expected_path}
         )
@@ -2158,8 +2172,19 @@ def test_while_loop_with_outer_code(self):
             dynamic_shapes=dynamic_shapes,
         )
 
+    # mps doesn't support float64
+    @skipIfMPS
     def test_while_loop_with_parameters(self):
-        inputs = (torch.randn((10, 20), device=self.device),)
+        inputs = (
+            torch.randn(
+                (
+                    10,
+                    20,
+                ),
+                dtype=torch.float64,
+                device=self.device,
+            ),
+        )
         dim0_a = Dim("s0", min=2, max=1024)
         dynamic_shapes = {
             "c": {},
@@ -2481,7 +2506,6 @@ def forward(self, x):
         torch._export.aot_compile(Model(), example_inputs)
 
     @skipCUDAIf(True, "Test for x86 backend")
-    @skipIfXpu
     @unittest.skipIf(IS_FBCODE, "Need newer ideep")
     def test_buffer_mutation_and_force_mmap_weights(self):
         class Model(nn.Module):
@@ -4208,7 +4232,6 @@ def forward(self, x, y):
 
         self.check_model(Model(), example_inputs)
 
-    # @skipIfXpu(msg="torch.xpu.memory_allocated not supported yet")
     def test_triton_kernel_reinterpret_view_mem_leak(self):
         # Check for memory leak when using user-defined Triton Kernel + AOTI.
         if self.device != GPU_TYPE:
@@ -5467,6 +5490,68 @@ def sin_triton(x, out):
         self.check_model(sin_triton, none_inputs)
         self.check_model(sin_triton, not_none_inputs)
 
+    @skipIfRocm  # RoCM does not support the config block size in test suite.
+    def test_autotune_int64_user_defined_triton_kernel(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: "tl.constexpr",
+        ):
+            pid = tl.program_id(axis=0).to(tl.int64)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        @torch.library.triton_op("mylib::add", mutates_args=())
+        def custom_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            capture_triton(add_kernel)[grid](x, y, output, n_elements, 16)
+            return output
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                x = custom_add(x, x)
+                split_with_sizes_1 = torch.ops.aten.split_with_sizes.default(
+                    x, [512, 512, 512, 512], 1
+                )
+                getitem_29 = split_with_sizes_1[0]
+                return getitem_29 * 3
+
+        n = 1379584
+
+        try:
+            buf196 = torch.randint(
+                0, 100, (n, 2048), dtype=torch.int8, device=self.device
+            )
+            example_inputs = (buf196,)
+
+            self.check_model(
+                Model(),
+                example_inputs,
+                dynamic_shapes={
+                    "x": (Dim("x", max=1379584), Dim.STATIC),
+                },
+                options={"max_autotune": True},
+            )
+        except torch.OutOfMemoryError:
+            # CI can OOM because this test uses too much memory
+            raise unittest.SkipTest("OOM. Test is too large") from None
+
     @skipIfWindows(
         msg="OpenMP crashed application on windows"
     )  # TODO: (xuhancn) need to root cause and fix.
@@ -6225,6 +6310,7 @@ def forward(self, x):
         example_inputs = (torch.randn(500, device=self.device),)
         self.check_model(model, example_inputs)
 
+    @skipIfXpu
     def test_conv3d(self):
         if self.device != GPU_TYPE or not is_big_gpu():
             raise unittest.SkipTest("requires modern GPU to run max-autotune")
@@ -6430,6 +6516,48 @@ def forward(self, x):
                 rtol=1e-3,
             )
 
+    @runOnRocm
+    def test_rocm_triton_autotuning(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y, m):
+                _M, K = x.shape
+                K, N = y.shape
+                M = torch.abs(m)
+                out = torch.empty((_M, N), device=x.device, dtype=torch.float32)
+                grid = lambda META: (  # noqa: E731
+                    triton.cdiv(
+                        4096 * 2046, META["BLOCK_SIZE_M"] * META["BLOCK_SIZE_N"]
+                    ),
+                )
+                strange_config_matmul_kernel[grid](
+                    x,
+                    y,
+                    out,
+                    M,
+                    N,
+                    K,
+                )
+                return out
+
+        x = torch.randn(4096, 1024, device=self.device)
+        y = torch.randn(1024, 2048, device=self.device)
+        m = torch.tensor([4096], dtype=torch.int32, device=self.device)
+
+        with (
+            torch.no_grad(),
+            config.patch(
+                {
+                    "triton.autotune_with_sample_inputs": True,
+                    "aot_inductor.allow_stack_allocation": self.allow_stack_allocation,
+                    "aot_inductor.use_minimal_arrayref_interface": self.use_minimal_arrayref_interface,
+                }
+            ),
+        ):
+            torch._export.aot_compile(Model(), (x, y, m))
+
     @skipIfRocm  # RoCM does not support the config block size in test suite.
     def test_triton_autotuning(self):
         if self.device != GPU_TYPE:
@@ -6899,9 +7027,10 @@ def forward(self, x, y):
         with zipfile.ZipFile(package_path, "r") as zip_ref:
             all_files = zip_ref.namelist()
             base_dir = "test_model.wrapper/data/aotinductor/model/test_model"
+            ext_type = get_module_ext_type()
             self.assertTrue(f"{base_dir}.wrapper.cpp" in all_files)
             self.assertTrue(f"{base_dir}.kernel.cpp" in all_files)
-            self.assertTrue(f"{base_dir}.wrapper.so" in all_files)
+            self.assertTrue(f"{base_dir}.wrapper.{ext_type}" in all_files)
 
         aot_inductor_module = torch._inductor.aoti_load_package(package_path)
         self.assertEqual(aot_inductor_module(*example_inputs), model(*example_inputs))
@@ -6936,6 +7065,34 @@ def forward(self, a, b):
 
         self.assertEqual(outputs, outputs_aoti)
 
+    def test_pad_non_zero_memory_leak(self):
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("test is only for GPU_TYPE")
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                x = x + 1
+                x = torch.ops.aten.constant_pad_nd(x, (0, 1, 0, 0), 12345.0)
+
+                return x @ x
+
+        model = Model()
+        example_inputs = (torch.randn(2048, 2047, device=self.device),)
+        package_path, code = run_and_get_cpp_code(
+            AOTIRunnerUtil.compile, model, example_inputs
+        )
+        outputs = model(*example_inputs)
+        model_aoti = torch._inductor.aoti_load_package(package_path)
+        outputs_aoti = model_aoti(*example_inputs)
+
+        self.assertEqual(outputs, outputs_aoti)
+
+        FileCheck().check_regex(
+            r"aoti_torch_as_strided\(buf0_handle, .*, &buf0_handle_restrided\)"
+        ).check("wrap_with_raii_handle_if_needed(buf0_handle);").check(
+            "RAIIAtenTensorHandle buf0(buf0_handle_restrided);"
+        ).run(code)
+
 
 class AOTInductorLoggingTest(LoggingTestCase):
     @make_logging_test(dynamic=logging.DEBUG)
@@ -7130,6 +7287,7 @@ def fail_gpu(suffixes: tuple[str, ...], is_skip=False):
     "test_none_args_aot_codegen": fail_mps(),
     "test_aoti_debug_printer_sym_inputs": fail_mps(),
     "test_aoti_debug_printer_user_defined_triton_kernel": fail_mps(),
+    "test_autotune_int64_user_defined_triton_kernel": fail_mps(),
 }
 
 
diff --git a/test/inductor/test_aot_inductor_package.py b/test/inductor/test_aot_inductor_package.py
index 46152103836a..0eb1057c802e 100644
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@@ -147,7 +147,10 @@ def check_package_cpp_only(self: TestCase) -> None:
 
     def cmake_compile_and_run(self, base_dir):
         custom_env = os.environ.copy()
-        custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
+        custom_env["CMAKE_PREFIX_PATH"] = ":".join(
+            [str(Path(torch.__file__).parent)]
+            + os.environ.get("CMAKE_PREFIX_PATH", "").split(":")
+        )
         build_path = Path(base_dir) / "build"
         build_path.mkdir()
         subprocess.run(
@@ -194,7 +197,10 @@ def cmake_compile(self, model, example_inputs, options, tmp_dir):
             self.assertTrue(not build_path.exists())
             build_path.mkdir()
             custom_env = os.environ.copy()
-            custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
+            custom_env["CMAKE_PREFIX_PATH"] = ":".join(
+                [str(Path(torch.__file__).parent)]
+                + os.environ.get("CMAKE_PREFIX_PATH", "").split(":")
+            )
             subprocess.run(
                 ["cmake", ".."],
                 cwd=build_path,
diff --git a/test/inductor/test_aot_inductor_utils.py b/test/inductor/test_aot_inductor_utils.py
index a86690270461..50edf7b695ad 100644
--- a/test/inductor/test_aot_inductor_utils.py
+++ b/test/inductor/test_aot_inductor_utils.py
@@ -159,7 +159,11 @@ def compile(
         with torch.no_grad():
             # strict=False needs extra migration work
             ep = torch.export.export(
-                model, example_inputs, dynamic_shapes=dynamic_shapes, strict=True
+                model,
+                example_inputs,
+                dynamic_shapes=dynamic_shapes,
+                strict=True,
+                prefer_deferred_runtime_asserts_over_guards=True,
             )
             package_path = torch._inductor.aoti_compile_and_package(
                 ep, inductor_configs=inductor_configs
diff --git a/test/inductor/test_async_compile.py b/test/inductor/test_async_compile.py
index ce30b67af239..5a61ea851eae 100644
--- a/test/inductor/test_async_compile.py
+++ b/test/inductor/test_async_compile.py
@@ -4,6 +4,7 @@
 import torch
 from torch._inductor import config
 from torch._inductor.async_compile import AsyncCompile, shutdown_compile_workers
+from torch._inductor.compile_worker.subproc_pool import SubprocException
 from torch._inductor.runtime.triton_compat import Config
 from torch._inductor.runtime.triton_heuristics import (
     generate_lookup_hash_from_source_code,
@@ -41,6 +42,29 @@ def fn(x, y):
                 compiled_fn = torch.compile(fn)
                 self.assertEqual(fn(x, y), compiled_fn(x, y))
 
+    @requires_gpu()
+    @requires_triton()
+    def test_bad_kernel(self):
+        shutdown_compile_workers()
+
+        with config.patch(worker_start_method="subprocess", compile_threads=8):
+            async_compile = AsyncCompile()
+            AsyncCompile.wait_pool_ready()
+            with self.assertRaises(SubprocException):
+                async_compile.triton(
+                    "fake_kernel_name", source_code="This definitely doesn't exist"
+                ).result()
+
+    @requires_gpu()
+    @requires_triton()
+    def test_wait_pool_ready(self):
+        shutdown_compile_workers()
+
+        with config.patch(worker_start_method="subprocess", compile_threads=8):
+            AsyncCompile.wait_pool_ready()
+            self.assertTrue(AsyncCompile._ready_future.done())
+            self.assertTrue(AsyncCompile.use_process_pool())
+
     @requires_gpu()
     @requires_triton()
     @patch("torch._inductor.runtime.coordinate_descent_tuner.CoordescTuner.autotune")
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index 757ea061c26f..6da49ab39229 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -33,6 +33,7 @@
 from torch._inductor.custom_graph_pass import (
     CustomGraphModulePass,
     CustomGraphPass,
+    CustomPartitionerFn,
     get_hash_for_files,
 )
 from torch._inductor.graph import GraphLowering
@@ -2115,6 +2116,19 @@ def fn(a, b):
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
 
 
+class TestCustomPartitionerFn(CustomPartitionerFn):
+    def __init__(self):
+        self._uuid = None
+
+    def __call__(
+        self, gm, joint_inputs, **kwargs
+    ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+        return gm, gm  # Dummy implementation
+
+    def uuid(self) -> Optional[Union[bytes, str]]:
+        return self._uuid
+
+
 class TestFxGraphCacheHashing(TestCase):
     def test_parameter_constants(self):
         """
@@ -2520,6 +2534,35 @@ def test_hash_custom_backend_config(self):
             self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)
 
+    def test_hash_custom_partitioner_fn(self):
+        """
+        Test that the custom partitioner function's UUID is properly used in the FX graph cache hashing.
+        """
+        custom_partitioner_fn = TestCustomPartitionerFn()
+        with config.patch({"custom_partitioner_fn": custom_partitioner_fn}):
+            custom_partitioner_fn._uuid = "1"
+            details1 = FxGraphHashDetails(None, [], {}, [])
+            details2 = FxGraphHashDetails(None, [], {}, [])
+
+            custom_partitioner_fn._uuid = "2"
+            details3 = FxGraphHashDetails(None, [], {}, [])
+
+            self.assertEqual(details1._custom_partitioner_fn, "1")
+            self.assertEqual(details2._custom_partitioner_fn, "1")
+            self.assertEqual(details3._custom_partitioner_fn, "2")
+
+            gm = torch.fx.GraphModule({}, torch.fx.Graph())
+            pickler = FxGraphCachePickler(gm)
+
+            self.assertEqual(
+                pickler.dumps(details1),
+                pickler.dumps(details2),
+            )
+            self.assertNotEqual(
+                pickler.dumps(details1),
+                pickler.dumps(details3),
+            )
+
     def test_bypass_unsupported(self):
         """
         Test _reduce_unsupported
diff --git a/test/inductor/test_combo_kernels.py b/test/inductor/test_combo_kernels.py
index 90399546d26e..6523cddcec6d 100644
--- a/test/inductor/test_combo_kernels.py
+++ b/test/inductor/test_combo_kernels.py
@@ -296,23 +296,6 @@ def fn(a0, a1, a2, b0, b1, b2):
 
         self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
 
-    @requires_cuda_and_triton
-    def test_persistent_reduction_no_x_dim(self):
-        def fn(x, y):
-            return x.sum(1), y.sum(1)
-
-        inps = (
-            torch.rand(16, 256, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
-        )
-        torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
-        torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
-        out_eager = fn(*inps)
-        out_compiled = torch.compile(fn)(*inps)
-
-        self.assertEqual(out_eager, out_compiled)
-        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
-
 
 @instantiate_parametrized_tests
 class ComboKernelDynamicShapesTests(TestCase):
diff --git a/test/inductor/test_compile_subprocess.py b/test/inductor/test_compile_subprocess.py
index 51aa7b70b9c4..bf474bfbf177 100644
--- a/test/inductor/test_compile_subprocess.py
+++ b/test/inductor/test_compile_subprocess.py
@@ -206,7 +206,8 @@ def model_add(x, y):
 
             start = time.time()
             last_report = start
-            while _AsyncFxCompile._stat_compiled_runs < 4:
+            while True:
+                start_stat_compiled_runs = _AsyncFxCompile._stat_compiled_runs
                 # Sleep a bit so we don't drive the CPU unnecessarily.
                 time.sleep(0.25)
 
@@ -219,6 +220,9 @@ def model_add(x, y):
                 # Backward pass
                 output.sum().backward()
 
+                if _AsyncFxCompile._stat_compiled_runs - start_stat_compiled_runs == 2:
+                    break
+
                 # DEBUGGING: Print a periodic message so we know we're still
                 # running...
                 now = time.time()
@@ -231,12 +235,12 @@ def model_add(x, y):
                         "Test timed out before producing a compiled artifact."
                     )
 
-            self.assertEqual(_AsyncFxCompile._stat_compiled_runs, 4)
+            self.assertGreater(_AsyncFxCompile._stat_compiled_runs, 1)
             # Make sure we ran eager at least once. Normally this will be
             # something like 80.
             self.assertGreater(_AsyncFxCompile._stat_eager_runs, 0)
-            self.assertEqual(_AsyncFxCompile._stat_bg_started, 1)
-            self.assertEqual(_AsyncFxCompile._stat_bg_finished, 1)
+            self.assertEqual(_AsyncFxCompile._stat_bg_started, 2)
+            self.assertEqual(_AsyncFxCompile._stat_bg_finished, 2)
 
 
 if RUN_CPU:
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index dff94b4aa092..6014a6e69860 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -172,6 +172,20 @@ def run_as_subprocess(self, script) -> bytes:
         except subprocess.CalledProcessError as e:
             self.fail(f"Subprocess exited with return code: {e.returncode}")
 
+    def test_hipify_not_loaded_with_import_torch(self):
+        script = """
+import torch
+assert globals().get("hipify", False) is False
+"""
+        self.run_as_subprocess(script)
+
+    def test_hipify_not_loaded_with_import_cpp_extension(self):
+        script = """
+import torch.utils.cpp_extension
+assert globals().get("hipify", False) is False
+"""
+        self.run_as_subprocess(script)
+
     def test_dynamo_flaky_segfault(self):
         script = """
 import torch
@@ -5183,6 +5197,7 @@ def wrap_test_class(orig_cls):
     "test_nested_checkpoint_set_early_stop",  # dynamo disable
     "test_nested_checkpoint_two_children_early_stop_False",  # dynamo disable
     "test_nested_checkpoint_two_children_early_stop_True",  # dynamo disable
+    "test_custom_autograd_ac_early_stop",  # marked as skipped
     "test_dropout",  # dynamo disable
     "test_dropout_inductor",  # dynamo disable
     "test_function_with_kwargs",  # dynamo disable
diff --git a/test/inductor/test_compiled_optimizers.py b/test/inductor/test_compiled_optimizers.py
index 09690243475d..c313348e9334 100644
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@@ -58,7 +58,7 @@
     optim_db,
     optims,
 )
-from torch.testing._internal.common_utils import parametrize
+from torch.testing._internal.common_utils import parametrize, skipIfWindows
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
     HAS_CPU,
@@ -731,6 +731,7 @@ def check_cudagraphs_ran(self):
         SGD, kernel_count=1, lr=0.01, foreach=True
     )
 
+    @skipIfWindows
     @requires_gpu
     def test_static_address_finalizer(self):
         import gc
diff --git a/test/inductor/test_control_flow.py b/test/inductor/test_control_flow.py
index adc6ff1bc5be..715176a5ee51 100644
--- a/test/inductor/test_control_flow.py
+++ b/test/inductor/test_control_flow.py
@@ -757,7 +757,7 @@ def test_cond_functional_call(self, device, dynamic):
         )
 
     @requires_gpu
-    @parametrize("device", ["cpu", "cuda"])
+    @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
     @torch._dynamo.config.patch("capture_scalar_outputs", True)
     def test_cond_select_with_input_idx(self, device, dynamic):
@@ -804,8 +804,12 @@ class Parameters(torch.nn.Module):
         class InnerModel(torch.nn.Module):
             def __init__(self, device):
                 super().__init__()
-                self.layer1 = torch.nn.Linear(20, 30, device=device)
-                self.layer2 = torch.nn.Linear(30, 20, device=device)
+                self.layer1 = torch.nn.Linear(
+                    20, 30, device=device, dtype=torch.float64
+                )
+                self.layer2 = torch.nn.Linear(
+                    30, 20, device=device, dtype=torch.float64
+                )
 
             def forward(self, c, x):
                 return c - 1, self.layer2(self.layer1(x - 2)) * 3.14
@@ -985,6 +989,23 @@ def body_fn(c, a_view):
             )
             return out1 + 1, out2 + 2
 
+    class ZeroLoop4(torch.nn.Module):
+        def forward(self, c, a):
+            a_view = torch.sin(a.view(-1, 1))
+
+            def cond_fn(c, a_view):
+                return torch.clip(a_view.sum(), 0, 1) < 0
+
+            def body_fn(c, a_view):
+                return c - 1, a_view + 1
+
+            out1, out2 = torch._higher_order_ops.while_loop(
+                cond_fn,
+                body_fn,
+                [c, a_view],
+            )
+            return out2.sin_(), a_view.cos_()
+
     class UnbackedSymIntClosure(torch.nn.Module):
         def forward(self, c, a, b):
             d = a.sum().to(torch.int64).item()
@@ -1008,7 +1029,7 @@ def forward(self, c, a, b):
             e = torch.nonzero(b).size(0)
 
             def cond_fn(c, a, b):
-                return d + e + a.shape[0] - b.shape[0] < 10
+                return c + d + e + a.shape[0] - b.shape[0] < 10
 
             def body_fn(c, a, b):
                 return c + 1, a + e, b + d
@@ -1071,34 +1092,52 @@ def body_fn(loop_idx, x):
                 (c, x),
             )
 
+    class WhileLoopStackOutputSimple(torch.nn.Module):
+        def __init__(self, device):
+            super().__init__()
+            self.linear = torch.nn.Linear(3, 3, device=device)
+
+        def forward(self, c, x):
+            def cond_fn(c, x):
+                return c < x.size(0)
+
+            def body_fn(c, x):
+                return c + 1, self.linear(x)
+
+            stacked_c, stacked_x = torch.ops.higher_order.while_loop_stack_output(
+                cond_fn, body_fn, (c, x), tuple()
+            )
+            return stacked_c, stacked_x
+
 
 class WhileLoopTests(TestCase):
     def _run_test(
-        self,
-        model,
-        inputs,
-        device,
-        dynamic=False,
-        num_counters=1,
+        self, model, inputs, device, dynamic=False, num_counters=1, autograd=False
     ):
+        import torch.utils._pytree as pytree
+
         cnt = torch._dynamo.testing.CompileCounterWithBackend("inductor")
-        compiled_model = torch.compile(backend=cnt, fullgraph=True)(model)
+        import copy
+
+        if not autograd:
+            for p in model.parameters():
+                p.requires_grad_(False)
+
+        compiled_model = copy.deepcopy(model)
+        compiled_fn = torch.compile(backend=cnt, fullgraph=True)(compiled_model)
 
         inputs = pytree.tree_map(lambda t: t.to(device=device), inputs)
         input_sets = [inputs]
-        if dynamic:
 
-            def mark_first_dim_dyn(inp):
-                torch._dynamo.mark_dynamic(inp, 0)
+        def mark_first_dim_dyn(inp):
+            torch._dynamo.mark_dynamic(inp, 0)
 
-            pytree.tree_map(mark_first_dim_dyn, input_sets)
+        if dynamic:
 
             def tile_fn(inp):
                 # tile every first dim 5x
                 tiling = [5] + [1] * (inp.ndim - 1)
                 t = torch.tile(inp, tiling)
-                # mark every first dim as dynamic
-                torch._dynamo.mark_dynamic(inp, 0)
                 return t
 
             larger_inputs = pytree.tree_map(tile_fn, inputs)
@@ -1115,24 +1154,78 @@ def tile_fn(inp):
                 )
                 unflat_inputs = pytree.tree_unflatten(flat, inp_spec)
                 inputs_with_counters = counters + unflat_inputs
-                cloned_inputs = pytree.tree_map(
-                    lambda t: t.clone(), inputs_with_counters
-                )
-                result = model(*inputs_with_counters)
-                with torch.no_grad():
-                    result_compiled = compiled_model(*inputs_with_counters)
+
+                def process_inputs(inp):
+                    inp = inp.clone()
+                    if dynamic:
+                        mark_first_dim_dyn(inp)
+
+                    if autograd and inp.dtype.is_floating_point:
+                        inp.requires_grad_(True)
+                    return inp
+
+                cloned_inputs = pytree.tree_map(process_inputs, inputs_with_counters)
+                cloned_inputs2 = pytree.tree_map(process_inputs, inputs_with_counters)
+
+                result = model(*cloned_inputs)
+                result_compiled = compiled_fn(*cloned_inputs2)
                 # inputs must not be mutated
                 torch.testing.assert_close(cloned_inputs, inputs_with_counters)
                 torch.testing.assert_close(
                     result, result_compiled, atol=1e-4, rtol=1e-4
                 )
 
+                if autograd and any(
+                    pytree.tree_map_only(
+                        torch.Tensor, lambda t: t.requires_grad, cloned_inputs
+                    )
+                ):
+                    result_loss = loss_fn(pytree.tree_flatten(result)[0])
+                    compiled_loss = loss_fn(pytree.tree_flatten(result_compiled)[0])
+                    self.assertTrue(
+                        not torch.isnan(result_loss) and not torch.isinf(compiled_loss)
+                    )
+                    self.assertTrue(
+                        not torch.isnan(compiled_loss)
+                        and not torch.isinf(compiled_loss)
+                    )
+
+                    self.assertEqual(result_loss, compiled_loss)
+
+                    result_loss.backward()
+                    compiled_loss.backward()
+
+                    model_parameters = dict(model.named_parameters())
+                    compiled_parameters = dict(compiled_model.named_parameters())
+                    for name, param in model_parameters.items():
+                        self.assertEqual(param, compiled_parameters[name])
+                        self.assertEqual(
+                            param.grad,
+                            compiled_parameters[name].grad,
+                            atol=1e-4,
+                            rtol=1e-4,
+                        )
+
+                    for inp1, inp2 in zip(
+                        pytree.tree_flatten(cloned_inputs)[0],
+                        pytree.tree_flatten(cloned_inputs2)[0],
+                    ):
+                        if inp1.requires_grad:
+                            self.assertEqual(
+                                inp1.grad,
+                                inp2.grad,
+                                atol=1e-4,
+                                rtol=1e-4,
+                            )
+
         self.assertEqual(cnt.frame_count, 1, "only one compilation expected")
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
-    def test_while_loop_simple_control_flow(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_simple_control_flow(self, device, dynamic, autograd):
         # while_loop control flow without nesting
         self._run_test(
             model=WhileLoopModels.Simple(),
@@ -1142,12 +1235,15 @@ def test_while_loop_simple_control_flow(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
-    def test_while_loop_nested_control_flow(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_nested_control_flow(self, device, dynamic, autograd):
         # while_loop control flow with nesting
         self._run_test(
             model=WhileLoopModels.Nested(),
@@ -1158,12 +1254,15 @@ def test_while_loop_nested_control_flow(self, device, dynamic):
             device=device,
             dynamic=dynamic,
             num_counters=2,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
-    def test_while_loop_with_outer_code(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_outer_code(self, device, dynamic, autograd):
         # while_loop control flow with outer code
         self._run_test(
             model=WhileLoopModels.OuterCode(),
@@ -1173,18 +1272,22 @@ def test_while_loop_with_outer_code(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [False, True])
-    def test_while_loop_with_parameters(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_parameters(self, device, dynamic, autograd):
         # while_loop control flow with parameters
         self._run_test(
             model=WhileLoopModels.Parameters(device),
-            inputs=(torch.randn(10, 20),),
+            inputs=(torch.randn(10, 20, dtype=torch.float64),),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
@@ -1192,7 +1295,9 @@ def test_while_loop_with_parameters(self, device, dynamic):
     # dynamic=True doesn't work now due to
     # https://github.com/pytorch/pytorch/issues/123596
     @parametrize("dynamic", [False])
-    def test_while_loop_with_outer_buffers(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_outer_buffers(self, device, dynamic, autograd):
         # while_loop control flow with outer code
         self._run_test(
             model=WhileLoopModels.OuterBuffers(),
@@ -1202,13 +1307,15 @@ def test_while_loop_with_outer_buffers(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
-    # dynamic=True doesn't work due to we haven't handle lifted symbols
     @parametrize("dynamic", [True, False])
-    def test_while_loop_with_pytree_inputs(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_pytree_inputs(self, device, dynamic, autograd):
         self._run_test(
             model=WhileLoopModels.PytreeCarry(),
             inputs=(
@@ -1219,12 +1326,15 @@ def test_while_loop_with_pytree_inputs(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
-    def test_while_loop_with_data_dependent_ops(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_data_dependent_ops(self, device, dynamic, autograd):
         with torch._dynamo.config.patch(
             {
                 "capture_dynamic_output_shape_ops": True,
@@ -1240,12 +1350,15 @@ def test_while_loop_with_data_dependent_ops(self, device, dynamic):
                 ),
                 device=device,
                 dynamic=dynamic,
+                autograd=autograd,
             )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
-    def test_while_loop_with_data_dependent_in_out(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_data_dependent_in_out(self, device, dynamic, autograd):
         with torch._dynamo.config.patch(
             {
                 "capture_dynamic_output_shape_ops": True,
@@ -1262,6 +1375,7 @@ def test_while_loop_with_data_dependent_in_out(self, device, dynamic):
                 ),
                 device=device,
                 dynamic=dynamic,
+                autograd=autograd,
             )
 
     @parametrize("dynamic", [True, False])
@@ -1307,6 +1421,7 @@ def test_while_loop_zero_loop(self, device, dynamic):
             WhileLoopModels.ZeroLoop(),
             WhileLoopModels.ZeroLoop2(),
             WhileLoopModels.ZeroLoop3(),
+            WhileLoopModels.ZeroLoop4(),
         ]:
             self._run_test(
                 model=model,
@@ -1321,7 +1436,8 @@ def test_while_loop_zero_loop(self, device, dynamic):
     @torch._dynamo.config.patch(
         {"capture_scalar_outputs": True, "capture_dynamic_output_shape_ops": True}
     )
-    def test_while_loop_with_unbacked_symint_closure(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    def test_while_loop_with_unbacked_symint_closure(self, device, dynamic, autograd):
         self._run_test(
             model=WhileLoopModels.UnbackedSymIntClosure(),
             inputs=(
@@ -1330,6 +1446,7 @@ def test_while_loop_with_unbacked_symint_closure(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
@@ -1364,10 +1481,11 @@ def test_while_loop_models_with_mixed_device(self, device):
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
+    @parametrize("autograd", [False, True])
     @torch._dynamo.config.patch(
         {"capture_scalar_outputs": True, "capture_dynamic_output_shape_ops": True}
     )
-    def test_while_loop_with_sym_expr_cond(self, device, dynamic):
+    def test_while_loop_with_sym_expr_cond(self, device, dynamic, autograd):
         self._run_test(
             model=WhileLoopModels.SymExprCond(),
             inputs=(
@@ -1376,17 +1494,33 @@ def test_while_loop_with_sym_expr_cond(self, device, dynamic):
             ),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
         )
 
     @requires_gpu
     @parametrize("device", ["cpu", GPU_TYPE])
     @parametrize("dynamic", [True, False])
-    def test_while_loop_with_conv(self, device, dynamic):
+    @parametrize("autograd", [False, True])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_with_conv(self, device, dynamic, autograd):
         self._run_test(
             model=WhileLoopModels.Conv(device),
             inputs=(torch.randn(2, 4, 4, 4, dtype=torch.float64),),
             device=device,
             dynamic=dynamic,
+            autograd=autograd,
+        )
+
+    @requires_gpu
+    @parametrize("device", ["cpu", GPU_TYPE])
+    @parametrize("dynamic", [True, False])
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_while_loop_stack_output_simple(self, device, dynamic):
+        self._run_test(
+            model=WhileLoopModels.WhileLoopStackOutputSimple(device),
+            inputs=(torch.randn(3, 3, dtype=torch.float32),),
+            device=device,
+            dynamic=dynamic,
         )
 
 
@@ -2043,16 +2177,6 @@ def _run_test(
         self.assertEqual(result, result_compiled)
 
         if autograd:
-
-            def loss_fn(result) -> torch.Tensor:
-                flat_results, _ = pytree.tree_flatten(result)
-                return sum(
-                    [
-                        torch.sqrt(torch.pow(res.sum() / res.max(), 2)).sum()
-                        for res in flat_results
-                    ]
-                )
-
             loss_fn(result).backward()
             loss_fn(result_exp).backward()
             loss_fn(result_compiled).backward()
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index 53506698297f..5cfb62285572 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -935,7 +935,7 @@ def foo(x):
 
         inp = inp.to(torch.float)
         out, code = run_and_get_code(torch.compile(foo), inp)
-        FileCheck().check_not("libdevice.exp").check("tl_math.exp").run(code[0])
+        FileCheck().check_not("tl_math.exp").check("libdevice.exp").run(code[0])
         self.assertEqual(foo(inp), out)
 
         def foo(x):
diff --git a/test/inductor/test_cudagraph_trees.py b/test/inductor/test_cudagraph_trees.py
index 763384671eb5..1dfc0a390eca 100644
--- a/test/inductor/test_cudagraph_trees.py
+++ b/test/inductor/test_cudagraph_trees.py
@@ -5,6 +5,7 @@
 import gc
 import importlib
 import itertools
+import re
 import sys
 import unittest
 import warnings
@@ -912,6 +913,67 @@ def test_unaligned_static_input_non_trees(self):
         def test_unaligned_static_input_no_cudagraphs(self):
             self._test_unaligned_static_input_impl(expected_clones=0)
 
+        @torch._inductor.config.patch("graph_partition", True)
+        @torch._inductor.config.patch("implicit_fallbacks", True)
+        def test_graph_partition_custom_rule(self):
+            def get_num_partitions(code):
+                code = "".join(code)
+                found = re.search(r"partitions=\[(.*)\]", code)
+                assert found is not None
+                partitions = found.group(1)
+                num_partitions = len([p for p in partitions.split(",") if p])
+                return num_partitions
+
+            @torch.library.custom_op("mylib::bar", mutates_args=())
+            def bar(x: torch.Tensor, flag: int) -> torch.Tensor:
+                return x.clone()
+
+            @bar.register_fake
+            def _(x, flag):
+                return x.clone()
+
+            def f(x, flag):
+                x = x + 1
+                x = bar(x, flag)
+                x = x + 1
+                return x
+
+            x = torch.randn(2, device="cuda")
+            f_compiled = torch.compile(f, mode="reduce-overhead", fullgraph=True)
+            _, code = run_and_get_code(f_compiled, x, True)
+            num_partitions = get_num_partitions(code)
+            self.assertEqual(num_partitions, 1)
+
+            @torch.library.custom_op("mylib::baz", mutates_args=())
+            def baz(x: torch.Tensor, flag: int) -> torch.Tensor:
+                return x.clone()
+
+            @baz.register_fake
+            def _(x, flag):
+                return x.clone()
+
+            def should_partition(x, flag):
+                return flag
+
+            torch._inductor.scheduler.register_should_partition_rule(
+                torch.ops.mylib.baz.default, should_partition
+            )
+
+            def f(x, flag):
+                x = x + 1
+                x = baz(x, flag)
+                x = x + 1
+                return x
+
+            f_compiled = torch.compile(f, mode="reduce-overhead", fullgraph=True)
+            _, code = run_and_get_code(f_compiled, x, True)
+            num_partitions = get_num_partitions(code)
+            self.assertEqual(num_partitions, 2)
+
+            _, code = run_and_get_code(f_compiled, x, False)
+            num_partitions = get_num_partitions(code)
+            self.assertEqual(num_partitions, 1)
+
         @torch._inductor.config.patch("graph_partition", True)
         @torch._inductor.config.patch("triton.cudagraph_trees", False)
         def test_graph_partition_gc(self):
@@ -3233,6 +3295,60 @@ def fn(x):
             # splitting on 1 custom gives 2 cudagraphs
             self.assertEqual(self.get_manager().new_graph_id().id, 2)
 
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_custom_op_mutation_late_free(self):
+            @torch.library.custom_op(
+                "mylib::op1",
+                mutates_args=["x"],
+                schema="(Tensor(a!)?  x) -> (Tensor, Tensor)",
+                device_types="cuda",
+            )
+            def op1(x) -> tuple[torch.Tensor, torch.Tensor]:
+                x = x + 1
+                return (x + 1, x + 2)
+
+            @op1.register_fake
+            def _(x) -> tuple[torch.Tensor, torch.Tensor]:
+                return (torch.empty_like(x), torch.empty_like(x))
+
+            @torch.library.custom_op(
+                "mylib::cg_unsafe_op",
+                mutates_args=[],
+                schema="(Tensor x, Tensor y, Tensor x1, Tensor y1) -> Tensor",
+                device_types="cuda",
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def cg_unsafe_op(x0, x1, y0, y1) -> torch.Tensor:
+                return x0 + x1 + y0 + y1
+
+            @cg_unsafe_op.register_fake
+            def _(x0, x1, y0, y1) -> torch.Tensor:
+                return torch.empty_like(x0)
+
+            def f(x):
+                x = x + 1
+                x = op1(x)
+                x0, x1 = x[0], x[1]
+                y0 = x0 + 1
+                y1 = x1 + 1
+                y = cg_unsafe_op(x0, x1, y0, y1)
+                z = y + x0 + x1
+                z0, z1 = op1(z)
+                z2 = z0 + z1
+                res = cg_unsafe_op(z2, z2, y, y)
+                return res
+
+            x = torch.randn(2, 2, device="cuda")
+            x_cloned = x.clone()
+            eager_out = f(x)
+
+            f_compiled = torch.compile(f, mode="reduce-overhead")
+
+            for _ in range(5):
+                compiled_out = f_compiled(x_cloned)
+                self.assertEqual(eager_out, compiled_out)
+
         @config.patch(implicit_fallbacks=True)
         @torch._inductor.config.patch("graph_partition", True)
         def test_graph_partition_custom_op_dynamoc_shapes(self):
@@ -3937,6 +4053,17 @@ def run(batch_size, seq_len, d):
 
             self.assertEqual(self.get_manager().new_graph_id().id, 4)
 
+        @torch._inductor.config.patch("triton.cudagraph_or_error", True)
+        def test_cudagraph_or_error(self):
+            def f(x):
+                x.add_(1)
+                return x
+
+            f = torch.compile(f, mode="reduce-overhead")
+
+            with self.assertRaises(RuntimeError):
+                f(torch.tensor(1, device="cuda"))
+
     class TestSAC(TestCase):
         def _make_observer_mode(self):
             class ObserverMode(TorchDispatchMode):
diff --git a/test/inductor/test_custom_partitioner_fn.py b/test/inductor/test_custom_partitioner_fn.py
new file mode 100644
index 000000000000..722a154b27ff
--- /dev/null
+++ b/test/inductor/test_custom_partitioner_fn.py
@@ -0,0 +1,72 @@
+# Owner(s): ["module: pt2-dispatcher"]
+import torch
+from functorch.compile import min_cut_rematerialization_partition
+from torch._C import FileCheck
+from torch._inductor.custom_graph_pass import CustomPartitionerFn, get_hash_for_files
+from torch._inductor.test_case import TestCase
+from torch._inductor.utils import run_fw_bw_and_get_code
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
+
+
+class MyCustomPartitionerFn(CustomPartitionerFn):
+    """
+    A custom partitioner function with static_lifetime_input_indices overwrites.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.called = False
+
+    def __call__(self, gm, joint_inputs, **kwargs):
+        self.called = True
+        kwargs["static_lifetime_input_indices"] = [0, 1]
+        return min_cut_rematerialization_partition(gm, joint_inputs, **kwargs)
+
+    def uuid(self):
+        return get_hash_for_files((__file__,))
+
+
+class TestCustomPartitionerFn(TestCase):
+    def test_custom_partitioner_fn(self):
+        """
+        For function f(a, b), with the  partitioner in the compile_fx stack,
+        the addition `a+b` (equivalently `buf0`) is saved for backward.
+        With the custom partitioner function, we indicate that
+        `a` and `b` (equivalently `primals_1` and `primals_2`) do not take
+        additional memory and thus, they are saved for backward.
+        """
+
+        # initialization
+        @torch.compile
+        def f(a, b):
+            return (a + b).cos().cos()
+
+        a = torch.randn((2, 2), requires_grad=True, device=GPU_TYPE)
+        b = torch.randn((2, 2), requires_grad=True, device=GPU_TYPE)
+
+        # CASE 1 -- default
+        # addition `a + b` (i.e, `buf0`) is saved for backward.
+        code_og = run_fw_bw_and_get_code(lambda: f(a, b))
+        fwd_code_og = code_og[1][0]
+        FileCheck().check("return (buf1, buf0, )").run(fwd_code_og)
+
+        # CASE 2 -- custom partitioner function
+        # `a` and `b` (i.e., `primals_1` and `primals_2`) are saved for backward.
+        custom_partitioner_fn = MyCustomPartitionerFn()
+        self.assertFalse(custom_partitioner_fn.called)
+        self.assertIsNotNone(custom_partitioner_fn.uuid())
+
+        with torch._inductor.config.patch(custom_partitioner_fn=custom_partitioner_fn):
+            code_cp = run_fw_bw_and_get_code(lambda: f(a, b))
+        fwd_code_cp = code_cp[1][0]
+        FileCheck().check("return (buf0, primals_1, primals_2, )").run(fwd_code_cp)
+
+        # make sure the custom partitioner function is indeed invoked
+        self.assertTrue(custom_partitioner_fn.called)
+
+
+if __name__ == "__main__":
+    from torch._inductor.test_case import run_tests
+
+    if HAS_GPU:
+        run_tests()
diff --git a/test/inductor/test_cutedsl_template.py b/test/inductor/test_cutedsl_template.py
index 4e9fcd132872..67c166040ee2 100644
--- a/test/inductor/test_cutedsl_template.py
+++ b/test/inductor/test_cutedsl_template.py
@@ -2,8 +2,12 @@
 import unittest
 from unittest.mock import MagicMock, patch
 
+from expecttest import assert_expected_inline
+
 import torch
 from torch._inductor.test_case import TestCase
+from torch._inductor.virtualized import V
+from torch.testing._internal.inductor_utils import MockGraphHandler
 
 
 try:
@@ -19,6 +23,7 @@
     from torch._inductor.codegen.cutedsl.cutedsl_template import CuteDSLTemplate
     from torch._inductor.select_algorithm import PartialRender
 
+
 CUTEDSL_ADD_TEMPLATE = r"""
 {{gen_defines()}}
 
@@ -52,13 +57,13 @@ def {{kernel_name}}_jit(mA: cute.Tensor, mB: cute.Tensor, mC: cute.Tensor, strea
         stream=stream
     )
 
-{{def_kernel("input_a", "input_b", "output_c")}}
+{{def_kernel("input_a", "input_b")}}
     cute_a = from_dlpack(input_a)
     cute_b = from_dlpack(input_b)
-    cute_c = from_dlpack(output_c)
+    cute_c = from_dlpack({{get_output()}})
 
     {{kernel_name}}_jit(cute_a, cute_b, cute_c, cuda.CUstream(stream))
-    return output_c
+    return {{get_output()}}
 """
 
 
@@ -82,7 +87,7 @@ def test_gen_imports(self):
         self.assertIsInstance(imports, str)
 
         lines = imports.strip().split("\n")
-        self.assertEqual(len(lines), 5)
+        self.assertEqual(len(lines), 7)
 
     def test_render_includes_imports(self):
         template_source = """@cute.kernel
@@ -299,18 +304,178 @@ def test_gen_defines(self):
             ENABLE_FEATURE=True,
         )
 
-        expected_lines = [
-            "THREADS_PER_BLOCK: cutlass.Constexpr = 256",
-            "BLOCK_SIZE: cutlass.Constexpr = 128",
-            "ENABLE_FEATURE: cutlass.Constexpr = True",
-        ]
-
-        for expected_line in expected_lines:
-            self.assertIn(expected_line, params)
+        assert_expected_inline(
+            params,
+            """\
+THREADS_PER_BLOCK: cutlass.Constexpr = 256
+BLOCK_SIZE: cutlass.Constexpr = 128
+ENABLE_FEATURE: cutlass.Constexpr = True
+""",
+        )
 
-        # Test float parameters
         params_float = kernel.gen_defines(SCALE_FACTOR=1.5)
-        self.assertIn("SCALE_FACTOR: cutlass.Constexpr = 1.5", params_float)
+        assert_expected_inline(
+            params_float,
+            """\
+SCALE_FACTOR: cutlass.Constexpr = 1.5
+""",
+        )
+
+    def test_template_aliasing(self):
+        """Test that template variables are correctly aliased to function arguments."""
+        from torch._inductor.ir import Buffer
+
+        mock_input1 = MagicMock(spec=Buffer)
+        mock_input1.get_name.return_value = "buf_input1"
+
+        mock_input2 = MagicMock(spec=Buffer)
+        mock_input2.get_name.return_value = "buf_input2"
+
+        mock_output = MagicMock(spec=Buffer)
+        mock_output.get_name.return_value = "buf_output"
+
+        mock_graph = MockGraphHandler()
+        with V.set_graph_handler(mock_graph):
+            kernel = CuteDSLTemplateKernel(
+                kernel_name="test_aliasing",
+                input_nodes=[mock_input1, mock_input2],
+                output_node=mock_output,
+            )
+
+            def_kernel_hook = kernel.def_kernel("custom_a", "custom_b")
+            self.assertEqual(def_kernel_hook, "<DEF_KERNEL>")
+
+            self.assertIn("<DEF_KERNEL>", kernel.render_hooks)
+
+            hook_fn = kernel.render_hooks["<DEF_KERNEL>"]
+            generated_code = hook_fn()
+
+            # Check that the generated code contains the expected aliasing statements
+            self.assertIn("custom_a = arg_custom_a", generated_code)
+            self.assertIn("custom_b = arg_custom_b", generated_code)
+
+    def test_get_output_hook(self):
+        """Test the get_output() template hook."""
+        from torch._inductor.ir import Buffer
+
+        mock_output = MagicMock(spec=Buffer)
+        mock_output.get_name.return_value = "buf_test_output"
+
+        mock_graph = MockGraphHandler()
+        with V.set_graph_handler(mock_graph):
+            kernel = CuteDSLTemplateKernel(
+                kernel_name="test_output",
+                input_nodes=[],
+                output_node=mock_output,
+            )
+
+            with self.assertRaises(ValueError):
+                # error if no output buffer
+                result = kernel.get_output()
+
+            kernel.args.output_buffers["buf_test_output"] = "arg_buf_test_output"
+            result = kernel.get_output()
+            self.assertEqual(result, "arg_buf_test_output")
+
+    def test_modification_subgraph(self):
+        """Test the modification() method and subgraph processing."""
+
+        from torch._inductor.ir import Buffer
+
+        mock_subgraph1 = MagicMock(spec=Buffer)
+        mock_subgraph2 = MagicMock(spec=Buffer)
+        subgraphs = [mock_subgraph1, mock_subgraph2]
+
+        mock_output = MagicMock(spec=Buffer)
+        mock_output.get_name.return_value = "buf_output"
+
+        kernel = CuteDSLTemplateKernel(
+            kernel_name="test_modification",
+            input_nodes=[],
+            output_node=mock_output,
+            subgraphs=subgraphs,
+        )
+
+        result = kernel._get_subgraph(0)
+        self.assertEqual(result, mock_subgraph1)
+
+        result = kernel._get_subgraph(1)
+        self.assertEqual(result, mock_subgraph2)
+
+        with self.assertRaises(AssertionError):
+            kernel._get_subgraph(2)
+
+    def test_cutedsl_op_overrides(self):
+        """Test the new CuteDSLOpOverrides class."""
+        import torch
+        from torch._inductor.codegen.common import CSEVariable
+        from torch._inductor.codegen.cutedsl.cutedsl_op_overrides import (
+            CuteDSLOpOverrides,
+        )
+        from torch.utils._sympy.value_ranges import ValueRanges
+
+        mock_cse_a = MagicMock(spec=CSEVariable)
+        mock_cse_a.__str__.return_value = "tensor_a"
+        mock_cse_a.dtype = torch.float32
+        mock_cse_a.bounds = ValueRanges.unknown()
+
+        mock_cse_b = MagicMock(spec=CSEVariable)
+        mock_cse_b.__str__.return_value = "tensor_b"
+        mock_cse_b.dtype = torch.float32
+        mock_cse_b.bounds = ValueRanges.unknown()
+
+        mock_graph = MockGraphHandler()
+        with V.set_graph_handler(mock_graph):
+            kernel = CuteDSLTemplateKernel(
+                kernel_name="test_ops",
+                input_nodes=[],
+                output_node=None,
+            )
+            with V.set_kernel_handler(kernel):
+                result = CuteDSLOpOverrides.add(mock_cse_a, mock_cse_b)
+                self.assertIsInstance(result, CSEVariable)
+
+                result = CuteDSLOpOverrides.mul(mock_cse_a, mock_cse_b)
+                self.assertIsInstance(result, CSEVariable)
+
+                result = CuteDSLOpOverrides.truediv(mock_cse_a, mock_cse_b)
+                self.assertIsInstance(result, CSEVariable)
+
+                result = CuteDSLOpOverrides.exp(mock_cse_a)
+                self.assertIsInstance(result, CSEVariable)
+
+                result = CuteDSLOpOverrides.sqrt(mock_cse_a)
+                self.assertIsInstance(result, CSEVariable)
+
+                with self.assertRaises(NotImplementedError):
+                    result = CuteDSLOpOverrides.maximum(mock_cse_a, mock_cse_b)
+                    result = CuteDSLOpOverrides.minimum(mock_cse_a, mock_cse_b)
+
+        scalar_result = CuteDSLOpOverrides._ensure_tensor_ssa("5.0", mock_cse_a)
+        self.assertEqual(scalar_result, "cute.full_like(tensor_a, 5.0)")
+
+        tensor_result = CuteDSLOpOverrides._ensure_tensor_ssa(mock_cse_a, mock_cse_b)
+        self.assertEqual(tensor_result, "tensor_a")
+
+    def test_cse_integration(self):
+        """Test CSE (Common Subexpression Elimination) integration."""
+        from torch._inductor.codegen.common import CSE
+
+        mock_graph = MockGraphHandler()
+        with V.set_graph_handler(mock_graph):
+            kernel = CuteDSLTemplateKernel(
+                kernel_name="test_cse",
+                input_nodes=[],
+                output_node=None,
+            )
+
+            self.assertIsInstance(kernel.cse, CSE)
+            self.assertEqual(kernel.cse.name_prefix, "tmp")
+
+            with V.set_kernel_handler(kernel):
+                test_expr = "x"
+                var = kernel.cse.generate(kernel.body, test_expr, dtype=None)
+                self.assertTrue(str(var).startswith("tmp"))
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_cutlass_backend.py b/test/inductor/test_cutlass_backend.py
index 4020b8b7d8cf..b807df5d6691 100644
--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@@ -2201,6 +2201,100 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         self.assertEqual(y_compiled.dtype, output_dtype)
         torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+")
+    @unittest.skipIf(not SM90OrLater, "need sm_90")
+    @fp8_config
+    @parametrize("float8_dtype", (torch.float8_e4m3fn,))
+    @parametrize(
+        "shape",
+        (
+            (
+                512,
+                1024,
+            ),
+        ),
+    )
+    @parametrize("use_fast_accum", (True,))
+    @parametrize("use_aoti", (False, True))
+    @parametrize("dynamic", (False, True))
+    def test_fp8_rowwise_scaling_multiple_linear(
+        self,
+        float8_dtype: torch.dtype,
+        shape: tuple[int, int],
+        use_fast_accum: bool,
+        use_aoti: bool = False,
+        dynamic: bool = False,
+    ):
+        """
+        This test is meant to simulate a more realistic scenario.
+        """
+        if dynamic and use_aoti:
+            self.skipTest("Accuracy issues when both AOTI and dynamic are enabled")
+        # Only bf16 output type is supported for row-wise scaling, not fp32
+        output_dtype: torch.dtype = torch.bfloat16
+        device = "cuda"
+        M, N = shape  # Matmul Y = X [M, K] x W [N, K]
+        x = torch.randn(M, N, dtype=output_dtype, device=device)
+        w1 = torch.randn(N, N, dtype=output_dtype, device=device)
+        w2 = torch.randn(N, N, dtype=output_dtype, device=device)
+
+        class TestModule(torch.nn.Module):
+            def __init__(self, w1, w2, float8_dtype):
+                super().__init__()
+                w1_fp8, self.w1_inverse_scale = _quantize_rowwise(w1, float8_dtype)
+                w2_fp8, self.w2_inverse_scale = _quantize_rowwise(w2, float8_dtype)
+
+                self.w1_t_fp8 = w1_fp8.t()
+                self.w2_t_fp8 = w2_fp8.t()
+
+                self.float8_dtype = float8_dtype
+
+            def forward(self, x):
+                x_fp8, x_inverse_scale = _quantize_rowwise(x, self.float8_dtype)
+                y1 = torch._scaled_mm(
+                    x_fp8,
+                    self.w1_t_fp8,
+                    x_inverse_scale.view(-1, 1),
+                    self.w1_inverse_scale.view(1, -1),
+                    out_dtype=output_dtype,
+                    use_fast_accum=use_fast_accum,
+                )
+
+                y1_fp8, y1_inverse_scale = _quantize_rowwise(y1, self.float8_dtype)
+                y2 = torch._scaled_mm(
+                    y1_fp8,
+                    self.w2_t_fp8,
+                    y1_inverse_scale.view(-1, 1),
+                    self.w2_inverse_scale.view(1, -1),
+                    out_dtype=output_dtype,
+                    use_fast_accum=use_fast_accum,
+                )
+                return y2
+
+        model = TestModule(w1, w2, float8_dtype).cuda()
+
+        dynamic_shapes = (
+            {
+                "x": {0: Dim.DYNAMIC, 1: Dim.DYNAMIC},
+            }
+            if dynamic
+            else None
+        )
+
+        expected = model(x)
+
+        if use_aoti:
+            actual = AOTIRunnerUtil.run(
+                model,
+                (x,),
+                dynamic_shapes=dynamic_shapes,
+            )
+        else:
+            compiled_model = torch.compile(model, fullgraph=True, dynamic=dynamic)
+            actual = compiled_model(x)
+
+        torch.testing.assert_close(expected, actual, rtol=1e-2, atol=0.05)
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "FP8 is only supported on H100+")
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @fp8_config
diff --git a/test/inductor/test_cutlass_evt.py b/test/inductor/test_cutlass_evt.py
index e92eb79500e7..cae9558d2ec2 100644
--- a/test/inductor/test_cutlass_evt.py
+++ b/test/inductor/test_cutlass_evt.py
@@ -10,12 +10,15 @@
     torch_dtype_to_cutlass_type,
     try_import_cutlass,
 )
-from torch._inductor.graph import GraphLowering
 from torch._inductor.ir import ComputedBuffer, FixedLayout, PermuteView, Pointwise
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch._inductor.utils import OrderedSet
 from torch.testing._internal.common_cuda import SM90OrLater
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
+from torch.testing._internal.inductor_utils import (
+    HAS_CPU,
+    HAS_CUDA_AND_TRITON,
+    MockGraphHandler,
+)
 
 
 if try_import_cutlass():
@@ -105,17 +108,6 @@ def num_reads(self):
         return 1
 
 
-class MockGraphHandler(GraphLowering):
-    def __init__(self, name_to_buffer):
-        import torch._inductor.sizevars
-
-        self.sizevars = torch._inductor.sizevars.SizeVarAllocator()
-        self.name_to_buffer = name_to_buffer
-        self.graph_inputs = dict()
-        self.mutated_buffers = OrderedSet()
-        self.constants = dict()
-
-
 class TestCutlassEVT(TestCase):
     @unittest.skipIf(not SM90OrLater, "need sm_90")
     @unittest.skipIf(not try_import_cutlass(), "requires cutlass")
diff --git a/test/inductor/test_device_assert.py b/test/inductor/test_device_assert.py
index e78bd7b29ff3..ddf85f9d88da 100644
--- a/test/inductor/test_device_assert.py
+++ b/test/inductor/test_device_assert.py
@@ -1,14 +1,13 @@
 # Owner(s): ["module: inductor"]
-import os
-import subprocess
-import sys
 
 import torch
 import torch._inductor.config
 from torch._inductor import metrics
 from torch._inductor.compiler_bisector import BisectionResult, CompilerBisector
 from torch._inductor.test_case import run_tests, TestCase
-from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+from torch._inductor.utils import run_and_get_code
+from torch.testing._internal.common_utils import skipIfRocm
+from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
 class TestTorchDeviceAssertTrigger(TestCase):
@@ -99,102 +98,46 @@ def test_assert_should_not_throw(self):
         self._run_assert_should_not_throw(device)
         self._run_assert_inline_expression_should_not_throw(device)
 
-    @torch._inductor.config.patch(force_disable_caches=True, cpp_wrapper=True)
-    def test_assert_should_throw_cpp_wrapper(self):
-        device = "cpu"
-        self._run_assert_should_throw(device)
-        self._run_assert_inline_expression_should_throw(device)
-
-    @torch._inductor.config.patch(force_disable_caches=True, cpp_wrapper=True)
-    def test_assert_should_not_throw_cpp_wrapper(self):
-        device = "cpu"
-        self._run_assert_should_not_throw(device)
-        self._run_assert_inline_expression_should_not_throw(device)
-
-    if HAS_CUDA_AND_TRITON:
-
-        @torch._inductor.config.patch(force_disable_caches=True)
-        def test_assert_fusion(self):
-            torch._logging.set_logs(inductor_metrics=True)
-
-            def func():
-                a = torch.tensor([1.0, 2.0], device="cuda")
-                result = torch.all(a > 0)
-                assert result, "should throw"
-
-            torch._dynamo.reset()
-            f_c = torch.compile(func, backend="inductor")
-            metrics.reset()
-            self.assertEqual(metrics.generated_kernel_count, 0)
-            f_c()
-            self.assertEqual(metrics.generated_kernel_count, 1)
-            torch._logging.set_logs()
-
-        @torch._inductor.config.patch(force_disable_caches=True)
-        def test_run_assert_triton(self):
-            should_throw = """
-import torch
-import torch._dynamo
-
-def func_should_throw():
-    a = torch.tensor([1.0, -2.0], device='cuda')
-    result = torch.all(a > 0)
-    assert result, "should throw"
+    @requires_cuda_and_triton
+    @skipIfRocm
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_assert_fusion(self):
+        torch._logging.set_logs(inductor_metrics=True)
 
-def test_fn():
-    torch._dynamo.reset()
-    f_c = torch.compile(func_should_throw, backend="inductor")
+        def func():
+            a = torch.tensor([1.0, 2.0], device="cuda")
+            result = torch.all(a > 0)
+            assert result, "should throw"
 
-    try:
+        torch._dynamo.reset()
+        f_c = torch.compile(func, backend="inductor")
+        metrics.reset()
+        self.assertEqual(metrics.generated_kernel_count, 0)
         f_c()
-        torch.cuda.synchronize()
-        return False
-    except Exception as e:
-        return True
+        self.assertEqual(metrics.generated_kernel_count, 1)
+        torch._logging.set_logs()
 
-result = test_fn()
-print(f"Test result: {result}")
-"""
-
-            should_not_throw = """
-import torch
-import torch._dynamo
+    @requires_cuda_and_triton
+    @skipIfRocm
+    @torch._inductor.config.patch(force_disable_caches=True)
+    def test_run_assert_triton(self):
+        @torch.compile(backend="inductor")
+        def fn():
+            a = torch.tensor([1.0, 2.0], device="cuda")
+            result = torch.all(a > 0)
+            assert result, "should throw"
 
-def func_should_not_throw():
-    a = torch.tensor([1.0, 2.0], device='cuda')
-    result = torch.all(a > 0)
-    assert result, "should throw"
+        def should_not_throw(fn):
+            try:
+                fn()
+                return True
+            except Exception:
+                return False
 
-def test_fn():
-    torch._dynamo.reset()
-    f_c = torch.compile(func_should_not_throw, backend="inductor")
+        self.assertEqual(should_not_throw(fn), True)
 
-    try:
-        f_c()
-        torch.cuda.synchronize()
-        return True
-    except Exception as e:
-        return False
-
-result = test_fn()
-print(f"Test result: {result}")
-"""
-            for script in [should_not_throw, should_throw]:
-                p = subprocess.run(
-                    [sys.executable, "-c", script],
-                    cwd=os.path.dirname(os.path.realpath(__file__)),
-                    capture_output=True,
-                    text=True,
-                )
-
-                output = p.stdout + "\n" + p.stderr
-
-                self.assertIn("Test result: True", output)
-
-                if p.returncode != 0:
-                    self.fail(
-                        f"Subprocess failed with return code {p.returncode}. Output: {output}"
-                    )
+        _, code = run_and_get_code(fn)
+        self.assertEqual(code[0].count("tl.device_assert"), 1)
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index 1767f99c45e6..740faa0b3757 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -28,7 +28,10 @@
     _identity,
     _mask_mod_signature,
     _score_mod_signature,
+    _WARNINGS_SHOWN,
     and_masks,
+    AuxOutput,
+    AuxRequest,
     BlockMask,
     create_block_mask,
     flex_attention,
@@ -42,20 +45,26 @@
 from torch.testing._internal.common_device_type import (
     dtypes,
     dtypesIfCUDA,
+    dtypesIfXPU,
     flex_attention_supported_platform as supported_platform,
     instantiate_device_type_tests,
     largeTensorTest,
     skipCPUIf,
     skipCUDAIf,
+    skipXPUIf,
 )
+from torch.testing._internal.inductor_utils import HAS_GPU
 from torch.utils._triton import has_triton, has_triton_tma_device
 
 
 # Use this decorator only when hitting Triton bugs on H100
 running_on_a100_only = skipUnless(
-    (torch.cuda.is_available() and has_triton())
-    and (torch.cuda.get_device_capability() == (8, 0) or torch.version.hip),
-    "Requires Triton + A100 or Triton + ROCm",
+    (
+        (torch.cuda.is_available() and has_triton())
+        and (torch.cuda.get_device_capability() == (8, 0) or torch.version.hip)
+    )
+    or (torch.xpu.is_available() and has_triton()),
+    "Requires Triton + A100 or Triton + ROCm or Triton + Intel GPU",
 )
 
 Tolerances = namedtuple("Tolerances", ["atol", "rtol"])
@@ -89,12 +98,23 @@ def temp_float32_matmul_precision(precision: str):
     Args:
     precision (str): The precision to set ('highest', 'high', or 'medium').
     """
+
+    def set_float32_matmul_precision_xpu(precision: str):
+        if precision == "highest":
+            torch._C._set_onednn_allow_tf32(False)
+        if precision == "high":
+            torch._C._set_onednn_allow_tf32(True)
+
     original_precision = torch.get_float32_matmul_precision()
     try:
         torch.set_float32_matmul_precision(precision)
+        if TEST_ON_XPU:
+            set_float32_matmul_precision_xpu(precision)
         yield
     finally:
         torch.set_float32_matmul_precision(original_precision)
+        if TEST_ON_XPU:
+            set_float32_matmul_precision_xpu(original_precision)
 
 
 def skip_on_cpu(test_func):
@@ -116,6 +136,12 @@ def skip_on_rocm(test_func):
     return decorated_func
 
 
+def skip_on_xpu(test_func):
+    """Decorator to skip tests that are not supported on Intel GPU."""
+    decorated_func = skipXPUIf(True, "Not supported on Intel GPU")(test_func)
+    return decorated_func
+
+
 def rmse(ref, res):
     """
     Calculate root mean squared error
@@ -156,9 +182,20 @@ class DeviceConfig:
     and torch.utils._triton.has_triton()
     and torch.cuda.get_device_capability() >= (8, 0)
 )
+TEST_ON_XPU = torch.xpu.is_available() and torch.utils._triton.has_triton()
 
 device_configs = {}
-test_device = ("cpu", "cuda")
+if HAS_GPU:
+    if TEST_ON_CUDA:
+        test_device = (
+            "cuda",
+            "cpu",
+        )
+    elif TEST_ON_XPU:
+        torch._C._set_onednn_allow_tf32(True)
+        test_device = ("xpu",)
+else:
+    test_device = ("cpu",)
 
 
 class SubstringSet:
@@ -168,6 +205,8 @@ def __init__(self, items):
     def __contains__(self, item):
         if "cuda" in item:
             item = "cuda"
+        if "xpu" in item:
+            item = "xpu"
         return item in self.items
 
 
@@ -185,6 +224,10 @@ def __contains__(self, item):
     ),
     dtypes_fast=[torch.float16],
 )
+device_configs["xpu"] = DeviceConfig(
+    dtypes=([torch.float32, torch.bfloat16, torch.float16]),
+    dtypes_fast=[torch.float16],
+)
 device_configs["cpu"] = DeviceConfig(
     dtypes=(
         [torch.float32, torch.bfloat16, torch.float16]
@@ -393,7 +436,7 @@ def batch_reserve(paged_attention: PagedAttention, target_seq_len: Tensor):
         )
 
 
-@large_tensor_test_class("2GB", device="cuda")
+@large_tensor_test_class("2GB", device=test_device[0])
 class TestFlexAttention(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -659,8 +702,13 @@ def preprocess_paged_attention(
         paged_attention.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
         # convert block mask and score mod
-        converted_block_mask = paged_attention.convert_logical_block_mask(block_mask)
-        converted_score_mod = paged_attention.get_score_mod(score_mod)
+        kv_len_tensor = torch.full((KV_B,), KV_S, device=device, dtype=torch.int64)
+        converted_block_mask = paged_attention.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
+        converted_score_mod = paged_attention.get_score_mod(
+            score_mod, kv_len=kv_len_tensor
+        )
         return k_cache, v_cache, converted_block_mask, converted_score_mod
 
     def run_paged_attention(
@@ -1194,6 +1242,7 @@ def run_automatic_dynamic_test(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods(self, device, dtype, score_mod: Callable):
         self.run_test(score_mod, dtype, device=device)
@@ -1203,6 +1252,7 @@ def test_builtin_score_mods(self, device, dtype, score_mod: Callable):
     @common_utils.parametrize("score_mod", test_score_mods)
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
         self, device, dtype, score_mod: Callable
     ):
@@ -1217,6 +1267,7 @@ def test_builtin_score_mods_seqlen_lt_default_sparse_block_size(
     @running_on_a100_only
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_seqlen_lt_custom_sparse_block_size(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -1250,6 +1301,7 @@ def causal_mask(b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mask_mod", test_score_mask_mod_map.items())
     def test_builtin_score_mods_dynamic(
         self, device, dtype: torch.dtype, score_mask_mod: tuple[Callable, Callable]
@@ -1259,6 +1311,7 @@ def test_builtin_score_mods_dynamic(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_automatic_dynamic(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -1268,6 +1321,7 @@ def test_builtin_score_mods_automatic_dynamic(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_builtin_score_mods_different_seqlen(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -1291,6 +1345,7 @@ def test_builtin_score_mods_different_seqlen(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("score_mod", test_score_mods)
     @common_utils.parametrize("BLOCK_SIZE", test_block_size)
     def test_builtin_score_mods_different_block_size(
@@ -1311,6 +1366,7 @@ def test_builtin_score_mods_different_block_size(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("batch_dims", test_Bq_Bkv)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
     @common_utils.parametrize("score_mod", test_score_mods)
@@ -1381,6 +1437,7 @@ def batch_mask_mod(
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("batch_dims", test_Bq_Bkv)
     @common_utils.parametrize("head_dims", test_Hq_Hkv)
     @common_utils.parametrize("score_mod", test_score_mods)
@@ -1411,8 +1468,10 @@ def mask_mod(b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize("score_mod", test_score_mods)
     @skip_on_rocm  # TODO: NaNs on ROCM
+    @skip_on_xpu  # TODO: NaNs on XPU like ROCM, need another PR to fix.
     def test_GQA(self, device, dtype: torch.dtype, score_mod: Callable):
         inputs = (
             score_mod,
@@ -1433,6 +1492,7 @@ def test_GQA(self, device, dtype: torch.dtype, score_mod: Callable):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     @common_utils.parametrize(
         "q_s", test_strides[:2]
     )  # TODO: fix layout for query braodcasting
@@ -1580,6 +1640,7 @@ def index_weird2(score, b, h, q_idx, kv_idx):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     def test_skip_odd_keys(self, device, dtype: torch.dtype):
         def score_mod(score, b, h, q, kv):
             return torch.where(kv % 2 == 0, score, float("-inf"))
@@ -1590,6 +1651,7 @@ def score_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     def test_function_composition(self, device, dtype: torch.dtype):
         def score_mod_1(score, b, h, m, n):
             return score + (m - n)
@@ -1606,6 +1668,7 @@ def composed_score_mod(score, b, h, m, n):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     def test_captured_buffers_all_dims(self, device, dtype: torch.dtype):
         head_scale = torch.randn(H, device=device)
         batch_scale = torch.randn(B, device=device)
@@ -1623,6 +1686,7 @@ def all_bias(score, batch, head, token_q, token_kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_seq_masking(self, device, dtype):
         seq_idx = torch.zeros(S, device=device, dtype=torch.bool)
         seq_idx[S // 2 :] = 1
@@ -1636,6 +1700,7 @@ def seq_mask_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_load_from_bias_seq_only(self, device, dtype):
         bias = torch.randn(S, S, device=device, dtype=dtype)
 
@@ -1648,6 +1713,7 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_load_from_bias_seq_batch(self, device, dtype):
         bias = torch.randn(B, S, S, device=device, dtype=dtype)
 
@@ -1707,6 +1773,7 @@ def add_decomposed_rel_pos(self, q):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_load_from_bias_head_seq_batch(self, device, dtype):
         bias = torch.randn(B, H, S, S, device=device, dtype=dtype)
 
@@ -1719,6 +1786,7 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_load_rel_bias(self, device, dtype):
         rel_bias = torch.randn(2 * S, device=device, dtype=dtype)
 
@@ -1731,6 +1799,7 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_dependent_causal_bidirectional(self, device, dtype):
         num_bidirectional = torch.randint(0, S, (B,), device=device, dtype=torch.int32)
 
@@ -1752,6 +1821,7 @@ def bias_mod(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_natten_2d(self, device, dtype):
         H = 32
         W = S // H
@@ -1820,6 +1890,7 @@ def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_silu_on_score(self, device, dtype):
         def silu_score(score, b, h, q, kv):
             return torch.nn.functional.silu(score)
@@ -1830,6 +1901,7 @@ def silu_score(score, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_padded_dense_causal(self, device, dtype):
         seq_len = torch.arange(B, device=device, dtype=torch.int32) + 1
 
@@ -1848,6 +1920,7 @@ def njt_score_mod(qk, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_captured_scale(self, device, dtype):
         scale = torch.ones((), device=device, dtype=torch.int32)
 
@@ -1860,6 +1933,7 @@ def score_mod_scale(qk, b, h, q, kv):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_recompile_changed_score_mod(self, device, dtype):
         scale = torch.ones((), device=device, dtype=torch.int32)
         ADD = True
@@ -1881,6 +1955,7 @@ def score_mod_scale(qk, b, h, q, kv):
     @expectedFailure  # If we capture a tensor then we can perform a reduction on it, and that shouldn't be allowed
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_captured_reduction(self, device, dtype):
         scale = torch.randn((B, 8), device=device)
 
@@ -1889,6 +1964,234 @@ def score_mod_scale(qk, b, h, q, kv):
 
         self.run_test(score_mod_scale, dtype, device=device)
 
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @common_utils.parametrize(
+        "score_mod", test_score_mods, name_fn=lambda score_mod: score_mod.__name__
+    )
+    @skip_on_cpu
+    def test_return_max(self, device, dtype, score_mod):
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 243, 16),
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        out_only = flex_attention(query, key, value, score_mod)
+        out_max, aux_max = flex_attention(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(max_scores=True),
+        )
+        out_both, aux_both = flex_attention(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(lse=True, max_scores=True),
+        )
+
+        flex_compile = torch.compile(flex_attention, fullgraph=True)
+        out_compiled, aux_compiled = flex_compile(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(max_scores=True),
+        )
+
+        torch.testing.assert_close(out_only, out_max, atol=1e-6, rtol=1e-6)
+        torch.testing.assert_close(out_only, out_both, atol=1e-6, rtol=1e-6)
+        torch.testing.assert_close(
+            aux_max.max_scores, aux_both.max_scores, atol=1e-6, rtol=1e-6
+        )
+
+        # we are calculating slightly different scores so add a lil fudge
+        # Extra tolerance for squared score_mod with float16 due to limited dynamic range
+        if score_mod.__name__ == "_squared" and dtype == torch.float16:
+            atol, rtol = 2e-2, 2e-2
+        else:
+            atol, rtol = 5e-3, 5e-3
+
+        torch.testing.assert_close(out_max, out_compiled, atol=atol, rtol=rtol)
+        torch.testing.assert_close(
+            aux_max.max_scores, aux_compiled.max_scores, atol=atol, rtol=rtol
+        )
+
+        B, H, L = query.shape[:3]
+        self.assertEqual(aux_max.max_scores.shape, (B, H, L))
+
+        max_score_tensors = [
+            aux_max.max_scores,
+            aux_both.max_scores,
+            aux_compiled.max_scores,
+        ]
+        for max_tensor in max_score_tensors:
+            self.assertFalse(
+                max_tensor.requires_grad, "max_scores should not require gradients"
+            )
+            self.assertEqual(
+                max_tensor.dtype, torch.float32, "max_scores should be kept in fp32"
+            )
+
+        # Test gradient computation for both eager and compiled versions
+        test_cases = [
+            ("eager", out_max, "eager mode"),
+            ("compiled", out_compiled, "compiled mode"),
+        ]
+
+        for mode_name, output, description in test_cases:
+            loss = output.sum()
+            grads = torch.autograd.grad(loss, (query, key, value))
+
+            # Verify gradients are computed for all inputs
+            input_names = ["query", "key", "value"]
+            for grad, input_name in zip(grads, input_names):
+                self.assertIsNotNone(
+                    grad, f"{input_name} should receive gradients in {description}"
+                )
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @common_utils.parametrize(
+        "score_mod", test_score_mods, name_fn=lambda score_mod: score_mod.__name__
+    )
+    @skip_on_cpu
+    def test_return_aux(self, device, dtype, score_mod):
+        """Test the new return_aux API with AuxRequest/Output"""
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 243, 16),
+            device=device,
+            dtype=dtype,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        flex_compile = torch.compile(flex_attention, fullgraph=True)
+        flex_compile_partial = torch.compile(flex_attention, fullgraph=False)
+
+        # Test 1: No auxiliary outputs (default behavior)
+        out_only = flex_compile(query, key, value, score_mod)
+        self.assertIsInstance(out_only, torch.Tensor)
+
+        # Test 2: Request only LSE
+        out, aux_lse = flex_compile(
+            query, key, value, score_mod, return_aux=AuxRequest(lse=True)
+        )
+        self.assertIsInstance(aux_lse, AuxOutput)
+        self.assertIsInstance(aux_lse.lse, torch.Tensor)
+        self.assertIsNone(aux_lse.max_scores)
+        self.assertEqual(aux_lse.lse.shape, (2, 2, 243))
+        self.assertEqual(aux_lse.lse.dtype, torch.float32)
+
+        # Test 3: Request only max_scores
+        out, aux_max = flex_compile(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(max_scores=True),
+        )
+        self.assertIsInstance(aux_max, AuxOutput)
+        self.assertIsNone(aux_max.lse)
+        self.assertIsInstance(aux_max.max_scores, torch.Tensor)
+        self.assertEqual(aux_max.max_scores.shape, (2, 2, 243))
+        self.assertEqual(aux_max.max_scores.dtype, torch.float32)
+
+        # Test 4: Request both auxiliary outputs
+        out, aux_both = flex_compile(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(lse=True, max_scores=True),
+        )
+        self.assertIsInstance(aux_both, AuxOutput)
+        self.assertIsInstance(aux_both.lse, torch.Tensor)
+        self.assertIsInstance(aux_both.max_scores, torch.Tensor)
+        self.assertEqual(aux_both.lse.shape, (2, 2, 243))
+        self.assertEqual(aux_both.max_scores.shape, (2, 2, 243))
+
+        # Test 5: Request no auxiliary outputs explicitly
+        out, aux_none = flex_compile(
+            query,
+            key,
+            value,
+            score_mod,
+            return_aux=AuxRequest(),  # Default is lse=False, max_scores=False
+        )
+        self.assertIsInstance(aux_none, AuxOutput)
+        self.assertIsNone(aux_none.lse)
+        self.assertIsNone(aux_none.max_scores)
+
+        # Test 6: Verify outputs are consistent with legacy API, can't fullgraph through warnings
+        out_legacy, lse_legacy = flex_compile_partial(
+            query, key, value, score_mod, return_lse=True
+        )
+        torch.testing.assert_close(out_only, out_legacy, atol=1e-6, rtol=1e-6)
+        torch.testing.assert_close(aux_lse.lse, lse_legacy, atol=1e-6, rtol=1e-6)
+
+    @supported_platform
+    @dtypes(*device_configs["cpu"].dtypes_fast)
+    @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @skip_on_cpu
+    def test_return_aux_deprecation_warnings(self, device, dtype):
+        """Test that deprecation warnings are issued for legacy parameters"""
+        import warnings
+
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 64, 16),
+            device=device,
+            dtype=dtype,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+
+        # Clear shown warnings to ensure we can test them
+        original_shown = _WARNINGS_SHOWN.copy()
+        _WARNINGS_SHOWN.clear()
+
+        try:
+            # Test deprecation warning for return_lse
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                flex_attention(query, key, value, return_lse=True)
+                self.assertTrue(
+                    any(
+                        "return_lse is deprecated" in str(warning.message)
+                        for warning in w
+                    )
+                )
+
+            # Clear for next test
+            _WARNINGS_SHOWN.clear()
+
+            # Test error when both old and new API are used
+            with self.assertRaises(ValueError) as cm:
+                flex_attention(
+                    query,
+                    key,
+                    value,
+                    return_lse=True,
+                    return_aux=AuxRequest(lse=True),
+                )
+            self.assertIn(
+                "Cannot specify both return_lse and return_aux", str(cm.exception)
+            )
+
+        finally:
+            # Restore original warnings state
+            _WARNINGS_SHOWN.clear()
+            _WARNINGS_SHOWN.update(original_shown)
+
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
@@ -2340,6 +2643,7 @@ def f(q, k, v):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     def test_njt_causal(self, device, dtype):
         offsets = torch.tensor(
             [0, 1024, 1024 + 512, S], device=device, dtype=torch.int32
@@ -2381,6 +2685,12 @@ def score_mod(score, b, h, m, n):
         self.run_test_with_paged_attention(
             score_mod, dtype=torch.float16, device=device
         )
+        self.run_test_with_paged_attention(
+            score_mod=score_mod,
+            dtype=torch.bfloat16,
+            KV_S=64,
+            device=device,
+        )
 
     @supported_platform
     @skip("TODO: Figure out why this is erroring")
@@ -2402,6 +2712,7 @@ def bias_mod(score, batch, head, token_q, token_kv):
     @common_utils.parametrize("score_mod", test_score_mods)
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("head_dims", [(D, D // 2), (D // 2, D)])
     def test_non_equal_head_dims(self, device, dtype, score_mod, head_dims):
         qk_d, v_d = head_dims
@@ -2495,6 +2806,7 @@ def causal(b, h, q_idx, kv_idx):
     @common_utils.parametrize("head_dim", [17, 24, 94, 121])
     @dtypes(*device_configs["cpu"].dtypes_fast)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes_fast)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes_fast)
     def test_non_pow_2_headdim(self, device, dtype, head_dim):
         self.run_test(_rel_bias, dtype, device, B, H, S, head_dim, B, H, S, head_dim)
 
@@ -2559,6 +2871,7 @@ def causal_constructor(S):
     @skip_on_cpu
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("score_mod", [_identity, _causal])
     def test_logsumexp_correctness(self, device, dtype, score_mod):
         make_tensor = functools.partial(
@@ -2694,9 +3007,7 @@ def test_differentiable_logsumexp_gradcheck(self, device):
         def flex_attention_lse_only(q, k, v):
             return flex_attention(q, k, v, return_lse=True)[1]
 
-        func = torch.compile(
-            flex_attention_lse_only, backend="aot_eager", fullgraph=True
-        )
+        func = torch.compile(flex_attention_lse_only, backend="aot_eager")
 
         self.assertTrue(
             torch.autograd.gradcheck(func, (query, key, value), raise_exception=True)
@@ -2722,9 +3033,7 @@ def test_differentiable_logsumexp_compiled(self, device):
         k.grad = None
         v.grad = None
 
-        out2, lse2 = torch.compile(flex_attention, fullgraph=True)(
-            q, k, v, return_lse=True
-        )
+        out2, lse2 = torch.compile(flex_attention)(q, k, v, return_lse=True)
         (out2.mean() + (lse2 * lse_mask).sum()).backward()
         q_grad2, k_grad2, v_grad2 = q.grad, k.grad, v.grad
         tolerance = Tolerances(atol=1e-1, rtol=1e-1)
@@ -2910,6 +3219,7 @@ def test_strided_backwards(self, device):
             torch.testing.assert_close(eager, compiled, atol=9e-3, rtol=0)
 
     @supported_platform
+    @skip_on_cpu
     @common_utils.parametrize("mode", ["eager", "inductor", "paged_attention"])
     @common_utils.parametrize(
         "permute_order",
@@ -3020,7 +3330,7 @@ def test_flex_attention_backward_stride_ordering(
     def test_non_contiguous_last_dim(self, device):
         """Test flex_attention with tensors having non contiguous last dimension."""
         B, H, D = 4, 8, 64
-        dtype = torch.float16 if device == "cuda" else torch.float32
+        dtype = torch.float16 if device in DEVICE_SUPPORTS_BACKWARDS else torch.float32
         for S in [16, 64]:
 
             def column_major_tensor():
@@ -3242,7 +3552,7 @@ def test_force_write_lse(self, device):
         query, key, value = make_tensor(), make_tensor(), make_tensor()
         out_eager, lse_eager = flex_attention(query, key, value, return_lse=True)
 
-        flex_compile = torch.compile(flex_attention, fullgraph=True)
+        flex_compile = torch.compile(flex_attention)
         out_compiled, lse_compiled = flex_compile(query, key, value, return_lse=True)
 
         out_paged, lse_paged = self.run_paged_attention(
@@ -3250,7 +3560,9 @@ def test_force_write_lse(self, device):
         )
 
         torch.testing.assert_close(lse_eager, lse_compiled, atol=3e-3, rtol=0)
-        torch.testing.assert_close(lse_eager, lse_paged, atol=3e-3, rtol=0)
+        requires_grad = device in DEVICE_SUPPORTS_BACKWARDS
+        if requires_grad:
+            torch.testing.assert_close(lse_eager, lse_paged, atol=3e-3, rtol=0)
 
     @supported_platform
     @skip_on_cpu
@@ -3774,7 +4086,9 @@ def causal_mask(b, h, q_idx, kv_idx):
         self.assertEqual(len(cnt.graphs), 1)
         graph = cnt.graphs[0]
         norm_graph = normalize_gm(graph.print_readable(print_output=False))
-        expected_graph = """\
+        self.assertExpectedInline(
+            norm_graph,
+            """\
 class GraphModule(torch.nn.Module):
     def forward(self, L_query_: "f64[2, 2, 128, 4]", L_key_: "f64[2, 2, 128, 4]", L_value_: "f64[2, 2, 128, 4]", L_block_mask_kv_indices: "i32[1, 1, 1, 1]", L_block_mask_kv_num_blocks: "i32[1, 1, 1]", L_block_mask_full_kv_num_blocks: "i32[1, 1, 1]", L_block_mask_full_kv_indices: "i32[1, 1, 1, 1]", L_block_mask_q_num_blocks: "i32[1, 1, 1]", L_block_mask_q_indices: "i32[1, 1, 1, 1]", L_block_mask_full_q_num_blocks: "i32[1, 1, 1]", L_block_mask_full_q_indices: "i32[1, 1, 1, 1]"):
         l_query_ = L_query_
@@ -3791,7 +4105,7 @@ def forward(self, L_query_: "f64[2, 2, 128, 4]", L_key_: "f64[2, 2, 128, 4]", L_
 
         score_mod_0 = self.score_mod_0
         mask_fn_0 = self.mask_fn_0
-        flex_attention = torch.ops.higher_order.flex_attention(l_query_, l_key_, l_value_, score_mod_0, (128, 128, l_block_mask_kv_num_blocks, l_block_mask_kv_indices, l_block_mask_full_kv_num_blocks, l_block_mask_full_kv_indices, l_block_mask_q_num_blocks, l_block_mask_q_indices, l_block_mask_full_q_num_blocks, l_block_mask_full_q_indices, 128, 128, mask_fn_0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True}, (), ());  l_query_ = l_key_ = l_value_ = score_mod_0 = l_block_mask_kv_num_blocks = l_block_mask_kv_indices = l_block_mask_full_kv_num_blocks = l_block_mask_full_kv_indices = l_block_mask_q_num_blocks = l_block_mask_q_indices = l_block_mask_full_q_num_blocks = l_block_mask_full_q_indices = mask_fn_0 = None
+        flex_attention = torch.ops.higher_order.flex_attention(l_query_, l_key_, l_value_, score_mod_0, (128, 128, l_block_mask_kv_num_blocks, l_block_mask_kv_indices, l_block_mask_full_kv_num_blocks, l_block_mask_full_kv_indices, l_block_mask_q_num_blocks, l_block_mask_q_indices, l_block_mask_full_q_num_blocks, l_block_mask_full_q_indices, 128, 128, mask_fn_0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), ());  l_query_ = l_key_ = l_value_ = score_mod_0 = l_block_mask_kv_num_blocks = l_block_mask_kv_indices = l_block_mask_full_kv_num_blocks = l_block_mask_full_kv_indices = l_block_mask_q_num_blocks = l_block_mask_q_indices = l_block_mask_full_q_num_blocks = l_block_mask_full_q_indices = mask_fn_0 = None
         out: "f64[2, 2, 128, 4]" = flex_attention[0];  flex_attention = None
         return (out,)
 
@@ -3804,10 +4118,7 @@ class mask_fn_0(torch.nn.Module):
         def forward(self, child: "i32[]", child_1: "i32[]", child_2: "i32[]", child_3: "i32[]"):
             ge: "b8[]" = child_2 >= child_3;  child_2 = child_3 = None
             return ge
-"""
-        self.assertExpectedInline(
-            norm_graph,
-            expected_graph,  # noqa: B950
+""",  # noqa: B950
         )
         # Save the AOT graphs
         aot_graphs = []
@@ -3825,18 +4136,20 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
         out.sum().backward()
 
         joint_graph = normalize_gm(aot_graphs[1].print_readable(print_output=False))
-        expected_joint_graph = """\
+        self.assertExpectedInline(
+            joint_graph,
+            """\
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "f64[2, 2, 128, 4]", primals_2: "f64[2, 2, 128, 4]", primals_3: "f64[2, 2, 128, 4]", full: "i32[1, 1, 1]", full_default: "i32[1, 1, 1, 1]", convert_element_type: "i32[1, 1, 1]", convert_element_type_1: "i32[1, 1, 1, 1]", getitem_2: "f64[2, 2, 128, 4]", getitem_3: "f32[2, 2, 128]", tangents_1: "f64[2, 2, 128, 4]"):
-        full_default_4: "f32[2, 2, 128]" = torch.ops.aten.full.default([2, 2, 128], 0, dtype = torch.float32, layout = torch.strided, device = device(type='GPU_TYPE', index=0), pin_memory = False)
+        full_default_4: "f32[2, 2, 128]" = torch.ops.aten.full.default([2, 2, 128], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
         fw_graph0 = self.fw_graph0
         joint_graph0 = self.joint_graph0
         mask_graph0 = self.mask_graph0
-        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, primals_3, getitem_2, getitem_3, tangents_1, full_default_4, fw_graph0, joint_graph0, (1, 1, full, full_default, None, None, convert_element_type, convert_element_type_1, None, None, 1073741824, 1073741824, mask_graph0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True}, (), ());  primals_1 = primals_2 = primals_3 = getitem_2 = getitem_3 = tangents_1 = full_default_4 = fw_graph0 = joint_graph0 = full = full_default = convert_element_type = convert_element_type_1 = mask_graph0 = None
-        getitem_4: "f64[2, 2, 128, 4]" = flex_attention_backward[0]
-        getitem_5: "f64[2, 2, 128, 4]" = flex_attention_backward[1]
-        getitem_6: "f64[2, 2, 128, 4]" = flex_attention_backward[2];  flex_attention_backward = None
-        return (getitem_4, getitem_5, getitem_6)
+        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, primals_3, getitem_2, getitem_3, tangents_1, full_default_4, fw_graph0, joint_graph0, (1, 1, full, full_default, None, None, convert_element_type, convert_element_type_1, None, None, 1073741824, 1073741824, mask_graph0), 0.5, {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'OUTPUT_MAX': False}, (), ());  primals_1 = primals_2 = primals_3 = getitem_2 = getitem_3 = tangents_1 = full_default_4 = fw_graph0 = joint_graph0 = full = full_default = convert_element_type = convert_element_type_1 = mask_graph0 = None
+        getitem_5: "f64[2, 2, 128, 4]" = flex_attention_backward[0]
+        getitem_6: "f64[2, 2, 128, 4]" = flex_attention_backward[1]
+        getitem_7: "f64[2, 2, 128, 4]" = flex_attention_backward[2];  flex_attention_backward = None
+        return (getitem_5, getitem_6, getitem_7)
 
     class fw_graph0(torch.nn.Module):
         def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i32[]", arg4_1: "i32[]"):
@@ -3855,12 +4168,8 @@ def forward(self, arg0_1: "i32[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i3
             full_default: "b8[]" = torch.ops.aten.full.default([], True, dtype = torch.bool, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False)
             return full_default
 """.replace(  # noqa: B950
-            "GPU_TYPE", torch.device(device).type
-        )
-
-        self.assertExpectedInline(
-            joint_graph,
-            expected_joint_graph,
+                "GPU_TYPE", torch.device(device).type
+            ),
         )
 
     @supported_platform
@@ -3946,7 +4255,7 @@ def flex_attention_as_strided_error_tensor(
             mask_mod_other_buffers=(),
         ):
             inner_q, inner_k, inner_v = query.elem, key.elem, value.elem
-            out, lse = flex_attention_hop(
+            out, lse, max_scores = flex_attention_hop(
                 inner_q,
                 inner_k,
                 inner_v,
@@ -3957,7 +4266,11 @@ def flex_attention_as_strided_error_tensor(
                 score_mod_other_buffers,
                 mask_mod_other_buffers,
             )
-            return AsStridedErrorTensor(out), AsStridedErrorTensor(lse)
+            return (
+                AsStridedErrorTensor(out),
+                AsStridedErrorTensor(lse),
+                AsStridedErrorTensor(max_scores),
+            )
 
         # Test setup
         B, H, S, D = 2, 1, 128, 16
@@ -3978,7 +4291,7 @@ def flex_attention_as_strided_error_tensor(
             )
 
         # Test 2: Run flex_attention with normal tensors first
-        compiled_fn = torch.compile(flex_attention, backend="aot_eager", fullgraph=True)
+        compiled_fn = torch.compile(flex_attention, backend="aot_eager")
         normal_out, normal_lse = compiled_fn(
             query_elem, key_elem, value_elem, return_lse=True
         )
@@ -4140,9 +4453,9 @@ def flex_attn_fn(x):
                 return output
 
         flex_module = SacModule(hidden_size=512, num_heads=8, context_fn=context_fn).to(
-            "cuda", dtype=torch.bfloat16
+            device, dtype=torch.bfloat16
         )
-        x = torch.ones(8, 1024, 512, device="cuda", dtype=torch.bfloat16)
+        x = torch.ones(8, 1024, 512, device=device, dtype=torch.bfloat16)
 
         # Run without compilation
         output_module = flex_module(x)
@@ -4238,11 +4551,11 @@ def make_tensor():
     @supported_platform
     @skip_on_cpu
     @skipCUDAIf(not has_triton_tma_device(), "Requires TMA enabled CUDA device")
-    def test_tma_with_customer_kernel_options(self):
+    def test_tma_with_customer_kernel_options(self, device):
         make_tensor = functools.partial(
             torch.ones,
             (1, 1, 256, 128),
-            device="cuda",
+            device=device,
             dtype=torch.bfloat16,
         )
         query, key, value = make_tensor(), make_tensor(), make_tensor()
@@ -4353,8 +4666,8 @@ def causal_mask(b, h, q, kv):
 
         block_mask = create_block_mask(causal_mask, 4, 2, 2048, 2048, device=device)
         self.assertEqual(block_mask.shape, (4, 2, 2048, 2048))
-        self.assertEqual(block_mask[0].shape, (2, 2048, 2048))
-        self.assertEqual(block_mask[0, 0].shape, (2048, 2048))
+        self.assertEqual(block_mask[0].shape, (1, 2, 2048, 2048))
+        self.assertEqual(block_mask[0, 0].shape, (1, 1, 2048, 2048))
         self.assertEqual(block_mask.numel(), 4 * 2 * 2048 * 2048)
         self.assertEqual(block_mask.sparsity(), 46.875)
         self.assertEqual(block_mask[0].sparsity(), 46.875)
@@ -4398,13 +4711,26 @@ def causal_mask(b, h, q, kv):
 
         # Index on batch dimension
         new_block_mask = block_mask[0]
-        assert new_block_mask.kv_num_blocks.shape == (2, 4)
-        assert new_block_mask.kv_indices.shape == (2, 4, 4)
+        assert new_block_mask.kv_num_blocks.shape == (1, 2, 4)
+        assert new_block_mask.kv_indices.shape == (1, 2, 4, 4)
 
         # Index on batch and head dimension
         new_block_mask = block_mask[0, 1]
-        assert new_block_mask.kv_num_blocks.shape == (4,)
-        assert new_block_mask.kv_indices.shape == (4, 4)
+        assert new_block_mask.kv_num_blocks.shape == (
+            1,
+            1,
+            4,
+        )
+        assert new_block_mask.kv_indices.shape == (1, 1, 4, 4)
+
+        # Index on batch and head dimension with -1 semantics
+        new_block_mask = block_mask[-1, -2]
+        assert new_block_mask.kv_num_blocks.shape == (
+            1,
+            1,
+            4,
+        )
+        assert new_block_mask.kv_indices.shape == (1, 1, 4, 4)
 
         # slicing on batch and head dimension
         new_block_mask = block_mask[0:2, 1:2]
@@ -4826,6 +5152,7 @@ def flex_attention_fn():
         )
 
     @supported_platform
+    @skip_on_xpu
     def test_create_is_cuda_graphable(self, device):
         def mask_mod(b, h, q, kv):
             return q >= kv
@@ -4938,6 +5265,98 @@ def test_backward_error_with_none_q_indices(self, device):
         ):
             flex_compile(q, k, v, block_mask=block_mask)
 
+    @supported_platform
+    @skip_on_cpu
+    def test_flex_attention_poisoned_rel_logits(self, device):
+        B = 1
+        H = 1
+        S = 1025
+        D = 64
+        q, k, v = [
+            torch.randn(B, H, S, D, requires_grad=True, device=device) for _ in range(3)
+        ]
+        rel_logits = torch.randn(2 * B, H, S, S, device=device)
+        rel_logits[B:] = float("nan")
+
+        def score_mod(score, b, h, q, kv):
+            return score + rel_logits[b, h, q, kv]
+
+        def causal(
+            b: torch.Tensor, h: torch.Tensor, q: torch.Tensor, kv: torch.Tensor
+        ) -> torch.Tensor:
+            return q >= kv
+
+        block_mask = create_block_mask(causal, B, H, S, S, device=device)
+        out = torch.compile(flex_attention)(
+            q, k, v, score_mod=score_mod, block_mask=block_mask
+        )
+        out.sum().backward()
+
+        assert out.isfinite().all().item()
+        assert q.grad.isfinite().all().item()
+        assert k.grad.isfinite().all().item()
+        assert v.grad.isfinite().all().item()
+
+    @supported_platform
+    @skip_on_cpu
+    def test_flex_attention_poison_mod_fwd(self, device):
+        """Div by score should cause our edge case handiling to NaN"""
+        B = 1
+        H = 1
+        S = 257
+        D = 16
+        q, k, v = [
+            torch.randn(B, H, S, D, requires_grad=True, device=device) for _ in range(3)
+        ]
+
+        def score_mod(score, b, h, q, kv):
+            return 1 / score
+
+        def causal(
+            b: torch.Tensor, h: torch.Tensor, q: torch.Tensor, kv: torch.Tensor
+        ) -> torch.Tensor:
+            return q >= kv
+
+        block_mask = create_block_mask(causal, B, H, S, S, device=device)
+        out = torch.compile(flex_attention, backend="inductor")(
+            q, k, v, score_mod=score_mod, block_mask=block_mask
+        )
+        out.sum().backward()
+        assert out.isfinite().all().item()
+        assert q.grad.isfinite().all().item()
+        # assert k.grad.isfinite().all().item()
+        assert v.grad.isfinite().all().item()
+
+    @supported_platform
+    @skip_on_cpu
+    def test_flex_attention_poison_mod_bwd(self, device):
+        """log score should cause our edge case handiling for NaN in grad score"""
+        B = 1
+        H = 1
+        S = 257
+        D = 16
+        q, k, v = [
+            torch.randn(B, H, S, D, requires_grad=True, device=device) for _ in range(3)
+        ]
+
+        def score_mod(score, b, h, q, kv):
+            return torch.where(score > 0, torch.log(score), score)
+
+        def causal(
+            b: torch.Tensor, h: torch.Tensor, q: torch.Tensor, kv: torch.Tensor
+        ) -> torch.Tensor:
+            return q >= kv
+
+        block_mask = create_block_mask(causal, B, H, S, S, device=device)
+        out = torch.compile(flex_attention, backend="inductor")(
+            q, k, v, score_mod=score_mod, block_mask=block_mask
+        )
+        out.sum().backward()
+        assert out.isfinite().all().item()
+        assert q.grad.isfinite().all().item()
+        # assert k.grad.isfinite().all().item()
+        assert v.grad.isfinite().all().item()
+
     @supported_platform
     @skip_on_cpu
     def test_forward_pass_with_none_q_indices(self, device):
@@ -4996,7 +5415,7 @@ def test_block_mask_operations_with_none_q_indices(self, device):
         self.assertEqual(block_mask.BLOCK_SIZE, (128, 128))
 
         sliced_mask = block_mask[0]
-        self.assertEqual(sliced_mask.shape, (1, 128, 512))
+        self.assertEqual(sliced_mask.shape, (1, 1, 128, 512))
         self.assertIsNone(sliced_mask.q_indices)
         self.assertIsNone(sliced_mask.q_num_blocks)
 
@@ -5006,8 +5425,68 @@ def test_block_mask_operations_with_none_q_indices(self, device):
             self.assertEqual(cpu_mask.kv_num_blocks.device.type, "cpu")
             self.assertIsNone(cpu_mask.q_indices)
 
+    @supported_platform
+    @skip_on_cpu
+    def test_broadcasted_head_block_mask(self, device):
+        torch.manual_seed(42)
+
+        def causal_mask(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        def get_mask_mod_with_offset(mask_mod, offset_tensor):
+            def _mask_mod(b, h, q, kv):
+                return mask_mod(b, h, q + offset_tensor, kv)
+
+            return _mask_mod
+
+        B, T, H, D, current_pos = 4, 512, 8, 64, 128
+        dtype = torch.float32
+
+        q = torch.randn(B, H, 1, D, device=device, dtype=dtype)
+        k_cache = torch.randn(B, H, T, D, device=device, dtype=dtype)
+        v_cache = torch.randn(B, H, T, D, device=device, dtype=dtype)
+
+        # Keep future tokens tiny to avoid numerical issues when using full caches
+        k_cache[:, :, current_pos + 1 :, :] = (
+            torch.randn_like(k_cache[:, :, current_pos + 1 :, :]) * 1e-10
+        )
+        v_cache[:, :, current_pos + 1 :, :] = (
+            torch.randn_like(v_cache[:, :, current_pos + 1 :, :]) * 1e-10
+        )
+
+        k_cropped = k_cache[:, :, : current_pos + 1, :]
+        v_cropped = v_cache[:, :, : current_pos + 1, :]
+        sdpa_output = torch.nn.functional.scaled_dot_product_attention(
+            q, k_cropped, v_cropped, attn_mask=None
+        )
+
+        base_mask = create_block_mask(
+            causal_mask,
+            B=B,
+            H=None,  # broadcast across heads
+            Q_LEN=T,
+            KV_LEN=T,
+            device=device,
+            _compile=True,
+        )
+
+        q_block_size = base_mask.BLOCK_SIZE[0]
+        block_offset = current_pos // q_block_size
+        mask_slice = base_mask[:, :, block_offset]
+
+        offset_tensor = torch.tensor(current_pos, device=device)
+        mask_slice.mask_mod = get_mask_mod_with_offset(
+            base_mask.mask_mod, offset_tensor
+        )
+        mask_slice.seq_lengths = (1, mask_slice.seq_lengths[1])
+
+        fa = torch.compile(flex_attention, dynamic=True)
+        flex_output = fa(q, k_cache, v_cache, block_mask=mask_slice)
+
+        self.assertEqual(flex_output, sdpa_output, atol=1e-3, rtol=1e-3)
 
-@large_tensor_test_class("2GB", device="cuda")
+
+@large_tensor_test_class("2GB", device=test_device[0])
 class TestPagedAttention(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -5129,7 +5608,12 @@ def causal_mask(b, h, q, kv):
         block_mask = create_block_mask(
             causal_mask, max_batch_size, 1, max_seq_len, max_seq_len, device=device
         )
-        new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
+        kv_len_tensor = torch.full(
+            (max_batch_size,), max_seq_len, device=device, dtype=torch.int64
+        )
+        new_block_mask = paged_cache.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
 
         zeros = [0, 0, 0, 0]
         # Check that the new block mask is correct
@@ -5322,6 +5806,7 @@ def test_update(self, device):
     @supported_platform
     @dtypes(*device_configs["cpu"].dtypes)
     @dtypesIfCUDA(*device_configs["cuda"].dtypes)
+    @dtypesIfXPU(*device_configs["xpu"].dtypes)
     @common_utils.parametrize("score_mod", test_score_mods)
     def test_paged_builtin_score_mods(
         self, device, dtype: torch.dtype, score_mod: Callable
@@ -5404,11 +5889,18 @@ def causal_mask(b, h, q, kv):
         )
         paged_cache.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
-        new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
+        kv_len_tensor = torch.full(
+            (max_batch_size,), max_seq_len, device=device, dtype=torch.int64
+        )
+        new_block_mask = paged_cache.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
 
         compiled_sdpa = torch.compile(
             create_attention(
-                paged_cache.get_score_mod(score_mod), block_mask, enable_gqa=False
+                paged_cache.get_score_mod(score_mod, kv_len=kv_len_tensor),
+                block_mask,
+                enable_gqa=False,
             )
         )
         paged_out = compiled_sdpa(q, k_cache, v_cache, block_mask=new_block_mask)
@@ -5450,14 +5942,16 @@ def get_params(dtypes: list[torch.dtype]) -> list[Params]:
 
 
 supports_learnable_bias = unittest.skipUnless(
-    (torch.cuda.is_available() and has_triton())
-    and (torch.cuda.get_device_capability() >= (8, 0) or torch.version.hip),
+    (
+        (torch.cuda.is_available() and has_triton())
+        and (torch.cuda.get_device_capability() >= (8, 0) or torch.version.hip)
+    ),
     "Requires Triton + A100 or Triton + ROCm",
 )
 
 
 @supports_learnable_bias
-@large_tensor_test_class("2GB", device="cuda")
+@large_tensor_test_class("2GB", device=test_device[0])
 class TestLearnableBiases(InductorTestCase):
     def setUp(self):
         super().setUp()
@@ -5510,7 +6004,7 @@ def _gold_check(self, eager, compiled, gold, tensor_name, fudge_factor=1.35):
     def _check_outputs_and_grads(
         self, out_eager, out_compiled, out_gold, tensors, names=None
     ):
-        backwards_grad = torch.randn_like(out_eager)
+        backwards_grad = torch.randn_like(out_eager, device="cpu").to(out_eager.device)
         grads_eager = torch.autograd.grad((out_eager,), tensors, backwards_grad)
         grads_compiled = torch.autograd.grad((out_compiled,), tensors, backwards_grad)
         grads_gold = torch.autograd.grad((out_gold,), tensors, backwards_grad)
@@ -6052,6 +6546,35 @@ def bias_mod(score, b, h, q_idx, kv_idx):
         assert bias.grad, "No gradient computed for bias"
         assert torch.any(bias.grad != 0), "Gradient for bias is 0"
 
+    @skip_on_cpu
+    def test_backprop_error_case(self, device):
+        @torch.compile()
+        def test(x, y):
+            # Materialize a bias matrix
+            B, L, device = x.shape[0], x.shape[1], x.device
+            b = torch.arange(B, device=device, dtype=torch.long).view(B, 1, 1)
+            q_idx = torch.arange(L, device=device, dtype=torch.long).view(1, L, 1)
+            kv_idx = torch.arange(L, device=device, dtype=torch.long).view(1, 1, L)
+            bias_mat = y[b, q_idx] + y[b, kv_idx]  # (B, L, L)
+
+            # Dummy score_mod retrieving bias values
+            def score_mod(score, b, h, q_idx, kv_idx):
+                return score + bias_mat[b, q_idx, kv_idx]
+
+            x_ = x[:, :, None].repeat(1, 1, 16, 1)
+            # torch._dynamo.graph_break()
+            return flex_attention(x_, x_, x_, score_mod=score_mod)
+
+        B, L, D = 2, 16, 64
+
+        x = torch.randn(B, L, D, device=device, requires_grad=True)
+        y = torch.randn(B, L, device=device, requires_grad=True)
+
+        _ = test(x, y).mean().backward()
+
+        assert x.grad.norm() > 0
+        assert y.grad.norm() > 0
+
     @skip_on_cpu
     @common_utils.parametrize(
         "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
@@ -6398,10 +6921,22 @@ def _test_learnable_bias_inner(
             )
 
 
-instantiate_device_type_tests(TestFlexAttention, globals(), only_for=test_device)
-instantiate_device_type_tests(TestPagedAttention, globals(), only_for=test_device)
-instantiate_device_type_tests(TestBlockMask, globals(), only_for=("cuda",))
-instantiate_device_type_tests(TestLearnableBiases, globals(), only_for=test_device)
+instantiate_device_type_tests(
+    TestFlexAttention, globals(), only_for=test_device, allow_xpu=True
+)
+instantiate_device_type_tests(
+    TestPagedAttention, globals(), only_for=test_device, allow_xpu=True
+)
+instantiate_device_type_tests(
+    TestBlockMask,
+    globals(),
+    only_for=(test_device[0] if HAS_GPU else "cuda",),
+    allow_xpu=True,
+)
+instantiate_device_type_tests(
+    TestLearnableBiases, globals(), only_for=test_device, allow_xpu=True
+)
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
index 33f6cc5295ba..120d8d36b439 100644
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@@ -27,8 +27,11 @@
 from torch.testing._internal.common_device_type import (
     flex_attention_supported_platform as supported_platform,
     instantiate_device_type_tests,
+    skipXPUIf,
 )
 from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
+from torch.testing._internal.inductor_utils import HAS_GPU
+from torch.utils._triton import has_triton_tma_device
 
 
 if IS_WINDOWS and IS_CI:
@@ -56,16 +59,24 @@
     and torch.utils._triton.has_triton()
     and torch.cuda.get_device_capability() >= (8, 0)
 )
-
-if TEST_ON_CUDA:
-    test_device = ("cuda",)
-    test_dtypes = (
-        [torch.float32, torch.bfloat16, torch.float16]
-        if PLATFORM_SUPPORTS_BF16
-        else [torch.float16, torch.float32]
-    )
-    test_dtypes_fast = [torch.float16]
-    SKIP_UT_ON_CPU = False
+TEST_ON_XPU = torch.xpu.is_available() and torch.utils._triton.has_triton()
+
+if HAS_GPU:
+    if TEST_ON_CUDA:
+        test_device = ("cuda",)
+        test_dtypes = (
+            [torch.float32, torch.bfloat16, torch.float16]
+            if PLATFORM_SUPPORTS_BF16
+            else [torch.float16, torch.float32]
+        )
+        test_dtypes_fast = [torch.float16]
+        SKIP_UT_ON_CPU = False
+    elif TEST_ON_XPU:
+        torch._C._set_onednn_allow_tf32(True)
+        test_device = ("xpu",)
+        test_dtypes = [torch.float32, torch.bfloat16, torch.float16]
+        test_dtypes_fast = [torch.float16]
+        SKIP_UT_ON_CPU = False
 else:
     test_device = ("cpu",)
     torch_config_string = torch.__config__.show()
@@ -85,12 +96,19 @@
     test_dtypes_fast = [torch.float32]
 
 
-def create_attention(score_mod, block_mask, enable_gqa=False):
+def skip_on_xpu(test_func):
+    """Decorator to skip tests that are not supported on Intel GPU."""
+    decorated_func = skipXPUIf(True, "Not supported on Intel GPU")(test_func)
+    return decorated_func
+
+
+def create_attention(score_mod, block_mask, enable_gqa=False, kernel_options=None):
     return functools.partial(
         flex_attention,
         score_mod=score_mod,
         block_mask=block_mask,
         enable_gqa=enable_gqa,
+        kernel_options=kernel_options,
     )
 
 
@@ -363,6 +381,7 @@ def run_test(
         V_D: int = D,
         block_mask: Optional[BlockMask] = None,
         device="cuda",
+        kernel_options=None,
     ):
         assert score_mod is not None or block_mask is not None, (
             "Must provide score_mod or block_mask"
@@ -393,7 +412,10 @@ def run_test(
         q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
 
         sdpa_partial = create_attention(
-            score_mod, block_mask, enable_gqa=(not Q_H == KV_H)
+            score_mod,
+            block_mask,
+            enable_gqa=(not Q_H == KV_H),
+            kernel_options=kernel_options,
         )
         compiled_sdpa = torch.compile(sdpa_partial)
         if not self.test_inference_only:
@@ -540,8 +562,13 @@ def preprocess_paged_attention(
         paged_attention.assign(batch_idx, input_pos, k, v, k_cache, v_cache)
 
         # convert block mask and score mod
-        converted_block_mask = paged_attention.convert_logical_block_mask(block_mask)
-        converted_score_mod = paged_attention.get_score_mod(score_mod)
+        kv_len_tensor = torch.full((KV_B,), KV_S, device=device, dtype=torch.int64)
+        converted_block_mask = paged_attention.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
+        converted_score_mod = paged_attention.get_score_mod(
+            score_mod, kv_len=kv_len_tensor
+        )
 
         return k_cache, v_cache, converted_block_mask, converted_score_mod
 
@@ -723,22 +750,22 @@ def run_test_with_call_paged_attention(
         )
 
     @supported_platform
-    @expectedFailure
+    @expectedFailure  # tl.dot does not support embedding size less than 16
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
     @common_utils.parametrize("dtype", test_dtypes_fast)
-    def test_bw_decoding_fails(self, dtype):
+    def test_bw_decoding_fails(self, device, dtype):
         make_kv = functools.partial(
             torch.randn,
             (2, 2, 128, 4),
             dtype=dtype,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         make_q = functools.partial(
             torch.randn,
             (2, 2, 8, 4),
             dtype=dtype,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         q, k, v, backward_grad = make_q(), make_kv(), make_kv(), make_q()
@@ -825,6 +852,28 @@ def test_builtin_score_mods_different_block_size(
         )
         self.run_test(score_mod, dtype, block_mask=block_mask, device=device)
 
+    @unittest.skipIf(not has_triton_tma_device(), "Skip when TMA is not available")
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_tma_decoding(self, device, dtype: torch.dtype):
+        n_heads, head_dim, seq_len = 4, 16, 128
+
+        score_mod = _generate_alibi_bias(n_heads)
+        kernel_options = {"USE_TMA": True}
+        self.run_test(
+            score_mod=score_mod,
+            dtype=dtype,
+            Q_B=1,
+            Q_H=n_heads,
+            Q_S=1,
+            Q_D=head_dim,
+            KV_B=1,
+            KV_H=n_heads,
+            KV_S=seq_len,
+            V_D=head_dim,
+            device=device,
+            kernel_options=kernel_options,
+        )
+
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     @common_utils.parametrize("k_s", test_input_strides)
@@ -1009,12 +1058,12 @@ def mask_mod(b, h, q, kv):
 
     @supported_platform
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
-    def test_non_divisible_multi_token_offset_mask_with_captured_buffer(self):
+    def test_non_divisible_multi_token_offset_mask_with_captured_buffer(self, device):
         KV_S = S - 3
         Q_S = 3
-        offset_kv = torch.randn(KV_S, device="cuda", dtype=torch.bfloat16)
-        offset_q = torch.randn(Q_S, device="cuda", dtype=torch.bfloat16)
-        offset_tensor = torch.tensor(S // 2 - 3, device="cuda", dtype=torch.int32)
+        offset_kv = torch.randn(KV_S, device=device, dtype=torch.bfloat16)
+        offset_q = torch.randn(Q_S, device=device, dtype=torch.bfloat16)
+        offset_tensor = torch.tensor(S // 2 - 3, device=device, dtype=torch.int32)
 
         def score_mod(score, b, h, q, kv):
             return score + offset_kv[kv] + offset_q[q]
@@ -1022,8 +1071,14 @@ def score_mod(score, b, h, q, kv):
         def mask_mod(b, h, q, kv):
             return kv >= q + offset_tensor
 
-        block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S)
-        self.run_test(Q_S=Q_S, KV_S=KV_S, block_mask=block_mask, score_mod=score_mod)
+        block_mask = create_block_mask(mask_mod, B, 1, Q_S, KV_S, device=device)
+        self.run_test(
+            Q_S=Q_S,
+            KV_S=KV_S,
+            block_mask=block_mask,
+            score_mod=score_mod,
+            device=device,
+        )
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
@@ -1526,6 +1581,19 @@ def score_mod(score, b, h, m, n):
 
         self.run_test(score_mod, device=device)
         self.run_test_with_paged_attention(score_mod, device=device)
+        self.run_test_with_paged_attention(
+            score_mod=score_mod,
+            dtype=torch.bfloat16,
+            Q_B=4,
+            Q_H=1,
+            Q_S=1,
+            QK_D=16,
+            KV_B=4,
+            KV_H=1,
+            KV_S=64,
+            V_D=16,
+            device=device,
+        )
 
     @supported_platform
     @patch.object(torch._inductor.config, "max_autotune", True)
@@ -1689,19 +1757,19 @@ def mask_mod(b, h, q, kv):
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
     @common_utils.parametrize("dtype", test_dtypes)
     @common_utils.parametrize("score_mod", [_identity, _causal])
-    def test_logsumexp_correctness(self, dtype, score_mod):
+    def test_logsumexp_correctness(self, device, dtype, score_mod):
         make_kv = functools.partial(
             torch.randn,
             (B, Hkv, S, D),
             dtype=dtype,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         make_q = functools.partial(
             torch.randn,
             (B, Hkv, Hq // Hkv, D),
             dtype=dtype,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         q, k, v = make_q(), make_kv(), make_kv()
@@ -1741,29 +1809,29 @@ def eager_sdpa_hop(q, k, v, score_mod):
 
     @supported_platform
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
-    def test_not_pw_of_two(self):
-        query = torch.randn(1, 12, 1, 16, device="cuda")
-        key = torch.randn(1, 2, 128, 16, device="cuda")
-        value = torch.randn(1, 2, 128, 16, device="cuda")
+    def test_not_pw_of_two(self, device):
+        query = torch.randn(1, 12, 1, 16, device=device)
+        key = torch.randn(1, 2, 128, 16, device=device)
+        value = torch.randn(1, 2, 128, 16, device=device)
 
         flex_compiled = torch.compile(flex_attention)
         flex_compiled(query, key, value, enable_gqa=True)
 
     @supported_platform
     @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
-    def test_logsumexp_only_return(self):
+    def test_logsumexp_only_return(self, device):
         make_q = functools.partial(
             torch.randn,
             (B, Hkv, Hq // Hkv, D),
             dtype=torch.float32,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
         make_kv = functools.partial(
             torch.randn,
             (B, Hkv, S, D),
             dtype=torch.float32,
-            device="cuda",
+            device=device,
             requires_grad=True,
         )
 
@@ -1782,6 +1850,7 @@ def func(q, k, v, score_mod):
         )
 
     @supported_platform
+    @skip_on_xpu  # TODO: SYCL acc issue
     def test_non_sparse_mulitple_block_size(self, device):
         def generate_causal_offset(offset: torch.Tensor):
             def causal_offset_mask(b, h, q_idx, kv_idx):
@@ -1843,9 +1912,9 @@ def test_do_not_trigger_dynamic_shapes_on_empty_block_mask(self, device):
             )
             # Ensure no more re-compilation after the second automatic dynamic shape version.
             if i == 0:
-                self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2)
+                self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1)
             else:
-                self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 4)
+                self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2)
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
@@ -1993,11 +2062,18 @@ def causal_mask(b, h, q, kv):
         input_pos = torch.tensor(prefill_length, device=device, dtype=torch.int32).view(
             max_batch_size, 1
         )
-        new_block_mask = paged_cache.convert_logical_block_mask(block_mask)
+        kv_len_tensor = torch.full(
+            (max_batch_size,), max_seq_len, device=device, dtype=torch.int64
+        )
+        new_block_mask = paged_cache.convert_logical_block_mask(
+            block_mask, kv_len=kv_len_tensor
+        )
         new_block_mask.seq_lengths = (1, new_block_mask.seq_lengths[1])
         compiled_sdpa = torch.compile(
             create_attention(
-                paged_cache.get_score_mod(score_mod), new_block_mask, enable_gqa=False
+                paged_cache.get_score_mod(score_mod, kv_len=kv_len_tensor),
+                new_block_mask,
+                enable_gqa=False,
             )
         )
         paged_out = compiled_sdpa(
@@ -2015,7 +2091,9 @@ def causal_mask(b, h, q, kv):
             self._check_equal(golden_outs, ref_outs, paged_out, fudge_factor, "Out")
 
 
-instantiate_device_type_tests(TestFlexDecoding, globals(), only_for=test_device)
+instantiate_device_type_tests(
+    TestFlexDecoding, globals(), only_for=test_device, allow_xpu=True
+)
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_fp8.py b/test/inductor/test_fp8.py
index d92af25977a7..82e4a923a92e 100644
--- a/test/inductor/test_fp8.py
+++ b/test/inductor/test_fp8.py
@@ -465,7 +465,10 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             # autotuning for the compiled case, the results can be different because of
             # the way blocks of results are accumulated (float addition not associative), so
             # setting a small absolute tolerance in these tests
-            torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
+            if dtype == torch.bfloat16:
+                self.assertEqual(y_eager, y_compiled, rtol=5e-2, atol=0.07)
+            else:
+                self.assertEqual(y_eager, y_compiled, rtol=1e-2, atol=0.05)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @unittest.skipIf(
@@ -611,7 +614,7 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
         )
         self.assertEqual(y_eager.dtype, dtype)
         self.assertEqual(y_compiled.dtype, dtype)
-        torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.05)
+        torch.testing.assert_close(y_eager, y_compiled, rtol=5e-2, atol=0.07)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @unittest.skipIf(
@@ -744,7 +747,7 @@ def linear(x_fp8, x_inverse_scale, w_t_fp8, w_inverse_scale, bias):
             )
         self.assertEqual(y_eager.dtype, dtype)
         self.assertEqual(y_compiled.dtype, dtype)
-        torch.testing.assert_close(y_eager, y_compiled, rtol=1e-2, atol=0.07)
+        torch.testing.assert_close(y_eager, y_compiled, rtol=5e-2, atol=0.07)
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("M", (1, 3, 33, 257, 1024))
diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
index 2e160cebf695..32ccce7e6c03 100644
--- a/test/inductor/test_fxir_backend.py
+++ b/test/inductor/test_fxir_backend.py
@@ -20,6 +20,7 @@
 from torch._inductor.codegen.common import register_backend_for_device
 from torch._inductor.codegen.cpp import CppScheduling
 from torch._inductor.codegen.triton import TritonScheduling
+from torch._inductor.codegen.wrapper import PythonWrapperCodegen
 from torch._inductor.codegen.wrapper_fxir import FxConverter, WrapperFxCodegen
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.export import Dim
@@ -35,6 +36,13 @@
 )
 
 
+if HAS_GPU:
+    import triton
+    import triton.language as tl
+
+    from torch.testing._internal.triton_utils import add_kernel_2d_autotuned
+
+
 @requires_gpu()
 @config.patch(
     compile_threads=1,
@@ -395,6 +403,57 @@ def get_input():
             ]
             self.assertEqual(placeholder.meta["val"], symbol)
 
+    @parametrize(
+        "shape",
+        [
+            (20,),
+            (50, 30),
+            (50, 30, 40),
+        ],
+    )
+    @torch._inductor.config.patch(
+        {
+            "pad_dynamic_shapes": True,
+            "comprehensive_padding": True,
+            "padding_alignment_bytes": 32,
+            "pad_outputs": True,
+        }
+    )
+    def test_dynamic_shapes_with_padding(self, shape):
+        """
+        Test a graph with dynamic shapes with padding.
+        """
+
+        def get_input(shape):
+            pad_size = list(shape)
+            pad_size[-1] = ((shape[-1] + 7) // 8) * 8
+            pad = torch.randn(pad_size, dtype=torch.float32, device=self.device)
+            view = torch.as_strided(pad, shape, pad.stride())
+            return view
+
+        args = [get_input(shape) for _ in range(2)]
+        (gm,) = self._compile_and_check(
+            torch.add, args, compile_kwargs={"dynamic": True}
+        )
+
+        # Check for a symbolic output shape.
+        (empty_strided,) = gm.graph.find_nodes(
+            op="call_function", target=torch.empty_strided
+        )
+        example_tensor = empty_strided.meta["val"]
+        symbolic_dims = example_tensor.shape
+        symbolic_strides = example_tensor.stride()
+
+        align_elems = 32 // args[0].dtype.itemsize
+        expected_strides = [1 for _ in range(len(shape))]
+        for i in range(len(shape) - 1, 0, -1):
+            expected_strides[i - 1] = align_elems * (
+                ((expected_strides[i] * symbolic_dims[i]) + align_elems - 1)
+                // align_elems
+            )
+        for i, j in zip(symbolic_strides, expected_strides):
+            self.assertEqual(i, j)
+
     def test_dynamic_shapes_precomputed_size(self):
         """
         Test dynamic shapes where a kernel's size arg is precomputed.
@@ -411,9 +470,9 @@ def test_dynamic_shapes_precomputed_size(self):
         )
         self.assertIn("ks0", triton_node.kwargs["kwargs"])
 
-    def test_dynamic_launch_grid_calc(self):
+    def test_dynamic_launch_grid_calc_python(self):
         """
-        Test the dyanmic launch grid calculation for Triton kernel wrapper
+        Test the dyanmic launch grid calculation for Triton kernel wrapper using python mode
         """
         func = torch.add
         args = [torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]]
@@ -427,12 +486,47 @@ def test_dynamic_launch_grid_calc(self):
         self.assertIn("xnumel", triton_node.kwargs["kwargs"])
         self.assertIn("XBLOCK", triton_node.kwargs["kwargs"])
         grid = triton_node.kwargs["grid"][0]
-        self.assertEqual(
-            grid[0].target, operator.floordiv
-        )  # ((xnumel + 127) // xblock))
+        xnumel = triton_node.kwargs["kwargs"]["xnumel"].meta["val"]
+        xblock = triton_node.kwargs["kwargs"]["XBLOCK"]
+        self.assertEqual(grid[0].meta["val"], -(-xnumel // xblock))
         self.assertEqual(grid[1], 1)
         self.assertEqual(grid[2], 1)
 
+    def test_dynamic_launch_grid_calc_python_slow(self):
+        """
+        Test the dyanmic launch grid calculation for Triton kernel wrapper using python_slow mode
+        """
+        from torch._inductor.runtime.triton_heuristics import GridExpr
+
+        # Mock GridExpr.from_meta to use "python_slow" mode explicitly
+        original_from_meta = GridExpr.from_meta
+
+        def mocked_from_meta(inductor_meta, cfg, mode="python"):
+            return original_from_meta(inductor_meta, cfg, mode="python_slow")
+
+        with unittest.mock.patch.object(GridExpr, "from_meta", mocked_from_meta):
+            func = torch.add
+            args = [
+                torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]
+            ]
+            (gm,) = self._compile_and_check(
+                func, args, compile_kwargs={"dynamic": True}
+            )
+
+            # Check for the precomputed size arg.
+            (triton_node,) = gm.graph.find_nodes(
+                op="call_function", target=triton_kernel_wrapper_mutation
+            )
+            self.assertIn("grid", triton_node.kwargs)
+            self.assertIn("xnumel", triton_node.kwargs["kwargs"])
+            self.assertIn("XBLOCK", triton_node.kwargs["kwargs"])
+            grid = triton_node.kwargs["grid"][0]
+            xnumel = triton_node.kwargs["kwargs"]["xnumel"].meta["val"]
+            xblock = triton_node.kwargs["kwargs"]["XBLOCK"]
+            self.assertEqual(grid[0].meta["val"], ((xnumel + xblock - 1) // xblock))
+            self.assertEqual(grid[1], 1)
+            self.assertEqual(grid[2], 1)
+
     @config.patch({"trace.enabled": True})
     @unittest.mock.patch("torch._inductor.debug.DebugFormatter.output_code")
     def test_debug(self, mock_output_code):
@@ -544,6 +638,37 @@ def run(*args, **kwargs):
         if use_dynamic_shapes:
             self.assertEqual(type(shape[0]), torch.fx.Node)
 
+    def test_custom_triton(self):
+        @triton.jit
+        def add_kernel(
+            in_ptr0,
+            in_ptr1,
+            out_ptr,
+            n_elements,
+            BLOCK_SIZE: tl.constexpr,
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(in_ptr0 + offsets, mask=mask)
+            y = tl.load(in_ptr1 + offsets, mask=mask)
+            output = x + y
+            tl.store(out_ptr + offsets, output, mask=mask)
+
+        def add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            output = torch.empty_like(x)
+            n_elements = output.numel()
+
+            def grid(meta):
+                return (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+
+            add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=16)
+            return output
+
+        args = [torch.randn(32, device=self.device) for _ in range(2)]
+        self._compile_and_check(add, args)
+
     def test_output_slice_view(self):
         """
         Test when the output is a view of the input.
@@ -560,9 +685,13 @@ def foo(x):
 class AOTFxirTestCase(InductorTestCase):
     device = GPU_TYPE
 
-    def check(self, model, inp, dynamic_shapes=None):
+    def check(self, model, inp, dynamic_shapes=None, strict=False):
+        if self.device == "xpu":
+            raise unittest.SkipTest("The feature AOTFxir not currently ready for XPU")
         with torch.no_grad():
-            ep = torch.export.export(model, inp, dynamic_shapes=dynamic_shapes)
+            ep = torch.export.export(
+                model, inp, dynamic_shapes=dynamic_shapes, strict=strict
+            )
             gm = torch._inductor.aot_compile(
                 ep.module(), inp, options={"fx_wrapper": True}
             )
@@ -624,6 +753,74 @@ def forward(self, x, y):
             dynamic_shapes=({0: Dim.DYNAMIC}, {0: Dim.DYNAMIC}),
         )
 
+    def test_custom_triton_autotune_dynamic(self):
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                output = torch.zeros_like(x)
+                x_elements = output.size()[0]
+                y_elements = output.size()[1]
+
+                def grid(meta):
+                    return (
+                        triton.cdiv(x_elements, meta["BLOCK_SIZE_X"]),
+                        triton.cdiv(y_elements, meta["BLOCK_SIZE_Y"]),
+                    )
+
+                add_kernel_2d_autotuned[grid](x, y, output, x_elements, y_elements)
+
+                return output
+
+        num_dims = 2
+        dims = [10] * num_dims
+        x = torch.randn(*dims, device=self.device)
+        y = torch.randn(*dims, device=self.device)
+        dim0_x = Dim("dim0_x", min=1, max=10)
+        dim0_y = Dim("dim0_y", min=1, max=10)
+        dynamic_shapes = {"x": {0: dim0_x}, "y": {0: dim0_y}}
+        self.check(
+            Model().to(device=self.device),
+            (x, y),
+            dynamic_shapes=dynamic_shapes,
+            strict=True,
+        )
+
+    def test_custom_backend(self):
+        """
+        Test registering a custom FX backend.
+        """
+        called = False
+
+        class CustomWrapperCodegen(WrapperFxCodegen):
+            def compile_graph(self, gm):
+                """
+                Simply records whether this override was called.
+                """
+                nonlocal called
+                called = True
+                return super().compile_graph(gm)
+
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        # Register a custom FX backend.
+        custom_backend = common.DeviceCodegen(
+            TritonScheduling,
+            PythonWrapperCodegen,
+            fx_wrapper_codegen=CustomWrapperCodegen,
+        )
+        with unittest.mock.patch.dict(
+            common.device_codegens, {self.device: custom_backend}
+        ):
+            # The backend should not have been called yet.
+            self.assertFalse(called)
+
+            inp = (torch.randn(8, device=self.device),)
+            self.check(M().to(self.device), inp)
+
+        # Now the backend should have been called.
+        self.assertTrue(called)
+
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py
index e23174863664..13e3c3684d38 100644
--- a/test/inductor/test_loop_ordering.py
+++ b/test/inductor/test_loop_ordering.py
@@ -3,11 +3,13 @@
 import contextlib
 import os
 import unittest
+from unittest import skipUnless
 
 import numpy as np
 import sympy
 
 import torch
+import torch.nn.functional as F
 from torch import nn
 from torch._dynamo.testing import rand_strided
 from torch._dynamo.utils import same
@@ -17,7 +19,7 @@
 from torch._inductor.scheduler import SchedulerNode
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.test_operators import realize
-from torch._inductor.utils import run_and_get_code, sympy_index_symbol
+from torch._inductor.utils import is_big_gpu, run_and_get_code, sympy_index_symbol
 from torch._inductor.virtualized import ops, V
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
@@ -476,6 +478,47 @@ def test_pattern2(tensor_x_inp, scale_x):
         expected_numbytes += tensor_fp8.nbytes + tensor_fp8_t.nbytes  # output
         self.assertEqual(expected_numbytes, metrics.num_bytes_accessed)
 
+    def test_outer_dimension_softmax(self):
+        """
+        This test repros the not able to fuse problem for outer dimension
+        softmax reported here: https://github.com/pytorch/pytorch/issues/93718
+
+        Perf data on h100:
+        - without loop ordering after fusion 0.564 ms
+        - with loop ordering after fusion 0.302 ms
+        This is 1.87x speedup.
+
+        """
+        x = torch.randn(32, 2**21, device=GPU_TYPE)
+
+        def f(x):
+            return F.softmax(x, dim=0)
+
+        self.do_acc_test(f, x)
+        self.assertEqual(1, metrics.generated_kernel_count)
+
+    def test_outer_dimension_sum_fuse_with_pw(self):
+        """
+        Test the fusion of an outer dimension sum with a followed pointwise.
+        Perf data on h100:
+        - without loop ordering after fusion 0.436 ms
+        - with loop ordering after fusion 0.260 ms
+        This is 1.68x speedup.
+        """
+        x = torch.randn(32, 2**21, device=GPU_TYPE)
+
+        def f(x):
+            return x.sum(dim=0, keepdim=True) + x
+
+        self.do_acc_test(f, x)
+        self.assertEqual(1, metrics.generated_kernel_count)
+
+        if DO_PERF_TEST:
+            from triton.testing import do_bench
+
+            optf = torch.compile(f)
+            print(f"ms={do_bench(lambda: optf(x))}")
+
     # Disable split reduction to make it easier to calculate the expected
     # number of bytes accessed. In this case, split reduction does not
     # help perf much.
@@ -520,6 +563,52 @@ def f(x):
             ms = do_bench(lambda: opt_f(x))
             print(f"{ms=:.3f}")
 
+    @inductor_config.patch(
+        {
+            "max_autotune": True,
+            "max_autotune_gemm_backends": "TRITON",
+            "test_configs.max_mm_configs": 4,
+        }
+    )
+    @skipUnless(HAS_GPU and is_big_gpu(), "Need big gpu for max-autotune")
+    def test_interaction_with_triton_template(self):
+        """
+        Make sure the dependency prefix for TritonTempalate and its
+        prologue match.
+        """
+
+        @torch.compile
+        def f(x, y):
+            return (x.expand([1, y.shape[0]]) + 1) @ y
+
+        x = torch.randn([1, 1], device=GPU_TYPE)
+        y = torch.randn([64, 128], device=GPU_TYPE)
+
+        out, code = run_and_get_code(f, x, y)
+
+        # well when benchmark_kernel flag is on, we have one more .run
+        # call in the benchmarking code.
+        FileCheck().check("def call(").check_count(
+            ".run(", 1 + int(inductor_config.benchmark_kernel), exactly=True
+        ).run(code[0])
+
+    def test_fuse_with_scalar_shared_memory(self):
+        """
+        Make sure if we can fuse two nodes sharing a scalar before,
+        we can still do it with LOAF applied.
+
+        This is not really a big deal. But some tests rely on this and
+        less number of kernels has some small benefits.
+        """
+
+        @torch.compile
+        def f(x):
+            return torch.mean(x)
+
+        x = torch.randn([5, 5], device=GPU_TYPE)
+        out, code = run_and_get_code(f, x)
+        FileCheck().check_count("@triton.jit", 1, exactly=True).run(code[0])
+
 
 @inductor_config.patch(
     {
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index d5be375056fa..320bdf3462e6 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -12,14 +12,13 @@
 import unittest
 from typing import Callable, Optional
 from unittest import mock
-from unittest.mock import MagicMock
 
 import torch
 from torch import multiprocessing as mp, nn
 from torch._dynamo import reset
 from torch._dynamo.exc import BackendCompilerFailed
 from torch._dynamo.testing import rand_strided, reset_rng_state
-from torch._dynamo.utils import same
+from torch._dynamo.utils import counters, same
 from torch._inductor import config
 from torch._inductor.autotune_process import (
     _TestBenchmarkRequest,
@@ -31,11 +30,17 @@
 from torch._inductor.ir import Buffer, ChoiceCaller, FixedLayout
 from torch._inductor.kernel.mm_plus_mm import aten_mm_plus_mm
 from torch._inductor.select_algorithm import (
+    add_feedback_saver,
+    add_preprocessing_fn,
     AlgorithmSelectorCache,
+    clear_feedback_savers,
+    clear_preprocessing_fns,
+    ExternKernelCaller,
     TritonTemplate,
     TritonTemplateCaller,
 )
-from torch._inductor.template_heuristics import (
+from torch._inductor.template_heuristics.registry import override_template_heuristics
+from torch._inductor.template_heuristics.triton import (
     CUDAMMTemplateConfigHeuristic,
     GemmConfig,
 )
@@ -229,6 +234,7 @@ def next_multiple_16(a: int) -> int:
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+    @skipIfXpu(msg="TMA path on Intel GPU not require this check")
     @parametrize("dynamic", (False, True))
     def test_max_autotune_regular_mm_persistent_tma_illegal_alignment(self, dynamic):
         def mm(a, b):
@@ -357,6 +363,7 @@ def addmm(x, a, b):
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
+    @skipIfXpu(msg="TMA path on Intel GPU not require this check")
     @parametrize("dynamic", (False, True))
     def test_max_autotune_addmm_persistent_tma_illegal_alignment(self, dynamic):
         def addmm(x, a, b):
@@ -418,6 +425,7 @@ def addmm(x, a, b):
         torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=1e-2)
 
     @fresh_cache()
+    @skipIfXpu(msg="XPU doesn't support sm carveout")
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support sm carveout")
     @unittest.skipIf(IS_WINDOWS, "Windows doesn't support persistent TMA")
     @unittest.skipIf(
@@ -1254,16 +1262,14 @@ def f(a, b):
 
         # Force only decomposeK choice
         with (
-            mock.patch(
-                "torch._inductor.kernel.mm.V.choices.get_mm_configs"
-            ) as base_mm_mock,
+            override_template_heuristics(
+                device_type=GPU_TYPE,
+                template_op_pairs=[(torch._inductor.kernel.mm.mm_template.name, "mm")],
+            ),
             mock.patch(
                 "torch._inductor.kernel.mm.use_decompose_k_choice"
             ) as decompose_mock,
         ):
-            mm_configs_mock = MagicMock()
-            mm_configs_mock.return_value = []
-            base_mm_mock.return_value = mm_configs_mock
             decompose_mock.return_value = True
             compiled_f = torch.compile(f)
             out, code = run_and_get_code(compiled_f, a, b)
@@ -1278,6 +1284,201 @@ def f(a, b):
                 code[0]
             )
 
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @parametrize("dtype", (torch.float16, torch.bfloat16, torch.float32))
+    @parametrize("sizes", ((64, 128, 256), (128, 256, 512), (256, 512, 1024)))
+    @config.patch(
+        max_autotune=True,
+    )
+    def test_max_autotune_contiguous_transform_mm(self, sizes, dtype):
+        """
+        Test the contiguous subgraph transform with A * transpose(B) pattern.
+        This transform makes the second matrix contiguous before the matmul.
+        """
+        M, N, K = sizes
+
+        def mm_transpose(a, b):
+            return a @ b.transpose(0, 1)
+
+        a = torch.randn(M, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+        b = torch.randn(N, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+
+        # Compute fp64 baseline
+        a_fp64 = a.to(torch.float64)
+        b_fp64 = b.to(torch.float64)
+        expected_fp64 = mm_transpose(a_fp64, b_fp64)
+
+        # Force only contiguous choice to test the transform
+        with (
+            mock.patch(
+                "torch._inductor.template_heuristics.contiguous_mm.use_contiguous"
+            ) as contiguous_mock,
+        ):
+            contiguous_mock.return_value = True
+
+            compiled_func = torch.compile(mm_transpose)
+            out, code = run_and_get_code(compiled_func, a, b)
+
+            # Verify correctness against fp64 baseline
+            torch.testing.assert_close(
+                out, expected_fp64.to(dtype), atol=1e-2, rtol=1e-2
+            )
+
+            # Check that contiguous transform was used
+            FileCheck().check("contiguous_mm").run(code[0])
+
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @parametrize("dtype", (torch.float16, torch.bfloat16, torch.float32))
+    @parametrize("sizes", ((64, 128, 256), (128, 256, 512), (256, 512, 1024)))
+    @config.patch(
+        max_autotune=True,
+    )
+    def test_max_autotune_contiguous_transform_addmm(self, sizes, dtype):
+        """
+        Test the contiguous subgraph transform for addmm with non-contiguous second matrix.
+        """
+        M, N, K = sizes
+
+        def addmm_transpose(inp, a, b):
+            return torch.addmm(inp, a, b.transpose(0, 1))
+
+        inp = torch.randn(M, N, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+        a = torch.randn(M, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+        b = torch.randn(N, K, dtype=dtype, device=GPU_TYPE, requires_grad=True)
+
+        # Compute fp64 baseline
+        inp_fp64 = inp.to(torch.float64)
+        a_fp64 = a.to(torch.float64)
+        b_fp64 = b.to(torch.float64)
+        expected_fp64 = addmm_transpose(inp_fp64, a_fp64, b_fp64)
+
+        # Force contiguous choice to test the transform
+        with (
+            mock.patch(
+                "torch._inductor.template_heuristics.contiguous_mm.use_contiguous"
+            ) as contiguous_mock,
+        ):
+            contiguous_mock.return_value = True
+
+            compiled_func = torch.compile(addmm_transpose)
+            out, code = run_and_get_code(compiled_func, inp, a, b)
+
+            # Verify correctness against fp64 baseline
+            torch.testing.assert_close(
+                out, expected_fp64.to(dtype), atol=1e-2, rtol=1e-2
+            )
+
+            # Check that contiguous transform was used
+            FileCheck().check("contiguous_addmm").run(code[0])
+
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @parametrize("dynamic", (False, True))
+    def test_max_autotune_contiguous_transform_non_contiguous_second_matrix(
+        self, dynamic
+    ):
+        """
+        Test that contiguous transform is only applied when the second matrix is non-contiguous.
+        """
+        M, N, K = 64, 128, 64
+
+        def mm(a, b):
+            return a @ b
+
+        a = torch.randn(M, K, dtype=torch.float32, device=GPU_TYPE)
+        b_contiguous = torch.randn(K, N, dtype=torch.float32, device=GPU_TYPE)
+        b_non_contiguous = torch.randn(
+            N, K, dtype=torch.float32, device=GPU_TYPE
+        ).transpose(0, 1)
+
+        # Compute fp64 baselines without max_autotune (since fp64 doesn't work with max_autotune=True)
+        a_fp64 = a.to(torch.float64)
+        b_contiguous_fp64 = b_contiguous.to(torch.float64)
+        b_non_contiguous_fp64 = b_non_contiguous.to(torch.float64)
+
+        expected1_fp64 = mm(a_fp64, b_contiguous_fp64)
+        expected2_fp64 = mm(a_fp64, b_non_contiguous_fp64)
+
+        with config.patch(
+            max_autotune=True,
+        ):
+            # Test with contiguous second matrix - should not use contiguous transform
+            compiled_func_contiguous = torch.compile(mm, dynamic=dynamic)
+            out1, code1 = run_and_get_code(compiled_func_contiguous, a, b_contiguous)
+
+            # Should not contain contiguous transform
+            try:
+                FileCheck().check("contiguous_mm").run(code1[0])
+                self.fail(
+                    "Contiguous transform should not be used for contiguous matrices"
+                )
+            except RuntimeError:
+                pass  # Expected - contiguous transform should not be used
+
+            # Test with non-contiguous second matrix - should use contiguous transform
+            with (
+                mock.patch(
+                    "torch._inductor.template_heuristics.contiguous_mm.use_contiguous"
+                ) as contiguous_mock,
+            ):
+                contiguous_mock.return_value = True
+
+                compiled_func_non_contiguous = torch.compile(mm, dynamic=dynamic)
+                out2, code2 = run_and_get_code(
+                    compiled_func_non_contiguous, a, b_non_contiguous
+                )
+
+                # Should contain contiguous transform
+                FileCheck().check("contiguous_mm").run(code2[0])
+
+        # Verify correctness against fp64 baselines
+        torch.testing.assert_close(
+            out1, expected1_fp64.to(torch.float32), atol=1e-2, rtol=1e-2
+        )
+        torch.testing.assert_close(
+            out2, expected2_fp64.to(torch.float32), atol=1e-2, rtol=1e-2
+        )
+
+    @unittest.skipIf(not torch.version.hip, "ROCM only")
+    @config.patch(
+        max_autotune=True,
+        max_autotune_gemm_backends="TRITON",
+    )
+    def test_max_autotune_contiguous_transform_with_epilogue(self):
+        """
+        Test contiguous transform with epilogue operations like relu.
+        """
+        M, N, K = 128, 256, 512
+
+        def mm_transpose_relu(a, b):
+            return (a @ b.transpose(0, 1)).relu()
+
+        a = torch.randn(M, K, dtype=torch.float32, device=GPU_TYPE)
+        b = torch.randn(N, K, dtype=torch.float32, device=GPU_TYPE)
+
+        # Compute fp64 baseline
+        a_fp64 = a.to(torch.float64)
+        b_fp64 = b.to(torch.float64)
+        expected_fp64 = mm_transpose_relu(a_fp64, b_fp64)
+
+        # Force contiguous transform
+        with (
+            mock.patch(
+                "torch._inductor.template_heuristics.contiguous_mm.use_contiguous"
+            ) as contiguous_mock,
+        ):
+            contiguous_mock.return_value = True
+
+            compiled_func = torch.compile(mm_transpose_relu)
+            out, code = run_and_get_code(compiled_func, a, b)
+
+            # Verify correctness against fp64 baseline
+            torch.testing.assert_close(
+                out, expected_fp64.to(torch.float32), atol=1e-2, rtol=1e-2
+            )
+
+            # Check that contiguous transform was used
+            FileCheck().check("contiguous_mm").run(code[0])
+
     def test_triton_template_generated_code_cache_key(self):
         generate_and_load_args = len(
             inspect.signature(
@@ -1643,7 +1844,7 @@ def f(a, b):
         b = torch.randn(K, N, dtype=torch.float16, device=GPU_TYPE, requires_grad=True)
 
         with mock.patch(
-            "torch._inductor.template_registry.get_template_heuristic"
+            "torch._inductor.template_heuristics.registry.get_template_heuristic"
         ) as config_mock:
             config_heuristics = CUDAMMTemplateConfigHeuristic()
 
@@ -1682,6 +1883,96 @@ def mm(x, y):
             out, code = run_and_get_code(compiled_f, a, b)
             torch.testing.assert_close(out, mm(a, b), atol=1e-2, rtol=1e-2)
 
+    @config.patch(
+        max_autotune_gemm=True,
+        max_autotune_prune_choices_based_on_shared_mem=True,
+    )
+    def test_max_autotune_prune_choices(self):
+        def mm(x, y):
+            return x @ y
+
+        M, K, N = (3, 3, 3)
+
+        x = torch.rand([M, K], device=GPU_TYPE, dtype=torch.float32)
+        y = torch.rand([K, N], device=GPU_TYPE, dtype=torch.float32)
+
+        compiled_f = torch.compile(mm)
+        compiled_f(x, y)
+
+        self.assertEqual(
+            counters["inductor"]["select_algorithm_num_precompilation_exceptions"], 0
+        )
+
+    @parametrize("op", ("mm", "addmm", "bmm", "baddbmm", "mm_plus_mm"))
+    @parametrize("max_autotune", (False, True))
+    @config.patch(
+        {"test_configs.max_mm_configs": 4, "max_autotune_gemm_backends": "ATEN,TRITON"}
+    )
+    def test_autotune_gemm_choice_validation(self, op, max_autotune):
+        def generate_inputs_and_func(op_name):
+            # Base config with just x and w
+            base_inputs = [
+                torch.randn(128, 256, device=GPU_TYPE),
+                torch.randn(256, 128, device=GPU_TYPE),
+            ]
+            func = torch.mm
+            if op_name == "mm":
+                # default
+                pass
+            elif op_name == "addmm":
+                # Add bias for addmm
+                base_inputs = [torch.randn(128, device=GPU_TYPE)] + base_inputs
+                func = torch.addmm
+            elif op_name in ["bmm", "baddbmm"]:
+                # Override for batch dimensions
+                base_inputs[0] = torch.randn(4, 128, 256, device=GPU_TYPE)
+                base_inputs[1] = torch.randn(4, 256, 128, device=GPU_TYPE)
+                func = torch.bmm
+                if op_name == "baddbmm":
+                    # Add batch bias
+                    base_inputs = [
+                        torch.torch.randn(4, 128, 128, device=GPU_TYPE)
+                    ] + base_inputs
+                    func = torch.baddbmm
+            elif op_name == "mm_plus_mm":
+                # Add second matrix pair
+                base_inputs += [
+                    torch.randn(128, 256, device=GPU_TYPE),
+                    torch.randn(256, 128, device=GPU_TYPE),
+                ]
+
+                def mmpmm(x, w, x2, w2):
+                    return torch.mm(x, w) + torch.mm(x2, w2)
+
+                func = mmpmm
+            else:
+                raise ValueError(f"Unsupported op: {op_name}")
+            return base_inputs, func
+
+        choice_types_seen = set()
+
+        def choice_validator(choices):
+            for choice in choices:
+                choice_types_seen.add(type(choice))
+            return choices
+
+        inputs, fn = generate_inputs_and_func(op)
+
+        add_preprocessing_fn(choice_validator)
+        try:
+            with config.patch({"max_autotune": max_autotune}):
+                compiled_fn = torch.compile(fn, dynamic=False)
+                compiled_fn(*inputs)
+
+                if max_autotune:
+                    self.assertIn(ExternKernelCaller, choice_types_seen)
+                    self.assertIn(TritonTemplateCaller, choice_types_seen)
+                else:
+                    self.assertIn(ExternKernelCaller, choice_types_seen)
+                    self.assertNotIn(TritonTemplateCaller, choice_types_seen)
+        finally:
+            clear_preprocessing_fns()
+
 
 class TestMaxAutotunePrecompile(TestCase):
     def test_precompilation_threads(self):
@@ -2195,6 +2486,118 @@ def test_tuning_pool_multiple_devices(self):
 
         tuning_pool.shutdown()
 
+    def test_add_feedback_saver(self):
+        """Test that add_feedback_saver correctly adds feedback functions."""
+        from torch._inductor.select_algorithm import get_algorithm_selector_cache
+
+        # Clear any existing feedback savers
+        clear_feedback_savers()
+
+        # Create a simple feedback saver function
+        feedback_calls = []
+
+        def simple_feedback_saver(timings, name, input_nodes, choices, profiled_time):
+            feedback_calls.append(
+                {
+                    "name": name,
+                    "num_choices": len(choices),
+                    "num_timings": len(timings),
+                    "has_profiled_time": profiled_time is not None,
+                }
+            )
+
+        # Add the feedback saver
+        add_feedback_saver(simple_feedback_saver)
+
+        # Get the global cache and verify the function was added
+        cache = get_algorithm_selector_cache()
+        self.assertEqual(len(cache.feedback_saver_fns), 1)
+        self.assertEqual(cache.feedback_saver_fns[0], simple_feedback_saver)
+
+        # Test that we can add multiple feedback savers
+        def another_feedback_saver(timings, name, input_nodes, choices, profiled_time):
+            pass
+
+        add_feedback_saver(another_feedback_saver)
+        self.assertEqual(len(cache.feedback_saver_fns), 2)
+
+        # Clean up
+        clear_feedback_savers()
+
+    def test_clear_feedback_savers(self):
+        """Test that clear_feedback_savers removes all feedback functions."""
+        from torch._inductor.select_algorithm import get_algorithm_selector_cache
+
+        # Add some feedback savers first
+        def feedback_saver1(timings, name, input_nodes, choices, profiled_time):
+            pass
+
+        def feedback_saver2(timings, name, input_nodes, choices, profiled_time):
+            pass
+
+        add_feedback_saver(feedback_saver1)
+        add_feedback_saver(feedback_saver2)
+
+        # Verify they were added
+        cache = get_algorithm_selector_cache()
+        self.assertEqual(len(cache.feedback_saver_fns), 2)
+
+        # Clear all feedback savers
+        clear_feedback_savers()
+
+        # Verify they were cleared
+        self.assertEqual(len(cache.feedback_saver_fns), 0)
+
+    def test_feedback_saver_integration(self):
+        """Test that feedback savers are actually called during autotuning."""
+        # Clear any existing feedback savers
+        clear_feedback_savers()
+
+        feedback_calls = []
+
+        def test_feedback_saver(timings, name, input_nodes, choices, profiled_time):
+            # Store information about the call for verification
+            feedback_calls.append(
+                {
+                    "name": name,
+                    "num_choices": len(choices),
+                    "num_timings": len(timings),
+                    "input_node_count": len(input_nodes),
+                }
+            )
+
+        # Add our test feedback saver
+        add_feedback_saver(test_feedback_saver)
+
+        # Create a simple matrix multiplication that will trigger autotuning
+        def mm(a, b):
+            return a @ b
+
+        a = torch.randn(32, 32, device=GPU_TYPE)
+        b = torch.randn(32, 32, device=GPU_TYPE)
+
+        with config.patch(
+            {"max_autotune": True, "max_autotune_gemm_backends": "TRITON"}
+        ):
+            torch.compile(mm)(a, b)
+
+        # Verify that our feedback saver was called
+        self.assertGreater(
+            len(feedback_calls), 0, "Feedback saver should have been called"
+        )
+
+        # Verify the structure of the feedback call
+        call = feedback_calls[0]
+        self.assertIn("name", call)
+        self.assertIn("num_choices", call)
+        self.assertIn("num_timings", call)
+        self.assertIn("input_node_count", call)
+        self.assertGreater(call["num_choices"], 0)
+        self.assertEqual(call["input_node_count"], 2)  # Two input matrices
+
+        # Clean up
+        clear_feedback_savers()
+
 
 @instantiate_parametrized_tests
 class TestPrologueFusion(TestCase):
diff --git a/test/inductor/test_minifier.py b/test/inductor/test_minifier.py
index ccbda46f9964..e8d695a1852d 100644
--- a/test/inductor/test_minifier.py
+++ b/test/inductor/test_minifier.py
@@ -5,7 +5,7 @@
 import torch._dynamo.config as dynamo_config
 import torch._inductor.config as inductor_config
 from torch._dynamo.test_minifier_common import MinifierTestBase
-from torch._inductor.codegen.common import get_wrapper_codegen_for_device
+from torch._inductor import config
 from torch.export import load as export_load
 from torch.testing._internal.common_utils import (
     IS_JETSON,
@@ -13,11 +13,7 @@
     skipIfXpu,
     TEST_WITH_ASAN,
 )
-from torch.testing._internal.inductor_utils import (
-    backend_for_device,
-    GPU_TYPE,
-    try_patch_inductor_backend_config,
-)
+from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.testing._internal.triton_utils import requires_gpu
 
 
@@ -38,43 +34,27 @@ def inner(x):
 """
         self._run_full_test(run_code, "aot", expected_error, isolate=False)
 
-    @unittest.skipIf(
-        backend_for_device("cpu") != "cpp", "Specifically testing C++ codegen"
-    )
     @unittest.skipIf(IS_JETSON, "Fails on Jetson")
-    @try_patch_inductor_backend_config(
-        "cpu", "inject_relu_bug_TESTING_ONLY", "compile_error"
-    )
-    def test_after_aot_cpp_compile_error(self):
+    @inductor_config.patch("cpp.inject_relu_bug_TESTING_ONLY", "compile_error")
+    def test_after_aot_cpu_compile_error(self):
         self._test_after_aot("cpu", "CppCompileError")
 
     @unittest.skipIf(IS_JETSON, "Fails on Jetson")
-    @try_patch_inductor_backend_config(
-        "cpu", "inject_relu_bug_TESTING_ONLY", "accuracy"
-    )
+    @inductor_config.patch("cpp.inject_relu_bug_TESTING_ONLY", "accuracy")
     def test_after_aot_cpu_accuracy_error(self):
         self._test_after_aot("cpu", "AccuracyError")
 
     @requires_gpu
-    @unittest.skipIf(
-        backend_for_device(GPU_TYPE) != "triton", "Specifically testing Triton codegen"
-    )
-    @try_patch_inductor_backend_config(
-        GPU_TYPE, "inject_relu_bug_TESTING_ONLY", "compile_error"
-    )
-    def test_after_aot_triton_compile_error(self):
+    @inductor_config.patch("triton.inject_relu_bug_TESTING_ONLY", "compile_error")
+    def test_after_aot_gpu_compile_error(self):
         self._test_after_aot(GPU_TYPE, "SyntaxError")
 
     @requires_gpu
-    @try_patch_inductor_backend_config(
-        GPU_TYPE, "inject_relu_bug_TESTING_ONLY", "accuracy"
-    )
+    @inductor_config.patch("triton.inject_relu_bug_TESTING_ONLY", "accuracy")
     def test_after_aot_gpu_accuracy_error(self):
         self._test_after_aot(GPU_TYPE, "AccuracyError")
 
-    @try_patch_inductor_backend_config(
-        "cpu", "inject_relu_bug_TESTING_ONLY", "accuracy"
-    )
+    @inductor_config.patch("cpp.inject_relu_bug_TESTING_ONLY", "accuracy")
     def test_constant_in_graph(self):
         run_code = """\
 @torch.compile()
@@ -86,7 +66,7 @@ def inner(x):
         self._run_full_test(run_code, "aot", "AccuracyError", isolate=False)
 
     @requires_gpu
-    @patch.object(inductor_config, "joint_graph_constant_folding", False)
+    @patch.object(config, "joint_graph_constant_folding", False)
     def test_rmse_improves_over_atol(self):
         # From https://twitter.com/itsclivetime/status/1651135821045719041?s=20
         run_code = """
@@ -115,12 +95,8 @@ def inner(x):
         # 655 * 100 precision, and so we report no problem
         self._run_full_test(run_code, "aot", None, isolate=False)
 
-    @try_patch_inductor_backend_config(
-        "cpu", "inject_relu_bug_TESTING_ONLY", "accuracy"
-    )
-    @try_patch_inductor_backend_config(
-        "cpu", "inject_log1p_bug_TESTING_ONLY", "accuracy"
-    )
+    @inductor_config.patch("cpp.inject_relu_bug_TESTING_ONLY", "accuracy")
+    @inductor_config.patch("cpp.inject_log1p_bug_TESTING_ONLY", "accuracy")
     def test_accuracy_vs_strict_accuracy(self):
         run_code = """
 @torch.compile()
@@ -174,9 +150,7 @@ def forward(self, arg0_1):
         return (relu,)""",
         )
 
-    @try_patch_inductor_backend_config(
-        "cpu", "inject_relu_bug_TESTING_ONLY", "accuracy"
-    )
+    @inductor_config.patch("cpp.inject_relu_bug_TESTING_ONLY", "accuracy")
     def test_offload_to_disk(self):
         # Just a smoketest, this doesn't actually test that memory
         # usage went down.  Test case is carefully constructed to hit
@@ -205,8 +179,6 @@ def _test_aoti(self, device, expected_error):
         # NB: The program is intentionally quite simple, just enough to
         # trigger one minification step, no more (dedicated minifier tests
         # should exercise minifier only)
-        if get_wrapper_codegen_for_device(device, cpp_wrapper=True) is None:
-            raise unittest.SkipTest(f"Device {device} does not support c++ wrapper")
         run_code = f"""\
 class Model(torch.nn.Module):
     def __init__(self):
@@ -239,8 +211,6 @@ def _test_aoti_unflattened_inputs(self, device, expected_error):
         # NB: The program is intentionally quite simple, just enough to
         # trigger one minification step, no more (dedicated minifier tests
         # should exercise minifier only)
-        if get_wrapper_codegen_for_device(device, cpp_wrapper=True) is None:
-            raise unittest.SkipTest(f"Device {device} does not support c++ wrapper")
 
         # It tests that the minifier can handle unflattened inputs and kwargs
         run_code = f"""\
@@ -279,7 +249,7 @@ def _aoti_check_relu_repro(self, res):
         assert res is not None
         ep_file_path = res.get_exported_program_path()
         assert ep_file_path is not None
-        gm = export_load(ep_file_path).module()
+        gm = export_load(ep_file_path).module(check_guards=False)
         self.assertExpectedInline(
             str(gm.code).strip(),
             """\
@@ -289,73 +259,53 @@ def forward(self, linear):
     return pytree.tree_unflatten((relu,), self._out_spec)""",
         )
 
-    @unittest.skipIf(
-        backend_for_device("cpu") != "cpp", "Specifically testing C++ codegen"
-    )
     @unittest.skipIf(IS_JETSON, "Fails on Jetson")
-    @try_patch_inductor_backend_config(
-        "cpu",
-        "inject_relu_bug_TESTING_ONLY",
+    @inductor_config.patch(
+        "cpp.inject_relu_bug_TESTING_ONLY",
         "compile_error",
     )
-    def test_aoti_cpp_compile_error(self):
+    def test_aoti_cpu_compile_error(self):
         res = self._test_aoti("cpu", "CppCompileError")
         self._aoti_check_relu_repro(res)
 
-    @unittest.skipIf(
-        backend_for_device("cpu") != "cpp", "Specifically testing C++ codegen"
-    )
     @unittest.skipIf(IS_JETSON, "Fails on Jetson")
-    @try_patch_inductor_backend_config(
-        "cpu",
-        "inject_relu_bug_TESTING_ONLY",
+    @inductor_config.patch(
+        "cpp.inject_relu_bug_TESTING_ONLY",
         "compile_error",
     )
-    def test_aoti_cpp_compile_error_unflatten(self):
+    def test_aoti_cpu_compile_error_unflatten(self):
         res = self._test_aoti_unflattened_inputs("cpu", "CppCompileError")
         self._aoti_check_relu_repro(res)
 
     @requires_gpu
-    @unittest.skipIf(
-        backend_for_device(GPU_TYPE) != "triton", "Specifically testing Triton codegen"
-    )
     @skipIfXpu(msg="AOTI for XPU not enabled yet")
-    @try_patch_inductor_backend_config(
-        GPU_TYPE,
-        "inject_relu_bug_TESTING_ONLY",
+    @inductor_config.patch(
+        "triton.inject_relu_bug_TESTING_ONLY",
         "compile_error",
     )
-    def test_aoti_triton_compile_error(self):
+    def test_aoti_gpu_compile_error(self):
         res = self._test_aoti(GPU_TYPE, "SyntaxError")
         self._aoti_check_relu_repro(res)
 
     @requires_gpu
-    @unittest.skipIf(
-        backend_for_device(GPU_TYPE) != "triton", "Specifically testing Triton codegen"
-    )
     @skipIfXpu(msg="AOTI for XPU not enabled yet")
-    @try_patch_inductor_backend_config(
-        GPU_TYPE,
-        "inject_relu_bug_TESTING_ONLY",
+    @inductor_config.patch(
+        "triton.inject_relu_bug_TESTING_ONLY",
         "compile_error",
     )
-    def test_aoti_triton_compile_error_unflatten(self):
+    def test_aoti_gpu_compile_error_unflatten(self):
         res = self._test_aoti_unflattened_inputs(GPU_TYPE, "SyntaxError")
         self._aoti_check_relu_repro(res)
 
     @unittest.skipIf(IS_JETSON, "Fails on Jetson")
-    @try_patch_inductor_backend_config(
-        "cpu", "inject_relu_bug_TESTING_ONLY", "accuracy"
-    )
+    @inductor_config.patch("cpp.inject_relu_bug_TESTING_ONLY", "accuracy")
     def test_aoti_cpu_accuracy_error(self):
         res = self._test_aoti("cpu", "AccuracyError")
         self._aoti_check_relu_repro(res)
 
     @requires_gpu
     @skipIfXpu(msg="AOTI for XPU not enabled yet")
-    @try_patch_inductor_backend_config(
-        GPU_TYPE, "inject_relu_bug_TESTING_ONLY", "accuracy"
-    )
+    @inductor_config.patch("triton.inject_relu_bug_TESTING_ONLY", "accuracy")
     def test_aoti_gpu_accuracy_error(self):
         res = self._test_aoti(GPU_TYPE, "AccuracyError")
         self._aoti_check_relu_repro(res)
diff --git a/test/inductor/test_minifier_utils.py b/test/inductor/test_minifier_utils.py
index d95f1ff14466..80c773830b4a 100644
--- a/test/inductor/test_minifier_utils.py
+++ b/test/inductor/test_minifier_utils.py
@@ -63,7 +63,7 @@ def true_fn(x):
         )
 
         model = M()
-        gm = torch.export.export(model, inputs, strict=False).module()
+        gm = torch.export.export(model, inputs, strict=False).module(check_guards=False)
 
         # TODO: make NNModuleToString.convert() generate string for nested submodules.
         model_string = get_module_string(gm)
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 79ca002f7f5b..8bbf76af6bac 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -177,6 +177,7 @@ def _test_common(
         is_dynamic=False,
         quantizer=None,
         compile_options={},  # noqa: B006
+        quantization_with_autocast=False,
     ):
         if not hasattr(self, "device"):
             has_xpu = any(
@@ -206,9 +207,15 @@ def _test_common(
             assert check_autocast == torch.float32
             maybe_autocast = contextlib.nullcontext()
         if check_quantization:
-            convert_model = _generate_qdq_quantized_model(
-                mod, inputs, is_qat, is_dynamic, quantizer
-            )
+            if quantization_with_autocast:
+                with maybe_autocast:
+                    convert_model = _generate_qdq_quantized_model(
+                        mod, inputs, is_qat, is_dynamic, quantizer
+                    )
+            else:
+                convert_model = _generate_qdq_quantized_model(
+                    mod, inputs, is_qat, is_dynamic, quantizer
+                )
             with torch.no_grad(), maybe_autocast:
                 _ = torch.compile(convert_model)(*inputs)
                 matcher_check_fn()
@@ -1106,7 +1113,12 @@ def matcher_check_fn():
             v = torch.randn(2, 4, 16).to(dtype)
             self._test_common(mod, (v,), matcher_check_fn, rtol=1e-2, atol=1e-2)
 
-    def _qconv2d_test_helper(self, device="cpu", int8_mixed_bf16=False):
+    def _qconv2d_test_helper(
+        self,
+        device="cpu",
+        int8_mixed_bf16=False,
+        quantization_with_autocast=False,
+    ):
         class M(torch.nn.Module):
             def __init__(
                 self,
@@ -1139,7 +1151,7 @@ def matcher_check_fn():
             )
             self.assertEqual(
                 counters["inductor"]["qconv_weight_prepack_matcher_nodes"],
-                18 if int8_mixed_bf16 else 12,
+                (16 if quantization_with_autocast else 18) if int8_mixed_bf16 else 12,
             )
             self.assertEqual(
                 counters["inductor"]["qconv_unary_lower_count"], 0 if TEST_ACL else 3
@@ -1151,6 +1163,7 @@ def matcher_check_fn():
             matcher_check_fn,
             check_quantization=True,
             check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
+            quantization_with_autocast=quantization_with_autocast,
         )
 
     @skipIfNoDynamoSupport
@@ -1181,6 +1194,16 @@ def test_qconv2d_int8_mixed_bf16(self):
         """
         self._qconv2d_test_helper(int8_mixed_bf16=True)
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    @skipIfRocmArch(MI300_ARCH)
+    def test_qconv2d_int8_mixed_bf16_use_autocast(self):
+        r"""
+        This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
+        """
+        self._qconv2d_test_helper(int8_mixed_bf16=True, quantization_with_autocast=True)
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
@@ -2330,6 +2353,7 @@ def _qlinear_test_helper(
         bias=True,
         is_dynamic=False,
         is_qat=False,
+        quantization_with_autocast=False,
     ):
         class M(torch.nn.Module):
             def __init__(self, use_bias, do_permute=False):
@@ -2368,6 +2392,7 @@ def _default_matcher_check_fn():
             check_quantization=True,
             is_qat=is_qat,
             is_dynamic=is_dynamic,
+            quantization_with_autocast=quantization_with_autocast,
         )
 
     @skipIfNoDynamoSupport
@@ -2436,6 +2461,21 @@ def test_qlinear_int8_mixed_bf16(self):
                 (torch.randn((2, 4)),), int8_mixed_bf16=True, bias=bias
             )
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    def test_qlinear_int8_mixed_bf16_use_autocast(self):
+        r"""
+        This testcase will quantize a single Linear Module with int8_mixed_bf16 quantization.
+        """
+        for bias in [True, False]:
+            self._qlinear_test_helper(
+                (torch.randn((2, 4)),),
+                int8_mixed_bf16=True,
+                bias=bias,
+                quantization_with_autocast=True,
+            )
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoXPU
@@ -2484,6 +2524,21 @@ def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2(self):
                 (torch.randn((2, 3, 4)),), int8_mixed_bf16=True, bias=bias
             )
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_use_autocast(self):
+        r"""
+        This testcase will quantize a single Linear Module with int8_mixed_bf16 quantization.
+        """
+        for bias in [True, False]:
+            self._qlinear_test_helper(
+                (torch.randn((2, 3, 4)),),
+                int8_mixed_bf16=True,
+                bias=bias,
+                quantization_with_autocast=True,
+            )
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
@@ -2554,6 +2609,37 @@ def matcher_check_fn():
                 bias=bias,
             )
 
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    def test_qlinear_int8_mixed_bf16_input_dim_exceeds_2_and_not_contiguous_use_autocast(
+        self,
+    ):
+        r"""
+        This testcase will quantize a single Linear Module for int8_bf16.
+        * Input dim exceeds 2
+        * Input not contiguous
+        """
+        for bias in [True, False]:
+
+            def matcher_check_fn():
+                self.assertEqual(
+                    counters["inductor"]["qlinear_weight_prepack_matcher_count"], 2
+                )
+                self.assertEqual(
+                    counters["inductor"]["qlinear_weight_prepack_matcher_nodes"],
+                    16 if bias else 15,
+                )
+
+            self._qlinear_test_helper(
+                (torch.randn((2, 4, 3, 4)),),
+                int8_mixed_bf16=True,
+                do_permute=True,
+                matcher_check_fn=matcher_check_fn,
+                bias=bias,
+                quantization_with_autocast=True,
+            )
+
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
@@ -2902,8 +2988,8 @@ def matcher_check_fn():
                     mod,
                     (v,),
                     [
-                        "aoti_torch_cpu__qlinear_pointwise_tensor",
-                        "aoti_torch_cpu__qlinear_pointwise_binary_tensor",
+                        f"aoti_torch_{device}__qlinear_pointwise_tensor",
+                        f"aoti_torch_{device}__qlinear_pointwise_binary_tensor",
                     ],
                     [],
                     check_quantization=True,
diff --git a/test/inductor/test_online_softmax.py b/test/inductor/test_online_softmax.py
index 1e94ff1f4987..808757b7e041 100644
--- a/test/inductor/test_online_softmax.py
+++ b/test/inductor/test_online_softmax.py
@@ -293,6 +293,19 @@ def f(x, mask):
         self.assertTrue(not act.isnan().any())
         self.assertTrue(torch.allclose(ref, act))
 
+    @inductor_config.patch(split_reductions=False)
+    def test_3d_tiled_online_softmax(self):
+        def f(x, y):
+            return (x * y).softmax(dim=-1)
+
+        M, N, K = 32, 8, 1024
+
+        x = torch.randn(K, N, M, device=GPU_TYPE).permute(2, 1, 0)
+        y = torch.randn(K, M, N, device=GPU_TYPE).permute(1, 2, 0)
+
+        opt_f = torch.compile(f)
+        torch.testing.assert_close(f(x, y), opt_f(x, y), atol=1e-3, rtol=1e-3)
+
 
 instantiate_parametrized_tests(TestOnlineSoftmax)
 
diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
index d04bed2a9032..781f4588e147 100644
--- a/test/inductor/test_pad_mm.py
+++ b/test/inductor/test_pad_mm.py
@@ -539,6 +539,115 @@ def fn(x, y):
         # Its name should contain `mm` because `mm` was the original aten op where the mm came from.
         FileCheck().check("def triton_tem_fused_mm").run(code[0])
 
+    def test_no_autocast_in_pad_bmm_joint_graph_pass(self):
+        # Track bmm dtypes before and after joint graph passes
+        bmm_dtypes_pre = {}
+        bmm_dtypes_post = {}
+
+        def make_bmm_dtype_tracker(dtype_dict):
+            def track_bmm_dtype(graph):
+                for node in graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.bmm.default
+                    ):
+                        # Store the output dtype
+                        if hasattr(node.meta.get("val", None), "dtype"):
+                            dtype_dict[str(node)] = node.meta["val"].dtype
+                return graph
+
+            return track_bmm_dtype
+
+        class MaskedMHA(torch.nn.Module):
+            def __init__(self, H_q, H_kv, D):
+                super().__init__()
+                self.H_kv = H_kv
+                num_heads_total = H_q + 2 * H_kv
+                self.qkv_proj_vid = torch.nn.Linear(H_q * D, num_heads_total * D)
+                self.qkv_proj_txt = torch.nn.Linear(H_q * D, num_heads_total * D)
+                self.out_proj = torch.nn.Linear(H_q * D, H_q * D)
+                self.H_q = H_q
+                self.D = D
+
+            def forward(self, x_vid, x_txt, attn_mask):
+                qkv_vid = self.qkv_proj_vid(x_vid)
+                qkv_txt = self.qkv_proj_txt(x_txt)
+                qkv_vid = qkv_vid.reshape((*qkv_vid.shape[:-1], -1, self.D))
+                qkv_txt = qkv_txt.reshape((*qkv_txt.shape[:-1], -1, self.D))
+
+                q_vid = qkv_vid[..., : self.H_q, :]
+                k_vid = qkv_vid[..., self.H_q : self.H_q + self.H_kv, :]
+                v_vid = qkv_vid[..., self.H_q + self.H_kv :, :]
+
+                q_txt = qkv_txt[..., : self.H_q, :]
+                k_txt = qkv_txt[..., self.H_q : self.H_q + self.H_kv, :]
+                v_txt = qkv_txt[..., self.H_q + self.H_kv :, :]
+
+                q = torch.cat([q_vid, q_txt], dim=-3)
+                k = torch.cat([k_vid, k_txt], dim=-3)
+                v = torch.cat([v_vid, v_txt], dim=-3)
+
+                out = torch.nn.functional.scaled_dot_product_attention(
+                    q.transpose(-2, -3),
+                    k.transpose(-2, -3),
+                    v.transpose(-2, -3),
+                    attn_mask=attn_mask,
+                    enable_gqa=True,
+                )
+                out = out.transpose(-2, -3)
+
+                return out
+
+        def test_masked_mha(B, H, S, D, device, dtype):
+            S_vid = 300
+            S_txt = S - S_vid
+            x1 = torch.randn(B, S_vid, H * D, requires_grad=True, device=device)
+            x2 = torch.randn(B, S_txt, H * D, requires_grad=True, device=device)
+            attn_mask = torch.ones(B, 1, S, S, dtype=torch.bool, device=device)
+
+            H_kv = H // 4
+            mha = MaskedMHA(H, H_kv, D)
+            mha = mha.to(device)
+
+            with torch._inductor.config.patch(
+                joint_custom_pre_pass=make_bmm_dtype_tracker(bmm_dtypes_pre),
+                joint_custom_post_pass=make_bmm_dtype_tracker(bmm_dtypes_post),
+            ):
+                mha = torch.compile(mha, fullgraph=True, backend="inductor")
+                with torch.autocast(
+                    device_type="cuda", dtype=dtype, cache_enabled=False
+                ):
+                    out_vid = mha(x1, x2, attn_mask)
+                    target_vid = torch.randn_like(out_vid)
+
+                    loss_vid = (out_vid - target_vid).mean()
+                    loss = loss_vid
+                loss.backward()
+
+            torch.cuda.synchronize()
+
+            # Check if any bmm operations had dtype changes
+            for node_name_pre, node_name_post in zip(
+                bmm_dtypes_pre, bmm_dtypes_post, strict=True
+            ):
+                pre_dtype = bmm_dtypes_pre[node_name_pre]
+                post_dtype = bmm_dtypes_post[node_name_post]
+                # Assert no bmm output dtype changes
+                self.assertEqual(pre_dtype, post_dtype)
+
+            # Based on issue https://github.com/pytorch/pytorch/issues/159469,
+            # if autocast was applied in pad_bmm causing bmm's output dtype to be changed from fp32 to bf16,
+            # gradient will have NaNs in this test case.
+            self.assertFalse(torch.any(x1.grad.isnan()).item())
+            self.assertFalse(torch.any(x2.grad.isnan()).item())
+
+        B, H, S, D = 2, 32, 549, 128
+        device = "cuda"
+        dtype = torch.bfloat16
+        torch.compiler.reset()
+        torch.manual_seed(42)
+        test_masked_mha(B, H, S, D, device, dtype)
+
 
 if __name__ == "__main__":
     if HAS_CUDA_AND_TRITON:
diff --git a/test/inductor/test_padding.py b/test/inductor/test_padding.py
index 15c1abdf32db..9ef3a18e2423 100644
--- a/test/inductor/test_padding.py
+++ b/test/inductor/test_padding.py
@@ -49,6 +49,18 @@ def geninp():
     return input_dict
 
 
+def get_padded_stride(shape, alignment_bytes, pad_output, itemsize):
+    align = alignment_bytes // itemsize
+    new_strides = [0 for _ in range(len(shape))]
+    new_strides[len(shape) - 1] = 1
+    for i in range(len(shape) - 1, 0, -1):
+        stride = shape[i] * new_strides[i]
+        if pad_output and stride % align != 0:
+            stride = (stride + align - 1) // align * align
+        new_strides[i - 1] = stride
+    return tuple(new_strides)
+
+
 class LinearAndSoftmax(nn.Module):
     """
     It's very common that a transformer model will do a matmul and then
@@ -767,7 +779,137 @@ def get_input(size: tuple[int], alignment_bytes: int) -> torch.Tensor:
         output_shape = (shape[0] * num_inputs, shape[1])
         output_stride = input_tensors[0].stride()
         output_line = f"buf12 = empty_strided_{GPU_TYPE}({output_shape}, {output_stride}, torch.float32)"
-        self.assertTrue(any(output_line in line for line in code))
+        self.assertTrue(output_line in code[0])
+
+    @parametrize(
+        "shape,alignment_bytes,enable_pad",
+        [
+            ((512, 1), 32, False),
+            ((512, 1), 32, True),
+            ((32, 30), 64, False),
+            ((32, 30), 64, True),
+            ((512, 100, 1), 32, False),
+            ((512, 100, 1), 32, True),
+            ((32, 50, 30), 64, False),
+            ((32, 50, 30), 64, True),
+        ],
+    )
+    def test_outer_dynamic_shape_padding(self, shape, alignment_bytes, enable_pad):
+        """
+        When only the outermost dim is dynamic shape, the output can still be padded up
+        based on padding configuration.
+        """
+        num_inputs = 2
+        input_tensors = [
+            torch.randn(shape, dtype=torch.float32) for _ in range(num_inputs)
+        ]
+
+        config_patches = {
+            "comprehensive_padding": enable_pad,
+            "pad_dynamic_shapes": True,
+            "cpu_backend": "triton",
+            "padding_alignment_bytes": alignment_bytes,
+            "pad_outputs": True,
+            "padding_stride_threshold": 0,
+        }
+        with config.patch(config_patches):
+            torch._dynamo.mark_dynamic(input_tensors[0], 0)
+            torch._dynamo.mark_dynamic(input_tensors[1], 0)
+            compiled = torch.compile(torch.add)
+            result, _ = run_and_get_code(compiled, *input_tensors)
+
+        expected_stride = get_padded_stride(
+            result.shape, alignment_bytes, enable_pad, result.dtype.itemsize
+        )
+        self.assertEqual(result.stride(), expected_stride)
+
+    @parametrize(
+        "shape,perm,alignment_bytes,enable_pad",
+        [
+            ((500, 10, 1), (2, 1, 0), 32, False),
+            ((500, 20, 1), (2, 1, 0), 32, True),
+            ((30, 10, 20), (2, 1, 0), 64, True),
+            ((30, 10, 20), (2, 1, 0), 64, False),
+            ((500, 10, 1), (1, 2, 0), 32, False),
+            ((500, 20, 1), (1, 2, 0), 32, True),
+            ((30, 10, 20), (1, 2, 0), 64, True),
+            ((30, 10, 20), (1, 2, 0), 64, False),
+        ],
+    )
+    def test_perm_outer_dynamic_shape_padding(
+        self, shape, perm, alignment_bytes, enable_pad
+    ):
+        """
+        When only the outermost dim is dynamic shape, the output can still be padded up
+        based on padding configuration. Test when this occurs after a permute op.
+        """
+
+        def permute_contig(x):
+            return torch.permute(x, perm).contiguous()
+
+        num_inputs = 1
+        input_tensors = [
+            torch.randn(shape, dtype=torch.float32) for _ in range(num_inputs)
+        ]
+
+        config_patches = {
+            "comprehensive_padding": enable_pad,
+            "pad_dynamic_shapes": True,
+            "cpu_backend": "triton",
+            "padding_alignment_bytes": alignment_bytes,
+            "pad_outputs": True,
+            "padding_stride_threshold": 0,
+            "triton.use_block_ptr": True,
+        }
+        with config.patch(config_patches):
+            torch._dynamo.mark_dynamic(input_tensors[0], 2)
+            compiled = torch.compile(permute_contig)
+            result, _ = run_and_get_code(compiled, *input_tensors)
+
+        expected_stride = get_padded_stride(
+            result.shape, alignment_bytes, enable_pad, result.dtype.itemsize
+        )
+        self.assertEqual(result.stride(), expected_stride)
+
+    @parametrize(
+        "shape,alignment_bytes,enable_pad",
+        [
+            ((512, 1), 32, False),
+            ((512, 1), 32, True),
+            ((32, 30), 64, False),
+            ((32, 30), 64, True),
+            ((512, 100, 1), 32, False),
+            ((512, 100, 1), 32, True),
+            ((32, 50, 30), 64, False),
+            ((32, 50, 30), 64, True),
+        ],
+    )
+    def test_dynamic_shape_padding(self, shape, alignment_bytes, enable_pad):
+        """
+        When only the outermost dim is dynamic shape, the output can still be padded up
+        based on padding configuration.
+        """
+        num_inputs = 2
+        input_tensors = [
+            torch.randn(shape, dtype=torch.float32) for _ in range(num_inputs)
+        ]
+
+        config_patches = {
+            "comprehensive_padding": enable_pad,
+            "pad_dynamic_shapes": enable_pad,
+            "cpu_backend": "triton",
+            "padding_alignment_bytes": alignment_bytes,
+            "pad_outputs": True,
+            "padding_stride_threshold": 0,
+        }
+        with config.patch(config_patches):
+            compiled = torch.compile(torch.add, dynamic=True)
+            result, _ = run_and_get_code(compiled, *input_tensors)
+
+        expected_stride = get_padded_stride(
+            result.shape, alignment_bytes, enable_pad, result.dtype.itemsize
+        )
+        self.assertEqual(result.stride(), expected_stride)
 
 
 if __name__ == "__main__":
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 83cd236875f4..2dd6d498936f 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -1156,11 +1156,13 @@ def f():
                 torch.compile(f, fullgraph=True),
             )
 
-            # Check that we are allocating the minimum number of intermediate buffers
+            # Check that we are not allocate intermediate buffers
+            # which can be reused.
             matches = re.findall(r"empty_strided_\w+\(", code)
-            self.assertEqual(len(matches), 1)
+            self.assertEqual(len(matches), 0)
+            self.assertEqual("in_out" in code, True)
 
-            self.assertExpectedInline(count_numel(f), """39""")
+            self.assertExpectedInline(count_numel(f), """45""")
 
     @requires_cuda_and_triton
     def test_inplace_triton_kernel_v1(self):
diff --git a/test/inductor/test_provenance_tracing.py b/test/inductor/test_provenance_tracing.py
index f5165356932d..7d6b714838ff 100644
--- a/test/inductor/test_provenance_tracing.py
+++ b/test/inductor/test_provenance_tracing.py
@@ -19,11 +19,12 @@
     create_kernel_information_json,
     create_mapping_pre_post_grad_nodes,
     create_node_mapping_kernel_to_post_grad,
+    reset_inductor_kernel_provenance_debug_handle,
 )
 from torch._inductor.fx_passes.post_grad import post_grad_passes
 from torch._inductor.test_case import run_tests, TestCase
 from torch._inductor.virtualized import V
-from torch.testing._internal.inductor_utils import HAS_GPU
+from torch.testing._internal.common_utils import IS_MACOS
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
@@ -94,11 +95,12 @@ class TestProvenanceTracingArtifact(TestCase):
     corresponding "inductor triton kernel node" is expected.
     """
 
-    def _check_provenance_tracing_artifact(self, filepath, expected_data):
+    def _check_provenance_tracing_kernel_to_post_grad(self, filepath, expected_data):
         self.assertTrue(filepath.is_dir())
-        filename = Path(filepath) / "inductor_generated_kernel_to_post_grad_nodes.json"
+        filename = Path(filepath) / "inductor_provenance_tracking_node_mappings.json"
         with open(filename) as f:
             actual_data = json.load(f)
+        actual_data = actual_data["cppCodeToPost"]
         # check that the generated provenance tracing artifact is expected
         self.assertEqual(sorted(actual_data.items()), sorted(expected_data.items()))
 
@@ -116,10 +118,11 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
         c = torch.randn(10, 30, device=device)
         example_inputs = (a, b, c)
 
-        model = Model()
+        model = Model().to(device)
         filepath = None
 
         for backend in ["aot_inductor", "inductor"]:
+            reset_inductor_kernel_provenance_debug_handle()
             try:
                 with config.patch(
                     {
@@ -142,28 +145,12 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                     self.assertTrue(m)
                     filepath = Path(m.group(1))
                     if device == "cuda":
-                        expected_data = {
-                            "triton_poi_fused_mul_0": ["mul"],
-                            "triton_poi_fused_addmm_gelu_1": [
-                                "mul_3",
-                                "mul_1",
-                                "add_tensor",
-                                "add",
-                                "erf",
-                                "mul_2",
-                            ],
-                        }
-                        if backend == "aot_inductor":
-                            expected_data["aoti_torch_cuda_mm_out"] = ["mm_default"]
-                        else:
-                            expected_data["extern_kernels.mm"] = ["mm_default"]
-                        self._check_provenance_tracing_artifact(filepath, expected_data)
                         expected_mapping = [
                             (
                                 "cppCodeToPost",
                                 {
-                                    "triton_poi_fused_mul_0": ["mul"],
-                                    "triton_poi_fused_addmm_gelu_1": [
+                                    "triton_poi_fused_mul_0:1": ["mul"],
+                                    "triton_poi_fused_addmm_gelu_1:2": [
                                         "mul_3",
                                         "mul_1",
                                         "add_tensor",
@@ -176,13 +163,13 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                             (
                                 "postToCppCode",
                                 {
-                                    "mul": ["triton_poi_fused_mul_0"],
-                                    "mul_3": ["triton_poi_fused_addmm_gelu_1"],
-                                    "mul_1": ["triton_poi_fused_addmm_gelu_1"],
-                                    "add_tensor": ["triton_poi_fused_addmm_gelu_1"],
-                                    "add": ["triton_poi_fused_addmm_gelu_1"],
-                                    "erf": ["triton_poi_fused_addmm_gelu_1"],
-                                    "mul_2": ["triton_poi_fused_addmm_gelu_1"],
+                                    "mul": ["triton_poi_fused_mul_0:1"],
+                                    "mul_3": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "mul_1": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "add_tensor": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "add": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "erf": ["triton_poi_fused_addmm_gelu_1:2"],
+                                    "mul_2": ["triton_poi_fused_addmm_gelu_1:2"],
                                 },
                             ),
                             (
@@ -208,15 +195,19 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                             ),
                         ]
                         if backend == "aot_inductor":
-                            expected_mapping[0][1]["aoti_torch_cuda_mm_out"] = [
+                            expected_mapping[0][1]["aoti_torch_cuda_mm_out:3"] = [
                                 "mm_default"
                             ]
                             expected_mapping[1][1]["mm_default"] = [
-                                "aoti_torch_cuda_mm_out"
+                                "aoti_torch_cuda_mm_out:3"
                             ]
                         else:
-                            expected_mapping[0][1]["extern_kernels.mm"] = ["mm_default"]
-                            expected_mapping[1][1]["mm_default"] = ["extern_kernels.mm"]
+                            expected_mapping[0][1]["extern_kernels.mm:3"] = [
+                                "mm_default"
+                            ]
+                            expected_mapping[1][1]["mm_default"] = [
+                                "extern_kernels.mm:3"
+                            ]
                         self._check_provenance_tracking_node_mappings(
                             filepath, expected_mapping
                         )
@@ -225,9 +216,9 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                         # check the inductor kernel to post grad nodes mapping is expected for cpu
                         if backend == "aot_inductor":
                             expected_data = {
-                                "cpp_fused_mul_0": ["mul"],
-                                "aoti_torch_cpu_addmm_out": ["addmm"],
-                                "cpp_fused_gelu_1": [
+                                "cpp_fused_mul_0:1": ["mul"],
+                                "aoti_torch_cpu_addmm_out:3": ["addmm"],
+                                "cpp_fused_gelu_1:2": [
                                     "mul_3",
                                     "mul_1",
                                     "add",
@@ -238,17 +229,19 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
                         else:
                             # backend == "inductor"
                             expected_data = {
-                                "cpp_fused_mul_0": ["mul"],
-                                "cpp_fused_gelu_1": [
+                                "cpp_fused_mul_0:1": ["mul"],
+                                "cpp_fused_gelu_1:2": [
                                     "mul_3",
                                     "mul_1",
                                     "add",
                                     "erf",
                                     "mul_2",
                                 ],
-                                "extern_kernels.addmm": ["addmm"],
+                                "extern_kernels.addmm:3": ["addmm"],
                             }
-                        self._check_provenance_tracing_artifact(filepath, expected_data)
+                        self._check_provenance_tracing_kernel_to_post_grad(
+                            filepath, expected_data
+                        )
 
             finally:
                 if filepath:
@@ -258,7 +251,6 @@ def _test_triton_kernel_to_post_grad_tracing(self, device):
     def test_triton_kernel_to_post_grad_tracing_cuda(self):
         self._test_triton_kernel_to_post_grad_tracing(device="cuda")
 
-    @unittest.skipIf(HAS_GPU, "the test is only for cpu")
     def test_triton_kernel_to_post_grad_tracing_cpu(self):
         self._test_triton_kernel_to_post_grad_tracing(device="cpu")
 
@@ -274,6 +266,7 @@ def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
         filepath = None
 
         for backend in ["aot_inductor", "inductor"]:
+            reset_inductor_kernel_provenance_debug_handle()
             try:
                 with config.patch(
                     {
@@ -297,15 +290,17 @@ def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
                     filepath = Path(m.group(1))
                     if backend == "inductor":
                         expected_data = {
-                            "extern_kernels.addmm": ["addmm"],
+                            "extern_kernels.addmm:1": ["addmm"],
                         }
                     else:
                         # backend = aot_inductor
                         expected_data = {
-                            "aoti_torch_cuda_addmm_out": ["addmm"],
-                            "triton_poi_fused_0": ["_tensor_constant1"],
+                            "aoti_torch_cuda_addmm_out:2": ["addmm"],
+                            "triton_poi_fused_0:1": ["_tensor_constant1"],
                         }
-                    self._check_provenance_tracing_artifact(filepath, expected_data)
+                    self._check_provenance_tracing_kernel_to_post_grad(
+                        filepath, expected_data
+                    )
             finally:
                 if filepath:
                     shutil.rmtree(filepath)
@@ -319,6 +314,7 @@ def _test_pt_tracing_combo_kernel(self, backend):
         example_inputs = (a, b, c)
 
         model = Model2()
+        reset_inductor_kernel_provenance_debug_handle()
 
         with config.patch(
             {
@@ -342,8 +338,8 @@ def _test_pt_tracing_combo_kernel(self, backend):
             m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
             self.assertTrue(m)
             filepath = Path(m.group(1)).resolve()
-            expected_data = {"triton_poi_fused_0": ["relu", "sigmoid", "tanh"]}
-            self._check_provenance_tracing_artifact(filepath, expected_data)
+            expected_data = {"triton_poi_fused_0:1": ["relu", "sigmoid", "tanh"]}
+            self._check_provenance_tracing_kernel_to_post_grad(filepath, expected_data)
 
     @requires_cuda_and_triton
     def test_triton_kernel_to_post_grad_tracing_combo_kernel(self):
@@ -538,13 +534,11 @@ def _setup_provenance_capture(self):
         finally:
             trace_log.removeHandler(payload_handler)
 
-    def extract_code_line(self, s):
-        # Extract last non-empty line
-        return s.split("\n")[-2].strip()
+    def extract_code_line(self, s, i=-2):
+        # Extract ith line
+        return s.split("\n")[i].strip()
 
-    @torch._inductor.config.patch(
-        {"fx_graph_cache": False, "trace.provenance_tracking_level": 2}
-    )
+    @torch._inductor.config.patch({"trace.provenance_tracking_level": 2})
     @requires_cuda_and_triton
     def test_tlparse_kernel_stack_traces(self):
         device = "cuda"
@@ -556,29 +550,36 @@ def test_tlparse_kernel_stack_traces(self):
         example_inputs = (x, a, b, c)
 
         expected = {
-            "triton_poi_fused_addmm_relu_sigmoid_threshold_backward_0": [
+            "triton_poi_fused_addmm_relu_sigmoid_threshold_backward_0:1": [
                 "x = self.sigmoid(x)",
                 "x = self.fc1(x)",
                 "x = self.relu(x)",
             ],
-            "triton_poi_fused_mul_1": [
+            "triton_poi_fused_mul_1:2": [
                 "d = a * 3.14",
             ],
-            "triton_poi_fused_addmm_gelu_2": [
+            "triton_poi_fused_addmm_gelu_2:3": [
                 "z = torch.nn.functional.gelu(y)",
                 "y = torch.addmm(c, d, b)",
             ],
-            "extern_kernels.mm": [
+            "extern_kernels.mm:4": [
                 "x = self.fc1(x)",
+            ],
+            "extern_kernels.mm:5": [
                 "y = torch.addmm(c, d, b)",
             ],
         }
 
-        with self._setup_provenance_capture() as payload_buffer:
-            compiled = torch.compile(model)
-            compiled(*example_inputs)
-            payload_content = payload_buffer.getvalue().strip()
-            if payload_content:
+        compiled = torch.compile(model)
+        # should produce the same provenance if there's cache hit
+        for _ in range(2):
+            # reset cache
+            torch._dynamo.reset()
+            reset_inductor_kernel_provenance_debug_handle()
+            with self._setup_provenance_capture() as payload_buffer:
+                compiled = torch.compile(model)
+                compiled(*example_inputs)
+                payload_content = payload_buffer.getvalue().strip()
                 data = json.loads(payload_content)
                 self.assertEqual(set(data.keys()), set(expected.keys()))
                 for key, expected_lines in expected.items():
@@ -623,6 +624,7 @@ def test_kernel_information_generation(self):
         with tempfile.TemporaryDirectory() as temp_dir:
             ep = torch.export.export(model, inputs, strict=False)
             pt2_file = os.path.join(temp_dir, "model.pt2")
+            reset_inductor_kernel_provenance_debug_handle()
             torch._inductor.aoti_compile_and_package(ep, package_path=pt2_file)
 
             # Extract and check kernel_information.json exists in the package
@@ -646,7 +648,7 @@ def test_kernel_information_generation(self):
                 kernel_info = json.load(f)
 
             expected = {
-                "triton_poi_fused_addmm_relu_sigmoid_0": {
+                "triton_poi_fused_addmm_relu_sigmoid_0:1": {
                     "stack_traces": [
                         "x = self.sigmoid(x)",
                         "x = self.fc1(x)",
@@ -655,14 +657,14 @@ def test_kernel_information_generation(self):
                     "post_grad_nodes": ["sigmoid", "relu", "add_tensor_1"],
                     "pre_grad_nodes": ["sigmoid", "relu", "linear"],
                 },
-                "triton_poi_fused_mul_1": {
+                "triton_poi_fused_mul_1:2": {
                     "stack_traces": [
                         "d = a * 3.14",
                     ],
                     "post_grad_nodes": ["mul"],
                     "pre_grad_nodes": ["mul"],
                 },
-                "triton_poi_fused_addmm_gelu_2": {
+                "triton_poi_fused_addmm_gelu_2:3": {
                     "stack_traces": [
                         "z = torch.nn.functional.gelu(y)",
                         "y = torch.addmm(c, d, b)",
@@ -677,13 +679,19 @@ def test_kernel_information_generation(self):
                     ],
                     "pre_grad_nodes": ["gelu", "addmm"],
                 },
-                "aoti_torch_cuda_mm_out": {
+                "aoti_torch_cuda_mm_out:4": {
                     "stack_traces": [
                         "x = self.fc1(x)",
+                    ],
+                    "post_grad_nodes": ["mm_default_1"],
+                    "pre_grad_nodes": ["linear"],
+                },
+                "aoti_torch_cuda_mm_out:5": {
+                    "stack_traces": [
                         "y = torch.addmm(c, d, b)",
                     ],
-                    "post_grad_nodes": ["mm_default_1", "mm_default"],
-                    "pre_grad_nodes": ["linear", "addmm"],
+                    "post_grad_nodes": ["mm_default"],
+                    "pre_grad_nodes": ["addmm"],
                 },
             }
 
@@ -743,6 +751,32 @@ def test_create_kernel_information_json_function(self):
         self.assertIsInstance(result, dict)
         self.assertEqual(len(result), 0)  # Should be empty with no provenance data
 
+    @unittest.skipIf(
+        IS_MACOS,
+        "MacOS generates different debug handles",
+    )
+    @torch._inductor.config.patch("trace.provenance_tracking_level", 1)
+    def test_cpu_extern_kernel(self):
+        class Foo(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.conv = torch.nn.Conv2d(16, 33, 3)
+
+            def forward(self, x):
+                return self.conv(x)
+
+        model = Foo()
+        x = torch.randn(20, 16, 50, 100)
+        with self._setup_provenance_capture() as payload_buffer:
+            reset_inductor_kernel_provenance_debug_handle()
+            ep = torch.export.export(model, (x,))
+            torch._inductor.aoti_compile_and_package(ep)
+            payload_content = payload_buffer.getvalue().strip()
+            data = json.loads(payload_content)
+
+            keys = [k.split(":")[0] for k in data]
+            self.assertTrue("aoti_torch_cpu_convolution" in keys)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/inductor/test_subgraph_choice.py b/test/inductor/test_subgraph_choice.py
index 98f447652d24..d2d5a3bf59a9 100644
--- a/test/inductor/test_subgraph_choice.py
+++ b/test/inductor/test_subgraph_choice.py
@@ -1,18 +1,13 @@
 # Owner(s): ["module: inductor"]
-import functools
 import unittest
 from unittest import mock
 from unittest.mock import MagicMock
 
 import torch
-from torch._dispatch.python import enable_python_dispatcher
-from torch._inductor.codegen.subgraph import SubgraphTemplate
-from torch._inductor.decomposition import select_decomp_table
 from torch._inductor.ir import Buffer, FixedLayout, FlexibleLayout
 from torch._inductor.lowering import register_lowering
 from torch._inductor.select_algorithm import autotune_select_algorithm
 from torch._inductor.test_case import run_tests, TestCase
-from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_utils import skipIfXpu, TEST_WITH_ROCM
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
 
@@ -64,20 +59,14 @@ def _(a, b):
             choices = [aten_mm.bind((mat1, mat2), layout)]
 
             kPartitions = 256
-            with enable_python_dispatcher():
-                decompositions = select_decomp_table()
-
-                decompose_k_subgraph_template = SubgraphTemplate(
-                    name="decompose_k_mm",
-                    make_fx_graph=make_fx(
-                        functools.partial(decomposeK, kPartitions=kPartitions),
-                        decompositions,
-                        tracing_mode="real",
-                    ),
-                )
+
+            decompose_k_subgraph_template = (
+                torch._inductor.kernel.mm.DecomposeKSugraphTemplate()
+            )
 
             decompose_k_subgraph_template.maybe_append_choice(
                 choices,
+                k_split=kPartitions,
                 input_nodes=(mat1, mat2),
                 layout=layout,
             )
@@ -139,19 +128,14 @@ def _(a, b):
             choices = []
 
             kPartitions = 2
-            with enable_python_dispatcher():
-                decompositions = select_decomp_table()
 
-                decompose_k_subgraph_template = SubgraphTemplate(
-                    name="decompose_k_mm",
-                    make_fx_graph=make_fx(
-                        functools.partial(decomposeK, kPartitions=kPartitions),
-                        decompositions,
-                    ),
-                )
+            decompose_k_subgraph_template = (
+                torch._inductor.kernel.mm.DecomposeKSugraphTemplate()
+            )
 
             decompose_k_subgraph_template.maybe_append_choice(
                 choices,
+                k_split=kPartitions,
                 input_nodes=(mat1, mat2),
                 layout=layout,
             )
diff --git a/test/inductor/test_template_heuristics_registry.py b/test/inductor/test_template_heuristics_registry.py
new file mode 100644
index 000000000000..6786fe24ccc9
--- /dev/null
+++ b/test/inductor/test_template_heuristics_registry.py
@@ -0,0 +1,171 @@
+# Owner(s): ["module: inductor"]
+from torch._inductor.template_heuristics.base import TemplateConfigHeuristics
+from torch._inductor.template_heuristics.registry import (
+    _TEMPLATE_HEURISTIC_REGISTRY,
+    clear_registry,
+    get_template_heuristic,
+    register_template_heuristic,
+)
+from torch._inductor.test_case import run_tests, TestCase
+
+
+class TestTemplateHeuristicsRegistry(TestCase):
+    def setUp(self):
+        super().setUp()
+        # Save original registry state
+        self.original_registry = _TEMPLATE_HEURISTIC_REGISTRY.copy()
+        clear_registry()  # Test heuristic classes using the decorator registration
+
+    def tearDown(self):
+        # Restore original registry
+        clear_registry()
+        _TEMPLATE_HEURISTIC_REGISTRY.update(self.original_registry)
+        super().tearDown()
+
+    def test_register_class(self):
+        """Test basic registration of a heuristic class."""
+        # Clear registry for this isolated test
+        clear_registry()
+
+        @register_template_heuristic("test_mm", "cuda")
+        class TestHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Verify registration
+        key = ("test_mm", "cuda", None)
+        self.assertIn(key, _TEMPLATE_HEURISTIC_REGISTRY)
+        self.assertEqual(_TEMPLATE_HEURISTIC_REGISTRY[key], TestHeuristic)
+
+    def test_assertion_existing_class(self):
+        @register_template_heuristic("triton::mm", "cuda")
+        class _CrossOpHeuristic(TemplateConfigHeuristics):
+            """(template, device, None) - Cross-op for specific device"""
+
+        """Test that registered class can be retrieved."""
+        # The _CrossOpHeuristic is registered at module level for ("mm", "cuda", None)
+        # Test retrieval - it should match for any op on cuda device
+        heuristic = get_template_heuristic("triton::mm", "cuda", "bmm")
+        self.assertIsInstance(heuristic, _CrossOpHeuristic)
+
+    def test_hierarchy_lookup(self):
+        """Test complete hierarchy: (template, device, op) -> (template, None, None)"""
+
+        @register_template_heuristic("triton::mm", "cuda", op_name="scaled_mm")
+        class _MostSpecificHeuristic(TemplateConfigHeuristics):
+            """(template, device, op) - Most specific"""
+
+        @register_template_heuristic("triton::mm", None, op_name="scaled_mm")
+        class _CrossDeviceHeuristic(TemplateConfigHeuristics):
+            """(template, None, op) - Cross-device for specific op"""
+
+        @register_template_heuristic("triton::mm", "cuda")
+        class _CrossOpHeuristic(TemplateConfigHeuristics):
+            """(template, device, None) - Cross-op for specific device"""
+
+        @register_template_heuristic("triton::mm", None)
+        class _MostGeneralHeuristic(TemplateConfigHeuristics):
+            """(template, None, None) - Most general"""
+
+        # All classes are already registered via decorators:
+        # _MostSpecificHeuristic: ("mm", "cuda", "scaled_mm") - Most specific
+        # _CrossDeviceHeuristic: ("mm", None, "scaled_mm") - Cross-device for specific op
+        # _CrossOpHeuristic: ("mm", "cuda", None) - Cross-op for specific device
+        # _MostGeneralHeuristic: ("mm", None, None) - Most general
+
+        # Test 1: Exact match - should get most specific
+        heuristic = get_template_heuristic("triton::mm", "cuda", "scaled_mm")
+        self.assertIsInstance(heuristic, _MostSpecificHeuristic)
+
+        # Test 2: Different device, same op - should get cross-device
+        heuristic = get_template_heuristic("triton::mm", "xpu", "scaled_mm")
+        self.assertIsInstance(heuristic, _CrossDeviceHeuristic)
+
+        # Test 3: Same device, different op - should get cross-op
+        heuristic = get_template_heuristic("triton::mm", "cuda", "bmm")
+        self.assertIsInstance(heuristic, _CrossOpHeuristic)
+
+        # Test 4: Different device and op - should get most general
+        heuristic = get_template_heuristic("triton::mm", "xpu", "bmm")
+        self.assertIsInstance(heuristic, _MostGeneralHeuristic)
+
+    def test_partial_hierarchy_scenarios(self):
+        """Test hierarchy behavior with partial registrations"""
+
+        # Scenario 1: Register partial hierarchy using decorators
+        @register_template_heuristic("triton::tma", None, op_name="scaled_tma")
+        class _TestCrossDeviceHeuristic(TemplateConfigHeuristics):
+            pass
+
+        @register_template_heuristic("triton::tma", None)
+        class _TestGeneralHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Should get cross-device for matching op, regardless of device
+        heuristic = get_template_heuristic("triton::tma", "cuda", "scaled_tma")
+        self.assertIsInstance(heuristic, _TestCrossDeviceHeuristic)
+
+        # Should fallback to general for different op
+        heuristic = get_template_heuristic("triton::tma", "cuda", "scaled_mm")
+        self.assertIsInstance(heuristic, _TestGeneralHeuristic)
+
+        # Scenario 2: Only specific device exists
+        @register_template_heuristic("triton::bmm", "cuda")
+        class _TestDeviceSpecificHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Should get device-specific for cuda
+        heuristic = get_template_heuristic("triton::bmm", "cuda", "any_op")
+        self.assertIsInstance(heuristic, _TestDeviceSpecificHeuristic)
+
+        # Should return fallback instance for other devices (no specific heuristic registered)
+        heuristic = get_template_heuristic("triton::bmm", "xpu", "any_op")
+        self.assertIsInstance(heuristic, TemplateConfigHeuristics)
+        # Make sure it's not the registered specific heuristic
+        self.assertNotIsInstance(heuristic, _TestDeviceSpecificHeuristic)
+
+        # Scenario 3: Only most general exists
+        @register_template_heuristic("triton::mm", None)
+        class _TestMostGeneralHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Should always get general regardless of device/op
+        heuristic = get_template_heuristic("triton::mm", "cuda", "scaled_addmm")
+        self.assertIsInstance(heuristic, _TestMostGeneralHeuristic)
+
+        heuristic = get_template_heuristic("triton::mm", "xpu", "regular_addmm")
+        self.assertIsInstance(heuristic, _TestMostGeneralHeuristic)
+
+    def test_fallback_behavior(self):
+        """Test that fallback TemplateConfigHeuristics is returned when no heuristic is found"""
+
+        # Test 1: Get fallback for unregistered template
+        heuristic = get_template_heuristic("unknown_template", "cuda", "unknown_op")
+        self.assertIsInstance(heuristic, TemplateConfigHeuristics)
+        # Make sure it's the base class and not a subclass
+        self.assertEqual(type(heuristic), TemplateConfigHeuristics)
+
+        # Test 2: Verify fallback instances are NOT cached (new instance each time)
+        heuristic2 = get_template_heuristic("unknown_template", "cuda", "unknown_op")
+        self.assertIsInstance(heuristic2, TemplateConfigHeuristics)
+        self.assertEqual(type(heuristic2), TemplateConfigHeuristics)
+        # Should be different instances (not cached)
+        self.assertIsNot(heuristic, heuristic2)
+
+        # Test 3: After registering a heuristic, should get the registered one instead
+        @register_template_heuristic("unknown_template", "cuda", op_name="unknown_op")
+        class _NewlyRegisteredHeuristic(TemplateConfigHeuristics):
+            pass
+
+        # Now should get the registered heuristic, not the fallback
+        heuristic3 = get_template_heuristic("unknown_template", "cuda", "unknown_op")
+        self.assertIsInstance(heuristic3, _NewlyRegisteredHeuristic)
+        self.assertNotEqual(type(heuristic3), TemplateConfigHeuristics)
+
+        # Test 4: Verify registered instances ARE cached (same instance each time)
+        heuristic4 = get_template_heuristic("unknown_template", "cuda", "unknown_op")
+        self.assertIsInstance(heuristic4, _NewlyRegisteredHeuristic)
+        self.assertIs(heuristic3, heuristic4)  # Should be same cached instance
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 7d8198a5c004..b5d880cd90f4 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -96,6 +96,7 @@
     MACOS_VERSION,
     parametrize,
     serialTest,
+    skipIfMPS,
     skipIfRocm,
     skipIfWindows,
     skipIfXpu,
@@ -2990,6 +2991,18 @@ def forward(x, y):
             ),
         )
 
+    def test_torch_device_split(self):
+        def fn(x):
+            return x.split(2)
+
+        x = torch.rand(10)
+
+        with x.device:
+            out = torch.compile(fn, backend=lambda gm, _: gm)(x)
+            ref = fn(x)
+            for a, b in zip(out, ref):
+                self.assertTrue(torch.allclose(a, b))
+
     def test_relu(self):
         def fn(a, b):
             return (torch.relu(a), torch.relu(a + b) / 10)
@@ -4692,7 +4705,7 @@ def test_conv3d(self):
         self.common(
             m,
             (torch.randn([1, 3, 8, 16, 32]),),
-            atol=6e-5,
+            atol=1e-3,
             rtol=0.001,
             # Make sure we compute also with fp16 in the reference. Otherwise,
             # the reference will compute with fp32 and cast back to fp16, which
@@ -6081,6 +6094,16 @@ def fn(x):
             (torch.randn([8, 16, 8, 8]),),
         )
 
+    def test_unsigned_constant_tensors(self):
+        def fn(x):
+            c = torch.tensor(7, dtype=torch.uint8)
+            return c + x, torch.neg(c), torch.neg(c) + x
+
+        self.common(
+            fn,
+            (torch.randn([16, 16]),),
+        )
+
     # Disable size_asserts for this test due to https://github.com/pytorch/pytorch/issues/145963
     @config.patch(size_asserts=os.environ.get("TORCHINDUCTOR_SIZE_ASSERTS") == "1")
     @torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True)
@@ -8480,6 +8503,24 @@ def forward(self, x, start_pos):
             self.common(kv_cache_module, (inp, 1), check_lowp=False)
         assertGeneratedKernelCountEqual(self, 1)
 
+    @skipIfMPS
+    def test_slice_scatter_dtype_consistency(self):
+        # Test dtype consistency of slice_scatter
+        def fn(x, y):
+            return torch.slice_scatter(y, x, 0)
+
+        for dtype in [
+            torch.int64,
+            torch.float64,
+        ]:
+            self.common(
+                fn,
+                [
+                    torch.tensor([0], dtype=dtype),
+                    torch.tensor([0], dtype=torch.float32),
+                ],
+            )
+
     @skip_if_gpu_halide  # compile error on gpu
     def test_scatter1(self):
         def fn(a, dim, index, b):
@@ -10711,6 +10752,22 @@ def fn(x):
 
         self.common(fn, [torch.randn(1, 8, 396 * 300)])
 
+    @torch._dynamo.config.patch("capture_scalar_outputs", True)
+    def test_pattern_matcher_unbacked(self):
+        @torch.compile(fullgraph=True)
+        def get_mask(W: torch.Tensor, percentage_nonzeros: torch.Tensor):
+            total_elements = W.numel()
+            k = total_elements * percentage_nonzeros
+            top_k_indices = torch.topk(torch.abs(W).flatten(), k.int())[1]
+            mask = torch.zeros(total_elements, dtype=torch.bool, device=W.device)
+            mask.scatter_(0, top_k_indices, True)
+            mask = mask.view(W.shape)
+            return mask
+
+        x = torch.randn((128, 64), device=self.device)
+        p = torch.tensor(0.50, device=self.device)
+        get_mask(x, p)
+
     def test_sqrt_dynamic_shapes(self):
         # TIMM convit_base model: https://github.com/pytorch/pytorch/issues/97877.
         # TODO: support cuda path.
@@ -12874,8 +12931,6 @@ def fn(x):
             not in [
                 "airy_ai",
                 "erfcx",
-                "gammainc",
-                "gammaincc",
                 "laguerre_polynomial_l",
                 "legendre_polynomial_p",
                 "log_ndtr",
@@ -13879,6 +13934,116 @@ def fn(m, inp):
         inp = torch.randn(100, 100, device=self.device)
         self.assertTrue(CommonTemplate._is_triggering_buffer_reuse(fn, m, inp))
 
+    @requires_cuda_and_triton
+    def test_cpu_scalar_with_gpu_tensor(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(4, device="cpu")
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda_and_triton
+    @torch._inductor.config.patch(cpp_wrapper=True)
+    def test_cpu_scalar_with_gpu_tensor_cpp(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(4, device="cpu")
+
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+
+    @requires_cuda_and_triton
+    def test_cpu_scalar_with_gpu_tensor_dynamic(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(4, device="cpu")
+
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor", dynamic=True)(a, b)
+        self.assertEqual(eager, compiled)
+
+    def test_cpu_scalar_with_cpu_tensor(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device="cpu")
+        b = torch.rand(4, device="cpu")
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda_and_triton
+    def test_gpu_scalar_with_gpu_tensor(self):
+        def fn(a, b):
+            return a + b[0]
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(4, device=GPU_TYPE)
+
+        torch._inductor.metrics.generated_kernel_count = 0
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
+    @requires_cuda_and_triton
+    def test_cpu_tensor_with_gpu_tensor(self):
+        def fn(a, b):
+            return a + b
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(20, device="cpu")
+
+        with self.assertRaises(RuntimeError):
+            compiled = torch.compile(fn, backend="inductor")(a, b)
+
+    def test_cpu_tensor_with_cpu_tensor(self):
+        def fn(a, b):
+            return a + b
+
+        a = torch.rand(20, device="cpu")
+        b = torch.rand(20, device="cpu")
+
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+
+    def test_cpu_scalar_with_cpu_scalar(self):
+        def fn(a, b):
+            return a[0] + b[0]
+
+        a = torch.rand(20, device="cpu")
+        b = torch.rand(20, device="cpu")
+
+        eager = fn(a, b)
+        compiled = torch.compile(fn, backend="inductor")(a, b)
+        self.assertEqual(eager, compiled)
+
+    @requires_cuda_and_triton
+    def test_gpu_scalar_with_cpu_tensor(self):
+        def fn(a, b):
+            return a[0] + b
+
+        a = torch.rand(20, device=GPU_TYPE)
+        b = torch.rand(20, device="cpu")
+
+        with self.assertRaises(RuntimeError):
+            compiled = torch.compile(fn, backend="inductor")(a, b)
+
     # end of class CommonTemplate - add new tests here
 
 
@@ -14412,6 +14577,55 @@ def fn_gpu(x):
             self.assertEqual(type(r), np.ndarray)
             self.assertEqual(r, np.sin(x))
 
+        @config.patch(expand_dimension_for_pointwise_nodes=True)
+        def test_rope_fusion(self):
+            batch_size, seq_length, hidden_dim = 8, 16, 128
+            num_q_heads, num_kv_heads = 32, 8
+
+            def prepare_input(batch_size, seq_length):
+                q = torch.randn(
+                    (batch_size, num_q_heads, seq_length, hidden_dim), device=GPU_TYPE
+                )
+                k = torch.randn(
+                    (batch_size, num_kv_heads, seq_length, hidden_dim),
+                    device=GPU_TYPE,
+                )
+                pos_ids = torch.arange(
+                    seq_length, device=GPU_TYPE, dtype=torch.long
+                ).unsqueeze(0)
+
+                # dummy cos and sin
+                cos, sin = (
+                    torch.randn((1, seq_length, hidden_dim), device=GPU_TYPE),
+                    torch.randn((1, seq_length, hidden_dim), device=GPU_TYPE),
+                )
+                return q, k, cos, sin, pos_ids
+
+            def rotate_half(x):
+                """Rotates half the hidden dims of the input."""
+                x1 = x[..., : x.shape[-1] // 2]
+                x2 = x[..., x.shape[-1] // 2 :]
+                return torch.cat((-x2, x1), dim=-1)
+
+            def apply_rotary_pos_emb(
+                q, k, cos, sin, position_ids=None, unsqueeze_dim=1
+            ):
+                cos = cos.unsqueeze(unsqueeze_dim)
+                sin = sin.unsqueeze(unsqueeze_dim)
+                q_embed = (q * cos) + (rotate_half(q) * sin)
+                k_embed = (k * cos) + (rotate_half(k) * sin)
+                return q_embed, k_embed
+
+            q, k, cos, sin, pos_ids = prepare_input(batch_size, seq_length)
+            compiled_fn = torch.compile(apply_rotary_pos_emb)
+            compiled_out = compiled_fn(q, k, cos, sin, pos_ids)
+            eager_out = apply_rotary_pos_emb(q, k, cos, sin, pos_ids)
+            self.assertEqual(compiled_out, eager_out)
+
+            # make sure that rope is fused into 1 kernel
+            code = run_and_get_triton_code(compiled_fn, q, k, cos, sin, pos_ids)
+            self.assertEqual(code.count(".run("), 1)
+
         def test_numpy_autograd(self):
             def my_torch(x):
                 y = torch.cat([torch.sin(x) ** 2, torch.max(x)[None]])
@@ -15283,6 +15497,43 @@ def fn():
 
             fn()
 
+        @config.patch(implicit_fallbacks=True)
+        @torch._inductor.config.patch("graph_partition", True)
+        def test_graph_partition_default_device_context(self):
+            @torch.library.custom_op(
+                "mylib::cg_unsafe_op",
+                mutates_args=[],
+                schema="(Tensor x) -> Tensor",
+                device_types=GPU_TYPE,
+                tags=(torch._C.Tag.cudagraph_unsafe,),
+            )
+            def cg_unsafe_op(x) -> torch.Tensor:
+                return x + 1
+
+            @cg_unsafe_op.register_fake
+            def _(x) -> torch.Tensor:
+                return torch.empty_like(x)
+
+            def f(x):
+                x += 1
+                y = cg_unsafe_op(x)
+                y += 1
+                return y
+
+            f = torch.compile(f, mode="reduce-overhead")
+
+            inp = torch.randn(2, device=GPU_TYPE)
+            _, (code,) = run_and_get_code(f, inp)
+
+            if config.cpp_wrapper:
+                FileCheck().check_count(
+                    "AOTICudaGuard device_guard(0)", 1, exactly=True
+                ).run(code)
+            else:
+                FileCheck().check_count(
+                    "with torch.cuda._DeviceGuard(0)", 1, exactly=True
+                ).run(code)
+
     class RNNTest(TestCase):
         device_type = GPU_TYPE
 
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
index f5b65c110c13..4bcdf0d0cddc 100644
--- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -138,6 +138,12 @@ def run(*ex, **kwargs):
     "test_mul_index_expr_dynamic_shapes": TestFailure(("cpu",)),
     "test_flip_cat_dynamic_shapes": TestFailure(("cpu",)),
     "test_pad_single_dynamic_shapes": TestFailure(("cpu",)),
+    "test_slice_scatter_dtype_consistency_dynamic_shapes": TestFailure(
+        (
+            "cpu",
+            "mps",
+        )
+    ),
     "test_embedding_sparse_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")),
     #
     # Failed to find for loop/triton kernel:
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index 8b6d625a5447..57d263a63e8a 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -104,10 +104,17 @@
     test_failures["test_unbacked_reduction"] = TestFailure(("cpu"), is_skip=True)
 
 
-if os.getenv("BUILD_ENVIRONMENT", "").endswith("-debug"):
+if any(os.getenv("BUILD_ENVIRONMENT", "").endswith(x) for x in ("-debug", "-asan")):
     # Fails with TORCH_INTERNAL_ASSERT(!is_heap_allocated()), see https://github.com/pytorch/pytorch/issues/130073
-    test_failures["test_resize_as_dynamic_shapes"] = TestFailure(("cpu", "cuda"))
-    test_failures["test_resize_dynamic_shapes"] = TestFailure(("cpu", "cuda"))
+    # After https://github.com/pytorch/pytorch/pull/161586, starts failing UBSAN so we can't even xfail.
+    # Root cause seems to be SymInt issues in StorageImpl, see
+    # https://github.com/pytorch/pytorch/pull/161586#issuecomment-3246530671
+    test_failures["test_resize_as_dynamic_shapes"] = TestFailure(
+        ("cpu", "cuda"), is_skip=True
+    )
+    test_failures["test_resize_dynamic_shapes"] = TestFailure(
+        ("cpu", "cuda"), is_skip=True
+    )
 
 
 def make_dynamic_cls(cls, xfail_prop="_expected_failure_dynamic"):
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index 8e527b659ec9..807ccb48a798 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -49,7 +49,7 @@
     HAS_XPU_AND_TRITON,
     maybe_skip_size_asserts,
 )
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import requires_gpu_and_triton
 from torch.utils._dtype_abbrs import dtype_abbrs
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_map
@@ -1133,7 +1133,7 @@ def tearDown(self):
     @skipCUDAMemoryLeakCheckIf(
         True
     )  # inductor kernels failing this test intermittently
-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
     @skipXPUIf(
         not HAS_XPU_AND_TRITON, "Skipped! Supported XPU compiler and Triton not found"
     )
diff --git a/test/inductor/test_torchinductor_strided_blocks.py b/test/inductor/test_torchinductor_strided_blocks.py
index cde27ad61dc8..6bde7a8c540a 100644
--- a/test/inductor/test_torchinductor_strided_blocks.py
+++ b/test/inductor/test_torchinductor_strided_blocks.py
@@ -78,7 +78,6 @@ def xfail_if_use_tensor_descriptor(fn):
         "test_2d_reduction_odd_shapes_view_size1_num_block_pointers_3_num_triton_kernels_2_reduction_op1",
         "test_broadcast_prefer_nd_tiling_False_x_size0_y_size0",
         "test_broadcast_prefer_nd_tiling_False_x_size2_y_size2",
-        "test_broadcast_prefer_nd_tiling_False_x_size3_y_size3",
         "test_broadcast_prefer_nd_tiling_True_x_size0_y_size0",
         "test_broadcast_prefer_nd_tiling_True_x_size2_y_size2",
         "test_broadcast_with_singleton_dims",
@@ -382,11 +381,19 @@ def load_args(reader):
         input_reader = InputReader()
         load_args(input_reader)
         args = input_reader.args
+        if self.device == "xpu":
+            atol = 1e-7
+            rtol = 1e-5
+        else:
+            atol = None
+            rtol = None
 
         self._run_and_compare(
             forward,
             *args,
             expected_num_block_pointers=4,
+            atol=atol,
+            rtol=rtol,
         )
 
     @parametrize(
@@ -809,6 +816,7 @@ def test_2d_reduction_odd_shapes(
         # Check the code for multiple Rn_BLOCK's
         self._assert_reduction_ndims(code, 2)
 
+
     @parametrize(
         "size,expected_num_block_pointers,expected_num_triton_kernels,expect_fallback",
         [
diff --git a/test/inductor/test_triton_heuristics.py b/test/inductor/test_triton_heuristics.py
index 4c2a04678b88..1573d4860a84 100644
--- a/test/inductor/test_triton_heuristics.py
+++ b/test/inductor/test_triton_heuristics.py
@@ -3,15 +3,24 @@
 import functools
 import sys
 import unittest
+from unittest import skipUnless
 from unittest.mock import MagicMock, patch
 
 import torch
 from torch._dynamo.testing import rand_strided
 from torch._inductor.runtime.triton_compat import HAS_WARP_SPEC
 from torch._inductor.utils import clone_preserve_strides
-from torch.testing._internal.common_utils import IS_LINUX, runOnRocm, skipIfXpu
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    IS_LINUX,
+    parametrize,
+    runOnRocm,
+    skipIfRocm,
+    skipIfXpu,
+)
 from torch.testing._internal.inductor_utils import (
     GPU_TYPE,
+    HAS_CUDA_AND_TRITON,
     HAS_GPU,
     requires_cuda_with_enough_memory,
 )
@@ -67,6 +76,7 @@ def get_autotuned_amd_sqr_kernel():
     )(amd_sqr_kernel)
 
 
+@instantiate_parametrized_tests
 class TestTritonHeuristics(TestCase):
     device_type = GPU_TYPE
 
@@ -262,6 +272,34 @@ def fn(x):
         res = torch.compile(fn)(x)
         self.assertEqual(ref, res)
 
+    @skipIfXpu
+    @skipIfRocm
+    @skipUnless(HAS_CUDA_AND_TRITON, "requires CUDA")
+    @parametrize("do_pruning", [False, True])
+    def test_prune_configs_over_shared_memory_limit(self, do_pruning):
+        from torch._inductor.template_heuristics.triton import (
+            CUDAConfigHeuristic,
+            GemmConfig,
+        )
+
+        expected_count = 1 if do_pruning else 2
+        mm_configs = [
+            GemmConfig(32, 32, 32, 1, 8, 8),
+            GemmConfig(
+                128, 128, 128, 100, 8, 4
+            ),  # intentionally large to exceed shared memory limit
+        ]
+        with config.patch(
+            {"max_autotune_prune_choices_based_on_shared_mem": do_pruning}
+        ):
+            config_heuristic = CUDAConfigHeuristic()
+            config_heuristic.should_scale_configs = False
+            config_heuristic.mm_configs = mm_configs
+            configs = list(
+                config_heuristic.get_mm_configs()(3, 3, 3, dtype_size=4, op_name="mm")
+            )
+            self.assertEqual(len(configs), expected_count)
+
 
 class TestArgumentCloneAndRestore(TestCase):
     # Our tensor is large enough. If a unexpected copy happens, the
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
index fc9f92477c79..5fe3623b271a 100644
--- a/test/inductor/test_triton_kernels.py
+++ b/test/inductor/test_triton_kernels.py
@@ -83,6 +83,12 @@ def _triton_get_ast_equal_to_str(params):
     BOOL_CONSTANT_C: tl.constexpr = tl.constexpr(True)
     FLOAT_CONSTANT_C = tl.constexpr(3.14)  # intentionally un-annotated
 
+    if hasattr(triton, "constexpr_function"):
+
+        @triton.constexpr_function
+        def log2(n):
+            return len(bin(n)) - 3
+
 
 class KernelTests(torch._inductor.test_case.TestCase):
     def _kernel_launched_in_code(self, kernel_name: str, code: str) -> bool:
@@ -1383,6 +1389,39 @@ def f(x):
 
         self.assertEqual(compiled_out, eager_out)
 
+    @unittest.skipIf(
+        not HAS_GPU or not hasattr(triton, "constexpr_function"),
+        "newer triton version required",
+    )
+    def test_triton_kernel_with_constexpr_function(self):
+        @triton.jit
+        def kernel(x_ptr, output_ptr, n_elements, BLOCK_SIZE: tl.constexpr):
+            pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(x_ptr + offsets, mask=mask)
+
+            FIRST_DIM: tl.constexpr = x.shape[0]
+            output = x + log2(FIRST_DIM)
+            tl.store(output_ptr + offsets, output, mask=mask)
+
+        def f(x):
+            out = torch.zeros_like(x)
+            n_elements = x.numel()
+            grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+            kernel[grid](x, out, n_elements, BLOCK_SIZE=16)
+            return out
+
+        x = torch.randn(16, device=GPU_TYPE)
+        eager_out = f(x)
+        compiled_out, (triton_code,) = run_and_get_code(
+            torch.compile(f, fullgraph=True), x
+        )
+
+        self.assertIn("@triton.constexpr_function", triton_code)
+        self.assertEqual(compiled_out, eager_out)
+
     @requires_gpu
     def test_triton_kernel_with_imported_symbol_with_custom_name(self):
         @triton.jit
diff --git a/test/jit/test_models.py b/test/jit/test_models.py
index c6364f10197d..4dd099dbaad5 100644
--- a/test/jit/test_models.py
+++ b/test/jit/test_models.py
@@ -7,6 +7,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.testing._internal.common_cuda import tf32_on_and_off
 from torch.testing._internal.common_utils import (
     enable_profiling_mode_for_profiling_tests,
     GRAPH_EXECUTOR,
@@ -482,6 +483,7 @@ def test_super_resolution(self):
         self._test_super_resolution(self, device="cpu")
 
     @unittest.skipIf(not RUN_CUDA, "no CUDA")
+    @tf32_on_and_off(0.02)
     def test_super_resolution_cuda(self):
         # XXX: export_import on CUDA modules doesn't work (#11480)
         self._test_super_resolution(self, device="cuda", check_export_import=False)
diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py
index 458eee0315cf..b12c99e74686 100644
--- a/test/nn/test_convolution.py
+++ b/test/nn/test_convolution.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: nn"]
 import itertools
 import math
+import os
 import unittest
 import warnings
 from itertools import product
@@ -55,12 +56,17 @@
     subtest,
     TEST_SCIPY,
     TEST_WITH_ROCM,
+    xfailIf,
 )
 
 
 AMPERE_OR_ROCM = TEST_WITH_ROCM or torch.cuda.is_tf32_supported()
 
 
+if TEST_WITH_ROCM:
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+
+
 if TEST_SCIPY:
     import scipy.ndimage
     import scipy.signal
@@ -4032,6 +4038,7 @@ def test_conv_double_backward_strided_with_3D_input_and_weight(self, device):
         self.assertEqual(grad_input.shape, input.shape)
         self.assertEqual(grad_weight.shape, weight.shape)
 
+    @skipCUDAIfRocm
     @onlyCUDA
     @largeTensorTest("40GB")
     @largeTensorTest("24GB", "cpu")
@@ -4042,10 +4049,36 @@ def test_conv3d_64bit_indexing(self, device):
         y = m.to(device=device)(x.to(device=device))
         self.assertEqual(yref, y)
 
+    @skipCUDAIfRocm
+    @onlyCUDA
+    @largeTensorTest("40GB", "cuda")
+    def test_conv3d_cudnn_broken(self, device):
+        for dtype in (torch.half, torch.bfloat16):
+            x = torch.rand(1, 16, 124, 1282, 722, dtype=dtype, device=device)
+            m = torch.nn.Conv3d(
+                16,
+                16,
+                kernel_size=(1, 3, 3),
+                padding=0,
+                stride=1,
+                bias=False,
+                dtype=dtype,
+                device=device,
+            )
+            with torch.backends.cudnn.flags(enabled=False):
+                yref = m(x)
+            y = m(x)
+            self.assertEqual(yref, y)
+
     @skipCUDAIfRocm
     @onlyCUDA
     @largeTensorTest("20GB")
     @largeTensorTest("64GB", "cpu")
+    # TODO(eqy): Remove this once it is fixed in cuDNN and we can dispatch to it again
+    @xfailIf(
+        torch.backends.cudnn.version() is not None
+        and torch.backends.cudnn.version() > 91000
+    )
     def test_depthwise_conv_64bit_indexing(self, device):
         x = torch.randn(1, 2, 32800, 32800, dtype=torch.half).to(
             memory_format=torch.channels_last
@@ -4059,7 +4092,8 @@ def test_depthwise_conv_64bit_indexing(self, device):
         del y, yref
 
         # try a batch-splittable case
-        x = x.reshape(100, 2, 3280, 3280).contiguous(memory_format=torch.channels_last)
+        x = x.reshape(100, 2, 3280, 3280)
+        x = x.contiguous(memory_format=torch.channels_last)
         yref = c(x)
         y = c.to(device=device)(x.to(device=device))
         self.assertEqual(yref, y, atol=1e-3, rtol=1e-4)
diff --git a/test/nn/test_embedding.py b/test/nn/test_embedding.py
index 6b3833748f37..3b21143711a5 100644
--- a/test/nn/test_embedding.py
+++ b/test/nn/test_embedding.py
@@ -182,6 +182,7 @@ def test_embedding_functional(self):
         self.assertEqual(res_old, res_F)
 
     # https://github.com/pytorch/pytorch/issues/130806
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     @largeTensorTest("40GB", device="cuda")
     def test_large_tensors(self):
         input = torch.randint(low=0, high=16032, size=[131072], device="cuda")
diff --git a/test/onnx/exporter/test_dynamic_shapes.py b/test/onnx/exporter/test_dynamic_shapes.py
index 464d3e34d6d0..42a08e5647bd 100644
--- a/test/onnx/exporter/test_dynamic_shapes.py
+++ b/test/onnx/exporter/test_dynamic_shapes.py
@@ -199,6 +199,7 @@ def test_dynamic_shapes_supports_nested_input_model_with_input_names_assigned(
                 filename,
                 dynamic_axes=dynamic_axes,
                 input_names=input_names,
+                dynamo=False,
             )
             onnx_model = onnx.load(filename)
 
diff --git a/test/onnx/internal/test_registraion.py b/test/onnx/internal/test_registraion.py
index e357dbff713a..fcc4cdeedd92 100644
--- a/test/onnx/internal/test_registraion.py
+++ b/test/onnx/internal/test_registraion.py
@@ -4,7 +4,7 @@
 from collections.abc import Sequence
 
 from torch.onnx import errors
-from torch.onnx._internal import registration
+from torch.onnx._internal.torchscript_exporter import registration
 from torch.testing._internal import common_utils
 
 
diff --git a/test/onnx/onnx_test_common.py b/test/onnx/onnx_test_common.py
index be6cc066e6b9..ab2bfb51bdea 100644
--- a/test/onnx/onnx_test_common.py
+++ b/test/onnx/onnx_test_common.py
@@ -17,7 +17,8 @@
 
 import torch
 from torch import export as torch_export
-from torch.onnx import _constants, verification
+from torch.onnx import _constants
+from torch.onnx._internal.torchscript_exporter import verification
 from torch.testing._internal import common_utils
 from torch.testing._internal.opinfo import core as opinfo_core
 from torch.types import Number
diff --git a/test/onnx/ops/test_ops.py b/test/onnx/ops/test_ops.py
index 437c74e9bfbf..3736b930900f 100644
--- a/test/onnx/ops/test_ops.py
+++ b/test/onnx/ops/test_ops.py
@@ -4,13 +4,20 @@
 from __future__ import annotations
 
 import onnx_ir.passes.common as common_passes
+import onnxruntime
 from onnxscript import ir
+from packaging import version
 
 import torch
+from torch.onnx._internal.exporter import _testing as onnx_testing
 from torch.onnx.ops import _impl, _symbolic_impl
 from torch.testing._internal import common_utils
 
 
+def has_onnxruntime_opset_23() -> bool:
+    return version.parse(onnxruntime.__version__) >= version.parse("1.23")
+
+
 class SchemaTest(common_utils.TestCase):
     def test_symbolic_has_correct_schema(self):
         torch.library.opcheck(
@@ -432,7 +439,7 @@ def export(self, model, args=(), kwargs=None, **options) -> torch.onnx.ONNXProgr
 
     def test_onnx_ops_can_be_decomposed_to_aten(self):
         input_data = torch.rand(2, 3, 4, 8)
-        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+        position_ids_data = torch.randint(0, 50, (2, 4)).long()
         sin_cache_data = torch.rand(50, 4)
         cos_cache_data = torch.rand(50, 4)
 
@@ -473,7 +480,7 @@ def forward(
 
     def test_rotary_embedding_opcheck(self):
         input_data = torch.rand(2, 3, 4, 8)
-        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+        position_ids_data = torch.randint(0, 50, (2, 4)).long()
         sin_cache_data = torch.rand(50, 4)
         cos_cache_data = torch.rand(50, 4)
 
@@ -484,7 +491,7 @@ def test_rotary_embedding_opcheck(self):
 
     def test_rotary_embedding(self):
         input_data = torch.rand(2, 3, 4, 8)
-        position_ids_data = torch.randint(0, 50, (2, 3)).long()
+        position_ids_data = torch.randint(0, 50, (2, 4)).long()
         sin_cache_data = torch.rand(50, 4)
         cos_cache_data = torch.rand(50, 4)
 
@@ -525,6 +532,49 @@ def forward(
         )
         self.assertEqual(onnx_program.model.opset_imports[""], 23)
         self.assertEqual("RotaryEmbedding", onnx_program.model.graph.node(0).op_type)
+        if has_onnxruntime_opset_23():
+            onnx_testing.assert_onnx_program(onnx_program)
+        else:
+            # Test with reference evaluator because ORT does not support the op as of version 1.22
+            onnx_testing.assert_onnx_program(onnx_program, backend="reference")
+
+    def test_rotary_embedding_3d(self):
+        num_heads = 2
+        input_data = torch.rand(2, 3, 8)
+        sin_cache_data = torch.rand(2, 3, 2)
+        cos_cache_data = torch.rand(2, 3, 2)
+
+        class Model(torch.nn.Module):
+            def forward(self, input_data, cos_cache_data, sin_cache_data):
+                return torch.onnx.ops.rotary_embedding(
+                    input_data,
+                    cos_cache_data,
+                    sin_cache_data,
+                    num_heads=num_heads,
+                )
+
+        model = Model()
+
+        # Dynamic shapes are supported
+        dynamic_shapes = {
+            "input_data": {0: torch.export.Dim.DYNAMIC},
+            "cos_cache_data": {0: torch.export.Dim.DYNAMIC},
+            "sin_cache_data": {0: torch.export.Dim.DYNAMIC},
+        }
+
+        onnx_program = self.export(
+            model,
+            (input_data, cos_cache_data, sin_cache_data),
+            dynamic_shapes=dynamic_shapes,
+            opset_version=23,
+        )
+        self.assertEqual(onnx_program.model.opset_imports[""], 23)
+        self.assertEqual("RotaryEmbedding", onnx_program.model.graph.node(0).op_type)
+        if has_onnxruntime_opset_23():
+            onnx_testing.assert_onnx_program(onnx_program)
+        else:
+            # Test with reference evaluator because ORT does not support the op as of version 1.22
+            onnx_testing.assert_onnx_program(onnx_program, backend="reference")
 
     def test_attention_basic(self):
         """Test basic attention functionality."""
diff --git a/test/onnx/test_autograd_funs.py b/test/onnx/test_autograd_funs.py
index cfeec9553ab7..81c70d7d9877 100644
--- a/test/onnx/test_autograd_funs.py
+++ b/test/onnx/test_autograd_funs.py
@@ -5,13 +5,11 @@
 
 import torch
 from torch.onnx import OperatorExportTypes
-from torch.onnx._globals import GLOBALS
-from torch.onnx.utils import _model_to_graph
 from torch.testing._internal import common_utils
 
 
 class TestAutogradFuns(pytorch_test_common.ExportTestCase):
-    opset_version = GLOBALS.export_onnx_opset_version
+    opset_version = 20
     keep_initializers_as_inputs = False
     onnx_shape_inference = True
 
@@ -133,7 +131,7 @@ def forward(self, input):
         input = torch.ones(1, 5)
 
         # Test ONNX_FALLTHROUGH_MODE
-        graph, _, _ = _model_to_graph(
+        graph, _, _ = torch.onnx.utils._model_to_graph(
             model,
             (input,),
             operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH,
@@ -142,7 +140,7 @@ def forward(self, input):
         self.assertEqual(next(iter).kind(), "prim::PythonOp")
 
         # Test ATEN_FALLBACK_MODE
-        graph, _, _ = _model_to_graph(
+        graph, _, _ = torch.onnx.utils._model_to_graph(
             model,
             (input,),
             operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 29ac8f108c2d..75de1f3fab83 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -11,7 +11,7 @@
 import torch.onnx
 from torch.nn import Module
 from torch.onnx import producer_name, producer_version
-from torch.onnx._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 from torch.testing._internal import common_utils
 
 
@@ -67,6 +67,7 @@ def check_onnx_opsets_operator(
             training=training,
             input_names=input_names,
             dynamic_axes=dynamic_axes,
+            dynamo=False,
         )
         model = onnx.load(io.BytesIO(f.getvalue()))
         check_onnx_opset_operator(model, ops[opset_version], opset_version)
diff --git a/test/onnx/test_onnxscript_no_runtime.py b/test/onnx/test_onnxscript_no_runtime.py
index 17e92f0e0117..e47c88b4c440 100644
--- a/test/onnx/test_onnxscript_no_runtime.py
+++ b/test/onnx/test_onnxscript_no_runtime.py
@@ -10,7 +10,7 @@
 from onnxscript.onnx_types import FLOAT
 
 import torch
-from torch.onnx._internal import jit_utils
+from torch.onnx._internal.torchscript_exporter import jit_utils
 from torch.testing._internal import common_utils
 
 
@@ -86,14 +86,20 @@ def custom_layer_norm(
         x = torch.randn(1, 2, 3, 4, requires_grad=True)
         model_selu = torch.nn.SELU()
         selu_onnx = io.BytesIO()
-        torch.onnx.export(model_selu, x, selu_onnx, opset_version=self.opset_version)
+        torch.onnx.export(
+            model_selu, x, selu_onnx, opset_version=self.opset_version, dynamo=False
+        )
 
         N, C = 3, 4
         y = torch.randn(N, C)
         model_layer_norm = torch.nn.LayerNorm(C)
         layer_norm_onnx = io.BytesIO()
         torch.onnx.export(
-            model_layer_norm, y, layer_norm_onnx, opset_version=self.opset_version
+            model_layer_norm,
+            y,
+            layer_norm_onnx,
+            opset_version=self.opset_version,
+            dynamo=False,
         )
 
         # 4. test on models
@@ -156,7 +162,11 @@ def custom_selu(g, X):
 
         saved_model = io.BytesIO()
         torch.onnx.export(
-            torch.jit.script(model), inputs, f=saved_model, opset_version=15
+            torch.jit.script(model),
+            inputs,
+            f=saved_model,
+            opset_version=15,
+            dynamo=False,
         )
         loop_selu_proto = onnx.load(io.BytesIO(saved_model.getvalue()))
         self.assertEqual(len(loop_selu_proto.functions), 1)
diff --git a/test/onnx/test_onnxscript_runtime.py b/test/onnx/test_onnxscript_runtime.py
index 23205045e838..dc19971498d9 100644
--- a/test/onnx/test_onnxscript_runtime.py
+++ b/test/onnx/test_onnxscript_runtime.py
@@ -9,7 +9,7 @@
 from onnxscript.onnx_types import FLOAT
 
 import torch
-from torch.onnx._internal import jit_utils
+from torch.onnx._internal.torchscript_exporter import jit_utils
 from torch.testing._internal import common_utils
 
 
diff --git a/test/onnx/test_pytorch_jit_onnx.py b/test/onnx/test_pytorch_jit_onnx.py
index 68f26aea8b89..bc3c64ab8679 100644
--- a/test/onnx/test_pytorch_jit_onnx.py
+++ b/test/onnx/test_pytorch_jit_onnx.py
@@ -4,8 +4,11 @@
 from pytorch_test_common import skipIfNoCuda
 
 import torch
-from torch.onnx import verification
-from torch.onnx._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter import verification
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter.utils import (
+    _trigger_symbolic_function_registration,
+)
 from torch.testing._internal import common_utils
 
 
@@ -20,6 +23,7 @@ def _jit_graph_to_onnx_model(graph, operator_export_type, opset_version):
     """
 
     GLOBALS.export_onnx_opset_version = opset_version
+    _trigger_symbolic_function_registration()
     graph = torch.onnx.utils._optimize_graph(
         graph, operator_export_type, params_dict={}
     )
diff --git a/test/onnx/test_pytorch_onnx_no_runtime.py b/test/onnx/test_pytorch_onnx_no_runtime.py
deleted file mode 100644
index b3a3aa01cf3c..000000000000
--- a/test/onnx/test_pytorch_onnx_no_runtime.py
+++ /dev/null
@@ -1,1226 +0,0 @@
-# Owner(s): ["module: onnx"]
-
-"""Tests for onnx export that don't run the exported model."""
-
-from __future__ import annotations
-
-import contextlib
-import io
-import itertools
-import unittest
-import unittest.mock
-import warnings
-from typing import Callable, Optional, TYPE_CHECKING, Union
-
-import numpy as np
-
-import onnx
-import onnx.numpy_helper
-import pytorch_test_common
-
-import torch
-import torch.nn.functional as F
-from torch import Tensor
-from torch.onnx import symbolic_helper, utils
-from torch.onnx._internal import registration
-from torch.testing._internal import common_quantization, common_utils, jit_utils
-
-
-if TYPE_CHECKING:
-    from collections.abc import Iterable
-
-
-def export_to_onnx(
-    model: Union[torch.nn.Module, torch.jit.ScriptFunction],
-    input: Union[torch.Tensor, tuple[torch.Tensor]],
-    custom_ops: Optional[
-        Iterable[Union[contextlib.AbstractContextManager, contextlib.ContextDecorator]]
-    ] = None,
-    mocks: Optional[Iterable] = None,
-    operator_export_type: torch.onnx.OperatorExportTypes = torch.onnx.OperatorExportTypes.ONNX,
-    opset_version: int = 17,
-    **torch_onnx_export_kwargs,
-) -> onnx.ModelProto:
-    """Exports `model(input)` to ONNX and returns it.
-
-    Custom operators and/or unittest patches can be used help reproducing specific behaviors.
-
-    Args:
-        model: model to export
-        input: model input with same format as `torch.onnx.export(..,args,...)`
-        custom_ops: list of custom operators to use during export
-        mocks: list of mocks to use during export
-        operator_export_type: export type as described by `torch.onnx.export(...operator_export_type,...)`
-        opset_version: ONNX opset version as described by `torch.onnx.export(...opset_version,...)`
-        torch_onnx_export_kwargs: extra torch.onnx.export kwargs arguments
-    Returns:
-        A valid ONNX model (`onnx.ModelProto`)
-    """
-    custom_ops = custom_ops or []
-    mocks = mocks or []
-    with contextlib.ExitStack() as stack:
-        for ctx in itertools.chain(custom_ops, mocks):
-            stack.enter_context(ctx)
-
-        f = io.BytesIO()
-        torch.onnx.export(
-            model,
-            input,
-            f,
-            operator_export_type=operator_export_type,
-            opset_version=opset_version,
-            **torch_onnx_export_kwargs,
-        )
-
-    # Validate ONNX graph before returning it
-    onnx_model = onnx.load_from_string(f.getvalue())
-    onnx.checker.check_model(onnx_model)
-    return onnx_model
-
-
-@common_utils.instantiate_parametrized_tests
-class TestONNXExport(pytorch_test_common.ExportTestCase):
-    def test_fuse_addmm(self):
-        class AddmmModel(torch.nn.Module):
-            def forward(self, x):
-                return torch.mm(x, x) + x
-
-        x = torch.ones(3, 3)
-        f = io.BytesIO()
-        torch.onnx.export(AddmmModel(), x, f)
-
-    def test_onnx_transpose_incomplete_tensor_type(self):
-        # Smoke test to get us into the state where we are attempting to export
-        # a transpose op, where the input is a TensorType without size information.
-        # This would previously not work, since we would
-        # take the size of the input and use the length of its sizes as the
-        # number of dimensions in the permutation.
-        class Foo(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                return x.contiguous().transpose(0, 1).sum()
-
-        class TraceMe(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.foo = Foo()
-
-            def forward(self, x):
-                return self.foo(x)
-
-        tm = TraceMe()
-        tm = torch.jit.trace(tm, torch.rand(3, 4))
-        f = io.BytesIO()
-        torch.onnx.export(tm, (torch.rand(3, 4),), f)
-
-    def test_export_tensoroption_to(self):
-        def foo(x):
-            return x[0].detach().clone().cpu() + x
-
-        traced = torch.jit.trace(foo, (torch.rand([2])))
-
-        f = io.BytesIO()
-        torch.onnx.export(traced, (torch.rand([2]),), f)
-
-    def test_onnx_export_script_module(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                y = x - x  # noqa: F841
-                return x + x
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    @common_utils.suppress_warnings
-    def test_onnx_export_func_with_warnings(self):
-        @torch.jit.script
-        def func_with_warning(inp):
-            return torch.nn.functional.sigmoid(inp)  # triggers a deprecation warning
-
-        class WarningTest(torch.nn.Module):
-            def forward(self, x):
-                return func_with_warning(x)
-
-        # no exception
-        f = io.BytesIO()
-        torch.onnx.export(WarningTest(), torch.randn(42), f)
-
-    def test_onnx_export_script_python_fail(self):
-        class PythonModule(torch.jit.ScriptModule):
-            @torch.jit.ignore
-            def forward(self, x):
-                return torch.neg(x)
-
-        class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.mod = PythonModule()
-
-            @torch.jit.script_method
-            def forward(self, x):
-                y = self.mod(x)
-                return y + y
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        with self.assertRaisesRegex(RuntimeError, "Couldn't export Python"):
-            torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    def test_onnx_export_script_inline_trace(self):
-        class ModuleToInline(torch.nn.Module):
-            def forward(self, x):
-                return torch.neg(x)
-
-        class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.mod = torch.jit.trace(ModuleToInline(), torch.zeros(1, 2, 3))
-
-            @torch.jit.script_method
-            def forward(self, x):
-                y = self.mod(x)
-                return y + y
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    def test_onnx_export_script_inline_script(self):
-        class ModuleToInline(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                return torch.neg(x)
-
-        class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.mod = ModuleToInline()
-
-            @torch.jit.script_method
-            def forward(self, x):
-                y = self.mod(x)
-                return y + y
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    def test_onnx_export_script_module_loop(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                # test if we support end to end onnx export on loop and
-                # nested loops with and without loop index
-                for _ in range(5):
-                    for i in range(3):
-                        x = x + i
-                return x
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    @common_utils.suppress_warnings
-    def test_onnx_export_script_truediv(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                z = x.size(0) / 2
-                return x + z
-
-        mte = ModuleToExport()
-
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3, dtype=torch.float),), f)
-
-    def test_onnx_export_script_non_alpha_add_sub(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                bs = x.size(0) + 1
-                return bs - 1
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.rand(3, 4),), f)
-
-    def test_onnx_export_script_module_if(self):
-        class ModuleToExport(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                if bool(torch.sum(x) > 0):
-                    x = torch.neg(x)
-                return x
-
-        mte = ModuleToExport()
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.zeros(1, 2, 3),), f)
-
-    def test_onnx_export_script_inline_params(self):
-        class ModuleToInline(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.m = torch.nn.Parameter(torch.ones(3, 3))
-                self.unused = torch.nn.Parameter(torch.ones(1, 2, 3))
-
-            @torch.jit.script_method
-            def forward(self, x):
-                return torch.mm(x, self.m)
-
-        class ModuleToExport(torch.jit.ScriptModule):
-            def __init__(self) -> None:
-                super().__init__()
-                self.mod = ModuleToInline()
-                self.param = torch.nn.Parameter(torch.ones(3, 4))
-
-            @torch.jit.script_method
-            def forward(self, x):
-                y = self.mod(x)
-                return torch.mm(y, self.param)
-
-        mte = ModuleToExport()
-        result = mte(torch.zeros(2, 3))
-        reference = torch.mm(
-            torch.mm(torch.zeros(2, 3), torch.ones(3, 3)), torch.ones(3, 4)
-        )
-        self.assertEqual(result, reference)
-        f = io.BytesIO()
-        torch.onnx.export(mte, (torch.ones(2, 3),), f)
-
-    def test_onnx_export_speculate(self):
-        class Foo(torch.jit.ScriptModule):
-            def __init__(self, m):
-                super().__init__()
-                self.m = m
-
-            @torch.jit.script_method
-            def forward(self, x):
-                x += x
-                # because we are testing if we emit `if` statement correctly
-                # we cannot use `True` as the condition. Constant prop
-                # would remove the `if` statements.
-                c = torch.sum(x) > 4
-                if bool(c):
-                    if bool(c):
-                        y = self.m(x)
-                    else:
-                        y = self.m(x)
-                else:
-                    y = self.m(x)
-                return y
-
-        linear = torch.jit.trace(
-            torch.nn.Linear(10, 20).float(), torch.zeros(1, 10, dtype=torch.float)
-        )
-
-        @torch.jit.script
-        def transpose(x):
-            return x.t()
-
-        f1 = Foo(transpose)
-        f2 = Foo(linear)
-
-        f = io.BytesIO()
-        torch.onnx.export(f1, (torch.ones(1, 10, dtype=torch.float),), f)
-        f = io.BytesIO()
-        torch.onnx.export(f2, (torch.ones(1, 10, dtype=torch.float),), f)
-
-    def test_onnx_export_shape_reshape(self):
-        class Foo(torch.nn.Module):
-            def forward(self, x):
-                import torch.onnx.operators
-
-                x = x.repeat(5, 1, 1)
-                shape = torch.onnx.operators.shape_as_tensor(x)
-                reshaped = torch.onnx.operators.reshape_from_tensor_shape(x, shape)
-                return reshaped
-
-        foo = torch.jit.trace(Foo(), torch.zeros(1, 2, 3))
-        f = io.BytesIO()
-        torch.onnx.export(foo, (torch.zeros(1, 2, 3)), f)
-
-    def test_export_dynamic_slice(self):
-        class DynamicSliceExportMod(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, x):
-                retval = x[0]
-                for i in range(x.size(1)):
-                    retval += torch.sum(x[0:i], dim=0)
-                return retval
-
-        input = torch.rand(3, 4, 5)
-
-        f = io.BytesIO()
-        torch.onnx.export(DynamicSliceExportMod(), (input,), f, opset_version=10)
-
-    def test_export_dict(self):
-        class DictModule(torch.nn.Module):
-            def forward(self, x_in: torch.Tensor) -> dict[str, torch.Tensor]:
-                return {"test_key_out": x_in}
-
-        x_in = torch.tensor(1)
-        mod = DictModule()
-        mod.train(False)
-
-        f = io.BytesIO()
-        torch.onnx.export(mod, (x_in,), f)
-
-        with self.assertRaisesRegex(RuntimeError, r"DictConstruct.+is not supported"):
-            f = io.BytesIO()
-            torch.onnx.export(torch.jit.script(mod), (x_in,), f)
-
-    def test_source_range_propagation(self):
-        class ExpandingModule(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                # Will be expanded during ONNX export
-                self.ln = torch.nn.LayerNorm([1])
-
-            def forward(self, input):
-                return self.ln(input)
-
-        mod = ExpandingModule()
-
-        graph, _, _ = utils._model_to_graph(
-            mod,
-            (torch.zeros(1),),
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX,
-        )
-
-        # Ensure that every node in the graph has a valid source range
-        for node in graph.nodes():
-            self.assertTrue(node.sourceRange())
-
-    def test_clip_aten_fallback_due_exception(self):
-        def bad_clamp(g, self, min, max):
-            return symbolic_helper._onnx_unsupported("Bad boy!")
-
-        class MyClip(torch.nn.Module):
-            def forward(self, x):
-                return torch.clamp(x, min=-0.5, max=0.5)
-
-        onnx_model = export_to_onnx(
-            MyClip(),
-            torch.randn(3, 4, requires_grad=True),
-            custom_ops=[common_utils.custom_op("aten::clamp", bad_clamp, 17)],
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-        )
-        self.assertAtenOp(onnx_model, "clamp", "Tensor")
-
-    def test_clip_aten_fallback_explicit_request(self):
-        class MyClip(torch.nn.Module):
-            def forward(self, x):
-                return torch.clamp(x, min=-0.5, max=0.5)
-
-        # Copy of mocked method must be saved to prevent
-        # max recursion depth while trying to run original instance method
-        original_get_function_group = registration.registry.get_function_group
-
-        def break_is_registered_op_api(name):
-            fake_missing_symbolics = {"aten::clamp"}
-            if name in fake_missing_symbolics:
-                return None
-            return original_get_function_group(name)
-
-        # Force missing symbolic for well-known op using a mock
-        onnx_model = export_to_onnx(
-            MyClip(),
-            torch.randn(3, 4, requires_grad=True),
-            mocks=[
-                unittest.mock.patch(
-                    "torch.onnx._internal.registration.registry.get_function_group",
-                    side_effect=break_is_registered_op_api,
-                    # wraps=registration.registry.get_function_group
-                )
-            ],
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-        )
-        self.assertAtenOp(onnx_model, "clamp", "Tensor")
-
-    def _helper_test_to_(self, cast_fn: Callable[[torch.Tensor], torch.Tensor]):
-        """Helper to test aten::to(device) variants.
-
-        `cast_fn` is converted into a `torch.jit.script`. It wraps `aten::to`
-        during export to preventing the devices to be hard-coded.
-
-        Needed by detectron2 after https://github.com/facebookresearch/detectron2/pull/4132/
-        """
-        cast_fn = torch.jit.script(cast_fn)
-        onnx_model = export_to_onnx(cast_fn, torch.zeros([1, 3, 32, 32]))
-        for n in onnx_model.graph.node:
-            self.assertNotEqual(n.op_type, "To")
-            self.assertNotEqual(n.op_type, "Cast")
-
-    def test_to__cpu_string(self):
-        def cast_cpu_string(src: torch.Tensor) -> torch.Tensor:
-            return src.to("cpu")
-
-        self._helper_test_to_(cast_cpu_string)
-
-    def test_to__device_cpu_string(self):
-        def cast_device_cpu_string(src: torch.Tensor) -> torch.Tensor:
-            return src.to(device="cpu")
-
-        self._helper_test_to_(cast_device_cpu_string)
-
-    def test_script_custom_class_error(self):
-        class BoxCoder:
-            def __init__(self, bbox_xform_clip: float) -> None:
-                self.bbox_xform_clip = bbox_xform_clip
-
-            def decode(self, rel_codes: Tensor, boxes: list[Tensor]) -> Tensor:
-                boxes = torch.cat(boxes, dim=0)
-                pred_ctr_x = (
-                    torch.clamp(rel_codes[:, 0::4], max=self.bbox_xform_clip)
-                    * boxes[:, 2]
-                )
-                return pred_ctr_x
-
-        class MyModule(torch.nn.Module):
-            __annotations__ = {
-                "box_coder": BoxCoder,
-            }
-
-            def __init__(self) -> None:
-                super().__init__()
-                self.box_coder = BoxCoder(1.4)
-
-            def forward(self, box_regression: Tensor, proposals: list[Tensor]):
-                return self.box_coder.decode(box_regression, proposals)
-
-        model = torch.jit.script(MyModule())
-        box_regression = torch.randn([4, 4])
-        proposal = [torch.randn(2, 4), torch.randn(2, 4)]
-
-        with self.assertRaises(RuntimeError):
-            f = io.BytesIO()
-            torch.onnx.export(
-                model,
-                (box_regression, proposal),
-                f,
-            )
-
-    def test_initializer_sequence(self):
-        class MyModule(torch.nn.Module):
-            def __init__(self, input_size, hidden_size, num_classes):
-                super().__init__()
-                self.fc1 = torch.nn.Linear(input_size, hidden_size)
-                self.relu = torch.nn.ReLU()
-                self.fc2 = torch.nn.Linear(hidden_size, num_classes)
-
-            def forward(self, x):
-                out = self.fc1(x)
-                out = self.relu(out)
-                out = self.fc2(out)
-                return out
-
-        test_model = MyModule(3, 4, 10)
-        state_dict_list = [k for (k, v) in test_model.state_dict().items()]
-        named_params_list = [k for (k, v) in test_model.named_parameters()]
-
-        x = torch.randn(32, 3)
-        f = io.BytesIO()
-        torch.onnx.export(test_model, (x,), f, do_constant_folding=False)
-        loaded_model = onnx.load_from_string(f.getvalue())
-
-        actual_list = [p.name for p in loaded_model.graph.initializer]
-        assert actual_list == state_dict_list, (
-            "Initializers' sequence is not as same as state_dict(). Expected: ("
-            + ", ".join(state_dict_list)
-            + "). Actual:("
-            + ", ".join(actual_list)
-            + ")."
-        )
-        assert actual_list == named_params_list, (
-            "Initializers' sequence is not as same as named_parameters(). Expected: ("
-            + ", ".join(named_params_list)
-            + "). Actual:("
-            + ", ".join(actual_list)
-            + ")."
-        )
-
-    def test_initializer_sequence_script_model(self):
-        def list_is_expected(short_list, long_list) -> bool:
-            if len(short_list) > len(long_list):
-                return False
-
-            for i in range(len(short_list)):
-                if short_list[i] not in long_list[i]:
-                    return False
-
-            return True
-
-        def loop(x, y):
-            for i in range(int(y)):
-                x = x + i
-            return x
-
-        class MyModule(torch.nn.Module):
-            def __init__(self, input_size, hidden_size, num_classes):
-                super().__init__()
-                self.fc1 = torch.nn.Linear(input_size, hidden_size)
-                self.relu = torch.nn.ReLU()
-                self.fc2 = torch.nn.Linear(hidden_size, num_classes)
-
-            def forward(self, x, y):
-                x = loop(x, y)
-                out = self.fc1(x)
-                out = self.relu(out)
-                out = self.fc2(out)
-                return out
-
-        test_model = torch.jit.script(MyModule(3, 4, 10))
-        state_dict_list = [k for (k, v) in test_model.state_dict().items()]
-        named_params_list = [k for (k, v) in test_model.named_parameters()]
-
-        x = torch.ones(2, 3, dtype=torch.float)
-        y = torch.tensor(5, dtype=torch.long)
-        f = io.BytesIO()
-
-        torch.onnx.export(test_model, (x, y), f, do_constant_folding=False)
-        loaded_model = onnx.load_from_string(f.getvalue())
-
-        actual_list = [p.name for p in loaded_model.graph.initializer]
-        assert list_is_expected(state_dict_list, actual_list), (
-            "ScriptModel - Initializers' sequence is not as same as state_dict(). Expected: ("
-            + ", ".join(state_dict_list)
-            + "). Actual:("
-            + ", ".join(actual_list)
-            + ")."
-        )
-        assert list_is_expected(named_params_list, actual_list), (
-            "ScriptModel - Initializers' sequence is not as same as named_parameters(). Expected: ("
-            + ", ".join(named_params_list)
-            + "). Actual:("
-            + ", ".join(actual_list)
-            + ")."
-        )
-
-    def test_shape_value_map(self):
-        class RSoftMax(torch.nn.Module):
-            def __init__(self, radix, cardinality):
-                super().__init__()
-                self.radix = radix
-                self.cardinality = cardinality
-
-            def forward(self, x):
-                batch = x.size(0)
-                x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
-                x = F.softmax(x, dim=1)
-                x = x.reshape(batch, -1)
-                return x
-
-        radix = 2
-        cardinality = 1
-        x = torch.randn(10, 1, 128, 1)
-        f = io.BytesIO()
-        torch.onnx.export(
-            RSoftMax(radix, cardinality),
-            (x,),
-            f,
-            input_names=["x"],
-            dynamic_axes={"x": [0]},
-        )
-        loaded_model = onnx.load_from_string(f.getvalue())
-        self.assertEqual(
-            loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128
-        )
-
-    def test_onnx_proto_checker(self):
-        class Model(torch.nn.Module):
-            def forward(self, x):
-                return 2 * x
-
-        x = torch.randn(1, 2, 3, requires_grad=True)
-        f = io.BytesIO()
-        torch.onnx.export(Model(), (x,), f)
-        model = onnx.load(f)
-        model.ir_version = 0
-
-        def check_proto():
-            torch._C._check_onnx_proto(model.SerializeToString())
-
-        self.assertRaises(RuntimeError, check_proto)
-
-    def test_maintain_dynamic_shapes_of_unreliable_nodes(self):
-        def symbolic_pythonop(g, *args, **kwargs):
-            return g.op("com.microsoft::PythonOp")
-
-        torch.onnx.register_custom_op_symbolic("prim::PythonOp", symbolic_pythonop, 1)
-        self.addCleanup(torch.onnx.unregister_custom_op_symbolic, "prim::PythonOp", 1)
-
-        # necessay parameters for transformer embeddings
-        hidden_size = 48
-        max_position_embeddings = 32
-        batch_size = 2
-
-        # issue found that autograd.function making downstream
-        # node unreliable but with static shape. The issue was first
-        # discovered with using Apex FusedLayerNorm in Transformers
-        class CustomLayerNorm(torch.autograd.Function):
-            @staticmethod
-            def forward(ctx, embedding):
-                layer_norm = torch.nn.LayerNorm(hidden_size, eps=1e-12)
-                return layer_norm(embedding)
-
-        class EmbeddingModule(torch.nn.Module):
-            def forward(
-                self,
-                embeddings=None,
-            ):
-                embedding_output = CustomLayerNorm.apply(embeddings)
-                query = embedding_output.transpose(0, 1)
-                target_len, batch_size, embedding_dim = query.size()
-                # Reshape is used for consuming batch_size, and if it is static,
-                # this will be a Constant node in the graph
-                query = query.reshape(target_len, batch_size, embedding_dim)
-                return query
-
-        embeddings = torch.randn(batch_size, max_position_embeddings, hidden_size)
-
-        f = io.BytesIO()
-        torch.onnx.export(
-            EmbeddingModule().eval(),
-            (embeddings,),
-            f,
-            input_names=["embeddings"],
-            dynamic_axes={
-                "embeddings": {
-                    0: "batch_size",
-                    1: "max_position_embeddings",
-                    2: "hidden_size",
-                }
-            },
-            custom_opsets={"com.microsoft": 1},
-        )
-        model = onnx.load(io.BytesIO(f.getvalue()))
-
-        # If there is a constant node with dim=3 and max_position_embeddings,
-        # batch_size, hidden_size as shape, it means the shape becomes static.
-        # Normally, with dynamic batch size, this constant node should not exist.
-        const_node = [n for n in model.graph.node if n.op_type == "Constant"]
-        self.assertNotEqual(len(const_node), 0)
-        for node in const_node:
-            for a in node.attribute:
-                if a.name == "value":
-                    shape = onnx.numpy_helper.to_array(a.t)
-                    self.assertNotEqual(
-                        shape.tolist(),
-                        [max_position_embeddings, batch_size, hidden_size],
-                    )
-
-    def test_is_fp_for_C_TypeList(self):
-        class M(torch.nn.Module):
-            def forward(self, x):
-                x = x.squeeze(1)
-                w = x.shape[2]
-                pos = x.view(2, -1).argmax(1)
-                x_int = pos % w
-                y_int = (pos - x_int) // w
-                return y_int, x_int
-
-        model = torch.jit.script(M())
-        inputs = torch.randn(2, 4, 6)
-        f = io.BytesIO()
-        torch.onnx.export(
-            model, inputs, f, dynamic_axes={"x": [0, 1]}, input_names=["x"]
-        )
-
-    def test_dropout_script(self):
-        eg = torch.zeros(1, 2, 3, requires_grad=True)
-
-        @jit_utils._trace(eg)
-        def foo(x):
-            x = torch.neg(x)
-            return F.dropout(x)
-
-        class MyDrop(torch.nn.Module):
-            def forward(self, x):
-                return foo(x)
-
-        f = io.BytesIO()
-        with warnings.catch_warnings(record=True):
-            torch.onnx.export(MyDrop(), (eg,), f)
-
-    def test_pack_padded_pad_packed_trace(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-
-        T, B, C = 3, 5, 7
-
-        class PadPackedWrapper(torch.nn.Module):
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        x = np.ones((T, B, C))
-        seq_lens = np.array([3, 3, 2, 2, 1], dtype=np.int32)
-        # set padding value so we can test equivalence
-        for b in range(B):
-            if seq_lens[b] < T:
-                x[seq_lens[b] :, b, :] = 0
-        seq_lens = torch.from_numpy(seq_lens)
-        x = torch.autograd.Variable(torch.from_numpy(x), requires_grad=True)
-
-        m = PadPackedWrapper()
-        m_traced = torch.jit.trace(
-            m,
-            (
-                x,
-                seq_lens,
-            ),
-        )
-
-        y = m(x, seq_lens)
-        loss = torch.sum(y)
-        loss.backward()
-        grad = x.grad.clone()
-        x.grad.zero_()
-
-        y_traced = m_traced(x, seq_lens)
-        loss_traced = torch.sum(y_traced)
-        loss_traced.backward()
-        grad_traced = x.grad.clone()
-
-        self.assertEqual(y_traced, x)
-        self.assertEqual(y_traced, y)
-        self.assertEqual(grad, grad_traced)
-
-        f = io.BytesIO()
-        torch.onnx.export(m, (x, seq_lens), f)
-
-    # Suppression: ONNX warns when exporting RNNs because of potential batch size mismatch.
-    @common_utils.suppress_warnings
-    def test_rnn_trace_override(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-
-        num_layers = 3
-        T, B, C = 11, 5, 7
-
-        class RNNTraceWrapper(torch.nn.Module):
-            def __init__(self, cell_type):
-                super().__init__()
-                if cell_type == "RNN":
-                    self.rnn = torch.nn.RNN(
-                        input_size=C, hidden_size=C, num_layers=num_layers
-                    )
-                elif cell_type == "LSTM":
-                    self.rnn = torch.nn.LSTM(
-                        input_size=C, hidden_size=C, num_layers=num_layers
-                    )
-                elif cell_type == "GRU":
-                    self.rnn = torch.nn.GRU(
-                        input_size=C, hidden_size=C, num_layers=num_layers
-                    )
-
-            def forward(self, x, seq_lens):
-                x = pack_padded_sequence(x, seq_lens)
-                x, _ = self.rnn(x)
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        for cell_type in ["RNN", "LSTM", "GRU"]:
-            x = torch.ones(T, B, C, requires_grad=True)
-            seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
-
-            m = RNNTraceWrapper(cell_type)
-            m_traced = torch.jit.trace(
-                m,
-                (
-                    x,
-                    seq_lens,
-                ),
-            )
-
-            y = m(x, seq_lens)
-            loss = torch.sum(y)
-            loss.backward()
-            grad = x.grad.clone()
-            x.grad.zero_()
-
-            y_traced = m_traced(x, seq_lens)
-            loss_traced = torch.sum(y_traced)
-            loss_traced.backward()
-            grad_traced = x.grad.clone()
-
-            self.assertEqual(y_traced, y)
-            self.assertEqual(grad, grad_traced)
-
-            f = io.BytesIO()
-            torch.onnx.export(m, (x, seq_lens), f)
-
-    def test_pushpackingpastrnn_in_peephole_create_own_gather_input(self):
-        from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-
-        num_layers = 3
-        T, B, C = 11, 5, 7
-        mask_start_point = 0
-
-        class LSTMTraceWrapper(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-
-                self.rnn = torch.nn.LSTM(
-                    input_size=C, hidden_size=C, num_layers=num_layers
-                )
-
-            def forward(self, x, seq_lens):
-                mask = torch.arange(mask_start_point, x.shape[1])
-                seq_lens = seq_lens[mask]
-                x = pack_padded_sequence(x, seq_lens)
-                # Calculate sizes and prepare views to our zero buffer to pass as hx
-                max_batch_size = x.batch_sizes[0]
-                hx = torch.randn(num_layers, max_batch_size, C)
-                cx = torch.randn(num_layers, max_batch_size, C)
-                x, _ = self.rnn(x, (hx, cx))
-                x, _ = pad_packed_sequence(x)
-                return x
-
-        x = torch.ones(T, B, C)
-        # length 5 because of B
-        seq_lens = torch.from_numpy(np.array([11, 3, 2, 2, 1], dtype=np.int32))
-        m = LSTMTraceWrapper()
-
-        f = io.BytesIO()
-        torch.onnx.export(
-            m,
-            (x, seq_lens),
-            f,
-            verbose=True,
-            input_names=["input", "seq_len"],
-            dynamic_axes={"input": {1: "B"}},
-        )
-        onnx_proto = onnx.load_model_from_string(f.getvalue())
-        # the first argument in onnx::Range should be constant node with value 0
-        const_node = []
-        constant_input_name = None
-        for n in onnx_proto.graph.node:
-            if n.op_type == "Constant":
-                const_node.append(n)
-            elif n.op_type == "Range":
-                constant_input_name = n.input[0]
-        self.assertNotEqual(constant_input_name, None)
-        self.assertNotEqual(len(const_node), 0)
-
-        value = None
-        for n in const_node:
-            if n.output[0] == constant_input_name:
-                value = np.frombuffer(n.attribute[0].t.raw_data, dtype=np.int64)
-        self.assertEqual(value, 0)
-
-    def test_trace_fork_wait_inline_onnx(self):
-        def fork_body(x):
-            return torch.neg(x), torch.neg(x)
-
-        class MyMod(torch.nn.Module):
-            def forward(self, x):
-                fut = torch.jit._fork(fork_body, x)
-                val = torch.jit._wait(fut)
-                return val[1]
-
-        # smoke test for ONNX export
-        f = io.BytesIO()
-        torch.onnx.export(MyMod(), (torch.rand(3, 4),), f)
-
-    def test_trace_detach_onnx_erase(self):
-        class Mod(torch.nn.Module):
-            def forward(self, x, w):
-                return torch.matmul(x, w).detach()
-
-        f = io.BytesIO()
-        torch.onnx.export(Mod(), (torch.rand(3, 4), torch.rand(4, 5)), f)
-
-    def test_aten_fallback_must_fallback(self):
-        class ModelWithAtenNotONNXOp(torch.nn.Module):
-            def forward(self, x, y):
-                abcd = x + y
-                defg = torch.linalg.qr(abcd)
-                return defg
-
-        x = torch.rand(3, 4)
-        y = torch.rand(3, 4)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ModelWithAtenNotONNXOp(),
-            (x, y),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-            # support for linalg.qr was added in later op set versions.
-            opset_version=9,
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        self.assertAtenOp(onnx_model, "linalg_qr")
-
-    def test_onnx_aten(self):
-        class ModelWithAtenFmod(torch.nn.Module):
-            def forward(self, x, y):
-                return torch.fmod(x, y)
-
-        x = torch.randn(3, 4, dtype=torch.float32)
-        y = torch.randn(3, 4, dtype=torch.float32)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ModelWithAtenFmod(),
-            (x, y),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN,
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        self.assertAtenOp(onnx_model, "fmod", "Tensor")
-
-    def test_onnx_aten_fallback_must_not_fallback(self):
-        # For BUILD_CAFFE2=0, aten fallback only when not exportable
-        class ONNXExportable(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.quant = torch.ao.quantization.QuantStub()
-                self.fc1 = torch.nn.Linear(12, 8)
-                self.fc2 = torch.nn.Linear(8, 4)
-                self.fc3 = torch.nn.Linear(4, 6)
-                self.dequant = torch.ao.quantization.DeQuantStub()
-
-            def forward(self, x):
-                x = self.quant(x)
-                x = x.view((-1, 12))
-                h = F.relu(self.fc1(x))
-                h = F.relu(self.fc2(h))
-                h = F.relu(self.fc3(h))
-                h = self.dequant(h)
-                return h
-
-        dummy_input = torch.randn(12)
-        f = io.BytesIO()
-        torch.onnx.export(
-            ONNXExportable(),
-            (dummy_input,),
-            f,
-            do_constant_folding=False,
-            operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-        )
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        all_aten_nodes = [
-            p
-            for p in onnx_model.graph.node
-            if p.op_type == "ATen" and p.domain == "org.pytorch.aten"
-        ]
-        self.assertEqual(len(all_aten_nodes), 0)
-
-    def test_cat_with_empty_tensor(self):
-        class NoopConcat(torch.nn.Module):
-            def forward(self, x):
-                return torch.cat((torch.Tensor([]), x))
-
-        x = torch.randn(4, 5, 6)
-        # TODO: Parametrize this test for opset_version
-        for opset_version in {9, 11}:
-            f = io.BytesIO()
-            torch.onnx.export(NoopConcat(), (x,), f, opset_version=opset_version)
-            loaded_model = onnx.load_from_string(f.getvalue())
-            self.assertEqual(
-                len(loaded_model.graph.output[0].type.tensor_type.shape.dim), 3
-            )
-            for idx, dim in enumerate(x.shape):
-                self.assertEqual(
-                    loaded_model.graph.output[0]
-                    .type.tensor_type.shape.dim[idx]
-                    .dim_value,
-                    dim,
-                )
-
-    def test_col2im(self):
-        # This test can be moved to test/onnx/test_pytorch_onnx_onnxruntime.py when ORT implement ::Col2Im
-
-        # Random batched RGB 32x32 image-shaped input tensor of batch size 64
-        original_image_inputs = torch.randn((64, 3, 32, 32))
-        output_size = tuple(original_image_inputs.shape[2:])
-        kernel_size = (1, 2)
-        dilation = 3
-        padding = 2
-        stride = 1
-        model_im2col = torch.nn.Unfold(
-            kernel_size, dilation=dilation, padding=padding, stride=stride
-        )
-        blocks = model_im2col(original_image_inputs)
-
-        model = torch.nn.Fold(
-            output_size=output_size,
-            kernel_size=kernel_size,
-            dilation=dilation,
-            padding=padding,
-            stride=stride,
-        )
-        f = io.BytesIO()
-        torch.onnx.export(model, (blocks,), f, opset_version=18)
-
-        onnx_model = onnx.load(io.BytesIO(f.getvalue()))
-        self.assertEqual(onnx_model.graph.node[-1].op_type, "Col2Im")
-        self.assertEqual(onnx_model.graph.node[-1].domain, "")
-        self.assertEqual(len(onnx_model.graph.node[-1].input), 3)
-        self.assertEqual(onnx_model.graph.node[-1].attribute[0].name, "dilations")
-        self.assertEqual(onnx_model.graph.node[-1].attribute[1].name, "pads")
-        self.assertEqual(onnx_model.graph.node[-1].attribute[2].name, "strides")
-
-    @unittest.skipIf(
-        not torch.hub._check_module_exists("torch_scatter"),
-        "torch_scatter not installed.",
-    )
-    def test_random_namespace_custom_op_is_onnx_exportable(self):
-        from torch_scatter import scatter_max  # type: ignore[import]
-
-        class MyModel(torch.nn.Module):
-            def forward(self, src: torch.Tensor, idx: torch.Tensor):
-                return scatter_max(src, idx)
-
-        m = MyModel().eval()
-        src = torch.ones([3, 10], dtype=torch.float32)
-        idx = torch.randint(0, 4, [3, 10], dtype=torch.long)
-
-        def sym_scatter_max(g, src, index, dim, out, dim_size):
-            return g.op(
-                "torch_scatter::scatter_max", src, index, dim_size_i=-1, outputs=2
-            )
-
-        torch.onnx.register_custom_op_symbolic(
-            "torch_scatter::scatter_max", sym_scatter_max, 1
-        )
-        f = io.BytesIO()
-        with torch.no_grad():
-            torch.onnx.export(
-                m,
-                (src, idx),
-                f,
-                opset_version=13,
-                custom_opsets={"torch_scatter": 1},
-                do_constant_folding=True,
-            )
-
-    @common_utils.parametrize("fp8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
-    def test_fp8_export(self, fp8_dtype: torch.dtype):
-        class Model(torch.nn.Module):
-            def forward(self, x):
-                return x.to(torch.float32)
-
-        x = torch.randn(2, 3).to(fp8_dtype)
-
-        f = io.BytesIO()
-        torch.onnx.export(Model(), x, f, opset_version=19)
-        onnx.checker.check_model(f.getvalue())
-
-        onnx_type = {
-            torch.float8_e4m3fn: 17,
-            torch.float8_e5m2: 19,
-        }  # From https://github.com/onnx/onnx/blob/main/onnx/onnx.proto3#L512-L521
-        loaded_model = onnx.load_from_string(f.getvalue())
-        self.assertEqual(
-            loaded_model.graph.input[0].type.tensor_type.elem_type, onnx_type[fp8_dtype]
-        )
-
-
-class TestQuantizeEagerONNXExport(common_utils.TestCase):
-    def _test_lower_graph_impl(self, model, data):
-        model.qconfig = torch.ao.quantization.default_qconfig
-        model = torch.ao.quantization.prepare(model)
-        model = torch.ao.quantization.convert(model)
-
-        _ = model(data)
-        input_names = ["x"]
-
-        def _export_to_onnx(model, input, input_names):
-            traced = torch.jit.trace(model, input)
-            buf = io.BytesIO()
-            torch.jit.save(traced, buf)
-            buf.seek(0)
-
-            model = torch.jit.load(buf)
-            f = io.BytesIO()
-            torch.onnx.export(
-                model,
-                input,
-                f,
-                input_names=input_names,
-                operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK,
-                opset_version=9,
-            )
-
-        _export_to_onnx(model, data, input_names)
-
-    @common_quantization.skipIfNoFBGEMM
-    @unittest.skip(
-        "onnx opset9 does not support quantize_per_tensor and caffe2 \
-    does not support conv3d"
-    )
-    def test_lower_graph_conv3d(self):
-        model = torch.ao.quantization.QuantWrapper(
-            torch.nn.Conv3d(3, 5, 2, bias=True)
-        ).to(dtype=torch.float)
-        data_numpy = np.random.rand(1, 3, 6, 6, 6).astype(np.float32)
-        data = torch.from_numpy(data_numpy).to(dtype=torch.float)
-        self._test_lower_graph_impl(model, data)
-
-    @pytorch_test_common.skipIfNoCuda
-    def test_composed_layer_norm_small_eps_fp16_keep_double(self):
-        class Net(torch.nn.Module):
-            def __init__(self, C):
-                super().__init__()
-                self.layer_norm = torch.nn.LayerNorm(C, eps=1e-8)
-
-            def forward(self, x):
-                return self.layer_norm(x)
-
-        N, C = 8, 4
-        model = Net(C).cuda().half()
-        x = torch.randn(N, C).cuda().half()
-        f = io.BytesIO()
-        torch.onnx.export(model, (x,), f, opset_version=14)
-        onnx_model = onnx.load_from_string(f.getvalue())
-        const_node = [n for n in onnx_model.graph.node if n.op_type == "Constant"]
-        self.assertNotEqual(len(const_node), 0)
-        double_type_count = 0
-        for node in const_node:
-            for a in node.attribute:
-                # EPS constant should be in double type
-                if a.name == "value" and a.t.data_type == 11:
-                    double_type_count += 1
-        self.assertNotEqual(double_type_count, 0)
-
-    @pytorch_test_common.skipIfNoCuda
-    def test_aten_device_with_index(self):
-        from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-
-        tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
-        model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
-        model = torch.compile(model, backend="onnxrt")
-        model = model.eval()
-        device = "cuda:0"
-        model = model.to(device)
-        ids = tokenizer.batch_encode_plus(["This is a test"], return_tensors="pt").to(
-            device
-        )
-
-        with torch.no_grad():
-            _ = model(
-                input_ids=ids["input_ids"],
-                attention_mask=ids["attention_mask"],
-                decoder_input_ids=ids["input_ids"],
-                decoder_attention_mask=ids["attention_mask"],
-            )
-
-    def test_aten_linalg_vector_norm_with_reducel2(self):
-        class Net(torch.nn.Module):
-            def forward(self, x):
-                x = F.normalize(x)
-                return x
-
-        f = io.BytesIO()
-        torch.onnx.export(Net(), (torch.randn(1, 2, 2),), f)
-        onnx_model = onnx.load_from_string(f.getvalue())
-        onnx_nodes = [n.op_type for n in onnx_model.graph.node]
-        self.assertIn("ReduceL2", onnx_nodes)
-
-
-if __name__ == "__main__":
-    common_utils.run_tests()
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index f99380840679..6fa49ed61b71 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -41,7 +41,9 @@
 import torch
 from torch import Tensor
 from torch.nn.utils import rnn as rnn_utils
-from torch.onnx import errors, verification
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import verification
+from torch.onnx._internal.torchscript_exporter._type_utils import JitScalarType
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import skipIfNoLapack
 
@@ -895,7 +897,11 @@ def forward(
         # export succeeds, but running ORT through run_test would fail because the exported model
         # has the inputs flattened into 3 inputs.
         torch.onnx.export(
-            model, (x, {"y": (y0, y1)}), io.BytesIO(), opset_version=self.opset_version
+            model,
+            (x, {"y": (y0, y1)}),
+            io.BytesIO(),
+            opset_version=self.opset_version,
+            dynamo=False,
         )
 
     def test_primitive_input_integer(self):
@@ -10789,6 +10795,7 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_outs = verification._run_onnx(ort_sess, (x,))
@@ -10804,6 +10811,7 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         ort_outs = verification._run_onnx(ort_sess, (x,))
         assert not torch.all(torch.eq(x, torch.from_numpy(ort_outs[0])))
@@ -10837,6 +10845,7 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_outs = verification._run_onnx(ort_sess, (x,))
@@ -10862,6 +10871,7 @@ def forward(self, x):
             opset_version=self.opset_version,
             do_constant_folding=False,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_outs = verification._run_onnx(ort_sess, (x,))
@@ -12622,7 +12632,11 @@ def forward(self, x, y):
         dummy_input = (torch.tensor([expected_mean]), torch.tensor([expected_std]))
         model_onnx = io.BytesIO()
         torch.onnx.export(
-            model_export, dummy_input, model_onnx, opset_version=self.opset_version
+            model_export,
+            dummy_input,
+            model_onnx,
+            opset_version=self.opset_version,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_out = verification._run_onnx(ort_sess, inputs=dummy_input)
@@ -12653,7 +12667,11 @@ def forward(self):
         model_onnx = io.BytesIO()
         test_inputs = ()
         torch.onnx.export(
-            model_export, test_inputs, model_onnx, opset_version=self.opset_version
+            model_export,
+            test_inputs,
+            model_onnx,
+            opset_version=self.opset_version,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
         ort_out = verification._run_onnx(ort_sess, inputs=test_inputs)
@@ -12696,7 +12714,11 @@ def forward(self, x, y):
         dummy_input = (torch.tensor([expected_min]), torch.tensor([expected_max]))
         model_onnx = io.BytesIO()
         torch.onnx.export(
-            model_export, dummy_input, model_onnx, opset_version=self.opset_version
+            model_export,
+            dummy_input,
+            model_onnx,
+            opset_version=self.opset_version,
+            dynamo=False,
         )
         ort_sess = verification._ort_session(model_onnx)
 
@@ -13703,9 +13725,10 @@ def test_optional_output(self, module_class: type[torch.nn.Module], x_size: int)
             # Ensure condition is not constant
             dynamic_axes={"x": {0: dynamic_axis_name}},
             input_names=["x"],
+            dynamo=False,
         )
         exported = onnx.load_from_string(f.getvalue())
-        expected_elem_type = torch.onnx.JitScalarType.from_value(x).onnx_type()
+        expected_elem_type = JitScalarType.from_value(x).onnx_type()
         expected_output_type = onnx.helper.make_optional_type_proto(
             onnx.helper.make_tensor_type_proto(expected_elem_type, (dynamic_axis_name,))
         )
diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py
index 801d84844935..e7c58e1ffdbe 100644
--- a/test/onnx/test_pytorch_onnx_shape_inference.py
+++ b/test/onnx/test_pytorch_onnx_shape_inference.py
@@ -10,8 +10,8 @@
 
 import torch
 from torch.onnx import _constants, utils
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils
+from torch.onnx._internal.torchscript_exporter import jit_utils
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 from torch.testing._internal import common_utils
 
 
@@ -396,6 +396,7 @@ def linalg_inv_settype(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
@@ -430,6 +431,7 @@ def linalg_inv_no_settype(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
@@ -468,6 +470,7 @@ def linalg_inv_settype(g, self):
             custom_opsets={"com.microsoft": 1},
             input_names=["x"],
             dynamic_axes={"x": {0: "batch"}},
+            dynamo=False,
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
@@ -508,6 +511,7 @@ def linalg_inv_settype(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         model_proto = onnx.load(io.BytesIO(f.getvalue()))
diff --git a/test/onnx/test_symbolic_helper.py b/test/onnx/test_symbolic_helper.py
index b7358fc1ec41..cc7a3a133732 100644
--- a/test/onnx/test_symbolic_helper.py
+++ b/test/onnx/test_symbolic_helper.py
@@ -3,7 +3,7 @@
 
 import torch
 from torch.onnx import symbolic_helper
-from torch.onnx._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 from torch.testing._internal import common_utils
 
 
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 387a8985879b..1f80f4163eb2 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -1,11 +1,9 @@
 # Owner(s): ["module: onnx"]
 
 import copy
-import functools
 import io
 import re
 import warnings
-from typing import Callable
 
 import onnx
 
@@ -23,7 +21,7 @@
 import torch.onnx
 import torch.utils.cpp_extension
 from torch.onnx import _constants, OperatorExportTypes, TrainingMode, utils
-from torch.onnx._globals import GLOBALS
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 from torch.onnx.symbolic_helper import _unpack_list, parse_args
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import skipIfNoLapack
@@ -86,86 +84,6 @@ def _model_to_graph(
         return graph, params_dict, torch_out
 
 
-@common_utils.instantiate_parametrized_tests
-class TestUnconvertibleOps(pytorch_test_common.ExportTestCase):
-    """Unit tests for the `unconvertible_ops` function."""
-
-    def setUp(self):
-        class EinsumModule(torch.nn.Module):
-            def forward(self, x):
-                return torch.einsum("ii", x)
-
-        self.einsum_module = EinsumModule()
-
-    def test_it_returns_graph_and_unconvertible_ops_at_lower_opset_version(self):
-        x = torch.randn(4, 4)
-
-        # Einsum is supported since opset 12. It should be unconvertible at opset 9.
-        graph, unconvertible_ops = utils.unconvertible_ops(
-            self.einsum_module, (x,), opset_version=9
-        )
-        nodes = graph.nodes()
-        self.assertEqual(next(nodes).kind(), "prim::Constant")
-        self.assertEqual(next(nodes).kind(), "prim::ListConstruct")
-        self.assertEqual(next(nodes).kind(), "prim::Constant")
-        self.assertEqual(next(nodes).kind(), "aten::einsum")
-        self.assertEqual(unconvertible_ops, ["aten::einsum"])
-
-    @common_utils.parametrize(
-        "jit_function",
-        [
-            common_utils.subtest(
-                functools.partial(torch.jit.trace, example_inputs=torch.randn(4, 4)),
-                name="traced",
-            ),
-            common_utils.subtest(torch.jit.script, name="scripted"),
-        ],
-    )
-    def test_it_returns_unconvertible_ops_at_lower_opset_version_for_jit_module(
-        self, jit_function: Callable
-    ):
-        module = jit_function(self.einsum_module)
-        x = torch.randn(4, 4)
-
-        # Einsum is supported since opset 12. It should be unconvertible at opset 9.
-        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=9)
-        self.assertEqual(unconvertible_ops, ["aten::einsum"])
-
-    @common_utils.parametrize(
-        "jit_function",
-        [
-            common_utils.subtest(lambda x: x, name="nn_module"),
-            common_utils.subtest(
-                functools.partial(torch.jit.trace, example_inputs=torch.randn(4, 4)),
-                name="traced",
-            ),
-            common_utils.subtest(torch.jit.script, name="scripted"),
-        ],
-    )
-    def test_it_returns_empty_list_when_all_ops_convertible(
-        self, jit_function: Callable
-    ):
-        module = jit_function(self.einsum_module)
-        x = torch.randn(4, 4)
-
-        # Einsum is supported since opset 12
-        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=12)
-        self.assertEqual(unconvertible_ops, [])
-
-    def test_it_returns_empty_list_when_model_contains_supported_inplace_ops(self):
-        class SkipConnectionModule(torch.nn.Module):
-            def forward(self, x):
-                out = x
-                out += x
-                out = torch.nn.functional.relu(out, inplace=True)
-                return out
-
-        module = SkipConnectionModule()
-        x = torch.randn(4, 4)
-        _, unconvertible_ops = utils.unconvertible_ops(module, (x,), opset_version=13)
-        self.assertEqual(unconvertible_ops, [])
-
-
 @parameterized.parameterized_class(
     [
         {"opset_version": opset}
@@ -193,7 +111,9 @@ def forward(self, x):
         x = torch.randn(3, 4)
         f = io.BytesIO()
         try:
-            torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version)
+            torch.onnx.export(
+                MyModule(), x, f, opset_version=self.opset_version, dynamo=False
+            )
         except ValueError:
             self.assertFalse(torch.onnx.is_in_onnx_export())
 
@@ -720,7 +640,7 @@ def test_constant_fold_upsample_scale_fold_as_constant(self):
         model = torch.nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
         x = torch.randn(1, 32, 224, 224)
         f = io.BytesIO()
-        torch.onnx.export(model, x, f)
+        torch.onnx.export(model, x, f, dynamo=False)
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(len(onnx_model.graph.initializer), 0)
 
@@ -733,10 +653,17 @@ def forward(self, input):
 
         def is_model_stripped(f, verbose=None):
             if verbose is None:
-                torch.onnx.export(MyModule(), x, f, opset_version=self.opset_version)
+                torch.onnx.export(
+                    MyModule(), x, f, opset_version=self.opset_version, dynamo=False
+                )
             else:
                 torch.onnx.export(
-                    MyModule(), x, f, verbose=verbose, opset_version=self.opset_version
+                    MyModule(),
+                    x,
+                    f,
+                    verbose=verbose,
+                    opset_version=self.opset_version,
+                    dynamo=False,
                 )
             model = onnx.load(io.BytesIO(f.getvalue()))
             model_strip = copy.copy(model)
@@ -759,7 +686,9 @@ def test_error_on_data_parallel(self):
             "exporter, please use 'attribute' module to "
             "unwrap model from torch.nn.DataParallel. Try ",
         ):
-            torch.onnx.export(model, x, f, opset_version=self.opset_version)
+            torch.onnx.export(
+                model, x, f, opset_version=self.opset_version, dynamo=False
+            )
 
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_sequence_dim(self):
@@ -783,6 +712,7 @@ def forward(self, x, y):
             opset_version=self.opset_version,
             input_names=["x", "y"],
             dynamic_axes={"y": [1]},
+            dynamo=False,
         )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         loop_output_value_info_proto = onnx_model.graph.output[0]
@@ -794,7 +724,9 @@ def forward(self, x, y):
         # Case 2: no dynamic axes.
         f = io.BytesIO()
         y = torch.randn(2, 3)
-        torch.onnx.export(script_model, (x, y), f, opset_version=self.opset_version)
+        torch.onnx.export(
+            script_model, (x, y), f, opset_version=self.opset_version, dynamo=False
+        )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         loop_output_value_info_proto = onnx_model.graph.output[0]
         ref_value_info_proto = onnx.helper.make_tensor_sequence_value_info(
@@ -821,6 +753,7 @@ def forward(self, x):
             f,
             opset_version=self.opset_version,
             training=torch.onnx.TrainingMode.TRAINING,
+            dynamo=False,
         )
         # verify that the model state is preserved
         self.assertEqual(model.training, old_state)
@@ -834,6 +767,7 @@ def forward(self, x):
             f,
             opset_version=self.opset_version,
             training=torch.onnx.TrainingMode.EVAL,
+            dynamo=False,
         )
         # verify that the model state is preserved
         self.assertEqual(model.training, old_state)
@@ -861,7 +795,9 @@ def forward(self, x):
         # jit.freeze removes the training attribute in the module
         module = torch.jit.freeze(module)
 
-        torch.onnx.export(module, (x,), io.BytesIO(), opset_version=self.opset_version)
+        torch.onnx.export(
+            module, (x,), io.BytesIO(), opset_version=self.opset_version, dynamo=False
+        )
 
     @skipIfUnsupportedMinOpsetVersion(15)
     def test_local_function(self):
@@ -910,6 +846,7 @@ def forward(self, x, y, z):
                 torch.nn.Dropout,
                 torch.nn.LayerNorm,
             },
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -944,6 +881,7 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions={torch.nn.CELU},
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -959,6 +897,7 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions=set(),
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -973,6 +912,7 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions=True,
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1009,6 +949,7 @@ def forward(self, x, y, z):
             f,
             opset_version=self.opset_version,
             export_modules_as_functions={NWithOverloads},
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1038,6 +979,7 @@ def forward(self, x):
             export_modules_as_functions=True,
             opset_version=self.opset_version,
             do_constant_folding=False,
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1070,6 +1012,7 @@ def forward(self, x):
             f,
             export_modules_as_functions=True,
             opset_version=self.opset_version,
+            dynamo=False,
         )
 
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
@@ -1135,6 +1078,7 @@ def forward(self, x):
             export_modules_as_functions=True,
             opset_version=self.opset_version,
             verbose=True,  # Allows the test case to print `Skipping module attribute 'freeze'`
+            dynamo=False,
         )
 
     def test_node_scope(self):
@@ -1379,6 +1323,7 @@ def gelu(g, self, approximate):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
@@ -1399,7 +1344,9 @@ def gelu(g, self, approximate):
         model = torch.nn.GELU(approximate="none")
         x = torch.randn(3, 3)
         f = io.BytesIO()
-        torch.onnx.export(model, (x,), f, opset_version=self.opset_version)
+        torch.onnx.export(
+            model, (x,), f, opset_version=self.opset_version, dynamo=False
+        )
         graph = onnx.load(io.BytesIO(f.getvalue()))
 
         self.assertEqual(graph.graph.node[0].op_type, "Gelu")
@@ -1426,6 +1373,7 @@ def linalg_inv(g, self):
             f,
             opset_version=self.opset_version,
             custom_opsets={"com.microsoft": 1},
+            dynamo=False,
         )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
@@ -1729,6 +1677,7 @@ def forward(self, x):
             f,
             opset_version=self.opset_version,
             keep_initializers_as_inputs=True,
+            dynamo=False,
         )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertEqual(graph.graph.input[1].name, "in_weight")
@@ -1761,13 +1710,19 @@ def forward(self, x):
         ]
         f = io.BytesIO()
 
-        torch.onnx.export(module, torch.ones(1, 10), f, output_names=["y"])
+        torch.onnx.export(
+            module, torch.ones(1, 10), f, output_names=["y"], dynamo=False
+        )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         for n in onnx_model.graph.node:
             self.assertIn(n.name, ref_node_names)
 
         torch.onnx.export(
-            torch.jit.script(module), torch.ones(1, 10), f, output_names=["y"]
+            torch.jit.script(module),
+            torch.ones(1, 10),
+            f,
+            output_names=["y"],
+            dynamo=False,
         )
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         for n in onnx_model.graph.node:
@@ -1810,6 +1765,7 @@ def forward(self, x):
             f,
             training=TrainingMode.TRAINING,
             opset_version=self.opset_version,
+            dynamo=False,
         )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertSetEqual({i.name for i in graph.graph.initializer}, param_name_set)
@@ -1822,6 +1778,7 @@ def forward(self, x):
             f,
             training=TrainingMode.PRESERVE,
             opset_version=self.opset_version,
+            dynamo=False,
         )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertSetEqual({i.name for i in graph.graph.initializer}, param_name_set)
@@ -1829,7 +1786,9 @@ def forward(self, x):
         # Test eval mode.
         model.eval()
         f = io.BytesIO()
-        torch.onnx.export(model, (x,), f, opset_version=self.opset_version)
+        torch.onnx.export(
+            model, (x,), f, opset_version=self.opset_version, dynamo=False
+        )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         param_name_set.remove("param2")
         self.assertSetEqual({i.name for i in graph.graph.initializer}, param_name_set)
@@ -1858,7 +1817,9 @@ def forward(self, x, y):
         x = torch.randn(3, 3, device=torch.device("cpu"))
         y = torch.randn(3, 3, device=torch.device("cuda"))
         f = io.BytesIO()
-        torch.onnx.export(Model(), (x, y), f, opset_version=self.opset_version)
+        torch.onnx.export(
+            Model(), (x, y), f, opset_version=self.opset_version, dynamo=False
+        )
         graph = onnx.load(io.BytesIO(f.getvalue()))
         self.assertSetEqual({i.name for i in graph.graph.initializer}, {"w_cpu"})
 
@@ -1899,6 +1860,7 @@ def forward(self, input0, input1):
             dynamic_axes=dynamic_axes,
             verbose=True,
             keep_initializers_as_inputs=True,
+            dynamo=False,
         )
 
         graph = onnx.load(io.BytesIO(f.getvalue()))
@@ -1926,7 +1888,7 @@ def forward(self, x):
 
         f = io.BytesIO()
         x = torch.randn(1, 32, 224, 224)
-        torch.onnx.export(Model(), x, f)
+        torch.onnx.export(Model(), x, f, dynamo=False)
         onnx_model = onnx.load(io.BytesIO(f.getvalue()))
         # aten::upsample converts to onnx::resize
         resize_nodes = [n for n in onnx_model.graph.node if n.op_type == "Resize"]
@@ -1958,7 +1920,7 @@ def forward(self, x):
         self.assertExpectedRaisesInline(
             AssertionError,
             lambda: torch.onnx.export(
-                model, (x,), f, opset_version=_onnx_opset_version
+                model, (x,), f, opset_version=_onnx_opset_version, dynamo=False
             ),
             (
                 "A mismatch between the number of arguments (2) and their descriptors (1) was found at symbolic function "
diff --git a/test/onnx/test_verification.py b/test/onnx/test_verification.py
deleted file mode 100644
index 4d2b4676d9b1..000000000000
--- a/test/onnx/test_verification.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Owner(s): ["module: onnx"]
-
-import contextlib
-import io
-import tempfile
-import unittest
-
-import numpy as np
-
-import onnx
-import parameterized
-import pytorch_test_common
-from packaging import version
-
-import torch
-from torch.onnx import _constants, _experimental, verification
-from torch.testing._internal import common_utils
-
-
-class TestVerification(pytorch_test_common.ExportTestCase):
-    def test_check_export_model_diff_returns_diff_when_constant_mismatch(self):
-        class UnexportableModel(torch.nn.Module):
-            def forward(self, x, y):
-                # tensor.data() will be exported as a constant,
-                # leading to wrong model output under different inputs.
-                return x + y.data
-
-        test_input_groups = [
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-        ]
-
-        results = verification.check_export_model_diff(
-            UnexportableModel(), test_input_groups
-        )
-        self.assertRegex(
-            results,
-            r"Graph diff:(.|\n)*"
-            r"First diverging operator:(.|\n)*"
-            r"prim::Constant(.|\n)*"
-            r"Former source location:(.|\n)*"
-            r"Latter source location:",
-        )
-
-    def test_check_export_model_diff_returns_diff_when_dynamic_controlflow_mismatch(
-        self,
-    ):
-        class UnexportableModel(torch.nn.Module):
-            def forward(self, x, y):
-                for i in range(x.size(0)):
-                    y = x[i] + y
-                return y
-
-        test_input_groups = [
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-            ((torch.randn(4, 3), torch.randn(2, 3)), {}),
-        ]
-
-        export_options = _experimental.ExportOptions(
-            input_names=["x", "y"], dynamic_axes={"x": [0]}
-        )
-        results = verification.check_export_model_diff(
-            UnexportableModel(), test_input_groups, export_options
-        )
-        self.assertRegex(
-            results,
-            r"Graph diff:(.|\n)*"
-            r"First diverging operator:(.|\n)*"
-            r"prim::Constant(.|\n)*"
-            r"Latter source location:(.|\n)*",
-        )
-
-    def test_check_export_model_diff_returns_empty_when_correct_export(self):
-        class SupportedModel(torch.nn.Module):
-            def forward(self, x, y):
-                return x + y
-
-        test_input_groups = [
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-            ((torch.randn(2, 3), torch.randn(2, 3)), {}),
-        ]
-
-        results = verification.check_export_model_diff(
-            SupportedModel(), test_input_groups
-        )
-        self.assertEqual(results, "")
-
-    def test_compare_ort_pytorch_outputs_no_raise_with_acceptable_error_percentage(
-        self,
-    ):
-        ort_outs = [np.array([[1.0, 2.0], [3.0, 4.0]])]
-        pytorch_outs = [torch.tensor([[1.0, 2.0], [3.0, 1.0]])]
-        options = verification.VerificationOptions(
-            rtol=1e-5,
-            atol=1e-6,
-            check_shape=True,
-            check_dtype=False,
-            ignore_none=True,
-            acceptable_error_percentage=0.3,
-        )
-        verification._compare_onnx_pytorch_outputs(
-            ort_outs,
-            pytorch_outs,
-            options,
-        )
-
-    def test_compare_ort_pytorch_outputs_raise_without_acceptable_error_percentage(
-        self,
-    ):
-        ort_outs = [np.array([[1.0, 2.0], [3.0, 4.0]])]
-        pytorch_outs = [torch.tensor([[1.0, 2.0], [3.0, 1.0]])]
-        options = verification.VerificationOptions(
-            rtol=1e-5,
-            atol=1e-6,
-            check_shape=True,
-            check_dtype=False,
-            ignore_none=True,
-            acceptable_error_percentage=None,
-        )
-        with self.assertRaises(AssertionError):
-            verification._compare_onnx_pytorch_outputs(
-                ort_outs,
-                pytorch_outs,
-                options,
-            )
-
-
-@common_utils.instantiate_parametrized_tests
-class TestVerificationOnWrongExport(pytorch_test_common.ExportTestCase):
-    opset_version: int
-
-    def setUp(self):
-        super().setUp()
-
-        def incorrect_add_symbolic_function(g, self, other, alpha):
-            return self
-
-        self.opset_version = _constants.ONNX_DEFAULT_OPSET
-        torch.onnx.register_custom_op_symbolic(
-            "aten::add",
-            incorrect_add_symbolic_function,
-            opset_version=self.opset_version,
-        )
-
-    def tearDown(self):
-        super().tearDown()
-        torch.onnx.unregister_custom_op_symbolic(
-            "aten::add", opset_version=self.opset_version
-        )
-
-    @common_utils.parametrize(
-        "onnx_backend",
-        [
-            common_utils.subtest(
-                verification.OnnxBackend.REFERENCE,
-                decorators=[
-                    unittest.skipIf(
-                        version.Version(onnx.__version__) < version.Version("1.13"),
-                        reason="Reference Python runtime was introduced in 'onnx' 1.13.",
-                    )
-                ],
-            ),
-            verification.OnnxBackend.ONNX_RUNTIME_CPU,
-        ],
-    )
-    def test_verify_found_mismatch_when_export_is_wrong(
-        self, onnx_backend: verification.OnnxBackend
-    ):
-        class Model(torch.nn.Module):
-            def forward(self, x):
-                return x + 1
-
-        with self.assertRaisesRegex(AssertionError, ".*Tensor-likes are not close!.*"):
-            verification.verify(
-                Model(),
-                (torch.randn(2, 3),),
-                opset_version=self.opset_version,
-                options=verification.VerificationOptions(backend=onnx_backend),
-            )
-
-
-@parameterized.parameterized_class(
-    [
-        # TODO: enable this when ONNX submodule catches up to >= 1.13.
-        # {"onnx_backend": verification.OnnxBackend.ONNX},
-        {"onnx_backend": verification.OnnxBackend.ONNX_RUNTIME_CPU},
-    ],
-    class_name_func=lambda cls,
-    idx,
-    input_dicts: f"{cls.__name__}_{input_dicts['onnx_backend'].name}",
-)
-class TestFindMismatch(pytorch_test_common.ExportTestCase):
-    onnx_backend: verification.OnnxBackend
-    opset_version: int
-    graph_info: verification.GraphInfo
-
-    def setUp(self):
-        super().setUp()
-        self.opset_version = _constants.ONNX_DEFAULT_OPSET
-
-        def incorrect_relu_symbolic_function(g, self):
-            return g.op("Add", self, g.op("Constant", value_t=torch.tensor(1.0)))
-
-        torch.onnx.register_custom_op_symbolic(
-            "aten::relu",
-            incorrect_relu_symbolic_function,
-            opset_version=self.opset_version,
-        )
-
-        class Model(torch.nn.Module):
-            def __init__(self) -> None:
-                super().__init__()
-                self.layers = torch.nn.Sequential(
-                    torch.nn.Linear(3, 4),
-                    torch.nn.ReLU(),
-                    torch.nn.Linear(4, 5),
-                    torch.nn.ReLU(),
-                    torch.nn.Linear(5, 6),
-                )
-
-            def forward(self, x):
-                return self.layers(x)
-
-        self.graph_info = verification.find_mismatch(
-            Model(),
-            (torch.randn(2, 3),),
-            opset_version=self.opset_version,
-            options=verification.VerificationOptions(backend=self.onnx_backend),
-        )
-
-    def tearDown(self):
-        super().tearDown()
-        torch.onnx.unregister_custom_op_symbolic(
-            "aten::relu", opset_version=self.opset_version
-        )
-        delattr(self, "opset_version")
-        delattr(self, "graph_info")
-
-    def test_pretty_print_tree_visualizes_mismatch(self):
-        f = io.StringIO()
-        with contextlib.redirect_stdout(f):
-            self.graph_info.pretty_print_tree()
-        self.assertExpected(f.getvalue())
-
-    def test_preserve_mismatch_source_location(self):
-        mismatch_leaves = self.graph_info.all_mismatch_leaf_graph_info()
-
-        self.assertTrue(len(mismatch_leaves) > 0)
-
-        for leaf_info in mismatch_leaves:
-            f = io.StringIO()
-            with contextlib.redirect_stdout(f):
-                leaf_info.pretty_print_mismatch(graph=True)
-            self.assertRegex(
-                f.getvalue(),
-                r"(.|\n)*aten::relu.*/torch/nn/functional.py:[0-9]+(.|\n)*",
-            )
-
-    def test_find_all_mismatch_operators(self):
-        mismatch_leaves = self.graph_info.all_mismatch_leaf_graph_info()
-
-        self.assertEqual(len(mismatch_leaves), 2)
-
-        for leaf_info in mismatch_leaves:
-            self.assertEqual(leaf_info.essential_node_count(), 1)
-            self.assertEqual(leaf_info.essential_node_kinds(), {"aten::relu"})
-
-    def test_find_mismatch_prints_correct_info_when_no_mismatch(self):
-        self.maxDiff = None
-
-        class Model(torch.nn.Module):
-            def forward(self, x):
-                return x + 1
-
-        f = io.StringIO()
-        with contextlib.redirect_stdout(f):
-            verification.find_mismatch(
-                Model(),
-                (torch.randn(2, 3),),
-                opset_version=self.opset_version,
-                options=verification.VerificationOptions(backend=self.onnx_backend),
-            )
-        self.assertExpected(f.getvalue())
-
-    def test_export_repro_for_mismatch(self):
-        mismatch_leaves = self.graph_info.all_mismatch_leaf_graph_info()
-        self.assertTrue(len(mismatch_leaves) > 0)
-        leaf_info = mismatch_leaves[0]
-        with tempfile.TemporaryDirectory() as temp_dir:
-            repro_dir = leaf_info.export_repro(temp_dir)
-
-            with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close!"):
-                options = verification.VerificationOptions(backend=self.onnx_backend)
-                verification.OnnxTestCaseRepro(repro_dir).validate(options)
-
-
-if __name__ == "__main__":
-    common_utils.run_tests()
diff --git a/test/optim/test_lrscheduler.py b/test/optim/test_lrscheduler.py
index a6e448173f9e..c36e7b2e21d6 100644
--- a/test/optim/test_lrscheduler.py
+++ b/test/optim/test_lrscheduler.py
@@ -784,6 +784,19 @@ def test_sequentiallr5(self):
         scheduler = SequentialLR(self.opt, schedulers=schedulers, milestones=milestones)
         self._test(scheduler, targets, epochs)
 
+    def test_sequentiallr_no_warnings(self):
+        scheduler1 = LinearLR(self.opt, start_factor=0.5, end_factor=0.1, total_iters=5)
+        scheduler2 = ExponentialLR(self.opt, gamma=0.9)
+        scheduler = SequentialLR(
+            self.opt, schedulers=[scheduler1, scheduler2], milestones=[5]
+        )
+
+        for _ in range(10):
+            self.opt.step()
+            with warnings.catch_warnings(record=True) as ws:
+                scheduler.step()
+                self.assertTrue(len(ws) == 0, "No warning should be raised")
+
     def test_get_last_lr_sequentiallr(self):
         epochs = 12
         milestones = [3, 6]
diff --git a/test/profiler/test_profiler_tree.py b/test/profiler/test_profiler_tree.py
index 75883c278b61..670e639c98e2 100644
--- a/test/profiler/test_profiler_tree.py
+++ b/test/profiler/test_profiler_tree.py
@@ -764,6 +764,7 @@ def test_profiler_experimental_tree_with_stack_and_torch_dispatch(self):
               aten::add
                 torch/_library/simple_registry.py(...): find_torch_dispatch_rule
                   torch/_library/simple_registry.py(...): find
+                    <built-in method get of dict object at 0xXXXXXXXXXXXX>
                   torch/_library/simple_registry.py(...): find
                     <built-in method get of dict object at 0xXXXXXXXXXXXX>
                 test_profiler_tree.py(...): __torch_dispatch__
diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 346f22c1e477..b6df2089e87e 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -7,8 +7,8 @@
 import numpy as np
 import operator
 import random
-import sys
 import unittest
+from packaging.version import Version
 from typing import NamedTuple
 
 import torch
@@ -73,7 +73,7 @@ class PointwisePostOp(NamedTuple):
 def avoid_vpmaddubsw_overflow_linear(
     batch_size, input_channels, output_channels, X, X_min, X_max, W, W_min, W_max
 ):
-    if sys.version_info >= (3, 13):
+    if Version(np.__version__) >= Version("2.1"):
         raise unittest.SkipTest("numpy 2.1 overflow error")
     for i, j in np.ndindex((batch_size, output_channels)):
         for k in range(0, input_channels // 2 * 2, 2):
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 5517b9d8eddb..f241cc438757 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -1409,6 +1409,9 @@ def test_choose_qparams_optimized(self):
             self.assertEqual(y[0].numpy(), ref[0])
             self.assertEqual(y[1].numpy(), ref[1])
 
+        with self.assertRaisesRegex(ValueError, "input tensor is empty and has no data"):
+            torch.choose_qparams_optimized(torch.tensor([]), numel=0, n_bins=200, ratio=0.16, bit_width=8)
+
     def _test_pickle_checkpoint_qtensor(self, device):
         with TemporaryFileName() as fname:
             class M(torch.jit.ScriptModule):
diff --git a/test/run_test.py b/test/run_test.py
index c0a61749936e..44a15d4ab2c6 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -12,6 +12,7 @@
 import signal
 import subprocess
 import sys
+import sysconfig
 import tempfile
 import time
 from collections import defaultdict
@@ -185,28 +186,15 @@ def __contains__(self, item):
     "lazy/test_meta_kernel",
     "onnx/test_utility_funs",
     "profiler/test_profiler",
-    "test_ao_sparsity",
     "test_jit",
-    "test_metal",
-    "test_mps",
-    "dynamo/test_torchrec",
-    "inductor/test_aot_inductor_utils",
-    "inductor/test_coordinate_descent_tuner",
-    "test_jiterator",
-    "inductor/test_cpu_cpp_wrapper",
-    "export/test_converter",
-    "inductor/test_inductor_freezing",
     "dynamo/test_utils",
     "test_nn",
-    "functorch/test_ops",
     # these tests run long and fail in addition to that
     "dynamo/test_dynamic_shapes",
     "test_quantization",
     "inductor/test_torchinductor",
     "inductor/test_torchinductor_dynamic_shapes",
     "inductor/test_torchinductor_opinfo",
-    "test_binary_ufuncs",
-    "test_unary_ufuncs",
     # these tests fail when cuda is not available
     "inductor/test_aot_inductor",
     "inductor/test_best_config",
@@ -225,9 +213,12 @@ def __contains__(self, item):
     # these tests fail when mkldnn is not available
     "inductor/test_custom_post_grad_passes",
     "inductor/test_mkldnn_pattern_matcher",
+    "test_metal",
     # lacks quantization support
     "onnx/test_models_quantized_onnxruntime",
     "onnx/test_pytorch_onnx_onnxruntime",
+    # sysctl -n hw.memsize is not available
+    "test_mps",
     # https://github.com/pytorch/pytorch/issues/102078
     "test_decomp",
     # https://github.com/pytorch/pytorch/issues/146698
@@ -246,6 +237,9 @@ def __contains__(self, item):
     "inductor/test_config",
     "test_public_bindings",
     "test_testing",
+    # depend on z3-solver
+    "fx/test_z3_gradual_types",
+    "test_proxy_tensor",
 ]
 
 XPU_BLOCKLIST = [
@@ -257,6 +251,7 @@ def __contains__(self, item):
     "profiler/test_profiler_tree",
     "profiler/test_record_function",
     "profiler/test_torch_tidy",
+    "test_openreg",
 ]
 
 XPU_TEST = [
@@ -654,27 +649,33 @@ def run_test(
     return ret_code
 
 
-def install_cpp_extensions(cpp_extensions_test_dir, env=os.environ):
+def install_cpp_extensions(extensions_dir, env=os.environ):
     # Wipe the build folder, if it exists already
-    cpp_extensions_test_build_dir = os.path.join(cpp_extensions_test_dir, "build")
-    if os.path.exists(cpp_extensions_test_build_dir):
-        shutil.rmtree(cpp_extensions_test_build_dir)
+    build_dir = os.path.join(extensions_dir, "build")
+    if os.path.exists(build_dir):
+        shutil.rmtree(build_dir)
 
     # Build the test cpp extensions modules
-    # FIXME: change setup.py command to pip command
-    cmd = [sys.executable, "setup.py", "install", "--root", "./install"]
-    return_code = shell(cmd, cwd=cpp_extensions_test_dir, env=env)
+    cmd = [
+        sys.executable,
+        "-m",
+        "pip",
+        "install",
+        "--no-build-isolation",
+        ".",
+        "--root",
+        "./install",
+    ]
+    return_code = shell(cmd, cwd=extensions_dir, env=env)
     if return_code != 0:
         return None, return_code
 
-    install_directory = ""
-    # install directory is the one that is named site-packages
-    for root, directories, _ in os.walk(
-        os.path.join(cpp_extensions_test_dir, "install")
-    ):
-        for directory in directories:
-            if "-packages" in directory:
-                install_directory = os.path.join(root, directory)
+    # Get the site-packages directory prepared for PYTHONPATH
+    platlib_path = sysconfig.get_paths()["platlib"]
+    platlib_rel = os.path.relpath(
+        platlib_path, os.path.splitdrive(platlib_path)[0] + os.sep
+    )
+    install_directory = os.path.join(extensions_dir, "install", platlib_rel)
 
     assert install_directory, "install_directory must not be empty"
     return install_directory, 0
@@ -821,8 +822,17 @@ def _test_cpp_extensions_aot(test_directory, options, use_ninja):
     # Build the test cpp extensions modules
     shell_env = os.environ.copy()
     shell_env["USE_NINJA"] = str(1 if use_ninja else 0)
-    install_cmd = [sys.executable, "setup.py", "install", "--root", "./install"]
-    wheel_cmd = [sys.executable, "setup.py", "bdist_wheel"]
+    install_cmd = [
+        sys.executable,
+        "-m",
+        "pip",
+        "install",
+        "--no-build-isolation",
+        ".",
+        "--root",
+        "./install",
+    ]
+    wheel_cmd = [sys.executable, "-m", "pip", "wheel", ".", "-w", "./dist"]
     return_code = shell(install_cmd, cwd=cpp_extensions_test_dir, env=shell_env)
     if return_code != 0:
         return return_code
diff --git a/test/slow_tests.json b/test/slow_tests.json
index 5d7769fe1793..cd9d6864f0ec 100644
--- a/test/slow_tests.json
+++ b/test/slow_tests.json
@@ -1,244 +1,244 @@
 {
-  "EndToEndLSTM (__main__.RNNTest)": 167.79299926757812,
-  "MultiheadAttention (__main__.ModulesTest)": 134.5040028889974,
-  "test_AllenaiLongformerBase_repro_cpu_halide (__main__.HalideCpuTests)": 215.27066548665366,
-  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 93.5010002983941,
-  "test_adaptive_max_pool2d1_cpu_halide (__main__.HalideCpuTests)": 116.77766418457031,
-  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 65.87677764892578,
-  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 64.79266611735027,
-  "test_alexnet_prefix_cpu_halide (__main__.HalideCpuTests)": 178.21500142415366,
-  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 75.37266540527344,
-  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.37223825000581,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 149.49199422200522,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 202.0199940999349,
-  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 137.46066538492838,
-  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 72.86633555094402,
-  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 142.4383341471354,
-  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 1095.141337076823,
-  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 133.06199951171874,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 499.07611762152777,
-  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 504.6579996744792,
-  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 139.0961659749349,
-  "test_avg_pool3d_backward_cpu_halide (__main__.HalideCpuTests)": 61.75833257039388,
-  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 114.60333506266277,
-  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 80.34850056966145,
-  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 255.46944003634982,
-  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 145.08583323160806,
-  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 386.5277845594618,
-  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 453.9065517849392,
-  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 309.5328877766927,
-  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 110.05283228556316,
-  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 113.49433517456055,
-  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 75.72400029500325,
-  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 77.22933578491211,
-  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 105.96799977620442,
-  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 104.43666585286458,
-  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 469.5853271484375,
-  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 444.3350016276042,
-  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 275.97166951497394,
-  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 273.89783477783203,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1454.28466796875,
-  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 72.1863333384196,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1493.8806966145833,
-  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 87.17483139038086,
-  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 102.65883382161458,
-  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 98.90133285522461,
-  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 100.09299850463867,
-  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 97.19400151570638,
-  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 71.13550122578938,
-  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 72.42433293660481,
-  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 77.77149963378906,
-  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 68.43516731262207,
-  "test_comprehensive_logspace_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 94.22900009155273,
-  "test_comprehensive_logspace_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 85.28733444213867,
-  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 156.78333536783853,
-  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 148.53383255004883,
-  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 144.6025021870931,
-  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 69.52500089009602,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 62.03900019327799,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 131.29416783650717,
-  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 126.30566660563152,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 121.55633290608723,
-  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 108.16266377766927,
-  "test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 77.12116622924805,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 60.599332597520615,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 288.9276580810547,
-  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 265.2711664835612,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 96.18350092569987,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 94.48850123087566,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 88.64933395385742,
-  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 92.91050211588542,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 147.21399943033853,
-  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 150.2751668294271,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1363.6788126627605,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1344.167500813802,
-  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1340.7553304036458,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 550.8669942220052,
-  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 544.4363301595052,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 87.62416585286458,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.1211675008138,
-  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 80.53016662597656,
-  "test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 83.94283294677734,
-  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 60.18320007324219,
-  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 127.41166559855144,
-  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 127.45016733805339,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 75.39050038655598,
-  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 82.25100072224934,
-  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 85.74650065104167,
-  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 82.1128323872884,
-  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 77.14166768391927,
-  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 164.81299845377603,
-  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 125.58233388264973,
-  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 110.5093339284261,
-  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 63.92166646321615,
-  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 262.6161126030816,
-  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 384.0022226969401,
-  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 74.33833312988281,
-  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 148.1703338623047,
-  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 62.10699971516927,
-  "test_conv3d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 63.631666564941405,
-  "test_conv_bn_fuse_cpu (__main__.CpuTests)": 90.1019999186198,
-  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 82.31511137220595,
-  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 76.46144570244684,
-  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 119.22800064086914,
-  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 118.82949701944987,
-  "test_count_nonzero_all (__main__.TestBool)": 620.3042161729601,
-  "test_custom_module_lstm (__main__.TestQuantizedOps)": 608.704111735026,
-  "test_ddp_uneven_inputs (__main__.TestDistBackendWithSpawn)": 367.39725255966187,
-  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 87.24549992879231,
-  "test_dtensor_op_db_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDTensorOpsCPU)": 92.91466776529948,
-  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 671.9209950764974,
-  "test_error_detection_and_propagation (__main__.NcclErrorHandlingTest)": 67.31599998474121,
-  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 65.68833329942491,
-  "test_fail_creation_ops.py (__main__.TestTyping)": 69.23046684265137,
-  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 117.8158327738444,
-  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 151.24599965413412,
-  "test_fuse_large_params_cpu (__main__.CpuTests)": 115.81725311279297,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 157.3572235107422,
-  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 164.22044372558594,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 141.51583353678384,
-  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 109.39250183105469,
-  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 103.42733383178711,
-  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 121.52200063069661,
-  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 114.0403315226237,
-  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 228.2855021158854,
-  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 119.98516591389973,
-  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 139.17350260416666,
-  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 626.3406829833984,
-  "test_grid_sampler_2d_cpu_halide (__main__.HalideCpuTests)": 194.62432861328125,
-  "test_group_norm (__main__.TestQuantizedOps)": 390.9878795411852,
-  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 327.1403299967448,
-  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 67.01422288682726,
-  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 136.49483362833658,
-  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 125.45255703396268,
-  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 117.41233317057292,
-  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 138.24000040690103,
-  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 107.67299906412761,
-  "test_linear (__main__.TestStaticQuantizedModule)": 284.66566043429907,
-  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 122.2096659342448,
-  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 137.30433654785156,
-  "test_linear_relu (__main__.TestStaticQuantizedModule)": 93.12611262003581,
-  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 121.86099815368652,
-  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.7022221883138,
-  "test_lstm_cpu (__main__.TestMkldnnCPU)": 119.58833567301433,
-  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 131.4415545993381,
-  "test_max_pool2d2_cpu_halide (__main__.HalideCpuTests)": 422.90733846028644,
-  "test_max_pool2d3_cpu_halide (__main__.HalideCpuTests)": 133.81799825032553,
-  "test_max_pool2d5_cpu_halide (__main__.HalideCpuTests)": 359.95066324869794,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.708777533637154,
-  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 64.97488827175565,
-  "test_proper_exit (__main__.TestDataLoader)": 233.8233388264974,
-  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 234.49950154622397,
-  "test_python_ref_executor__refs_special_zeta_executor_aten_cuda_float64 (__main__.TestCommonCUDA)": 60.49800046284994,
-  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 155.6514426337348,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 63.423309689476376,
-  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 66.07076204390754,
-  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 139.556332482232,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 78.51466623942058,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 95.9586664835612,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 103.0530014038086,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 75.59033203125,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 99.44300079345703,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 97.32666778564453,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 71.75700124104817,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 88.78366597493489,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 93.92866770426433,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 75.43733215332031,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 96.89966583251953,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 106.35166676839192,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 77.10733286539714,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 93.39999898274739,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 97.94333140055339,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 99.14266713460286,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.572998046875,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 71.17066701253255,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 95.87900034586589,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 100.59700012207031,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 101.87699890136719,
-  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 97.12399800618489,
-  "test_qrnncell (__main__.TestDynamicQuantizedOps)": 249.1098878648546,
-  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 544.9809977213541,
-  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1417.1561686197917,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 777.9390055338541,
-  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1726.124491373698,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 86.06699879964192,
-  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 353.15733337402344,
-  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 130.33799997965494,
-  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 275.1813329060872,
-  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 69.66900126139323,
-  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 161.03100077311197,
-  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 78.78350067138672,
-  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 104.52733103434245,
-  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 193.02466583251953,
-  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 121.30533345540364,
-  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 106.73883438110352,
-  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 65.16733296712239,
-  "test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 199.22500292460123,
-  "test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 162.2075015703837,
-  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 72.16866683959961,
-  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 83.71911112467448,
-  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 164.3634999593099,
-  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 125.80033111572266,
-  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 160.44044155544705,
-  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 159.717776828342,
-  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 188.66122351752387,
-  "test_softmax_view_reshape (__main__.HelionTests)": 238.01199849446616,
-  "test_sort_stable_cpu (__main__.CpuTritonTests)": 77.48899841308594,
-  "test_split_cumsum_cpu (__main__.CpuTritonTests)": 90.10066477457683,
-  "test_std (__main__.TestQuantizedOps)": 226.4018878671858,
-  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 144.04767243067423,
-  "test_tensor_split (__main__.TestVmapOperators)": 83.96149863230272,
-  "test_terminate_handler_on_crash (__main__.TestTorch)": 112.35611218876309,
-  "test_terminate_signal (__main__.ForkTest)": 138.1184465073877,
-  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 138.32678190039263,
-  "test_terminate_signal (__main__.SpawnTest)": 141.84766822391086,
-  "test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 264.57921579149036,
-  "test_train_parity_multi_group_unshard_async_op (__main__.TestFullyShard1DTrainingCore)": 69.01499977111817,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 68.08183352152507,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 65.6910006205241,
-  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 70.14533233642578,
-  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 119.57766723632812,
-  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 120.03250249226888,
-  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 97.5744997660319,
-  "test_unary_ops (__main__.TestTEFuserDynamic)": 98.9211094379425,
-  "test_unary_ops (__main__.TestTEFuserStatic)": 93.61777896351285,
-  "test_upsample_bicubic2d_cpu_halide (__main__.HalideCpuTests)": 97.4816665649414,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 84.01033528645833,
-  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 95.27316538492839,
-  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 85.62733205159505,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 91.35633341471355,
-  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 370.32765706380206,
-  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 136.11000188191733,
-  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 94.92966779073079,
-  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 90.03333282470703,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 70.37433369954427,
-  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 172.6675008138021,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 66.6173324584961,
-  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 136.00016657511392,
-  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 79.25650024414062,
-  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 61.63114266168503,
-  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 111.99316533406575,
-  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 99.19866689046223,
-  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 102.87133407592773,
-  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 150.6071662902832
+  "EndToEndLSTM (__main__.RNNTest)": 194.9510040283203,
+  "MultiheadAttention (__main__.ModulesTest)": 140.13499959309897,
+  "test__adaptive_avg_pool2d (__main__.CPUReproTests)": 89.57710986667209,
+  "test_after_aot_cpu_runtime_error (__main__.MinifierIsolateTests)": 64.31833351982965,
+  "test_after_aot_gpu_runtime_error (__main__.MinifierIsolateTests)": 66.09833272298177,
+  "test_aot_autograd_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 64.02314267839704,
+  "test_aot_autograd_symbolic_exhaustive_linalg_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 72.13800048828125,
+  "test_aot_autograd_symbolic_exhaustive_masked_norm_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 63.19166692097982,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool1d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 153.9259999593099,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool2d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 214.78533426920572,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_max_pool3d_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 158.7769978841146,
+  "test_aot_autograd_symbolic_exhaustive_nn_functional_unfold_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 60.201476414998375,
+  "test_aot_autograd_symbolic_exhaustive_svd_cpu_float32 (__main__.TestEagerFusionOpInfoCPU)": 75.8566665649414,
+  "test_aot_autograd_symbolic_module_exhaustive_nn_TransformerDecoderLayer_cpu_float32 (__main__.TestEagerFusionModuleInfoCPU)": 158.88999938964844,
+  "test_avg_pool3d_backward2_cpu (__main__.CpuTests)": 600.0303955078125,
+  "test_avg_pool3d_backward2_cuda (__main__.GPUTests)": 143.89337348937988,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 494.34210883246527,
+  "test_avg_pool3d_backward2_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 504.5401102701823,
+  "test_avg_pool3d_backward2_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 135.9231694539388,
+  "test_backward_nn_functional_multi_head_attention_forward_cpu_float32 (__main__.TestCompositeComplianceCPU)": 71.03799947102864,
+  "test_backward_nn_functional_multi_head_attention_forward_cuda_float32 (__main__.TestCompositeComplianceCUDA)": 73.23316764831543,
+  "test_basic_cpu (__main__.EfficientConvBNEvalCpuTests)": 214.73055691189236,
+  "test_basic_cuda (__main__.EfficientConvBNEvalGpuTests)": 150.5653305053711,
+  "test_cat_2k_args (__main__.TestTEFuserDynamic)": 121.138150700114,
+  "test_cat_2k_args (__main__.TestTEFuserStatic)": 117.27021219874874,
+  "test_checkpointing_without_reentrant_input_requires_grad_False (__main__.TestAutogradWithCompiledAutograd)": 332.1435546875,
+  "test_checkpointing_without_reentrant_input_requires_grad_True (__main__.TestAutogradWithCompiledAutograd)": 413.1364440917969,
+  "test_collect_callgrind (__main__.TestBenchmarkUtils)": 322.539549085829,
+  "test_comprehensive_diff_cuda_complex128 (__main__.TestDecompCUDA)": 109.46066538492839,
+  "test_comprehensive_diff_cuda_complex64 (__main__.TestDecompCUDA)": 110.44916661580403,
+  "test_comprehensive_diff_cuda_float32 (__main__.TestDecompCUDA)": 77.25650024414062,
+  "test_comprehensive_diff_cuda_float64 (__main__.TestDecompCUDA)": 75.41433461507161,
+  "test_comprehensive_grid_sampler_2d_cpu_bfloat16 (__main__.TestDecompCPU)": 111.43533325195312,
+  "test_comprehensive_grid_sampler_2d_cpu_float16 (__main__.TestDecompCPU)": 113.98733520507812,
+  "test_comprehensive_grid_sampler_2d_cpu_float32 (__main__.TestDecompCPU)": 485.4573465983073,
+  "test_comprehensive_grid_sampler_2d_cpu_float64 (__main__.TestDecompCPU)": 464.56699625651044,
+  "test_comprehensive_grid_sampler_2d_cuda_bfloat16 (__main__.TestDecompCUDA)": 265.6348292032878,
+  "test_comprehensive_grid_sampler_2d_cuda_float16 (__main__.TestDecompCUDA)": 314.0461654663086,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestDecompCUDA)": 1546.3898315429688,
+  "test_comprehensive_grid_sampler_2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 69.4828332265218,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestDecompCUDA)": 1384.938496907552,
+  "test_comprehensive_grid_sampler_2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 73.32633463541667,
+  "test_comprehensive_linalg_lu_solve_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 78.70183436075847,
+  "test_comprehensive_linalg_lu_solve_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 76.88016764322917,
+  "test_comprehensive_linalg_pinv_singular_cuda_complex128 (__main__.TestDecompCUDA)": 60.60533459981283,
+  "test_comprehensive_linalg_solve_triangular_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 83.5096664428711,
+  "test_comprehensive_linalg_solve_triangular_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 78.69066619873047,
+  "test_comprehensive_linalg_svd_cuda_complex128 (__main__.TestDecompCUDA)": 92.91299947102864,
+  "test_comprehensive_linalg_svd_cuda_complex64 (__main__.TestDecompCUDA)": 73.34999974568684,
+  "test_comprehensive_linalg_vector_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 70.28683344523112,
+  "test_comprehensive_linalg_vector_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 69.44366518656413,
+  "test_comprehensive_logspace_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 77.09783299763997,
+  "test_comprehensive_logspace_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 70.4760004679362,
+  "test_comprehensive_masked_norm_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 142.64183044433594,
+  "test_comprehensive_masked_norm_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 137.7250010172526,
+  "test_comprehensive_masked_norm_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 138.17566553751627,
+  "test_comprehensive_nn_functional_conv_transpose3d_cuda_complex64 (__main__.TestDecompCUDA)": 69.95266660054524,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float32 (__main__.TestDecompCPU)": 60.835333506266274,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cpu_float64 (__main__.TestDecompCPU)": 66.94753379821778,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestDecompCUDA)": 138.8831672668457,
+  "test_comprehensive_nn_functional_gaussian_nll_loss_cuda_float64 (__main__.TestDecompCUDA)": 157.37983194986978,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float32 (__main__.TestDecompCPU)": 148.48499552408853,
+  "test_comprehensive_nn_functional_grid_sample_cpu_float64 (__main__.TestDecompCPU)": 142.54666646321616,
+  "test_comprehensive_nn_functional_grid_sample_cuda_bfloat16 (__main__.TestDecompCUDA)": 66.76000086466472,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float16 (__main__.TestDecompCUDA)": 70.30716641743977,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float32 (__main__.TestDecompCUDA)": 340.98316701253253,
+  "test_comprehensive_nn_functional_grid_sample_cuda_float64 (__main__.TestDecompCUDA)": 314.614995320638,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestDecompCUDA)": 88.2018330891927,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 85.09549967447917,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestDecompCUDA)": 88.72550201416016,
+  "test_comprehensive_nn_functional_interpolate_bicubic_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 85.59499867757161,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cpu_float32 (__main__.TestDecompCPU)": 61.82139994303385,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float32 (__main__.TestDecompCUDA)": 141.1143341064453,
+  "test_comprehensive_nn_functional_interpolate_trilinear_cuda_float64 (__main__.TestDecompCUDA)": 142.72383499145508,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 1356.413838704427,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 1347.1215209960938,
+  "test_comprehensive_nn_functional_max_pool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 1366.5043131510417,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 593.5763346354166,
+  "test_comprehensive_nn_functional_max_pool3d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 549.9474945068359,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float16 (__main__.TestInductorOpInfoCUDA)": 74.53666687011719,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 75.8316650390625,
+  "test_comprehensive_nn_functional_max_unpool2d_cuda_float64 (__main__.TestInductorOpInfoCUDA)": 74.80666669209798,
+  "test_comprehensive_nn_functional_unfold_cuda_complex128 (__main__.TestDecompCUDA)": 67.3658332824707,
+  "test_comprehensive_ormqr_cpu_complex64 (__main__.TestDecompCPU)": 67.6716677347819,
+  "test_comprehensive_ormqr_cuda_complex128 (__main__.TestDecompCUDA)": 120.74283218383789,
+  "test_comprehensive_ormqr_cuda_complex64 (__main__.TestDecompCUDA)": 117.90700022379558,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestDecompCUDA)": 74.16149965922038,
+  "test_comprehensive_ormqr_cuda_float32 (__main__.TestInductorOpInfoCUDA)": 74.09249877929688,
+  "test_comprehensive_ormqr_cuda_float64 (__main__.TestDecompCUDA)": 68.72949981689453,
+  "test_comprehensive_svd_cuda_complex128 (__main__.TestDecompCUDA)": 76.05216598510742,
+  "test_comprehensive_svd_cuda_complex64 (__main__.TestDecompCUDA)": 79.25549952189128,
+  "test_constructor_autograd_SparseBSC_cuda (__main__.TestSparseAnyCUDA)": 124.02233123779297,
+  "test_constructor_autograd_SparseBSR_cuda (__main__.TestSparseAnyCUDA)": 130.15816497802734,
+  "test_constructor_autograd_SparseCSC_cuda (__main__.TestSparseAnyCUDA)": 114.52783139546712,
+  "test_constructor_autograd_SparseCSR_cuda (__main__.TestSparseAnyCUDA)": 94.13066546122234,
+  "test_conv1d_basic (__main__.TestXNNPACKConv1dTransformPass)": 243.25878143310547,
+  "test_conv1d_with_relu_fc (__main__.TestXNNPACKConv1dTransformPass)": 560.9872216118706,
+  "test_conv2d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 85.30400085449219,
+  "test_conv2d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.0622667948405,
+  "test_conv2d_unary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 60.94093297322591,
+  "test_conv3d_binary_broadcast_shapes_cpu (__main__.TestPatternMatcherGenericCPU)": 164.94733174641928,
+  "test_conv3d_binary_dynamic_shapes_cpu (__main__.TestDynamicPatternMatcherGenericCPU)": 67.41599782307942,
+  "test_conv_bn_fuse_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 80.62599987453885,
+  "test_conv_unary_fusion_nnc (__main__.TestMkldnnFusion)": 77.90822347005208,
+  "test_correctness_AdamW_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 88.02899932861328,
+  "test_correctness_Adam_use_closure_True_cuda_float32 (__main__.CompiledOptimizerParityTestsCUDA)": 83.99416732788086,
+  "test_count_nonzero_all (__main__.TestBool)": 625.3162163628472,
+  "test_custom_module_lstm (__main__.TestQuantizedOps)": 691.5127597384983,
+  "test_dispatch_symbolic_meta_outplace_all_strides_nn_functional_gaussian_nll_loss_cuda_float32 (__main__.TestMetaCUDA)": 86.18333435058594,
+  "test_eager_sequence_nr_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 146.76594623766448,
+  "test_eig_check_magma_cuda_float32 (__main__.TestLinalgCUDA)": 341.765677134196,
+  "test_fail_arithmetic_ops.py (__main__.TestTyping)": 68.25488874647353,
+  "test_fail_random.py (__main__.TestTyping)": 69.70459224559643,
+  "test_fn_fwgrad_bwgrad_cumprod_cuda_complex128 (__main__.TestFwdGradientsCUDA)": 99.30016708374023,
+  "test_fn_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 90.32933298746745,
+  "test_fuse_large_params_cpu (__main__.CpuTests)": 100.9027509689331,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 156.06466674804688,
+  "test_fuse_large_params_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 154.44311014811197,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesCodegenGPUTests)": 140.33400217692056,
+  "test_fuse_large_params_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 108.87950007120769,
+  "test_grad_nn_Transformer_cpu_float64 (__main__.TestModuleCPU)": 78.21525671543219,
+  "test_grad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 95.37383270263672,
+  "test_gradgrad_nn_LSTM_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 124.23833465576172,
+  "test_gradgrad_nn_LSTM_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 130.07466634114584,
+  "test_gradgrad_nn_TransformerDecoderLayer_cuda_float64 (__main__.TestModuleCUDA)": 228.14850107828775,
+  "test_gradgrad_nn_TransformerEncoder_eval_mode_cuda_float64 (__main__.TestModuleCUDA)": 141.07866414388022,
+  "test_gradgrad_nn_TransformerEncoder_train_mode_cuda_float64 (__main__.TestModuleCUDA)": 155.69166564941406,
+  "test_gradgrad_nn_Transformer_cuda_float64 (__main__.TestModuleCUDA)": 638.5084838867188,
+  "test_group_norm (__main__.TestQuantizedOps)": 235.64022382100424,
+  "test_indirect_device_assert (__main__.TritonCodeGenTests)": 328.87933349609375,
+  "test_inductor_dynamic_shapes_broadcasting_dynamic_shapes (__main__.DynamicShapesReproTests)": 116.18105255930047,
+  "test_inductor_no_recursionerror_on_for_loops_dynamic_shapes (__main__.DynamicShapesReproTests)": 70.07888836330838,
+  "test_inplace_gradgrad_cumprod_cuda_complex128 (__main__.TestBwdGradientsCUDA)": 89.06283315022786,
+  "test_inputs_overlapping_with_mutation_stress_dynamic_shapes (__main__.DynamicShapesAotAutogradFallbackTests)": 131.60088857014975,
+  "test_jit_cuda_archflags (__main__.TestCppExtensionJIT)": 118.61966451009114,
+  "test_linalg_solve_triangular_large_cuda_complex128 (__main__.TestLinalgCUDA)": 131.74433390299478,
+  "test_linalg_solve_triangular_large_cuda_complex64 (__main__.TestLinalgCUDA)": 101.52466583251953,
+  "test_linear (__main__.TestStaticQuantizedModule)": 219.97832912868924,
+  "test_linear_binary_cpp_wrapper (__main__.TestCppWrapper)": 111.1229985555013,
+  "test_linear_binary_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 139.29833475748697,
+  "test_linear_relu (__main__.TestStaticQuantizedModule)": 222.60332700941296,
+  "test_lobpcg_ortho_cuda_float64 (__main__.TestLinalgCUDA)": 137.30917072296143,
+  "test_longformer_chunk_dynamic_shapes (__main__.DynamicShapesReproTests)": 106.62766689724393,
+  "test_low_memory_max_pool_dilation_1_dim_3_cpu_halide (__main__.HalideCpuTests)": 585.4219970703125,
+  "test_low_memory_max_pool_dilation_2_dim_3_cpu_halide (__main__.HalideCpuTests)": 504.6419982910156,
+  "test_lstm_cpu (__main__.TestMkldnnCPU)": 69.61133321126302,
+  "test_many_overlapping_inputs_does_not_explode_guards_dynamic_shapes (__main__.DynamicShapesReproTests)": 127.47244517008464,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCodegenCpuTests)": 63.23977788289388,
+  "test_max_pool2d_with_indices_backward4_dynamic_shapes_cpu (__main__.DynamicShapesCpuTests)": 63.10499954223633,
+  "test_nan_assert_float16 (__main__.ProcessGroupNCCLGroupTest)": 105.55233224232991,
+  "test_pattern_matcher_multi_user_cpu (__main__.CpuTritonTests)": 148.99966939290366,
+  "test_proper_exit (__main__.TestDataLoader)": 195.07049942016602,
+  "test_proper_exit (__main__.TestDataLoaderPersistentWorkers)": 238.3838322957357,
+  "test_qat_conv2d_unary (__main__.TestQuantizePT2EX86Inductor)": 180.44411044650607,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn1d)": 64.31058961917192,
+  "test_qat_conv_bn_fusion_no_conv_bias (__main__.TestQuantizePT2EQAT_ConvBn2d)": 62.13955030441284,
+  "test_qat_mobilenet_v2 (__main__.TestQuantizePT2EQATModels)": 141.32811228434244,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 92.34100087483723,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 84.88599904378255,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True (__main__.TestPatternMatcher)": 77.63999938964844,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.23133341471355,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.41600036621094,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.7643305460612,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 85.55433400472005,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 86.17699940999348,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True (__main__.TestPatternMatcher)": 76.47133382161458,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 98.72666676839192,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_False_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 102.08499908447266,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False (__main__.TestPatternMatcher)": 79.43900044759114,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 87.4413324991862,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 88.52833302815755,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 91.18200174967448,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_False_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 91.71099853515625,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False (__main__.TestPatternMatcher)": 75.84733327229817,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_cpp_wrapper (__main__.TestCppWrapper)": 89.47599792480469,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_False_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 89.17300160725911,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_cpp_wrapper (__main__.TestCppWrapper)": 96.56466674804688,
+  "test_qlinear_add_int8_mixed_bf16_use_relu_True_is_qat_True_is_dynamic_True_dynamic_shapes_cpp_wrapper (__main__.DynamicShapesCppWrapperCpuTests)": 92.08200073242188,
+  "test_qrnncell (__main__.TestDynamicQuantizedOps)": 200.46322377522787,
+  "test_quick_core_backward__unsafe_masked_index_cpu_float64 (__main__.TestDecompCPU)": 637.5349934895834,
+  "test_quick_core_backward__unsafe_masked_index_cuda_float64 (__main__.TestDecompCUDA)": 1213.9888509114583,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cpu_float64 (__main__.TestDecompCPU)": 759.4036661783854,
+  "test_quick_core_backward__unsafe_masked_index_put_accumulate_cuda_float64 (__main__.TestDecompCUDA)": 1672.4736735026042,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cpu_float64 (__main__.TestDecompCPU)": 76.77566528320312,
+  "test_quick_core_backward_nn_functional_max_unpool3d_grad_cuda_float64 (__main__.TestDecompCUDA)": 292.51483662923175,
+  "test_quick_core_backward_roll_cpu_float64 (__main__.TestDecompCPU)": 129.11066691080728,
+  "test_quick_core_backward_roll_cuda_float64 (__main__.TestDecompCUDA)": 260.64366658528644,
+  "test_quick_core_backward_select_scatter_cpu_float64 (__main__.TestDecompCPU)": 73.24966684977214,
+  "test_quick_core_backward_select_scatter_cuda_float64 (__main__.TestDecompCUDA)": 157.60366821289062,
+  "test_quick_core_backward_split_cuda_float64 (__main__.TestDecompCUDA)": 78.70783360799153,
+  "test_quick_core_backward_split_with_sizes_copy_cpu_float64 (__main__.TestDecompCPU)": 89.36199951171875,
+  "test_quick_core_backward_split_with_sizes_copy_cuda_float64 (__main__.TestDecompCUDA)": 193.34283447265625,
+  "test_quick_core_backward_std_cpu_float64 (__main__.TestDecompCPU)": 64.08739941914877,
+  "test_quick_core_backward_std_cuda_float64 (__main__.TestDecompCUDA)": 126.64083353678386,
+  "test_register_spills_cuda (__main__.BenchmarkFusionCudaTest)": 106.82166735331218,
+  "test_replicatepad_64bit_indexing_cuda_float16 (__main__.TestNNDeviceTypeCUDA)": 64.22033437093098,
+  "test_rosenbrock_sparse_with_lrsched_False_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 65.57016626993816,
+  "test_rosenbrock_sparse_with_lrsched_True_SGD_cuda_float64 (__main__.TestOptimRenewedCUDA)": 76.09683354695638,
+  "test_runtime_checks_large_cpu (__main__.AOTInductorTestABICompatibleCpu)": 71.15816752115886,
+  "test_runtime_checks_large_cpu_with_stack_allocation (__main__.AOTInductorTestABICompatibleCpuWithStackAllocation)": 74.32677883572049,
+  "test_runtime_checks_large_cuda (__main__.AOTInductorTestABICompatibleGpu)": 157.43183390299478,
+  "test_save_load_large_string_attribute (__main__.TestSaveLoad)": 131.13233439127603,
+  "test_sdpa_kernel_ctx_manager2_dynamic_shapes (__main__.DynamicShapesCtxManagerTests)": 160.5550011528863,
+  "test_shuffler_iterdatapipe (__main__.IntegrationTestDataLoaderDataPipe)": 117.62710995144315,
+  "test_slow_tasks (__main__.TestFunctionalAutogradBenchmark)": 114.96744452582465,
+  "test_std (__main__.TestQuantizedOps)": 275.08810419506494,
+  "test_svd_lowrank_cuda_complex128 (__main__.TestLinalgCUDA)": 150.82900087038675,
+  "test_terminate_handler_on_crash (__main__.TestTorch)": 110.43555479579501,
+  "test_terminate_signal (__main__.ForkTest)": 130.07055732442274,
+  "test_terminate_signal (__main__.ParallelForkServerShouldWorkTest)": 129.6981106830968,
+  "test_terminate_signal (__main__.SpawnTest)": 133.48411263359918,
+  "test_torchvision_smoke (__main__.TestTensorBoardPytorchGraph)": 90.4521090189616,
+  "test_train_parity_multi_group (__main__.TestFullyShard1DTrainingCore)": 164.04612350463867,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 77.9958324432373,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 78.84283447265625,
+  "test_triton_bsr_scatter_mm_blocksize_64_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 79.08466720581055,
+  "test_triton_bsr_softmax_cuda_bfloat16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 127.43616739908855,
+  "test_triton_bsr_softmax_cuda_float16 (__main__.TestSparseCompressedTritonKernelsCUDA)": 129.390500386556,
+  "test_triton_bsr_softmax_cuda_float32 (__main__.TestSparseCompressedTritonKernelsCUDA)": 104.55349795023601,
+  "test_unary_ops (__main__.TestTEFuserDynamic)": 84.59466772609287,
+  "test_unary_ops (__main__.TestTEFuserStatic)": 87.30733429061041,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cpu_float32 (__main__.TestJitCPU)": 82.17999776204427,
+  "test_variant_consistency_jit_nn_functional_max_pool2d_cuda_float32 (__main__.TestJitCUDA)": 79.73050053914388,
+  "test_views1_dynamic_shapes_cuda (__main__.DynamicShapesGPUTests)": 87.70950190226237,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cpu_float32 (__main__.TestOperatorsCPU)": 96.42566680908203,
+  "test_vmapjvpvjp_linalg_lstsq_grad_oriented_cuda_float32 (__main__.TestOperatorsCUDA)": 78.90966542561848,
+  "test_vmapjvpvjp_linalg_lu_solve_cpu_float32 (__main__.TestOperatorsCPU)": 62.53285598754883,
+  "test_vmapjvpvjp_linalg_lu_solve_cuda_float32 (__main__.TestOperatorsCUDA)": 91.11416816711426,
+  "test_vmapjvpvjp_linalg_multi_dot_cuda_float32 (__main__.TestOperatorsCUDA)": 86.59666760762532,
+  "test_vmapjvpvjp_linalg_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 93.32300059000652,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cpu_float32 (__main__.TestOperatorsCPU)": 100.57566833496094,
+  "test_vmapjvpvjp_max_pool2d_with_indices_backward_cuda_float32 (__main__.TestOperatorsCUDA)": 116.00733248392741,
+  "test_vmapjvpvjp_nn_functional_conv2d_cpu_float32 (__main__.TestOperatorsCPU)": 62.26690483093262,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cpu_float32 (__main__.TestOperatorsCPU)": 87.44200134277344,
+  "test_vmapjvpvjp_nn_functional_max_pool2d_cuda_float32 (__main__.TestOperatorsCUDA)": 133.6548334757487,
+  "test_vmapjvpvjp_svd_cuda_float32 (__main__.TestOperatorsCUDA)": 114.57983334859212,
+  "test_vmapjvpvjp_unbind_cpu_float32 (__main__.TestOperatorsCPU)": 69.25033442179362,
+  "test_vmapjvpvjp_unbind_cuda_float32 (__main__.TestOperatorsCUDA)": 124.68766911824544,
+  "test_vmapvjpvjp_linalg_lstsq_cuda_float32 (__main__.TestOperatorsCUDA)": 76.81024932861328,
+  "test_vmapvjpvjp_meshgrid_list_of_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 140.70899963378906,
+  "test_vmapvjpvjp_meshgrid_variadic_tensors_cuda_float32 (__main__.TestOperatorsCUDA)": 118.22750091552734,
+  "test_vmapvjpvjp_nn_functional_bilinear_cuda_float32 (__main__.TestOperatorsCUDA)": 181.27366256713867
 }
\ No newline at end of file
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 53a98276090c..dbd1454ff745 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -614,8 +614,6 @@ def unpack(x):
 
         with disable_gc():
             unpack_hook_ref = scope()
-            if torch._dynamo.is_compiling():
-                torch._dynamo.reset()
             self.assertIsNone(unpack_hook_ref())
 
     def test_will_engine_execute_node(self):
@@ -3890,6 +3888,38 @@ def backward(ctx, grad_output):
         torch.autograd.grad(y, x, create_graph=True)
         torch.autograd.grad(y, x)  # should not error!
 
+    def test_custom_autograd_ac_early_stop(self):
+        refs = []
+
+        class Test(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, x):
+                y = x.clone()
+                ctx.save_for_backward(y)
+                refs.append(weakref.ref(y))
+                return y
+
+            @staticmethod
+            def backward(ctx, *args):
+                _ = ctx.saved_tensors
+                return None
+
+        def fn(inp):
+            return Test.apply(inp)
+
+        inp = torch.randn(5, 5, requires_grad=True)
+
+        def scope():
+            # Early-stop is true by default in non-reentrant torch.utils.checkpoint
+            out = torch.utils.checkpoint.checkpoint(fn, inp, use_reentrant=False)
+            out.sum().backward()
+
+        with disable_gc():
+            scope()
+
+            for ref in refs:
+                self.assertIsNone(ref())
+
     def test_detach(self):
         x = torch.randn(10, 10, requires_grad=True)
         y = x + 2
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 1c31d5445f91..569d1bac8595 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1480,8 +1480,8 @@ def to_np(value):
                     self.assertRaisesRegex(RuntimeError, regex, base.pow_, exponent)
                 elif torch.can_cast(torch.result_type(base, exponent), base.dtype):
                     actual2 = actual.pow_(exponent)
-                    self.assertEqual(actual, expected)
-                    self.assertEqual(actual2, expected)
+                    self.assertEqual(actual, expected.to(actual))
+                    self.assertEqual(actual2, expected.to(actual))
                 else:
                     self.assertRaisesRegex(
                         RuntimeError,
diff --git a/test/test_cpp_extensions_jit.py b/test/test_cpp_extensions_jit.py
index fd80c7fa565a..e93167296a00 100644
--- a/test/test_cpp_extensions_jit.py
+++ b/test/test_cpp_extensions_jit.py
@@ -220,6 +220,12 @@ def test_mps_extension(self):
 
         self.assertEqual(cpu_output, mps_output.to("cpu"))
 
+        # Regression test for https://github.com/pytorch/pytorch/issues/163721
+        lib = torch.mps.compile_shader("void kernel noop(device float *x) {}")
+        lib.noop(mps_output)
+        module.mps_add_one_new_context(mps_output)
+        self.assertEqual(cpu_output + 1.0, mps_output.to("cpu"))
+
     def _run_jit_cuda_archflags(self, flags, expected):
         # Compile an extension with given `flags`
         def _check_cuobjdump_output(expected_values, is_ptx=False):
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 11ff43b06b8d..7985a2cd9fe8 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -504,6 +504,9 @@ def test_out_of_memory_retry(self):
         IS_JETSON, "oom reporting has issues on jetson igx due to partial nvml support"
     )
     def test_set_per_process_memory_fraction(self):
+        if torch.version.hip and ('gfx1101' in torch.cuda.get_device_properties(0).gcnArchName):
+           torch.cuda.empty_cache()
+           torch.cuda.reset_peak_memory_stats()
         orig = torch.cuda.get_per_process_memory_fraction(0)
         torch.cuda.reset_peak_memory_stats(0)
         try:
@@ -4549,28 +4552,28 @@ def power2_div(size, div_factor):
         with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings("foo:1,bar:2")
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings(
                 "garbage_collection_threshold:1.2"
             )
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings("max_split_size_mb:2")
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings("release_lock_on_cudamalloc:none")
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings(
                 "pinned_use_cuda_host_register:none"
             )
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings(
                 "pinned_num_register_threads:none"
             )
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings(
                 "pinned_num_register_threads:1024"
             )
@@ -5613,6 +5616,149 @@ def my_function(pool):
             s = p.snapshot()
             self.assertEqual(len(s), 1, "Expected to have a single segment")
 
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    def test_graph_capture_reclaim_2_streams(self):
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:True"
+        )
+        torch.cuda.empty_cache()
+
+        s1, s2 = torch.cuda.Stream(), torch.cuda.Stream()
+        g = torch.cuda.CUDAGraph(keep_graph=True)
+
+        torch.cuda.synchronize()
+
+        with torch.cuda.stream(s1):
+            g.capture_begin()
+
+            # A sink node allocated up-front so it doesn't steal data1's block later.
+            sink1 = torch.empty(8, device="cuda")
+
+            # Source tensor on s1; this block is the reuse candidate.
+            data1 = torch.empty(8, device="cuda")
+            data1_ptr = data1.data_ptr()
+
+            # Fork: do real work on s2 that READS data1 and writes to its own buffer.
+            s2.wait_stream(s1)
+            with torch.cuda.stream(s2):
+                buf2 = torch.empty_like(data1)
+                torch.add(data1, 2.0, out=buf2)
+                data1.record_stream(s2)
+
+            del data1
+
+            # BEFORE JOIN: must NOT reuse
+            data2 = torch.empty(8, device="cuda")
+            data2_ptr = data2.data_ptr()
+
+            # Join s2 -> s1 and add a sink node on s1.
+            s1.wait_stream(s2)
+            sink1.fill_(1.0)
+
+            # AFTER JOIN: now reuse is allowed
+            data3 = torch.empty(8, device="cuda")
+            data3_ptr = data3.data_ptr()
+
+            g.capture_end()
+
+        torch.cuda.synchronize()
+
+        # No reuse before join; reuse after join.
+        self.assertNotEqual(data1_ptr, data2_ptr)
+        self.assertEqual(data1_ptr, data3_ptr)
+
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:False"
+        )
+
+    @unittest.skipIf(
+        not TEST_CUDA_GRAPH, "CUDA >= 11.0 or ROCM >= 5.3 required for graphs"
+    )
+    def test_graph_capture_reclaim_4_streams(self):
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:True"
+        )
+
+        torch.cuda.empty_cache()
+        s1, s2, s3, s4 = (
+            torch.cuda.Stream(),
+            torch.cuda.Stream(),
+            torch.cuda.Stream(),
+            torch.cuda.Stream(),
+        )
+        g = torch.cuda.CUDAGraph(keep_graph=True)
+
+        torch.cuda.synchronize()
+
+        with torch.cuda.stream(s1):
+            g.capture_begin()
+
+            # Source tensor allocated on s1. This block is the candidate for reuse.
+            data1 = torch.ones(8, device="cuda")
+            data1_ptr = data1.data_ptr()
+            sink1 = torch.empty_like(data1)
+            sink3 = torch.empty_like(data1)
+
+            s2.wait_stream(s1)
+            with torch.cuda.stream(s2):
+                buf2 = torch.empty_like(data1)
+                torch.add(data1, 2.0, out=buf2)
+                data1.record_stream(s2)
+
+            s3.wait_stream(s1)
+            with torch.cuda.stream(s3):
+                buf3 = torch.empty_like(data1)
+                torch.add(data1, 3.0, out=buf3)
+                data1.record_stream(s3)
+
+            s4.wait_stream(s1)
+            with torch.cuda.stream(s4):
+                buf4 = torch.empty_like(data1)
+                torch.add(data1, 4.0, out=buf4)
+                data1.record_stream(s4)
+
+            # Free data1 inside capture; allocator may reuse later when it's safe.
+            del data1
+
+            # PARTIAL JOINS: should NOT allow reuse yet
+            # Join s2 -> s1 and add a sink node on s1.
+            s1.wait_stream(s2)
+            sink1.fill_(1.0)
+
+            # Join s4 -> s3 and add a sink node on s3.
+            s3.wait_stream(s4)
+            with torch.cuda.stream(s3):
+                sink3.fill_(3.0)
+                sink3.record_stream(s3)
+
+            # At this point, s1 and s3 subgraphs are NOT yet joined together.
+            # Allocating data2 here must NOT reuse data1's block.
+            data2 = torch.empty(8, device="cuda")
+            data2_ptr = data2.data_ptr()
+
+            # FINAL JOIN: now reuse is allowed
+            # Join s3 -> s1 and add a sink node on s1.
+            s1.wait_stream(s3)
+            sink1.add_(sink3)
+
+            # Now allocator should safely reuse data1's block.
+            data3 = torch.empty(8, device="cuda")
+            data3_ptr = data3.data_ptr()
+
+            g.capture_end()
+
+        torch.cuda.synchronize()
+
+        # No reuse before full join; reuse after full join.
+        self.assertNotEqual(data1_ptr, data2_ptr)
+        self.assertEqual(data1_ptr, data3_ptr)
+
+        torch.cuda.memory._set_allocator_settings(
+            "graph_capture_record_stream_reuse:False"
+        )
+
     @skipIfRocm(msg="expandable_segments mode is not supported on ROCm")
     @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Load_inline doesn't work in fbcode")
     def test_mempool_expandable(self):
@@ -6902,9 +7048,7 @@ def test_graph_external_wait_and_record(self):
         """
         from torch.cuda import _compile_kernel
 
-        spin_wait_kernel = _compile_kernel(
-            kernel_source, "wait_for_cpu", compute_capability="70"
-        )
+        spin_wait_kernel = _compile_kernel(kernel_source, "wait_for_cpu")
 
         x = torch.ones(4, device="cuda")
         x_cpu = torch.zeros(x.shape, device="cpu").pin_memory()
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 8f4e74d85177..b2f47c437fc3 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -25,6 +25,7 @@
 from torch.testing._internal.common_utils import (
     IS_CI,
     IS_JETSON,
+    IS_MACOS,
     IS_S390X,
     IS_SANDCASTLE,
     IS_WINDOWS,
@@ -3477,6 +3478,10 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers):
             if current_worker_idx == num_workers:
                 current_worker_idx = 0
 
+    @unittest.skipIf(
+        IS_WINDOWS or IS_MACOS,
+        "Flaky on Windows and MacOS https://github.com/pytorch/pytorch/issues/68643",
+    )
     def test_ind_worker_queue(self):
         max_num_workers = None
         if hasattr(os, "sched_getaffinity"):
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 9baad91da79d..78e3ff69ed5b 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -1943,6 +1943,16 @@ def test_cache_key_constants(self):
             self._test_cache_key(fm, 1.0, 1.0, 1)
             self._test_cache_key(fm, 0.0, 0.0, 0)
 
+    def test_empty_list(self):
+        with FakeTensorMode() as fm:
+            func = aten.any.dims
+            state = _CacheKeyState()
+            x = torch.ones((2, 3))
+            key_x = fm._cache_key(state, func, [x, []], {})
+            key_y = fm._cache_key(state, func, [x], {})
+
+        self.assertNotEqual(key_x, key_y)
+
     def assertHitsMisses(self, hits, misses):
         """
         Helper to assert on the number of recorded hits and misses.
diff --git a/test/test_linalg.py b/test/test_linalg.py
index ac668fee049d..0f6c8f207421 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -4254,6 +4254,18 @@ def test(n=10,                       # how many tests to generate
 
         test(500)
 
+    @dtypes(torch.float)
+    def test_einsum_output_layout(self, device, dtype):
+        batch, in_dim, out_dim = 2, 3, 5
+        x = make_tensor((batch, in_dim), dtype=dtype, device=device)
+        w = make_tensor((out_dim, in_dim), dtype=dtype, device=device)
+        result = torch.einsum("fd,bd->bf", w, x)
+        expected = x.matmul(w.t())
+        self.assertEqual(result, expected)
+        self.assertTrue(result.is_contiguous())
+        self.assertEqual(result.stride(), expected.stride())
+
+
     def test_einsum_corner_cases(self, device):
         def check(equation, *operands, expected_output):
             tensors = [torch.tensor(operand, device=device, dtype=torch.float32) if not isinstance(operand, tuple)
@@ -6003,6 +6015,7 @@ def test_tensordot_out_kernel_errors_with_autograd(self, device, dtype):
             self.assertEqual(len(w), 1)
 
     # 4GB should do, but we run tests in parallel in CI, so let's be generous
+    @onlyCUDA
     @largeTensorTest('16GB', device='cuda')
     def test_large_bmm_mm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device="cuda").mT.contiguous().mT
@@ -6013,6 +6026,7 @@ def test_large_bmm_mm_backward(self, device):
         (A @ B).backward(G)
 
     # 4GB should do, but we run tests in parallel in CI, so let's be generous
+    @onlyCUDA
     @largeTensorTest('16GB', device='cuda')
     def test_large_bmm_backward(self, device):
         A = torch.randn([1024, 2, 1024], device="cuda").mT.contiguous().mT
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index 7e28633ca080..175e6a9649cd 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -21,6 +21,7 @@
 from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_BF16,
     SM53OrLater,
+    SM80OrLater,
     SM89OrLater,
     SM90OrLater,
     xfailIfSM100OrLater,
@@ -29,6 +30,7 @@
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_FP8_GROUPED_GEMM,
     PLATFORM_SUPPORTS_MX_GEMM,
+    PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM,
     IS_SM90,
 )
 from torch.testing._internal.common_device_type import (
@@ -54,7 +56,13 @@
     TEST_WITH_ROCM,
     TestCase,
 )
-from torch.testing._internal.common_quantized import _f32_to_floatx_unpacked, _floatx_unpacked_to_f32, ceil_div, to_blocked
+from torch.testing._internal.common_quantized import (
+    _f32_to_floatx_unpacked,
+    _floatx_unpacked_to_f32,
+    ceil_div, to_blocked,
+    to_mxfp8,
+    generate_jagged_offs,
+)
 
 _IS_SM8X = False
 if TEST_CUDA:
@@ -310,13 +318,13 @@ def grouped_mm_helper(self, alist, blist, gOlist, agradlist, bgradlist, outlist)
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported only on SM90 and SM100")
+    @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
-    def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major):
+    @dtypes(torch.bfloat16, torch.float32, torch.float16)
+    def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
-        dtype = torch.bfloat16
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
             a = torch.randn(m, k * n_groups + k * int(strided), device=device, dtype=dtype)[:, :k * n_groups]
@@ -333,7 +341,7 @@ def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major):
         offs = torch.arange(k, n_groups * k + 1, k, device=device, dtype=torch.int32)
 
         f = torch._grouped_mm
-        out = f(a, b.t(), offs=offs, out_dtype=torch.bfloat16)
+        out = f(a, b.t(), offs=offs, out_dtype=dtype)
         gO = torch.rand_like(out)
         out.backward(gO)
         offs_cpu = offs.cpu()
@@ -349,13 +357,13 @@ def test_grouped_gemm_2d_2d(self, strided, a_row_major, b_row_major):
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported only on SM90 and SM100")
+    @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
-    def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major):
+    @dtypes(torch.bfloat16, torch.float32, torch.float16)
+    def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
-        dtype = torch.bfloat16
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
@@ -382,12 +390,12 @@ def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major):
 
             a.grad = None
             b.grad = None
-            offs = torch.arange(m, n_groups * m + 1, m, device="cuda", dtype=torch.int32)
+            offs = torch.arange(m, n_groups * m + 1, m, device=device, dtype=torch.int32)
             if check_zero_size:
                 offs[0] = offs[1]
 
             f = torch._grouped_mm
-            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=torch.bfloat16)
+            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=dtype)
             gO = torch.rand_like(out)
             if not check_zero_size:
                 out.backward(gO)
@@ -406,13 +414,13 @@ def test_grouped_gemm_2d_3d(self, strided, a_row_major, b_row_major):
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported only on SM90 and SM100")
+    @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
-    def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major):
+    @dtypes(torch.bfloat16, torch.float32, torch.float16)
+    def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
-        dtype = torch.bfloat16
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
@@ -434,20 +442,20 @@ def test_grouped_gemm_3d_3d(self, strided, a_row_major, b_row_major):
         self.assertTrue(b_contig.is_contiguous() is not strided)
 
         f = torch._grouped_mm
-        out = f(a, b.transpose(-2, -1), out_dtype=torch.bfloat16)
+        out = f(a, b.transpose(-2, -1), out_dtype=dtype)
         gO = torch.rand_like(out)
         out.backward(gO)
         self.grouped_mm_helper(a, b, gO, a.grad, b.grad, out)
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM120OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported only on SM90 and SM100")
+    @unittest.skipIf(not SM80OrLater, "Grouped gemm supported only on SM80 or greater")
     @parametrize("strided", [False, True])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
-    def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major):
+    @dtypes(torch.bfloat16, torch.float32, torch.float16)
+    def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major, dtype):
         device = "cuda"
-        dtype = torch.bfloat16
         s_int = int(strided)
         m, n, k, n_groups = 16, 32, 64, 4
         if a_row_major:
@@ -471,12 +479,12 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major):
             if check_zero_size and n_groups <= 1:
                 continue
 
-            offs = torch.arange(n, n_groups * n + 1, n, device="cuda", dtype=torch.int32)
+            offs = torch.arange(n, n_groups * n + 1, n, device=device, dtype=torch.int32)
             if check_zero_size:
                 offs[0] = offs[1]
 
             f = torch._grouped_mm
-            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=torch.bfloat16)
+            out = f(a, b.transpose(-2, -1), offs=offs, out_dtype=dtype)
             gO = torch.rand_like(out)
             if not check_zero_size:
                 out.backward(gO)
@@ -494,7 +502,8 @@ def test_grouped_gemm_3d_2d(self, strided, a_row_major, b_row_major):
 
     @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUTLASS")
     @xfailIfSM100OrLater
-    @unittest.skipIf(not SM90OrLater, "Grouped gemm supported on SM90")
+    # TODO(future PR): enable compile for torch._grouped_mm fallback path
+    @unittest.skipIf(not SM90OrLater, "Grouped gemm with compile supported on SM90")
     @parametrize("op", ["2d/2d", "2d/3d", "3d/2d", "3d/3d"])
     @parametrize("a_row_major", [False, True])
     @parametrize("b_row_major", [False, True])
@@ -612,13 +621,13 @@ def test_grouped_gemm_compiled(self, op, a_row_major, b_row_major, max_autotune)
 
 
     @onlyCUDA
-    @skipIfRocm
     @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
     @parametrize("M", [1, 32, 64])
     @parametrize("N", [1, 32, 64])
     @parametrize("K", [1, 32, 64])
     @parametrize("batch_size", [None, 1, 16])
-    @parametrize("backend", ["cublas", "cublaslt"])
+    # TODO: enable rocblas path on ROCm
+    @parametrize("backend", ["cublaslt"] if torch.version.hip else ["cublas", "cublaslt"])
     def test_mm_bmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
         device = "cuda"
         dtype = input_dtype
@@ -667,13 +676,13 @@ def create_inputs(B=None):
 
 
     @onlyCUDA
-    @skipIfRocm
     @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
     @parametrize("M", [1, 32, 64])
     @parametrize("N", [1, 32, 64])
     @parametrize("K", [1, 32, 64])
     @parametrize("batch_size", [None, 1, 32])
-    @parametrize("backend", ["cublas", "cublaslt"])
+    # TODO: enable rocblas path on ROCm
+    @parametrize("backend", ["cublaslt"] if torch.version.hip else ["cublas", "cublaslt"])
     def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
         device = "cuda"
         dtype = input_dtype
@@ -771,6 +780,7 @@ def expand(tensor):
 f8_msg = "FP8 is only supported on H100+, SM 8.9 and MI300+ devices"
 f8_grouped_msg = "FP8 grouped is only supported on SM90 and MI300+ devices"
 mx_skip_msg = "MX gemm is only supported on CUDA capability 10.0+"
+mxfp8_grouped_mm_skip_msg = "MXFP8 grouped GEMM is only supported when PyTorch is built with USE_FBGEMM_GENAI=1 on SM100+"
 
 # avoid division by zero when calculating scale
 EPS = 1e-12
@@ -901,6 +911,8 @@ def to_fp8_saturated(
 
     return x.to(fp8_dtype)
 
+
+
 def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     """Computes the error between two tensors in dB.
 
@@ -1045,6 +1057,167 @@ def test_float8_scale(self, device) -> None:
         out_fp8_s = torch._scaled_mm(x, y, scale_a=scale_a, scale_b=scale_b)
         self.assertEqual(out_fp8, out_fp8_s)
 
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
+    @parametrize("G", [1, 4, 16])
+    @parametrize("M", [2048, 2049])
+    @parametrize("N", [8192])
+    @parametrize("K", [16640])
+    def test_mxfp8_scaled_grouped_mm_2d_2d(self, G, M, N, K):
+        torch.manual_seed(42)
+        total_K = K  # Alias for clarity, communicating this consists of several groups along this dim
+        input_group_end_offsets = generate_jagged_offs(
+            G, total_K, multiple_of=32, device="cuda"
+        )
+        X = torch.randn((M, total_K), dtype=torch.bfloat16, device="cuda") * 0.1
+        W = torch.randn((N, total_K), dtype=torch.bfloat16, device="cuda") * 0.01
+
+        # Convert scales to blocked format.
+        x_list = []
+        w_list = []
+        x_blocked_scale_list = []
+        w_blocked_scale_list = []
+
+        def round_up(x: int, y: int) -> int:
+            return ((x + y - 1) // y) * y
+
+        for group_idx in range(G):
+            # to_mxfp8 per group
+            prev_group_end_offset = (
+                0 if group_idx == 0 else input_group_end_offsets[group_idx - 1]
+            )
+            curr_group_end_offset = input_group_end_offsets[group_idx]
+            group_size = curr_group_end_offset - prev_group_end_offset
+            if group_size > 0:
+                x_slice = X[
+                    :, prev_group_end_offset:curr_group_end_offset
+                ].contiguous()  # (M, K_group)
+                w_slice = W[
+                    :, prev_group_end_offset:curr_group_end_offset
+                ].contiguous()  # (N, K_group)
+                x_scale_slice, xq_slice = to_mxfp8(
+                    x_slice
+                )  # scale shape -> (M, K_group // 32)
+                w_scale_slice, wq_slice = to_mxfp8(
+                    w_slice
+                )  # scale shape -> (N, K_group // 32)
+                x_list.append(xq_slice)
+                w_list.append(wq_slice)
+
+                # Convert scales to blocked format.
+                x_scale_slice_blocked = to_blocked(
+                    x_scale_slice
+                )  # (round_up(M, 128), round_up(K_group//32, 4))
+                w_scale_slice_blocked = to_blocked(
+                    w_scale_slice
+                )  # (round_up(N, 128), round_up(K_group//32, 4))
+                x_blocked_scale_list.append(x_scale_slice_blocked)
+                w_blocked_scale_list.append(w_scale_slice_blocked)
+
+        # Assemble the full XQ and WQ
+        xq = torch.cat(x_list, dim=1).contiguous()
+        wq = torch.cat(w_list, dim=1).contiguous()
+
+        # Combine all XQ groups blocked scales into one tensor.
+        x_blocked_scales = torch.cat(x_blocked_scale_list, dim=0)
+        M_rounded = round_up(M, 128)
+        x_blocked_scales = x_blocked_scales.reshape(M_rounded, -1)
+
+        # Combine all WQ groups blocked scales into one tensor.
+        w_blocked_scales = torch.cat(w_blocked_scale_list, dim=0)
+        N_rounded = round_up(N, 128)
+        w_blocked_scales = w_blocked_scales.reshape(N_rounded, -1)
+
+        # Compute mxfp8 grouped mm output
+        y_mxfp8 = torch._scaled_grouped_mm(
+            xq,  # (M, total_K)
+            wq.transpose(-2, -1),  # (total_K, N)
+            x_blocked_scales,  # to_blocked_per_group(M, total_K//32)
+            w_blocked_scales,  # to_blocked_per_group(N, total_K//32)
+            offs=input_group_end_offsets,  # (G,)
+            out_dtype=torch.bfloat16,
+        )
+
+        # bf16 reference output
+        y_bf16 = torch._grouped_mm(
+            X, W.t(), offs=input_group_end_offsets, out_dtype=torch.bfloat16
+        )
+
+        # Assert no NaNs
+        assert not y_mxfp8.isnan().any(), "mxfp8 output contains NaN"
+
+        # Assert outputs are close
+        torch.testing.assert_close(y_mxfp8, y_bf16, atol=8.0e-2, rtol=8.0e-2)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM, mxfp8_grouped_mm_skip_msg)
+    @parametrize("G", [1, 4, 16])
+    @parametrize("M", [16640])
+    @parametrize("N", [8192])
+    @parametrize("K", [4096])
+    def test_mxfp8_scaled_grouped_mm_2d_3d(self, G, M, N, K):
+        torch.manual_seed(42)
+        # Simulate 2d-3d grouped gemm `out = input @ weight.t()`
+        # 2D inputs with groups along M, 3D weights.
+        block_size = 32
+        total_M = M  # Alias for clarity that M dim contains groups.
+        X = torch.randn((total_M, K), dtype=torch.bfloat16, device="cuda") * 0.1
+        W = torch.randn((G, N, K), dtype=torch.bfloat16, device="cuda") * 0.01
+        input_group_end_offsets = generate_jagged_offs(
+            G, total_M, multiple_of=32, device="cuda"
+        )
+
+        # For each constituent 2d subtensor in the 3d weights, quantize and convert scale to blocked format separately,
+        # as they each used for independent gemm in the grouped gemm.
+        wq_list = []
+        w_scale_list = []
+        for i in range(G):
+            w_scale, wq = to_mxfp8(W[i])
+            w_scale = to_blocked(w_scale)
+            wq_list.append(wq)
+            w_scale_list.append(w_scale)
+        wq = torch.stack(wq_list, dim=0).contiguous()
+        w_scale = torch.stack(w_scale_list, dim=0).contiguous()
+
+        # For each group along `total_M` in the 2D tensor, quantize and convert scale to blocked format separately,
+        # as they each used for independent gemm in the grouped gemm.
+        xq_list = []
+        x_scale_list = []
+        for i in range(G):
+            prev_group_end = 0 if i == 0 else input_group_end_offsets[i - 1]
+            curr_group_end = input_group_end_offsets[i]
+            group_size = curr_group_end - prev_group_end
+            if group_size > 0:
+                x_slice = X[prev_group_end:curr_group_end, :]
+                x_scale, xq = to_mxfp8(x_slice)
+                x_scale = to_blocked(x_scale)
+                xq_list.append(xq)
+                x_scale_list.append(x_scale)
+        xq = torch.cat(xq_list, dim=0).contiguous()
+        x_scale = torch.cat(x_scale_list, dim=0).contiguous()
+        x_scale = x_scale.reshape(-1, K // block_size)
+        xq = xq.view(-1, xq.shape[-1])
+
+        # Compute mxfp8 grouped gemm.
+        y_mxfp8 = torch._scaled_grouped_mm(
+            xq,
+            wq.transpose(-2, -1),
+            x_scale,
+            w_scale,
+            offs=input_group_end_offsets,
+            out_dtype=torch.bfloat16,
+        )
+
+        # Compute reference bf16 grouped gemm.
+        y_bf16 = torch._grouped_mm(
+            X,
+            W.transpose(-2, -1),
+            offs=input_group_end_offsets,
+            out_dtype=torch.bfloat16,
+        )
+
+        # Assert outputs are close.
+        torch.testing.assert_close(y_mxfp8, y_bf16, atol=8.0e-2, rtol=8.0e-2)
+
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, f8_msg)
     @parametrize("base_dtype", [torch.float16, torch.bfloat16, torch.float32])
     def test_scaled_mm_vs_emulated(self, base_dtype):
@@ -1315,18 +1488,26 @@ def test_float8_error_messages(self, device) -> None:
                 out_dtype=torch.bfloat16,
             )
 
-        # Note re.compile is used, not re.escape. This is to accommodate fn vs fnuz type message.
-        with self.assertRaisesRegex(
-            RuntimeError,
-            r"Expected b\.dtype\(\) == at::kFloat8_e4m3fnu?z? to be true, but got false\.",
-        ):
-            torch._scaled_mm(
+        def e5m2():
+            out = torch._scaled_mm(
                 x_fp8,
                 y_fp8.to(e5m2_type),
                 scale_a=torch.ones((M, 1), device="cuda"),
                 scale_b=torch.ones((1, N), device="cuda"),
                 out_dtype=torch.bfloat16,
             )
+            return out
+
+        if torch.cuda.get_device_capability() == (9, 0) and torch.version.cuda and torch.version.cuda >= "12.9":
+            out = e5m2()
+            self.assertEqual(out, torch.ones_like(out) * 128.)
+        else:
+            # Note re.compile is used, not re.escape. This is to accommodate fn vs fnuz type message.
+            with self.assertRaisesRegex(
+                RuntimeError,
+                r"Expected b\.dtype\(\) == at::kFloat8_e4m3fnu?z? to be true, but got false\.",
+            ):
+                e5m2()
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FP8 or IS_WINDOWS, f8_msg)
     @unittest.skipIf(not SM89OrLater, "rowwise implementation is currently sm89-sm100 specific")
@@ -1736,8 +1917,8 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
                 B = B.clamp(min=min_val, max=max_val).to(torch.float8_e4m3fn)
             else:  # nvfp4 # mxfp4
                 scale_func = data_to_mx_scale if recipe == "mxfp4" else data_to_nvfp4_scale
-                A_scale = scale_func(A_ref, BLOCK_SIZE, recipe if recipe == "mxfp4" else None)
-                B_scale = scale_func(B_ref, BLOCK_SIZE, recipe if recipe == "mxfp4" else None)
+                A_scale = scale_func(*([A_ref, BLOCK_SIZE] + recipe if recipe == "mxfp4" else [A_ref, BLOCK_SIZE]))
+                B_scale = scale_func(*([B_ref, BLOCK_SIZE] + recipe if recipe == "mxfp4" else [B_ref, BLOCK_SIZE]))
                 max_val = FP4_MAX_VAL
                 min_val = -1 * max_val
 
@@ -1799,10 +1980,9 @@ def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
         # Test wrong scale tensor size for scale_a with correct dtype
         with self.assertRaisesRegex(
             RuntimeError,
-            re.escape(
-                f"For BlockWise scaling: Expected scale_a size to be {expected_a_size} "
-                f"but got {expected_a_size - 1}"
-            ),
+            f".*For Block[W,w]ise.*scaling.*scale_a should have {expected_a_size} "
+            f"elements.*"
+            ,
         ):
             incorrect_size_a = torch.ones(expected_a_size - 1, device=device, dtype=scale_dtype)
             correct_size_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
@@ -1817,10 +1997,9 @@ def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
         # Test wrong scale tensor size for scale_b with correct dtype
         with self.assertRaisesRegex(
             RuntimeError,
-            re.escape(
-                f"For BlockWise scaling: Expected scale_b size to be {expected_b_size} "
-                f"but got {expected_b_size + 1}"
-            ),
+            f"For Block[W,w]ise.*scaling.*scale_b should have {expected_b_size} "
+            f"elements.*"
+            ,
         ):
             correct_size_a = torch.ones(expected_a_size, device=device, dtype=scale_dtype)
             incorrect_size_b = torch.ones(expected_b_size + 1, device=device, dtype=scale_dtype)
@@ -1835,9 +2014,8 @@ def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
         # Test non-contiguous scale tensors with correct dtype
         with self.assertRaisesRegex(
             RuntimeError,
-            re.escape(
-                "For BlockWise scaling: Both scale_a and scale_b must be contiguous"
-            ),
+            "For Block[W,w]ise.*scaling.*both should be contiguous"
+            ,
         ):
             non_contiguous_a = torch.ones(expected_a_size * 2, device=device, dtype=scale_dtype)[::2]
             contiguous_b = torch.ones(expected_b_size, device=device, dtype=scale_dtype)
diff --git a/test/test_mps.py b/test/test_mps.py
index 8333ec006077..9204bf5dba2c 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -1244,6 +1244,17 @@ def test_linear_errors(self):
             torch.nn.functional.linear(torch.rand(size, device='cpu'),
                                        torch.rand(size, device='mps'))
 
+    def test_linear_non_contiguous(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/161640
+        # Slice tensors to force non-contiguity
+        large_weight = torch.randn(12, 8, device='mps')
+        weight_sliced = large_weight[::2, ::1]
+        weight_contiguous_equiv = weight_sliced.contiguous()
+        input_s = torch.randn(2, 8, device='mps')
+        result_sliced = torch.nn.functional.linear(input_s, weight_sliced)
+        result_contig = torch.nn.functional.linear(input_s, weight_contiguous_equiv)
+        self.assertEqual(result_contig, result_sliced)
+
     def _linear_helper(self, in_features, out_features, shape, bias=True, backward_pass=False):
         cpu_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="cpu", bias=bias)
         mps_linear = torch.nn.Linear(in_features=in_features, out_features=out_features, device="mps", bias=bias)
@@ -7513,6 +7524,39 @@ def test_bernoulli(self):
             uniq = mps_out.unique()
             self.assertEqual(uniq, torch.arange(2, device='mps', dtype=dtype))
 
+    @parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    def test_dropout(self, dtype):
+        shapes = [
+            (100_000,),
+            (100, 1000),
+            (10, 100, 100),
+            (10, 10, 10, 10, 10),
+        ]
+        p_list = [0, 0.34, 0.78, 1]
+
+        for shape, p, train in itertools.product(shapes, p_list, [False, True]):
+            input = torch.randn(shape, device='mps', dtype=dtype, requires_grad=True)
+            output, mask = torch.native_dropout(input, p, train=train)
+
+            p_actual_mps = 1 - (mask.sum() / mask.numel())
+            if train:
+                self.assertEqual(p_actual_mps, p, atol=1e-2, rtol=1e-2)
+                self.assertTrue((output[mask.logical_not()] == 0).all())
+                self.assertEqual(output[mask], input[mask] / (1 - p))
+            else:
+                self.assertEqual(output, input)
+                self.assertTrue(mask.all())
+
+            output_grad = torch.randn_like(output)
+            output.backward(output_grad)
+
+            grad_scale = 0 if p == 1 else 1 / (1 - p)
+            if train:
+                self.assertEqual(input.grad, output_grad * mask * grad_scale)
+            else:
+                self.assertEqual(input.grad, output_grad)
+
+
     def test_mps_generator(self):
         # explicit manual seeding by creating an MPS Generator
         g_mps = torch.Generator(device='mps')
@@ -8006,6 +8050,14 @@ def test_inplace_bitwise_not(self, dtype):
             x[::2].bitwise_not_()
         self.assertEqual(x_mps.cpu(), x_cpu)
 
+    def test_empty_posneginf(self):
+        # just to check that it doesnt crash
+        input_tensor = torch.empty(0, device="mps")
+        out_pos = torch.isposinf(input_tensor)
+        out_neg = torch.isposinf(input_tensor)
+        self.assertEqual(out_pos.numel(), 0)
+        self.assertEqual(out_neg.numel(), 0)
+
 
 class TestLargeTensors(TestCaseMPS):
     @serialTest()
@@ -9420,17 +9472,37 @@ def get_mps_memory_usage():
         # 5 MB different maximum allowed value(could be decreased even more)
         torch.testing.assert_close(memory_footprints[-1], memory_footprints[0], atol=5, rtol=1)
 
-    def generate_qkv(self, batch, NH, q_len, s_len, head_dim, contiguous, dtype):
-        if contiguous:
+    def generate_qkv(self, batch: int, NH: int, q_len: int, s_len: int, head_dim: int, layout: str, dtype: torch.dtype):
+        if layout == "contiguous":
             q = torch.randn(batch, NH, q_len, head_dim, dtype=dtype, device="mps")
             k = torch.randn(batch, NH, s_len, head_dim, dtype=dtype, device="mps")
-        else:
+        elif layout == "mT":
+            # Transpose head dimension and length
             q = torch.randn(batch, NH, head_dim, q_len, dtype=dtype, device="mps").mT
             k = torch.randn(batch, NH, head_dim, s_len, dtype=dtype, device="mps").mT
+        elif layout == "transpose_seq_head":
+            # Transpose length and number of heads
+            q = torch.randn(batch, q_len, NH, head_dim, dtype=dtype, device="mps").transpose(1, 2)
+            k = torch.randn(batch, s_len, NH, head_dim, dtype=dtype, device="mps").transpose(1, 2)
+        elif layout == "permute":
+            # Permute head dimension and length
+            q = torch.randn(batch, head_dim, NH, q_len, dtype=dtype, device="mps").permute(0, 2, 3, 1)
+            k = torch.randn(batch, head_dim, NH, s_len, dtype=dtype, device="mps").permute(0, 2, 3, 1)
+        else:
+            raise ValueError(f"Unknown layout: {layout}")
+
         v = torch.randn(batch, NH, s_len, head_dim, dtype=dtype, device="mps")
         return q, k, v
 
-    def run_fast_attention_test(self, q, k, v, with_mask, dropout_p=0.0, is_causal=False):
+    def run_fast_attention_test(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        with_mask: bool,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+    ):
         q_len = q.shape[2]
         s_len = k.shape[2]
 
@@ -9471,48 +9543,47 @@ def run_fast_attention_test(self, q, k, v, with_mask, dropout_p=0.0, is_causal=F
         self._compare_tensors(y.cpu(), y_ref)
 
     @parametrize("dtype", [torch.float16, torch.float32])
-    @parametrize("contiguous", [True, False])
+    @parametrize("layout", ["contiguous", "mT", "transpose_seq_head", "permute"])
     @parametrize("head_dim", [64, 96, 128])  # 64, 96, 128 are for the fast kernel
     @parametrize("with_mask", [True, False])
-    def test_fast_vector_attention(self, dtype, contiguous, head_dim, with_mask):
+    def test_fast_vector_attention(self, dtype: torch.dtype, layout: str, head_dim: int, with_mask: bool):
         torch.manual_seed(1729)
         batch = 1
         NH = 2
         q_len = 4  # <8 so that vector fast is eligible
         s_len = 16  # smaller than 1024 so that we use the one–pass variant
-        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, layout, dtype)
         self.run_fast_attention_test(q, k, v, with_mask)
 
     @parametrize("dtype", [torch.float32])  # float16 underflows sometimes, which leads to flaky tests
-    @parametrize("contiguous", [True, False])
+    @parametrize("layout", ["contiguous", "mT", "transpose_seq_head", "permute"])
     @parametrize("with_mask", [True, False])
-    def test_fast_vector_attention_2pass(self, dtype, contiguous, with_mask):
+    def test_fast_vector_attention_2pass(self, dtype: torch.dtype, layout: str, with_mask: bool):
         torch.manual_seed(1729)
         batch = 1
         NH = 32
         q_len = 8
         s_len = 1024  # large enough to trigger the two–pass path
         head_dim = 64  # supported head dimension for vector attention
-        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, layout, dtype)
         self.run_fast_attention_test(q, k, v, with_mask)
 
     @unittest.skip("Full attention fast kernel not implemented yet")
     @parametrize("dtype", [torch.float16, torch.float32])
-    @parametrize("contiguous", [True, False])
+    @parametrize("layout", ["contiguous", "mT"])
     @parametrize("head_dim", [64, 80, 128])  # 64, 80, 128 are for the fast kernel
     @parametrize("with_mask", [True, False])
-    def test_fast_full_attention(self, dtype, contiguous, head_dim, with_mask):
+    def test_fast_full_attention(self, dtype: torch.dtype, layout: str, head_dim: int, with_mask: bool):
         torch.manual_seed(1729)
         batch = 1
         NH = 2
         q_len = 32  # threshold to trigger full fast attention path
         s_len = 16
-        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, contiguous, dtype)
+        q, k, v = self.generate_qkv(batch, NH, q_len, s_len, head_dim, layout, dtype)
         self.run_fast_attention_test(q, k, v, with_mask)
 
 
 
-
 class TestSDPAMetaDispatchMode(TorchDispatchMode):
     """
     TorchDispatchMode which intercepts the
@@ -12281,9 +12352,18 @@ def get_samples():
                 # Similar to the above, float vs double precision aresults in slight error
                 atol, rtol = 2e-5, 2e-6
 
-            if op.name in "grid_sampler_3d":
+            if op.name in ["grid_sampler_3d", "asinh"]:
                 atol, rtol = 1e-4, 1e-4
 
+            if op.name == "kthvalue":
+                self.assertEqual(cpu_out[0], mps_out[0], atol=atol, rtol=rtol)
+                # kthvalue is non-deterministic if input has repeated values
+                dim = cpu_args[2] if len(cpu_args) > 2 else -1
+                keep_dim = cpu_args[3] if len(cpu_args) > 3 else False
+                values = torch.gather(mps_sample.input, dim, mps_out[1] if keep_dim else mps_out[1].unsqueeze(dim))
+                self.assertEqual(values if keep_dim else values.squeeze(dim), mps_out[0])
+                continue
+
             self.assertEqual(cpu_out, mps_out, atol=atol, rtol=rtol)
 
     @ops(mps_ops_grad_modifier(copy.deepcopy(test_consistency_op_db)), allowed_dtypes=MPS_GRAD_DTYPES)
@@ -12745,119 +12825,6 @@ def test_metal_capture(self):
                            f"Capture file {capture_dirname} contains only metadata, i.e. {capture_listdir}")
 
 
-
-class TestSparseMPS(TestCaseMPS):
-    def _get_basic_sparse_coo(self, device="mps"):
-        indices = torch.tensor([[0, 1], [2, 0]], dtype=torch.int64, device=device)
-        values = torch.tensor([1, 2], dtype=torch.float32, device=device)
-        size = (2, 3)
-        return torch.sparse_coo_tensor(indices, values, size, device=device)
-
-    def test_sparse_coo_tensor_with_dims(self):
-        indices = torch.zeros((2, 0), dtype=torch.int64, device="mps")
-        values = torch.tensor([], dtype=torch.float32, device="mps")
-        size = (2, 3)
-        t = torch.sparse_coo_tensor(indices, values, size, device="mps")
-        self.assertEqual(t.device.type, "mps")
-        self.assertEqual(t.layout, torch.sparse_coo)
-
-    def test_sparse_coo_tensor_with_dims_and_tensors(self):
-        indices = torch.tensor([[0, 1], [2, 0]], device="mps")
-        values = torch.tensor([1., 2.], device="mps")
-        size = (2, 3)
-        t = torch.sparse_coo_tensor(indices, values, size, device="mps")
-        self.assertEqual(t.device.type, "mps")
-        self.assertEqual(t.layout, torch.sparse_coo)
-        self.assertEqual(t._indices().cpu(), indices.cpu())
-        self.assertEqual(t._values().cpu(), values.cpu())
-
-    def test_nnz(self):
-        t = self._get_basic_sparse_coo()
-        self.assertEqual(t._nnz(), 2)
-
-    def test_sparse_dim(self):
-        t = self._get_basic_sparse_coo()
-        self.assertEqual(t.sparse_dim(), 2)
-
-    def test_to_sparse(self):
-        t = torch.tensor([[[1., 0], [2., 3.]], [[4., 0], [5., 6.]]], device="mps")
-        x = t.to_sparse()
-        t_cpu = torch.tensor([[[1., 0], [2., 3.]], [[4., 0], [5., 6.]]], device="mps")
-        x_cpu = t.to_sparse()
-        self.assertEqual(x.cpu(), x_cpu)
-
-    def test_resize(self):
-        indices = torch.tensor([[0, 1], [2, 0]])
-        values = torch.tensor([3.0, 4.0])
-        size = torch.Size([2, 3])
-        sparse = torch.sparse_coo_tensor(indices, values, size, device="mps")
-        sparse_cpu = torch.sparse_coo_tensor(indices, values, size, device="cpu")
-        sparse = sparse.sparse_resize_(torch.Size([4, 5]), sparse_dim=2, dense_dim=0)
-        sparse_cpu = sparse_cpu.sparse_resize_(torch.Size([4, 5]), sparse_dim=2, dense_dim=0)
-        self.assertEqual(sparse, sparse_cpu)
-
-    @parametrize("dtype", [torch.int8, torch.int16, torch.uint8, torch.int32, torch.int64,
-                           torch.float32, torch.float16, torch.bfloat16, torch.bool])
-    def test_coalesce(self, dtype):
-        indices = torch.tensor([[0, 0, 1, 1], [0, 0, 2, 2]], dtype=torch.int64, device="mps")
-        values = torch.tensor([1., 2., 3., 4.], dtype=dtype, device="mps")
-        size = (2, 3)
-        indices_cpu = indices.cpu()
-        values_cpu = values.cpu()
-        sparse_mps = torch.sparse_coo_tensor(indices, values, size, device="mps")
-        sparse_cpu = torch.sparse_coo_tensor(indices_cpu, values_cpu, size, device="cpu")
-        coalesced_mps = sparse_mps.coalesce()
-        coalesced_cpu = sparse_cpu.coalesce()
-
-        self.assertTrue(coalesced_mps.is_coalesced())
-        self.assertTrue(coalesced_cpu.is_coalesced())
-        self.assertEqual(coalesced_mps._nnz(), 2)
-        self.assertEqual(coalesced_mps.cpu(), coalesced_cpu)
-
-    def test_already_coalesced_tensor(self):
-        already_coalesced = self._get_basic_sparse_coo()
-        result = already_coalesced.coalesce()
-        self.assertTrue(result.is_coalesced())
-        self.assertEqual(result._indices().cpu(), already_coalesced._indices().cpu())
-        self.assertEqual(result._values().cpu(), already_coalesced._values().cpu())
-
-    def test_coalesce_empty_sparse_tensor(self):
-        empty_indices = torch.zeros((2, 0), dtype=torch.int64, device="mps")
-        empty_values = torch.tensor([], dtype=torch.float32, device="mps")
-        empty_sparse = torch.sparse_coo_tensor(empty_indices, empty_values, (3, 3), device="mps")
-        empty_coalesced = empty_sparse.coalesce()
-        self.assertTrue(empty_coalesced.is_coalesced())
-        self.assertEqual(empty_coalesced._nnz(), 0)
-
-    def test_coalesce_large_tensor(self):
-        size = (1000000, 1000000)
-        num_elements = 1000
-
-        # 800 unique random positions
-        unique_indices = torch.randint(0, size[0], (2, 800), dtype=torch.int64)
-        # 200 duplicates by repeating some of the first 200 indices
-        duplicate_indices = unique_indices[:, :200]
-        indices = torch.cat([unique_indices, duplicate_indices], dim=1)
-        # shuffle indices to mix duplicates with unique entries
-        perm = torch.randperm(indices.size(1))
-        indices = indices[:, perm]
-
-        values = torch.randn(num_elements, dtype=torch.float32)
-        indices_mps = indices.to("mps")
-        values_mps = values.to("mps")
-        sparse_mps = torch.sparse_coo_tensor(indices_mps, values_mps, size, device="mps")
-        sparse_cpu = torch.sparse_coo_tensor(indices, values, size, device="cpu")
-
-        self.assertFalse(sparse_mps.is_coalesced())
-        coalesced_mps = sparse_mps.coalesce()
-        coalesced_cpu = sparse_cpu.coalesce()
-        self.assertTrue(coalesced_mps.is_coalesced())
-        self.assertTrue(coalesced_cpu.is_coalesced())
-        self.assertEqual(coalesced_mps._nnz(), coalesced_cpu._nnz())
-        self.assertEqual(coalesced_mps._indices().cpu(), coalesced_cpu._indices())
-        self.assertEqual(coalesced_mps._values().cpu(), coalesced_cpu._values())
-
-
 # TODO: Actually instantiate that test for the "mps" device to better reflect what it is doing.
 # This requires mps to be properly registered in the device generic test framework which is not the
 # case right now. We can probably use `allow_mps` introduced in https://github.com/pytorch/pytorch/pull/87342
@@ -12872,7 +12839,6 @@ def test_coalesce_large_tensor(self):
 instantiate_parametrized_tests(TestSDPA)
 instantiate_parametrized_tests(TestSmoothL1Loss)
 instantiate_parametrized_tests(TestMetalLibrary)
-instantiate_parametrized_tests(TestSparseMPS)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_nestedtensor.py b/test/test_nestedtensor.py
index f4473aacfb8b..ac97f2beda8e 100644
--- a/test/test_nestedtensor.py
+++ b/test/test_nestedtensor.py
@@ -7285,6 +7285,9 @@ def _rand_nt(noncontig_with_holes=noncontig_with_holes):
 
         return query, key, value
 
+    @unittest.skip(
+        "Temporarily skip - nested tensor backward pass broken after return-max-scores commit"
+    )
     @onlyCUDA
     @flex_attention_supported_platform
     @dtypes(torch.float32)
diff --git a/test/test_nn.py b/test/test_nn.py
index 4bf5f57df647..c17f7cb668b6 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -8,6 +8,7 @@
 import io
 import itertools
 import warnings
+import os
 import pickle
 import re
 from copy import deepcopy
@@ -59,6 +60,9 @@
 
 AMPERE_OR_ROCM = TEST_WITH_ROCM or torch.cuda.is_tf32_supported()
 
+if TEST_WITH_ROCM:
+    os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1"
+
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests
@@ -7431,6 +7435,7 @@ def test_layer_norm_backwards_eps(self):
                 if bias and elementwise_affine:
                     self.assertEqual(ln.bias.grad, ln_cuda.bias.grad, f"bias grad failed: {m=} {n=}", rtol=rtol, atol=atol)
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA not available")
     @largeTensorTest("40GB", device="cuda")
     def test_layer_norm_large_tensor(self):
         # test for https://github.com/pytorch/pytorch/issues/136291
@@ -9210,7 +9215,7 @@ def test_ReflectionPad_empty(self, device, dtype):
 
     @onlyNativeDeviceTypes
     def test_ReflectionPad_fails(self, device):
-        with self.assertRaisesRegex(RuntimeError, 'Only 2D, 3D, 4D, 5D'):
+        with self.assertRaisesRegex(RuntimeError, r'Padding size 2 is not supported for 4D input tensor'):
             mod = torch.nn.ReflectionPad1d(2)
             inp = torch.randn(3, 3, 10, 10, device=device)
             mod(inp)
@@ -9219,7 +9224,7 @@ def test_ReflectionPad_fails(self, device):
             inp = torch.randn(3, 3, 10, 10, device=device)
             torch.ops.aten.reflection_pad1d(inp, (2, 2))
 
-        with self.assertRaisesRegex(RuntimeError, 'Only 2D, 3D, 4D, 5D'):
+        with self.assertRaisesRegex(RuntimeError, r'Padding size 4 is not supported for 5D input tensor'):
             mod = torch.nn.ReflectionPad2d(2)
             inp = torch.randn(3, 3, 10, 10, 10, device=device)
             mod(inp)
@@ -9228,7 +9233,7 @@ def test_ReflectionPad_fails(self, device):
             inp = torch.randn(3, 3, 10, 10, 10, device=device)
             torch.ops.aten.reflection_pad2d(inp, (2, 2, 2, 2))
 
-        with self.assertRaisesRegex(RuntimeError, 'Only 2D, 3D, 4D, 5D'):
+        with self.assertRaisesRegex(RuntimeError, r'Padding size 6 is not supported for 6D input tensor'):
             mod = torch.nn.ReflectionPad3d(3)
             inp = torch.randn(3, 3, 10, 10, 10, 10, device=device)
             mod(inp)
@@ -12043,6 +12048,12 @@ def test_softmax_bfloat16(self, device):
             # test softmax with large input value which causes exp() to overflow
             _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0)
 
+    def test_nll_loss_1d_input_1d_target_invalid_size(self, device):
+        x = torch.randn(10, device=device)
+        t = torch.randint(0, 10, (3,), dtype=torch.int64, device=device)
+        with self.assertRaisesRegex(ValueError, "For 1D input, 1D target must have size 1"):
+            F.nll_loss(x, t)
+
     def test_nll_loss_mismatched_batch(self, device):
         x = torch.randn((10, 3), requires_grad=True, device=device)
         # t should have size (10,)
diff --git a/test/test_numa_binding.py b/test/test_numa_binding.py
index 349b89fa95e6..764156ff9b98 100644
--- a/test/test_numa_binding.py
+++ b/test/test_numa_binding.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import json
-import os
 import sys
 from dataclasses import dataclass
 from multiprocessing.context import SpawnProcess
@@ -460,31 +459,6 @@ def test_explicit_numa_options_overrides_default(self) -> None:
             NumaOptions(affinity_mode=AffinityMode.EXCLUSIVE),
         )
 
-    def test_parallel_start_does_not_call_get_default_numa_options(self) -> None:
-        # Inner import to avoid crashing if not torch.distributed.is_available()
-        from torch.distributed.launcher.api import LaunchConfig
-
-        self._add_mock_hardware(
-            num_sockets=1,
-            num_numa_nodes_per_socket=1,
-            num_gpus_per_numa_node=2,
-            num_l3_caches_per_numa_node=1,
-            num_physical_core_per_l3_cache=1,
-        )
-
-        with patch(
-            "torch.distributed.launcher.api.get_default_numa_options"
-        ) as mock_get_default_numa_options:
-            os.environ["TORCH_MP_PARALLEL_START"] = "1"
-            launch_config = LaunchConfig(
-                min_nodes=1,
-                max_nodes=1,
-                nproc_per_node=2,
-                start_method="forkserver",
-            )
-            mock_get_default_numa_options.assert_not_called()
-            self.assertIsNone(launch_config.numa_options)
-
     def test_nproc_must_equal_cuda_device_count_to_use_default_numa_options(
         self,
     ) -> None:
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 4db042297f05..8454677856d0 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -615,6 +615,271 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
 
         self.assertEqual(NothingImplemented() ** RPowOnly(), -1)
 
+    def test_torch_function_in_lists(self):
+        """Test that __torch_function__ is called for objects inside lists"""
+
+        class IntLike:
+            """Object that can be used in int lists"""
+            def __init__(self, value):
+                self.value = value
+                self.torch_function_called = False
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                self.torch_function_called = True
+                # Return a result that makes the operation succeed
+                if func.__name__ == 'pad':
+                    # For pad, return the input with shape adjusted
+                    return args[0]
+                elif func.__name__ == 'layer_norm':
+                    # For layer_norm, return normalized tensor
+                    return torch.ones_like(args[0])
+                elif func.__name__ == 'tensordot':
+                    # For tensordot, return appropriate shape
+                    return torch.tensor(42.0)
+                # Fallback
+                return torch.tensor(42.0)
+
+        # Test with F.pad which takes int list
+        import torch.nn.functional as F
+        x = torch.randn(2, 3)
+        obj = IntLike(1)
+
+        # pad takes [left, right, top, bottom] as padding
+        _ = F.pad(x, [1, obj, 0, 0])
+        self.assertTrue(obj.torch_function_called,
+                        "torch_function should be called for object in int list")
+
+        # Test multiple objects in list
+        obj1 = IntLike(1)
+        obj2 = IntLike(2)
+        _ = F.pad(x, [obj1, obj2, 0, 0])
+        self.assertTrue(obj1.torch_function_called or obj2.torch_function_called,
+                        "torch_function should be called for at least one object")
+
+    def test_torch_function_in_float_lists(self):
+        """Test that __torch_function__ is called for objects inside float lists"""
+
+        class FloatLike:
+            """Object that can be used in float lists"""
+            def __init__(self, value):
+                self.value = float(value)
+                self.torch_function_called = False
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                self.torch_function_called = True
+                # Return appropriate result
+                if func.__name__ == 'layer_norm':
+                    return torch.ones_like(args[0])
+                return torch.tensor(42.0)
+
+        import torch.nn.functional as F
+        x = torch.randn(2, 3, 4)
+        obj = FloatLike(4.0)
+
+        # layer_norm takes normalized_shape as int/float list
+        _ = F.layer_norm(x, [3, obj])
+        self.assertTrue(obj.torch_function_called,
+                        "torch_function should be called for object in float list")
+
+    def test_torch_function_in_scalar_lists(self):
+        """Test that __torch_function__ is called for scalar objects inside lists"""
+
+        class ScalarLike:
+            """Object that can be used as a scalar in lists"""
+            def __init__(self, value):
+                self.value = value
+                self.torch_function_called = False
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                self.torch_function_called = True
+                # Return a scalar tensor
+                return torch.tensor(self.value)
+
+            def __float__(self):
+                return float(self.value)
+
+            def __int__(self):
+                return int(self.value)
+
+        # Test with a function that takes scalar lists
+        # Using torch.as_tensor which can take scalar lists
+        obj1 = ScalarLike(1.0)
+        obj2 = ScalarLike(2.0)
+
+        # Create a tensor with scalar list containing torch function objects
+        # Use a different operation that should trigger torch_function
+        _ = torch.stack([obj1, obj2])
+        self.assertTrue(obj1.torch_function_called or obj2.torch_function_called,
+                        "torch_function should be called for scalar objects in list")
+
+    def test_torch_function_precedence_in_lists(self):
+        """Test precedence when multiple torch function objects are in a list"""
+
+        call_order = []
+
+        class HighPriority:
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                call_order.append('high')
+                # Delegate to lower priority
+                return NotImplemented
+
+        class LowPriority:
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                call_order.append('low')
+                # Return valid result
+                if func.__name__ == 'pad':
+                    return args[0]
+                return torch.tensor(42.0)
+
+        import torch.nn.functional as F
+        x = torch.randn(2, 3)
+
+        high = HighPriority()
+        low = LowPriority()
+
+        # Test with both objects in list
+        call_order.clear()
+        _ = F.pad(x, [1, high, low, 0])
+
+        # High priority should be called first
+        self.assertEqual(call_order[0], 'high',
+                         "Higher priority torch_function should be called first")
+        self.assertEqual(call_order[1], 'low',
+                         "Lower priority torch_function should be called after NotImplemented")
+
+    def test_torch_function_mixed_lists(self):
+        """Test lists with mix of regular values and torch function objects"""
+
+        class CountingInt:
+            call_count = 0
+
+            def __init__(self, value):
+                self.value = value
+
+            @classmethod
+            def reset(cls):
+                cls.call_count = 0
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                CountingInt.call_count += 1
+                # Return valid result
+                if func.__name__ == 'pad':
+                    return args[0]
+                return torch.tensor(42.0)
+
+            def __index__(self):
+                return self.value
+
+        import torch.nn.functional as F
+        x = torch.randn(2, 3)
+
+        obj = CountingInt(2)
+        CountingInt.reset()
+
+        # Mix regular ints with torch function object
+        _ = F.pad(x, [1, obj, 0, 0])
+
+        self.assertEqual(CountingInt.call_count, 1,
+                         "torch_function should be called exactly once for mixed list")
+
+    def test_torch_function_empty_lists(self):
+        """Test that empty lists work correctly"""
+
+        # This should work without calling any torch_function
+        x = torch.randn(1)  # Single element tensor
+
+        # Functions that accept empty lists should still work
+        # torch.stack with empty list of tensors would fail,
+        # but empty size lists should work
+        result = x.view([])  # Empty list means scalar
+        self.assertEqual(result.shape, torch.Size([]),
+                         "Empty list should work for size arguments")
+
+    def test_torch_function_not_first_in_list(self):
+        """Test that torch_function is called even when object is not first in list"""
+
+        class IntLikeNotFirst:
+            """Object with torch_function that won't be first in list"""
+            def __init__(self, value):
+                self.value = value
+                self.torch_function_called = False
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                self.torch_function_called = True
+                # Return input tensor for pad
+                return args[0]
+
+            def __index__(self):
+                return self.value
+
+        import torch.nn.functional as F
+        x = torch.randn(2, 3)
+
+        # Test with torch_function object as second item
+        obj_second = IntLikeNotFirst(2)
+        _ = F.pad(x, [1, obj_second, 0, 0])
+        self.assertTrue(obj_second.torch_function_called,
+                        "torch_function should be called when object is second in list")
+
+        # Test with torch_function object as third item
+        obj_third = IntLikeNotFirst(1)
+        _ = F.pad(x, [1, 1, obj_third, 0])
+        self.assertTrue(obj_third.torch_function_called,
+                        "torch_function should be called when object is third in list")
+
+        # Test with torch_function object as last item
+        obj_last = IntLikeNotFirst(1)
+        _ = F.pad(x, [1, 1, 1, obj_last])
+        self.assertTrue(obj_last.torch_function_called,
+                        "torch_function should be called when object is last in list")
+
+    def test_torch_function_nested_tuple_getitem(self):
+        """Test that torch_function is called with getitem for TF objects inside nested tuples"""
+
+        called_functions = []
+
+        class TorchFunctionObj:
+            """Object with torch_function that tracks which functions are called"""
+            def __init__(self, value):
+                self.value = value
+
+            def __torch_function__(self, func, types, args=(), kwargs=None):
+                called_functions.append(func.__name__)
+                # For getitem, return the tensor unchanged
+                if func.__name__ == '__getitem__':
+                    return args[0]
+                # Return a simple result for other functions
+                return torch.tensor(42.0)
+
+            def __index__(self):
+                return self.value
+
+        # Create a tensor to index
+        x = torch.randn(5, 5, 5)
+
+        # Create torch function objects - these will be INSIDE the nested structure
+        tf_obj1 = TorchFunctionObj(0)
+        tf_obj2 = TorchFunctionObj(1)
+
+        # Clear the called functions list
+        called_functions.clear()
+
+        # Test with tuple of tuple where TF objects are only on the INSIDE
+        # The outer structure is regular tuples, but inner elements have __torch_function__
+        # This tests the recursive detection logic added in the recent commit
+        x[(0, (tf_obj1, tf_obj2))]
+
+        # Assert that torch_function was called
+        self.assertTrue(len(called_functions) > 0,
+                        "torch_function should be called for TF objects inside nested tuples")
+
+        # Assert that getitem was called, not size
+        self.assertIn('__getitem__', called_functions,
+                      "getitem should be called for tuple indexing with torch function objects inside")
+
+        self.assertNotIn('size', called_functions,
+                         "size should not be called - we should use getitem, not convert to advanced indexing")
+
 
 def generate_tensor_like_override_tests(cls):
     from torch.testing._internal.generated.annotated_fn_args import annotated_args
@@ -1135,29 +1400,31 @@ def test_resolve_name(self):
                 )
 
 class TestTorchFunctionWarning(TestCase):
-    def test_warn_on_invalid_torch_function_standalone_class(self):
+    def test_torch_function_standalone_class(self):
         class StandaloneTorchFunctionClass:
-            def __torch_function__(self, *args, **kwargs):
-                pass
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                # Return a simple tensor for testing
+                return torch.tensor(42.0)
         a = StandaloneTorchFunctionClass()
-        with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"):
-            # Function that handles torch_function on the python side
-            torch.nn.functional.dropout(a)
-        with self.assertWarnsRegex(UserWarning, "as a plain method is deprecated"):
-            # Function that handles torch_function in C++
-            torch.abs(a)
-
-    def test_warn_on_invalid_torch_function_tensor_subclass(self):
+        # Test that torch_function works without warnings
+        result1 = torch.nn.functional.dropout(a)
+        result2 = torch.abs(a)
+        self.assertEqual(result1, torch.tensor(42.0))
+        self.assertEqual(result2, torch.tensor(42.0))
+
+    def test_torch_function_tensor_subclass(self):
         class TensorSubclassTorchFunctionClass(torch.Tensor):
-            def __torch_function__(self, *args, **kwargs):
-                pass
+            @classmethod
+            def __torch_function__(cls, func, types, args=(), kwargs=None):
+                # Return a simple tensor for testing
+                return torch.tensor(99.0)
         b = TensorSubclassTorchFunctionClass()
-        with self.assertWarnsRegex(DeprecationWarning, "as a plain method is deprecated"):
-            # Function that handles torch_function on the python side
-            torch.nn.functional.dropout(b)
-        with self.assertWarnsRegex(UserWarning, "as a plain method is deprecated"):
-            # Function that handles torch_function in C++
-            torch.abs(b)
+        # Test that torch_function works without warnings
+        result1 = torch.nn.functional.dropout(b)
+        result2 = torch.abs(b)
+        self.assertEqual(result1, torch.tensor(99.0))
+        self.assertEqual(result2, torch.tensor(99.0))
 
 class TestDisabledUserWarnings(TestCase):
     def test_no_implicit_user_warning_for_deprecated_functions(self):
diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py
index 9faa5ce4b894..07a92244cd73 100644
--- a/test/test_python_dispatch.py
+++ b/test/test_python_dispatch.py
@@ -587,6 +587,47 @@ def test_error_for_unsupported_ns_or_kind(self) -> None:
             with self.assertRaisesRegex(ValueError, "reserved namespace"):
                 my_lib1 = Library("prim", kind)  # noqa: TOR901
 
+    def test_dispatcher_error_filenames(self) -> None:
+        # Test that dispatcher errors report correct Python filenames and line numbers
+        # when defining duplicate libraries (which triggers the filename tracking)
+        import linecache
+        import re
+
+        # Create first library
+        # NOTE: Using Library directly instead of _scoped_library because this test
+        # specifically verifies filename tracking in error messages, and _scoped_library
+        # would report library.py locations instead of the actual test file locations
+        lib1 = Library(self.test_ns, "DEF")  # FIRST_LIB_MARKER  # noqa: TOR901
+        try:
+            lib1.define("duplicate_op(Tensor x) -> Tensor")
+
+            # Try to create another library with same namespace - this should trigger error
+            with self.assertRaises(RuntimeError) as cm:
+                lib2 = Library(self.test_ns, "DEF")  # SECOND_LIB_MARKER  # noqa: TOR901
+        finally:
+            lib1._destroy()
+
+        error_msg = str(cm.exception)
+
+        # The error should NOT contain /dev/null (the old placeholder)
+        self.assertNotIn("/dev/null", error_msg)
+        # The error should contain the test file name for both registrations
+        self.assertIn("test_python_dispatch.py", error_msg)
+        # Extract line numbers from the error message and verify they point to the right lines
+        line_matches = re.findall(r"test_python_dispatch\.py:(\d+)", error_msg)
+        self.assertEqual(
+            len(line_matches), 2, "Should have exactly 2 line number references"
+        )
+
+        # Get the actual source lines and verify they contain our markers
+        first_line_num, second_line_num = sorted([int(x) for x in line_matches])
+        first_line = linecache.getline(__file__, first_line_num).strip()
+        second_line = linecache.getline(__file__, second_line_num).strip()
+
+        # Verify the lines contain our expected markers
+        self.assertIn("FIRST_LIB_MARKER", first_line)
+        self.assertIn("SECOND_LIB_MARKER", second_line)
+
     def test_returning_symint(self) -> None:
         shape_env = ShapeEnv()
         fake_tensor_mode = FakeTensorMode(shape_env=shape_env)
@@ -1958,6 +1999,8 @@ def __new__(cls, data, wrapper):
                 def __torch_dispatch__(cls, func, types, args, kwargs):
                     if func.overloadpacket == torch.ops.aten.is_contiguous:
                         return contiguous_data.is_contiguous()
+                    if func.overloadpacket == torch.ops.aten.sym_is_contiguous:
+                        return torch.ops.aten.sym_is_contiguous(contiguous_data)
                     return NotImplemented
 
             class ExampleTensor3(torch.Tensor):
@@ -1971,6 +2014,8 @@ def __new__(cls, data, wrapper):
                 def __torch_dispatch__(cls, func, types, args, kwargs):
                     if func.overloadpacket == torch.ops.aten.is_contiguous:
                         return not_contiguous_data.is_contiguous()
+                    if func.overloadpacket == torch.ops.aten.sym_is_contiguous:
+                        return torch.ops.aten.sym_is_contiguous(not_contiguous_data)
                     return NotImplemented
 
             err_msg = "Multiple dispatch failed for 'torch.ops.aten.is_contiguous'"
@@ -2003,6 +2048,7 @@ def __new__(cls, data):
             @classmethod
             def __torch_dispatch__(cls, func, types, args, kwargs):
                 if func in [
+                    torch.ops.aten.sym_is_contiguous.default,
                     torch.ops.aten.is_contiguous.default,
                     torch.ops.aten.is_contiguous.memory_format,
                     torch.ops.aten.is_strides_like_format.default,
diff --git a/test/test_pytree.py b/test/test_pytree.py
index 228dec85bff6..e19f1471267c 100644
--- a/test/test_pytree.py
+++ b/test/test_pytree.py
@@ -14,7 +14,7 @@
 from typing import Any, NamedTuple, Optional
 
 import torch
-import torch.utils._pytree as py_pytree
+import torch.utils._pytree as python_pytree
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
@@ -26,12 +26,24 @@
 )
 
 
-if IS_FBCODE:
-    # optree is not yet enabled in fbcode, so just re-test the python implementation
-    cxx_pytree = py_pytree
-else:
+pytree_modules = {
+    "python": python_pytree,
+}
+if not IS_FBCODE:
     import torch.utils._cxx_pytree as cxx_pytree
 
+    pytree_modules["cxx"] = cxx_pytree
+else:
+    # optree is not yet enabled in fbcode, so just re-test the python implementation
+    cxx_pytree = python_pytree
+
+
+parametrize_pytree_module = parametrize(
+    "pytree",
+    [subtest(module, name=name) for name, module in pytree_modules.items()],
+)
+
+
 GlobalPoint = namedtuple("GlobalPoint", ["x", "y"])
 
 
@@ -53,26 +65,32 @@ class TestEnum(enum.Enum):
     A = auto()
 
 
+python_leafspec = python_pytree.LeafSpec()
+
+
 class TestGenericPytree(TestCase):
     def test_aligned_public_apis(self):
-        public_apis = py_pytree.__all__
+        public_apis = python_pytree.__all__
 
         self.assertEqual(public_apis, cxx_pytree.__all__)
 
         for name in public_apis:
             cxx_api = getattr(cxx_pytree, name)
-            py_api = getattr(py_pytree, name)
+            python_api = getattr(python_pytree, name)
 
-            self.assertEqual(inspect.isclass(cxx_api), inspect.isclass(py_api))
-            self.assertEqual(inspect.isfunction(cxx_api), inspect.isfunction(py_api))
+            self.assertEqual(inspect.isclass(cxx_api), inspect.isclass(python_api))
+            self.assertEqual(
+                inspect.isfunction(cxx_api),
+                inspect.isfunction(python_api),
+            )
             if inspect.isfunction(cxx_api):
                 cxx_signature = inspect.signature(cxx_api)
-                py_signature = inspect.signature(py_api)
+                python_signature = inspect.signature(python_api)
 
                 # Check the parameter names are the same.
                 cxx_param_names = list(cxx_signature.parameters)
-                py_param_names = list(py_signature.parameters)
-                self.assertEqual(cxx_param_names, py_param_names)
+                python_param_names = list(python_signature.parameters)
+                self.assertEqual(cxx_param_names, python_param_names)
 
                 # Check the positional parameters are the same.
                 cxx_positional_param_names = [
@@ -86,9 +104,9 @@ def test_aligned_public_apis(self):
                         }
                     )
                 ]
-                py_positional_param_names = [
+                python_positional_param_names = [
                     n
-                    for n, p in py_signature.parameters.items()
+                    for n, p in python_signature.parameters.items()
                     if (
                         p.kind
                         in {
@@ -97,19 +115,22 @@ def test_aligned_public_apis(self):
                         }
                     )
                 ]
-                self.assertEqual(cxx_positional_param_names, py_positional_param_names)
+                self.assertEqual(
+                    cxx_positional_param_names,
+                    python_positional_param_names,
+                )
 
-                for py_name, py_param in py_signature.parameters.items():
-                    self.assertIn(py_name, cxx_signature.parameters)
-                    cxx_param = cxx_signature.parameters[py_name]
+                for python_name, python_param in python_signature.parameters.items():
+                    self.assertIn(python_name, cxx_signature.parameters)
+                    cxx_param = cxx_signature.parameters[python_name]
 
                     # Check parameter kinds and default values are the same.
-                    self.assertEqual(cxx_param.kind, py_param.kind)
-                    self.assertEqual(cxx_param.default, py_param.default)
+                    self.assertEqual(cxx_param.kind, python_param.kind)
+                    self.assertEqual(cxx_param.default, python_param.default)
 
                     # Check parameter annotations are the same.
                     if "TreeSpec" in str(cxx_param.annotation):
-                        self.assertIn("TreeSpec", str(py_param.annotation))
+                        self.assertIn("TreeSpec", str(python_param.annotation))
                         self.assertEqual(
                             re.sub(
                                 r"(?:\b)([\w\.]*)TreeSpec(?:\b)",
@@ -119,78 +140,66 @@ def test_aligned_public_apis(self):
                             re.sub(
                                 r"(?:\b)([\w\.]*)TreeSpec(?:\b)",
                                 "TreeSpec",
-                                str(py_param.annotation),
+                                str(python_param.annotation),
                             ),
                             msg=(
                                 f"C++ parameter {cxx_param} "
-                                f"does not match Python parameter {py_param} "
+                                f"does not match Python parameter {python_param} "
                                 f"for API `{name}`"
                             ),
                         )
                     else:
                         self.assertEqual(
                             cxx_param.annotation,
-                            py_param.annotation,
+                            python_param.annotation,
                             msg=(
                                 f"C++ parameter {cxx_param} "
-                                f"does not match Python parameter {py_param} "
+                                f"does not match Python parameter {python_param} "
                                 f"for API `{name}`"
                             ),
                         )
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_register_pytree_node(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_register_pytree_node(self, pytree):
         class MyDict(UserDict):
             pass
 
         d = MyDict(a=1, b=2, c=3)
 
         # Custom types are leaf nodes by default
-        values, spec = pytree_impl.tree_flatten(d)
+        values, spec = pytree.tree_flatten(d)
         self.assertEqual(values, [d])
         self.assertIs(values[0], d)
-        self.assertEqual(d, pytree_impl.tree_unflatten(values, spec))
+        self.assertEqual(d, pytree.tree_unflatten(values, spec))
         self.assertTrue(spec.is_leaf())
 
         # Register MyDict as a pytree node
-        pytree_impl.register_pytree_node(
+        pytree.register_pytree_node(
             MyDict,
             lambda d: (list(d.values()), list(d.keys())),
             lambda values, keys: MyDict(zip(keys, values)),
         )
 
-        values, spec = pytree_impl.tree_flatten(d)
+        values, spec = pytree.tree_flatten(d)
         self.assertEqual(values, [1, 2, 3])
-        self.assertEqual(d, pytree_impl.tree_unflatten(values, spec))
+        self.assertEqual(d, pytree.tree_unflatten(values, spec))
 
         # Do not allow registering the same type twice
         with self.assertRaisesRegex(ValueError, "already registered"):
-            pytree_impl.register_pytree_node(
+            pytree.register_pytree_node(
                 MyDict,
                 lambda d: (list(d.values()), list(d.keys())),
                 lambda values, keys: MyDict(zip(keys, values)),
             )
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_unflatten_leaf(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_flatten_unflatten_leaf(self, pytree):
         def run_test_with_leaf(leaf):
-            values, treespec = pytree_impl.tree_flatten(leaf)
+            values, treespec = pytree.tree_flatten(leaf)
             self.assertEqual(values, [leaf])
-            self.assertEqual(treespec, pytree_impl.LeafSpec())
+            self.assertEqual(treespec, pytree.LeafSpec())
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, leaf)
 
         run_test_with_leaf(1)
@@ -200,16 +209,16 @@ def run_test_with_leaf(leaf):
         run_test_with_leaf(torch.randn(3, 3))
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda tup: py_pytree.TreeSpec(
-                        tuple, None, [py_pytree.LeafSpec() for _ in tup]
+                    python_pytree,
+                    lambda tup: python_pytree.TreeSpec(
+                        tuple, None, [python_leafspec for _ in tup]
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (cxx_pytree, lambda tup: cxx_pytree.tree_structure((0,) * len(tup))),
@@ -217,15 +226,15 @@ def run_test_with_leaf(leaf):
             ),
         ],
     )
-    def test_flatten_unflatten_tuple(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_tuple(self, pytree, gen_expected_fn):
         def run_test(tup):
             expected_spec = gen_expected_fn(tup)
-            values, treespec = pytree_impl.tree_flatten(tup)
+            values, treespec = pytree.tree_flatten(tup)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(tup))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, tup)
             self.assertIsInstance(unflattened, tuple)
 
@@ -235,16 +244,16 @@ def run_test(tup):
         run_test((torch.tensor([1.0, 2]), 2, 10, 9, 11))
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda lst: py_pytree.TreeSpec(
-                        list, None, [py_pytree.LeafSpec() for _ in lst]
+                    python_pytree,
+                    lambda lst: python_pytree.TreeSpec(
+                        list, None, [python_leafspec for _ in lst]
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (cxx_pytree, lambda lst: cxx_pytree.tree_structure([0] * len(lst))),
@@ -252,15 +261,15 @@ def run_test(tup):
             ),
         ],
     )
-    def test_flatten_unflatten_list(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_list(self, pytree, gen_expected_fn):
         def run_test(lst):
             expected_spec = gen_expected_fn(lst)
-            values, treespec = pytree_impl.tree_flatten(lst)
+            values, treespec = pytree.tree_flatten(lst)
             self.assertIsInstance(values, list)
             self.assertEqual(values, lst)
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, lst)
             self.assertIsInstance(unflattened, list)
 
@@ -269,18 +278,18 @@ def run_test(lst):
         run_test([torch.tensor([1.0, 2]), 2, 10, 9, 11])
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda dct: py_pytree.TreeSpec(
+                    python_pytree,
+                    lambda dct: python_pytree.TreeSpec(
                         dict,
                         list(dct.keys()),
-                        [py_pytree.LeafSpec() for _ in dct.values()],
+                        [python_leafspec for _ in dct.values()],
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (
@@ -291,15 +300,15 @@ def run_test(lst):
             ),
         ],
     )
-    def test_flatten_unflatten_dict(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_dict(self, pytree, gen_expected_fn):
         def run_test(dct):
             expected_spec = gen_expected_fn(dct)
-            values, treespec = pytree_impl.tree_flatten(dct)
+            values, treespec = pytree.tree_flatten(dct)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(dct.values()))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, dct)
             self.assertIsInstance(unflattened, dict)
 
@@ -310,18 +319,18 @@ def run_test(dct):
         run_test({"a": 1, "b": 2, "c": torch.randn(2, 3)})
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda odict: py_pytree.TreeSpec(
+                    python_pytree,
+                    lambda odict: python_pytree.TreeSpec(
                         OrderedDict,
                         list(odict.keys()),
-                        [py_pytree.LeafSpec() for _ in odict.values()],
+                        [python_leafspec for _ in odict.values()],
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (
@@ -334,15 +343,15 @@ def run_test(dct):
             ),
         ],
     )
-    def test_flatten_unflatten_ordereddict(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_ordereddict(self, pytree, gen_expected_fn):
         def run_test(odict):
             expected_spec = gen_expected_fn(odict)
-            values, treespec = pytree_impl.tree_flatten(odict)
+            values, treespec = pytree.tree_flatten(odict)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(odict.values()))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, odict)
             self.assertIsInstance(unflattened, OrderedDict)
 
@@ -354,18 +363,18 @@ def run_test(odict):
         run_test(od)
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda ddct: py_pytree.TreeSpec(
+                    python_pytree,
+                    lambda ddct: python_pytree.TreeSpec(
                         defaultdict,
                         [ddct.default_factory, list(ddct.keys())],
-                        [py_pytree.LeafSpec() for _ in ddct.values()],
+                        [python_leafspec for _ in ddct.values()],
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (
@@ -378,15 +387,15 @@ def run_test(odict):
             ),
         ],
     )
-    def test_flatten_unflatten_defaultdict(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_defaultdict(self, pytree, gen_expected_fn):
         def run_test(ddct):
             expected_spec = gen_expected_fn(ddct)
-            values, treespec = pytree_impl.tree_flatten(ddct)
+            values, treespec = pytree.tree_flatten(ddct)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(ddct.values()))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, ddct)
             self.assertEqual(unflattened.default_factory, ddct.default_factory)
             self.assertIsInstance(unflattened, defaultdict)
@@ -398,18 +407,16 @@ def run_test(ddct):
         run_test(defaultdict(int, {"a": 1, "b": 2, "c": torch.randn(2, 3)}))
 
     @parametrize(
-        "pytree_impl,gen_expected_fn",
+        "pytree,gen_expected_fn",
         [
             subtest(
                 (
-                    py_pytree,
-                    lambda deq: py_pytree.TreeSpec(
-                        deque,
-                        deq.maxlen,
-                        [py_pytree.LeafSpec() for _ in deq],
+                    python_pytree,
+                    lambda deq: python_pytree.TreeSpec(
+                        deque, deq.maxlen, [python_leafspec for _ in deq]
                     ),
                 ),
-                name="py",
+                name="python",
             ),
             subtest(
                 (
@@ -422,15 +429,15 @@ def run_test(ddct):
             ),
         ],
     )
-    def test_flatten_unflatten_deque(self, pytree_impl, gen_expected_fn):
+    def test_flatten_unflatten_deque(self, pytree, gen_expected_fn):
         def run_test(deq):
             expected_spec = gen_expected_fn(deq)
-            values, treespec = pytree_impl.tree_flatten(deq)
+            values, treespec = pytree.tree_flatten(deq)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(deq))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, deq)
             self.assertEqual(unflattened.maxlen, deq.maxlen)
             self.assertIsInstance(unflattened, deque)
@@ -439,29 +446,23 @@ def run_test(deq):
         run_test(deque([1.0, 2]))
         run_test(deque([torch.tensor([1.0, 2]), 2, 10, 9, 11], maxlen=8))
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_unflatten_namedtuple(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_flatten_unflatten_namedtuple(self, pytree):
         Point = namedtuple("Point", ["x", "y"])
 
         def run_test(tup):
-            if pytree_impl is py_pytree:
-                expected_spec = py_pytree.TreeSpec(
-                    namedtuple, Point, [py_pytree.LeafSpec() for _ in tup]
+            if pytree is python_pytree:
+                expected_spec = python_pytree.TreeSpec(
+                    namedtuple, Point, [python_leafspec for _ in tup]
                 )
             else:
                 expected_spec = cxx_pytree.tree_structure(Point(0, 1))
-            values, treespec = pytree_impl.tree_flatten(tup)
+            values, treespec = pytree.tree_flatten(tup)
             self.assertIsInstance(values, list)
             self.assertEqual(values, list(tup))
             self.assertEqual(treespec, expected_spec)
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
+            unflattened = pytree.tree_unflatten(values, treespec)
             self.assertEqual(unflattened, tup)
             self.assertIsInstance(unflattened, Point)
 
@@ -475,43 +476,31 @@ def run_test(tup):
             subtest(torch.min, name="min"),
         ],
     )
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_unflatten_return_types(self, pytree_impl, op):
+    @parametrize_pytree_module
+    def test_flatten_unflatten_return_types(self, pytree, op):
         x = torch.randn(3, 3)
         expected = op(x, dim=0)
 
-        values, spec = pytree_impl.tree_flatten(expected)
+        values, spec = pytree.tree_flatten(expected)
         # Check that values is actually List[Tensor] and not (ReturnType(...),)
         for value in values:
             self.assertIsInstance(value, torch.Tensor)
-        result = pytree_impl.tree_unflatten(values, spec)
+        result = pytree.tree_unflatten(values, spec)
 
         self.assertEqual(type(result), type(expected))
         self.assertEqual(result, expected)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_unflatten_nested(self, pytree_impl):
-        def run_test(pytree):
-            values, treespec = pytree_impl.tree_flatten(pytree)
+    @parametrize_pytree_module
+    def test_flatten_unflatten_nested(self, pytree):
+        def run_test(tree):
+            values, treespec = pytree.tree_flatten(tree)
             self.assertIsInstance(values, list)
             self.assertEqual(len(values), treespec.num_leaves)
 
             # NB: python basic data structures (dict list tuple) all have
             # contents equality defined on them, so the following works for them.
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
-            self.assertEqual(unflattened, pytree)
+            unflattened = pytree.tree_unflatten(values, treespec)
+            self.assertEqual(unflattened, tree)
 
         cases = [
             [()],
@@ -523,17 +512,11 @@ def run_test(pytree):
         for case in cases:
             run_test(case)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_flatten_with_is_leaf(self, pytree_impl):
-        def run_test(pytree, one_level_leaves):
-            values, treespec = pytree_impl.tree_flatten(
-                pytree, is_leaf=lambda x: x is not pytree
+    @parametrize_pytree_module
+    def test_flatten_with_is_leaf(self, pytree):
+        def run_test(tree, one_level_leaves):
+            values, treespec = pytree.tree_flatten(
+                tree, is_leaf=lambda x: x is not tree
             )
             self.assertIsInstance(values, list)
             self.assertEqual(len(values), treespec.num_nodes - 1)
@@ -543,13 +526,13 @@ def run_test(pytree, one_level_leaves):
 
             self.assertEqual(
                 treespec,
-                pytree_impl.tree_structure(
-                    pytree_impl.tree_unflatten([0] * treespec.num_leaves, treespec)
+                pytree.tree_structure(
+                    pytree.tree_unflatten([0] * treespec.num_leaves, treespec)
                 ),
             )
 
-            unflattened = pytree_impl.tree_unflatten(values, treespec)
-            self.assertEqual(unflattened, pytree)
+            unflattened = pytree.tree_unflatten(values, treespec)
+            self.assertEqual(unflattened, tree)
 
         cases = [
             ([()], [()]),
@@ -568,28 +551,22 @@ def run_test(pytree, one_level_leaves):
         for case in cases:
             run_test(*case)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_map(self, pytree_impl):
-        def run_test(pytree):
+    @parametrize_pytree_module
+    def test_tree_map(self, pytree):
+        def run_test(tree):
             def f(x):
                 return x * 3
 
-            sm1 = sum(map(f, pytree_impl.tree_leaves(pytree)))
-            sm2 = sum(pytree_impl.tree_leaves(pytree_impl.tree_map(f, pytree)))
+            sm1 = sum(map(f, pytree.tree_leaves(tree)))
+            sm2 = sum(pytree.tree_leaves(pytree.tree_map(f, tree)))
             self.assertEqual(sm1, sm2)
 
             def invf(x):
                 return x // 3
 
             self.assertEqual(
-                pytree_impl.tree_map(invf, pytree_impl.tree_map(f, pytree)),
-                pytree,
+                pytree.tree_map(invf, pytree.tree_map(f, tree)),
+                tree,
             )
 
         cases = [
@@ -602,27 +579,19 @@ def invf(x):
         for case in cases:
             run_test(case)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_map_multi_inputs(self, pytree_impl):
-        def run_test(pytree):
+    @parametrize_pytree_module
+    def test_tree_map_multi_inputs(self, pytree):
+        def run_test(tree):
             def f(x, y, z):
                 return x, [y, (z, 0)]
 
-            pytree_x = pytree
-            pytree_y = pytree_impl.tree_map(lambda x: (x + 1,), pytree)
-            pytree_z = pytree_impl.tree_map(lambda x: {"a": x * 2, "b": 2}, pytree)
+            tree_x = tree
+            tree_y = pytree.tree_map(lambda x: (x + 1,), tree)
+            tree_z = pytree.tree_map(lambda x: {"a": x * 2, "b": 2}, tree)
 
             self.assertEqual(
-                pytree_impl.tree_map(f, pytree_x, pytree_y, pytree_z),
-                pytree_impl.tree_map(
-                    lambda x: f(x, (x + 1,), {"a": x * 2, "b": 2}), pytree
-                ),
+                pytree.tree_map(f, tree_x, tree_y, tree_z),
+                pytree.tree_map(lambda x: f(x, (x + 1,), {"a": x * 2, "b": 2}), tree),
             )
 
         cases = [
@@ -635,55 +604,29 @@ def f(x, y, z):
         for case in cases:
             run_test(case)
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_map_only(self, pytree_impl):
-        self.assertEqual(
-            pytree_impl.tree_map_only(int, lambda x: x + 2, [0, "a"]), [2, "a"]
-        )
+    @parametrize_pytree_module
+    def test_tree_map_only(self, pytree):
+        self.assertEqual(pytree.tree_map_only(int, lambda x: x + 2, [0, "a"]), [2, "a"])
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_map_only_predicate_fn(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_tree_map_only_predicate_fn(self, pytree):
         self.assertEqual(
-            pytree_impl.tree_map_only(lambda x: x == 0, lambda x: x + 2, [0, 1]), [2, 1]
+            pytree.tree_map_only(lambda x: x == 0, lambda x: x + 2, [0, 1]), [2, 1]
         )
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_tree_all_any(self, pytree_impl):
-        self.assertTrue(pytree_impl.tree_all(lambda x: x % 2, [1, 3]))
-        self.assertFalse(pytree_impl.tree_all(lambda x: x % 2, [0, 1]))
-        self.assertTrue(pytree_impl.tree_any(lambda x: x % 2, [0, 1]))
-        self.assertFalse(pytree_impl.tree_any(lambda x: x % 2, [0, 2]))
-        self.assertTrue(pytree_impl.tree_all_only(int, lambda x: x % 2, [1, 3, "a"]))
-        self.assertFalse(pytree_impl.tree_all_only(int, lambda x: x % 2, [0, 1, "a"]))
-        self.assertTrue(pytree_impl.tree_any_only(int, lambda x: x % 2, [0, 1, "a"]))
-        self.assertFalse(pytree_impl.tree_any_only(int, lambda x: x % 2, [0, 2, "a"]))
-
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_broadcast_to_and_flatten(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_tree_all_any(self, pytree):
+        self.assertTrue(pytree.tree_all(lambda x: x % 2, [1, 3]))
+        self.assertFalse(pytree.tree_all(lambda x: x % 2, [0, 1]))
+        self.assertTrue(pytree.tree_any(lambda x: x % 2, [0, 1]))
+        self.assertFalse(pytree.tree_any(lambda x: x % 2, [0, 2]))
+        self.assertTrue(pytree.tree_all_only(int, lambda x: x % 2, [1, 3, "a"]))
+        self.assertFalse(pytree.tree_all_only(int, lambda x: x % 2, [0, 1, "a"]))
+        self.assertTrue(pytree.tree_any_only(int, lambda x: x % 2, [0, 1, "a"]))
+        self.assertFalse(pytree.tree_any_only(int, lambda x: x % 2, [0, 2, "a"]))
+
+    @parametrize_pytree_module
+    def test_broadcast_to_and_flatten(self, pytree):
         cases = [
             (1, (), []),
             # Same (flat) structures
@@ -716,29 +659,17 @@ def test_broadcast_to_and_flatten(self, pytree_impl):
             ((1, 2), ([0, [0, 0], 0], [0, 0]), [1, 1, 1, 1, 2, 2]),
             (([1, 2, 3], 4), ([0, [0, 0], 0], [0, 0]), [1, 2, 2, 3, 4, 4]),
         ]
-        for pytree, to_pytree, expected in cases:
-            _, to_spec = pytree_impl.tree_flatten(to_pytree)
-            result = pytree_impl._broadcast_to_and_flatten(pytree, to_spec)
-            self.assertEqual(result, expected, msg=str([pytree, to_spec, expected]))
+        for tree, to_tree, expected in cases:
+            _, to_spec = pytree.tree_flatten(to_tree)
+            result = pytree._broadcast_to_and_flatten(tree, to_spec)
+            self.assertEqual(result, expected, msg=str([tree, to_spec, expected]))
 
-    @parametrize(
-        "pytree_impl",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
-    def test_pytree_serialize_bad_input(self, pytree_impl):
+    @parametrize_pytree_module
+    def test_pytree_serialize_bad_input(self, pytree):
         with self.assertRaises(TypeError):
-            pytree_impl.treespec_dumps("random_blurb")
+            pytree.treespec_dumps("random_blurb")
 
-    @parametrize(
-        "pytree",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
+    @parametrize_pytree_module
     def test_is_namedtuple(self, pytree):
         DirectNamedTuple1 = namedtuple("DirectNamedTuple1", ["x", "y"])
 
@@ -779,13 +710,7 @@ class IndirectNamedTuple2(DirectNamedTuple2):
         self.assertFalse(pytree.is_namedtuple_class(tuple))
         self.assertFalse(pytree.is_namedtuple_class(list))
 
-    @parametrize(
-        "pytree",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
+    @parametrize_pytree_module
     def test_is_structseq(self, pytree):
         class FakeStructSeq(tuple):
             n_fields = 2
@@ -859,13 +784,7 @@ class DirectNamedTuple2(NamedTuple):
                 self.assertFalse(pytree.is_namedtuple(cls))
                 self.assertFalse(pytree.is_namedtuple_class(cls))
 
-    @parametrize(
-        "pytree",
-        [
-            subtest(py_pytree, name="py"),
-            subtest(cxx_pytree, name="cxx"),
-        ],
-    )
+    @parametrize_pytree_module
     def test_enum_treespec_roundtrip(self, pytree):
         data = {TestEnum.A: 5}
         spec = pytree.tree_structure(data)
@@ -885,14 +804,14 @@ def __init__(self, x, y):
         with self.assertWarnsRegex(
             FutureWarning, "torch.utils._pytree._register_pytree_node"
         ):
-            py_pytree._register_pytree_node(
+            python_pytree._register_pytree_node(
                 DummyType,
                 lambda dummy: ([dummy.x, dummy.y], None),
                 lambda xs, _: DummyType(*xs),
             )
 
         with self.assertWarnsRegex(UserWarning, "already registered"):
-            py_pytree._register_pytree_node(
+            python_pytree._register_pytree_node(
                 DummyType,
                 lambda dummy: ([dummy.x, dummy.y], None),
                 lambda xs, _: DummyType(*xs),
@@ -929,28 +848,30 @@ def test_import_pytree_doesnt_import_optree(self):
 
     def test_treespec_equality(self):
         self.assertEqual(
-            py_pytree.LeafSpec(),
-            py_pytree.LeafSpec(),
+            python_pytree.LeafSpec(),
+            python_pytree.LeafSpec(),
         )
         self.assertEqual(
-            py_pytree.TreeSpec(list, None, []),
-            py_pytree.TreeSpec(list, None, []),
+            python_pytree.TreeSpec(list, None, []),
+            python_pytree.TreeSpec(list, None, []),
         )
         self.assertEqual(
-            py_pytree.TreeSpec(list, None, [py_pytree.LeafSpec()]),
-            py_pytree.TreeSpec(list, None, [py_pytree.LeafSpec()]),
+            python_pytree.TreeSpec(list, None, [python_pytree.LeafSpec()]),
+            python_pytree.TreeSpec(list, None, [python_pytree.LeafSpec()]),
         )
         self.assertFalse(
-            py_pytree.TreeSpec(tuple, None, []) == py_pytree.TreeSpec(list, None, []),
+            python_pytree.TreeSpec(tuple, None, [])
+            == python_pytree.TreeSpec(list, None, []),
         )
         self.assertTrue(
-            py_pytree.TreeSpec(tuple, None, []) != py_pytree.TreeSpec(list, None, []),
+            python_pytree.TreeSpec(tuple, None, [])
+            != python_pytree.TreeSpec(list, None, []),
         )
 
     def test_treespec_repr(self):
         # Check that it looks sane
-        pytree = (0, [0, 0, [0]])
-        _, spec = py_pytree.tree_flatten(pytree)
+        tree = (0, [0, 0, [0]])
+        spec = python_pytree.tree_structure(tree)
         self.assertEqual(
             repr(spec),
             (
@@ -964,113 +885,86 @@ def test_treespec_repr(self):
     @parametrize(
         "spec",
         [
-            # py_pytree.tree_structure([])
-            py_pytree.TreeSpec(list, None, []),
-            # py_pytree.tree_structure(())
-            py_pytree.TreeSpec(tuple, None, []),
-            # py_pytree.tree_structure({})
-            py_pytree.TreeSpec(dict, [], []),
-            # py_pytree.tree_structure([0])
-            py_pytree.TreeSpec(list, None, [py_pytree.LeafSpec()]),
-            # py_pytree.tree_structure([0, 1])
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure([])
+            python_pytree.TreeSpec(list, None, []),
+            # python_pytree.tree_structure(())
+            python_pytree.TreeSpec(tuple, None, []),
+            # python_pytree.tree_structure({})
+            python_pytree.TreeSpec(dict, [], []),
+            # python_pytree.tree_structure([0])
+            python_pytree.TreeSpec(list, None, [python_leafspec]),
+            # python_pytree.tree_structure([0, 1])
+            python_pytree.TreeSpec(
                 list,
                 None,
-                [
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                ],
+                [python_leafspec, python_leafspec],
             ),
-            # py_pytree.tree_structure((0, 1, 2))
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure((0, 1, 2))
+            python_pytree.TreeSpec(
                 tuple,
                 None,
-                [
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                ],
+                [python_leafspec, python_leafspec, python_leafspec],
             ),
-            # py_pytree.tree_structure({"a": 0, "b": 1, "c": 2})
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure({"a": 0, "b": 1, "c": 2})
+            python_pytree.TreeSpec(
                 dict,
                 ["a", "b", "c"],
-                [
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                    py_pytree.LeafSpec(),
-                ],
+                [python_leafspec, python_leafspec, python_leafspec],
             ),
-            # py_pytree.tree_structure(OrderedDict([("a", (0, 1)), ("b", 2), ("c", {"a": 3, "b": 4, "c": 5})])
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure(OrderedDict([("a", (0, 1)), ("b", 2), ("c", {"a": 3, "b": 4, "c": 5})])
+            python_pytree.TreeSpec(
                 OrderedDict,
                 ["a", "b", "c"],
                 [
-                    py_pytree.TreeSpec(
+                    python_pytree.TreeSpec(
                         tuple,
                         None,
-                        [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                        ],
+                        [python_leafspec, python_leafspec],
                     ),
-                    py_pytree.LeafSpec(),
-                    py_pytree.TreeSpec(
+                    python_leafspec,
+                    python_pytree.TreeSpec(
                         dict,
                         ["a", "b", "c"],
-                        [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                        ],
+                        [python_leafspec, python_leafspec, python_leafspec],
                     ),
                 ],
             ),
-            # py_pytree.tree_structure([(0, 1, [2, 3])])
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure([(0, 1, [2, 3])])
+            python_pytree.TreeSpec(
                 list,
                 None,
                 [
-                    py_pytree.TreeSpec(
+                    python_pytree.TreeSpec(
                         tuple,
                         None,
                         [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                            py_pytree.TreeSpec(
+                            python_leafspec,
+                            python_leafspec,
+                            python_pytree.TreeSpec(
                                 list,
                                 None,
-                                [
-                                    py_pytree.LeafSpec(),
-                                    py_pytree.LeafSpec(),
-                                ],
+                                [python_leafspec, python_leafspec],
                             ),
                         ],
                     ),
                 ],
             ),
-            # py_pytree.tree_structure(defaultdict(list, {"a": [0, 1], "b": [1, 2], "c": {}}))
-            py_pytree.TreeSpec(
+            # python_pytree.tree_structure(defaultdict(list, {"a": [0, 1], "b": [1, 2], "c": {}}))
+            python_pytree.TreeSpec(
                 defaultdict,
                 [list, ["a", "b", "c"]],
                 [
-                    py_pytree.TreeSpec(
+                    python_pytree.TreeSpec(
                         list,
                         None,
-                        [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                        ],
+                        [python_leafspec, python_leafspec],
                     ),
-                    py_pytree.TreeSpec(
+                    python_pytree.TreeSpec(
                         list,
                         None,
-                        [
-                            py_pytree.LeafSpec(),
-                            py_pytree.LeafSpec(),
-                        ],
+                        [python_leafspec, python_leafspec],
                     ),
-                    py_pytree.TreeSpec(dict, [], []),
+                    python_pytree.TreeSpec(dict, [], []),
                 ],
             ),
         ],
@@ -1079,86 +973,92 @@ def test_pytree_serialize(self, spec):
         # Ensure that the spec is valid
         self.assertEqual(
             spec,
-            py_pytree.tree_structure(
-                py_pytree.tree_unflatten([0] * spec.num_leaves, spec)
+            python_pytree.tree_structure(
+                python_pytree.tree_unflatten([0] * spec.num_leaves, spec)
             ),
         )
 
-        serialized_spec = py_pytree.treespec_dumps(spec)
+        serialized_spec = python_pytree.treespec_dumps(spec)
         self.assertIsInstance(serialized_spec, str)
-        self.assertEqual(spec, py_pytree.treespec_loads(serialized_spec))
+        self.assertEqual(spec, python_pytree.treespec_loads(serialized_spec))
 
     def test_pytree_serialize_defaultdict_enum(self):
-        spec = py_pytree.TreeSpec(
+        spec = python_pytree.TreeSpec(
             defaultdict,
             [list, [TestEnum.A]],
             [
-                py_pytree.TreeSpec(
+                python_pytree.TreeSpec(
                     list,
                     None,
                     [
-                        py_pytree.LeafSpec(),
+                        python_leafspec,
                     ],
                 ),
             ],
         )
-        serialized_spec = py_pytree.treespec_dumps(spec)
+        serialized_spec = python_pytree.treespec_dumps(spec)
         self.assertIsInstance(serialized_spec, str)
 
     def test_pytree_serialize_enum(self):
-        spec = py_pytree.TreeSpec(dict, TestEnum.A, [py_pytree.LeafSpec()])
+        spec = python_pytree.TreeSpec(dict, TestEnum.A, [python_leafspec])
 
-        serialized_spec = py_pytree.treespec_dumps(spec)
+        serialized_spec = python_pytree.treespec_dumps(spec)
         self.assertIsInstance(serialized_spec, str)
 
     def test_pytree_serialize_namedtuple(self):
         Point1 = namedtuple("Point1", ["x", "y"])
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             Point1,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point1",
         )
 
-        spec = py_pytree.tree_structure(Point1(1, 2))
+        spec = python_pytree.tree_structure(Point1(1, 2))
         self.assertIs(spec.type, namedtuple)
-        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+        roundtrip_spec = python_pytree.treespec_loads(
+            python_pytree.treespec_dumps(spec)
+        )
         self.assertEqual(spec, roundtrip_spec)
 
         class Point2(NamedTuple):
             x: int
             y: int
 
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             Point2,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point2",
         )
 
-        spec = py_pytree.tree_structure(Point2(1, 2))
+        spec = python_pytree.tree_structure(Point2(1, 2))
         self.assertIs(spec.type, namedtuple)
-        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+        roundtrip_spec = python_pytree.treespec_loads(
+            python_pytree.treespec_dumps(spec)
+        )
         self.assertEqual(spec, roundtrip_spec)
 
         class Point3(Point2):
             pass
 
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             Point3,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.Point3",
         )
 
-        spec = py_pytree.tree_structure(Point3(1, 2))
+        spec = python_pytree.tree_structure(Point3(1, 2))
         self.assertIs(spec.type, namedtuple)
-        roundtrip_spec = py_pytree.treespec_loads(py_pytree.treespec_dumps(spec))
+        roundtrip_spec = python_pytree.treespec_loads(
+            python_pytree.treespec_dumps(spec)
+        )
         self.assertEqual(spec, roundtrip_spec)
 
     def test_pytree_serialize_namedtuple_bad(self):
         DummyType = namedtuple("DummyType", ["x", "y"])
 
-        spec = py_pytree.tree_structure(DummyType(1, 2))
+        spec = python_pytree.tree_structure(DummyType(1, 2))
 
         with self.assertRaisesRegex(
             NotImplementedError, "Please register using `_register_namedtuple`"
         ):
-            py_pytree.treespec_dumps(spec)
+            python_pytree.treespec_dumps(spec)
 
     def test_pytree_custom_type_serialize_bad(self):
         class DummyType:
@@ -1166,17 +1066,17 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             DummyType,
             lambda dummy: ([dummy.x, dummy.y], None),
             lambda xs, _: DummyType(*xs),
         )
 
-        spec = py_pytree.tree_structure(DummyType(1, 2))
+        spec = python_pytree.tree_structure(DummyType(1, 2))
         with self.assertRaisesRegex(
             NotImplementedError, "No registered serialization name"
         ):
-            py_pytree.treespec_dumps(spec)
+            python_pytree.treespec_dumps(spec)
 
     def test_pytree_custom_type_serialize(self):
         class DummyType:
@@ -1184,7 +1084,7 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             DummyType,
             lambda dummy: ([dummy.x, dummy.y], None),
             lambda xs, _: DummyType(*xs),
@@ -1192,10 +1092,10 @@ def __init__(self, x, y):
             to_dumpable_context=lambda context: "moo",
             from_dumpable_context=lambda dumpable_context: None,
         )
-        spec = py_pytree.tree_structure(DummyType(1, 2))
-        serialized_spec = py_pytree.treespec_dumps(spec, 1)
+        spec = python_pytree.tree_structure(DummyType(1, 2))
+        serialized_spec = python_pytree.treespec_dumps(spec, 1)
         self.assertIn("moo", serialized_spec)
-        roundtrip_spec = py_pytree.treespec_loads(serialized_spec)
+        roundtrip_spec = python_pytree.treespec_loads(serialized_spec)
         self.assertEqual(roundtrip_spec, spec)
 
     def test_pytree_serialize_register_bad(self):
@@ -1207,7 +1107,7 @@ def __init__(self, x, y):
         with self.assertRaisesRegex(
             ValueError, "Both to_dumpable_context and from_dumpable_context"
         ):
-            py_pytree.register_pytree_node(
+            python_pytree.register_pytree_node(
                 DummyType,
                 lambda dummy: ([dummy.x, dummy.y], None),
                 lambda xs, _: DummyType(*xs),
@@ -1221,7 +1121,7 @@ def __init__(self, x, y):
                 self.x = x
                 self.y = y
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             DummyType,
             lambda dummy: ([dummy.x, dummy.y], None),
             lambda xs, _: DummyType(*xs),
@@ -1230,65 +1130,59 @@ def __init__(self, x, y):
             from_dumpable_context=lambda dumpable_context: None,
         )
 
-        spec = py_pytree.tree_structure(DummyType(1, 2))
+        spec = python_pytree.tree_structure(DummyType(1, 2))
 
         with self.assertRaisesRegex(
             TypeError, "Object of type type is not JSON serializable"
         ):
-            py_pytree.treespec_dumps(spec)
+            python_pytree.treespec_dumps(spec)
 
     def test_pytree_serialize_bad_protocol(self):
         import json
 
         Point = namedtuple("Point", ["x", "y"])
-        spec = py_pytree.tree_structure(Point(1, 2))
-        py_pytree._register_namedtuple(
+        spec = python_pytree.tree_structure(Point(1, 2))
+        python_pytree._register_namedtuple(
             Point,
             serialized_type_name="test_pytree.test_pytree_serialize_bad_protocol.Point",
         )
 
         with self.assertRaisesRegex(ValueError, "Unknown protocol"):
-            py_pytree.treespec_dumps(spec, -1)
+            python_pytree.treespec_dumps(spec, -1)
 
-        serialized_spec = py_pytree.treespec_dumps(spec)
+        serialized_spec = python_pytree.treespec_dumps(spec)
         _, data = json.loads(serialized_spec)
         bad_protocol_serialized_spec = json.dumps((-1, data))
 
         with self.assertRaisesRegex(ValueError, "Unknown protocol"):
-            py_pytree.treespec_loads(bad_protocol_serialized_spec)
+            python_pytree.treespec_loads(bad_protocol_serialized_spec)
 
     def test_saved_serialized(self):
-        # py_pytree.tree_structure(OrderedDict([(1, (0, 1)), (2, 2), (3, {4: 3, 5: 4, 6: 5})]))
-        complicated_spec = py_pytree.TreeSpec(
+        # python_pytree.tree_structure(OrderedDict([(1, (0, 1)), (2, 2), (3, {4: 3, 5: 4, 6: 5})]))
+        complicated_spec = python_pytree.TreeSpec(
             OrderedDict,
             [1, 2, 3],
             [
-                py_pytree.TreeSpec(
-                    tuple, None, [py_pytree.LeafSpec(), py_pytree.LeafSpec()]
-                ),
-                py_pytree.LeafSpec(),
-                py_pytree.TreeSpec(
+                python_pytree.TreeSpec(tuple, None, [python_leafspec, python_leafspec]),
+                python_leafspec,
+                python_pytree.TreeSpec(
                     dict,
                     [4, 5, 6],
-                    [
-                        py_pytree.LeafSpec(),
-                        py_pytree.LeafSpec(),
-                        py_pytree.LeafSpec(),
-                    ],
+                    [python_leafspec, python_leafspec, python_leafspec],
                 ),
             ],
         )
         # Ensure that the spec is valid
         self.assertEqual(
             complicated_spec,
-            py_pytree.tree_structure(
-                py_pytree.tree_unflatten(
+            python_pytree.tree_structure(
+                python_pytree.tree_unflatten(
                     [0] * complicated_spec.num_leaves, complicated_spec
                 )
             ),
         )
 
-        serialized_spec = py_pytree.treespec_dumps(complicated_spec)
+        serialized_spec = python_pytree.treespec_dumps(complicated_spec)
         saved_spec = (
             '[1, {"type": "collections.OrderedDict", "context": "[1, 2, 3]", '
             '"children_spec": [{"type": "builtins.tuple", "context": "null", '
@@ -1301,11 +1195,11 @@ def test_saved_serialized(self):
             '[]}, {"type": null, "context": null, "children_spec": []}]}]}]'
         )
         self.assertEqual(serialized_spec, saved_spec)
-        self.assertEqual(complicated_spec, py_pytree.treespec_loads(saved_spec))
+        self.assertEqual(complicated_spec, python_pytree.treespec_loads(saved_spec))
 
     def test_tree_map_with_path(self):
         tree = [{i: i for i in range(10)}]
-        all_zeros = py_pytree.tree_map_with_path(
+        all_zeros = python_pytree.tree_map_with_path(
             lambda kp, val: val - kp[1].key + kp[0].idx, tree
         )
         self.assertEqual(all_zeros, [dict.fromkeys(range(10), 0)])
@@ -1318,34 +1212,34 @@ class Data:
             c: Optional[str] = None
             d: str = field(init=False, default="")
 
-        py_pytree.register_dataclass(Data)
+        python_pytree.register_dataclass(Data)
         old_data = Data(torch.tensor(3), "b", "c")
         old_data.d = "d"
-        new_data = py_pytree.tree_unflatten(*py_pytree.tree_flatten(old_data))
+        new_data = python_pytree.tree_map(lambda x: x, old_data)
         self.assertEqual(new_data.a, torch.tensor(3))
         self.assertEqual(new_data.b, "b")
         self.assertEqual(new_data.c, "c")
         self.assertEqual(new_data.d, "")
-        py_pytree._deregister_pytree_node(Data)
+        python_pytree._deregister_pytree_node(Data)
 
         with self.assertRaisesRegex(ValueError, "Missing fields"):
-            py_pytree.register_dataclass(Data, field_names=["a", "b"])
+            python_pytree.register_dataclass(Data, field_names=["a", "b"])
 
         with self.assertRaisesRegex(ValueError, "Unexpected fields"):
-            py_pytree.register_dataclass(Data, field_names=["a", "b", "e"])
+            python_pytree.register_dataclass(Data, field_names=["a", "b", "e"])
 
         with self.assertRaisesRegex(ValueError, "Unexpected fields"):
-            py_pytree.register_dataclass(Data, field_names=["a", "b", "c", "d"])
+            python_pytree.register_dataclass(Data, field_names=["a", "b", "c", "d"])
 
-        py_pytree.register_dataclass(
+        python_pytree.register_dataclass(
             Data, field_names=["a"], drop_field_names=["b", "c"]
         )
         old_data = Data(torch.tensor(3), "b", "c")
-        new_data = py_pytree.tree_unflatten(*py_pytree.tree_flatten(old_data))
+        new_data = python_pytree.tree_map(lambda x: x, old_data)
         self.assertEqual(new_data.a, torch.tensor(3))
         self.assertEqual(new_data.b, "moo")
         self.assertEqual(new_data.c, None)
-        py_pytree._deregister_pytree_node(Data)
+        python_pytree._deregister_pytree_node(Data)
 
     def test_register_dataclass_class(self):
         class CustomClass:
@@ -1354,11 +1248,11 @@ def __init__(self, x, y):
                 self.y = y
 
         with self.assertRaisesRegex(ValueError, "field_names must be specified"):
-            py_pytree.register_dataclass(CustomClass)
+            python_pytree.register_dataclass(CustomClass)
 
-        py_pytree.register_dataclass(CustomClass, field_names=["x", "y"])
+        python_pytree.register_dataclass(CustomClass, field_names=["x", "y"])
         c = CustomClass(torch.tensor(0), torch.tensor(1))
-        mapped = py_pytree.tree_map(lambda x: x + 1, c)
+        mapped = python_pytree.tree_map(lambda x: x + 1, c)
         self.assertEqual(mapped.x, torch.tensor(1))
         self.assertEqual(mapped.y, torch.tensor(2))
 
@@ -1369,10 +1263,10 @@ def test_constant(self):
         class Config:
             norm: str
 
-        py_pytree.register_constant(Config)
+        python_pytree.register_constant(Config)
 
         config = Config("l1")
-        elements, spec = py_pytree.tree_flatten(config)
+        elements, spec = python_pytree.tree_flatten(config)
         self.assertEqual(elements, [])
         self.assertEqual(spec.context.value, config)
 
@@ -1382,7 +1276,7 @@ def __init__(self, norm: str):
                 self.norm = norm
 
         try:
-            py_pytree.register_constant(Config)
+            python_pytree.register_constant(Config)
             self.assertFalse(True)  # must raise error before this
         except TypeError as e:
             msg = "register_constant(cls) expects `cls` to have a non-default `__eq__` implementation."
@@ -1397,7 +1291,7 @@ def __eq__(self, other):
                 return self.norm == other.norm
 
         try:
-            py_pytree.register_constant(Config)
+            python_pytree.register_constant(Config)
             self.assertFalse(True)  # must raise error before this
         except TypeError as e:
             msg = "register_constant(cls) expects `cls` to have a non-default `__hash__` implementation."
@@ -1413,23 +1307,23 @@ class ACustomPytree:
         tree1 = [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5]
         tree2 = [ACustomPytree(x=2, y={"cin": [2, 2, 2], "bar": 2}, z="leaf"), 2]
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             ACustomPytree,
             flatten_fn=lambda f: ([f.x, f.y], f.z),
             unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
             flatten_with_keys_fn=lambda f: ((("x", f.x), ("y", f.y)), f.z),
         )
-        from_two_trees = py_pytree.tree_map_with_path(
+        from_two_trees = python_pytree.tree_map_with_path(
             lambda kp, a, b: a + b, tree1, tree2
         )
-        from_one_tree = py_pytree.tree_map(lambda a: a + 2, tree1)
+        from_one_tree = python_pytree.tree_map(lambda a: a + 2, tree1)
         self.assertEqual(from_two_trees, from_one_tree)
 
     def test_tree_flatten_with_path_is_leaf(self):
         leaf_dict = {"foo": [(3)]}
-        pytree = (["hello", [1, 2], leaf_dict],)
-        key_leaves, _ = py_pytree.tree_flatten_with_path(
-            pytree, is_leaf=lambda x: isinstance(x, dict)
+        tree = (["hello", [1, 2], leaf_dict],)
+        key_leaves, _ = python_pytree.tree_flatten_with_path(
+            tree, is_leaf=lambda x: isinstance(x, dict)
         )
         self.assertTrue(key_leaves[-1][1] is leaf_dict)
 
@@ -1445,7 +1339,7 @@ class ACustomPytree:
             y: Any
             z: Any
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             ACustomPytree,
             flatten_fn=lambda f: ([f.x, f.y], f.z),
             unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
@@ -1458,10 +1352,12 @@ class ACustomPytree:
             [ANamedTuple(x=torch.rand(2, 3), y=1, z="foo")],
             [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5],
         ]
-        for pytree in SOME_PYTREES:
-            key_leaves, spec = py_pytree.tree_flatten_with_path(pytree)
-            actual = py_pytree.tree_unflatten([leaf for _, leaf in key_leaves], spec)
-            self.assertEqual(actual, pytree)
+        for tree in SOME_PYTREES:
+            key_leaves, spec = python_pytree.tree_flatten_with_path(tree)
+            actual = python_pytree.tree_unflatten(
+                [leaf for _, leaf in key_leaves], spec
+            )
+            self.assertEqual(actual, tree)
 
     def test_tree_leaves_with_path(self):
         class ANamedTuple(NamedTuple):
@@ -1475,7 +1371,7 @@ class ACustomPytree:
             y: Any
             z: Any
 
-        py_pytree.register_pytree_node(
+        python_pytree.register_pytree_node(
             ACustomPytree,
             flatten_fn=lambda f: ([f.x, f.y], f.z),
             unflatten_fn=lambda xy, z: ACustomPytree(xy[0], xy[1], z),
@@ -1488,9 +1384,9 @@ class ACustomPytree:
             [ANamedTuple(x=torch.rand(2, 3), y=1, z="foo")],
             [ACustomPytree(x=12, y={"cin": [1, 4, 10], "bar": 18}, z="leaf"), 5],
         ]
-        for pytree in SOME_PYTREES:
-            flat_out, _ = py_pytree.tree_flatten_with_path(pytree)
-            leaves_out = py_pytree.tree_leaves_with_path(pytree)
+        for tree in SOME_PYTREES:
+            flat_out, _ = python_pytree.tree_flatten_with_path(tree)
+            leaves_out = python_pytree.tree_leaves_with_path(tree)
             self.assertEqual(flat_out, leaves_out)
 
     def test_key_str(self):
@@ -1499,8 +1395,8 @@ class ANamedTuple(NamedTuple):
             y: int
 
         tree = (["hello", [1, 2], {"foo": [(3)], "bar": [ANamedTuple(x="baz", y=10)]}],)
-        flat, _ = py_pytree.tree_flatten_with_path(tree)
-        paths = [f"{py_pytree.keystr(kp)}: {val}" for kp, val in flat]
+        flat, _ = python_pytree.tree_flatten_with_path(tree)
+        paths = [f"{python_pytree.keystr(kp)}: {val}" for kp, val in flat]
         self.assertEqual(
             paths,
             [
@@ -1515,7 +1411,7 @@ class ANamedTuple(NamedTuple):
 
     def test_flatten_flatten_with_key_consistency(self):
         """Check that flatten and flatten_with_key produces consistent leaves/context."""
-        reg = py_pytree.SUPPORTED_NODES
+        reg = python_pytree.SUPPORTED_NODES
 
         EXAMPLE_TREE = {
             list: [1, 2, 3],
@@ -1534,8 +1430,8 @@ def test_flatten_flatten_with_key_consistency(self):
             example = EXAMPLE_TREE.get(typ)
             if example is None:
                 continue
-            flat_with_path, spec1 = py_pytree.tree_flatten_with_path(example)
-            flat, spec2 = py_pytree.tree_flatten(example)
+            flat_with_path, spec1 = python_pytree.tree_flatten_with_path(example)
+            flat, spec2 = python_pytree.tree_flatten(example)
 
             self.assertEqual(flat, [x[1] for x in flat_with_path])
             self.assertEqual(spec1, spec2)
@@ -1546,9 +1442,9 @@ class ANamedTuple(NamedTuple):
             y: int
 
         tree = (["hello", [1, 2], {"foo": [(3)], "bar": [ANamedTuple(x="baz", y=10)]}],)
-        flat, _ = py_pytree.tree_flatten_with_path(tree)
+        flat, _ = python_pytree.tree_flatten_with_path(tree)
         for kp, val in flat:
-            self.assertEqual(py_pytree.key_get(tree, kp), val)
+            self.assertEqual(python_pytree.key_get(tree, kp), val)
 
 
 class TestCxxPytree(TestCase):
@@ -1561,8 +1457,8 @@ def test_treespec_equality(self):
 
     def test_treespec_repr(self):
         # Check that it looks sane
-        pytree = (0, [0, 0, [0]])
-        _, spec = cxx_pytree.tree_flatten(pytree)
+        tree = (0, [0, 0, [0]])
+        spec = cxx_pytree.tree_structure(tree)
         self.assertEqual(
             repr(spec), "PyTreeSpec((*, [*, *, [*]]), NoneIsLeaf, namespace='torch')"
         )
@@ -1599,7 +1495,7 @@ def test_pytree_serialize(self, spec):
         self.assertEqual(spec, cxx_pytree.treespec_loads(serialized_spec))
 
     def test_pytree_serialize_namedtuple(self):
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             GlobalPoint,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.GlobalPoint",
         )
@@ -1609,7 +1505,7 @@ def test_pytree_serialize_namedtuple(self):
         self.assertEqual(roundtrip_spec.type._fields, spec.type._fields)
 
         LocalPoint = namedtuple("LocalPoint", ["x", "y"])
-        py_pytree._register_namedtuple(
+        python_pytree._register_namedtuple(
             LocalPoint,
             serialized_type_name="test_pytree.test_pytree_serialize_namedtuple.LocalPoint",
         )
diff --git a/test/test_scatter_gather_ops.py b/test/test_scatter_gather_ops.py
index d2a0e8bd1ccc..ba967c142f1e 100644
--- a/test/test_scatter_gather_ops.py
+++ b/test/test_scatter_gather_ops.py
@@ -383,13 +383,14 @@ def helper(input_size, idx_size):
     @dtypes(torch.float32)
     def test_scatter_add_broadcasted_index_deterministic(self, device, dtype):
         for d in (0, 1):
-            inp = torch.randn(3, 4, device=device, dtype=dtype)
+            inp = torch.randn(3, 4, 5, device=device, dtype=dtype)
             idx_1d = torch.randint(3, (10,), device=device)
             src_shape = list(inp.shape)
             src_shape[d] = 10
             src = torch.randn(src_shape, device=device, dtype=dtype)
-            idx = idx_1d.unsqueeze(1 - d).expand(src_shape)
-            print(idx.stride())
+            idx_view_shape = [1] * inp.ndim
+            idx_view_shape[d] = 10
+            idx = idx_1d.view(idx_view_shape).expand(src_shape)
             ref = inp.clone().scatter_add_(d, idx, src)
             with DeterministicGuard(True):
                 res = inp.clone().scatter_add_(d, idx, src)
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 456380f37077..727c3a5f6bcd 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -14,18 +14,19 @@
     parametrize, subtest, is_coalesced_indices, suppress_warnings, instantiate_parametrized_tests, \
     skipIfCrossRef
 from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_mps import mps_ops_modifier
 from numbers import Number
 from typing import Any
 from packaging import version
 from torch.testing._internal.common_cuda import \
     (SM53OrLater, SM80OrLater, TEST_MULTIGPU)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride,
-     deviceCountAtLeast, OpDTypes, onlyNativeDeviceTypes, skipCUDAIf, largeTensorTest)
+    (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, dtypesIfMPS, onlyCPU, onlyCUDA, precisionOverride,
+     deviceCountAtLeast, OpDTypes, onlyNativeDeviceTypes, skipCUDAIf, expectedFailureMPS, largeTensorTest)
 from torch.testing._internal.common_methods_invocations import \
     (op_db, reduction_ops, sparse_unary_ufuncs, sparse_masked_reduction_ops, binary_ufuncs)
 from torch.testing._internal.common_dtype import (
-    all_types, all_types_and_complex, all_types_and_complex_and, floating_and_complex_types,
+    all_types, all_types_and_complex, all_mps_types, all_types_and_complex_and, floating_and_complex_types,
     floating_and_complex_types_and, integral_types, floating_types_and,
 )
 from torch.testing._internal.opinfo.definitions.sparse import validate_sample_input_sparse
@@ -42,7 +43,6 @@ def _op_supports_any_sparse(op):
             or op.supports_sparse_bsc)
 
 
-
 reduction_ops_with_sparse_support = [
     op for op in reduction_ops if 'masked.' not in op.name and
     _op_supports_any_sparse(op) and not isinstance(op, ReductionPythonRefInfo)]
@@ -224,10 +224,12 @@ def randn(self, *args, **kwargs):
         return torch.empty(*args, **kwargs).normal_()
 
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_print_coalesced(self, device, dtype):
         self._test_print(device, dtype, True)
 
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_print_uncoalesced(self, device, dtype):
         self._test_print(device, dtype, False)
 
@@ -266,7 +268,7 @@ def _test_print(self, device, dtype, coalesced):
             if values.dtype == torch.double:
                 dtypes.append(torch.float)
             else:
-                dtypes.append(torch.double)
+                dtypes.append(torch.double if values.device != torch.device("mps:0") else torch.float32)
             for dtype in dtypes:
                 printed.append(f"########## {dtype} ##########")
                 x = sp_tensor.detach().to(dtype)
@@ -286,6 +288,7 @@ def _test_print(self, device, dtype, coalesced):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_basic(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             if isinstance(with_size, Number):
@@ -320,6 +323,7 @@ def test_shape(sparse_dims, nnz, with_size):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble, torch.bfloat16)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     @precisionOverride({torch.bfloat16: 1e-2})
     def test_coalesce(self, device, dtype, coalesced):
 
@@ -382,6 +386,7 @@ def test_coalesce_accepts_large_tensor(self, device, dtype):
         sparse_matrix = sparse_matrix.coalesce()
 
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/89395")
     def test_coalesce_reference_cycle(self, device, dtype):
         # Test coalesce doesn't create autograd graph cycles (gh-52253)
@@ -409,6 +414,7 @@ def test_sparse_sum():
         self.assertTrue(ref.expired())
 
     @dtypes(torch.double)
+    @dtypesIfMPS(torch.float32)
     def test_ctor_large_sizes(self, device, dtype):
         # Test that integer overflow is detected when computing numel
         # of a sparse tensor with large dimensions (gh-57416). Notice
@@ -423,6 +429,7 @@ def test_ctor_large_sizes(self, device, dtype):
                               indices, values, (N + 1,) * 4, device=device))
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_ctor_size_checks(self, device, dtype):
         indices = self.index_tensor([
             [0, 0, 0],
@@ -446,6 +453,7 @@ def test_ctor_size_checks(self, device, dtype):
             RuntimeError,
             lambda: self.sparse_tensor(indices, values, torch.Size([2, 4, 2, 1])))
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double)
     def test_ctor_is_coalesced_with_gradcheck(self, device, dtype, coalesced):
@@ -471,6 +479,7 @@ def func(indices, values, shape, is_coalesced):
                                                 "cannot set is_coalesced to true if indices correspond to uncoalesced COO tensor"):
                         torch.autograd.gradcheck(func, (t._indices(), t._values().requires_grad_(True), shape, True))
 
+    @expectedFailureMPS
     @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16))
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
@@ -536,6 +545,7 @@ def fn(x):
 
     @coalescedonoff
     @dtypes(torch.float16, torch.bfloat16, torch.float64, torch.int, torch.cfloat, torch.cdouble)
+    @expectedFailureMPS  # unique_dim not implemented for MPS device
     def test_to_sparse(self, device, dtype, coalesced):
         shape = [5, 2, 10, 4]
         max_nnz = 1
@@ -555,6 +565,7 @@ def test_to_sparse(self, device, dtype, coalesced):
                     self.assertEqual(dim, result.sparse_dim())
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_sparse_bool(self, device, dtype):
         a = torch.tensor([True, False], dtype=dtype, device=device).to(torch.bool)
         b = a.to_sparse().to_dense()
@@ -562,6 +573,7 @@ def test_sparse_bool(self, device, dtype):
 
     @skipIfTorchDynamo("https://github.com/pytorch/pytorch/issues/108667")
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_scalar(self, device, dtype):
         # tensor with value
         a = self.sparse_tensor(self.index_tensor([], device=device).unsqueeze(1), 12.3, [], dtype=dtype, device=device)
@@ -592,6 +604,7 @@ def test_scalar(self, device, dtype):
         self.assertEqual(a, a.to_dense().to_sparse())
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_shared(self, device, dtype):
         i = self.index_tensor([[2]], device=device)
         v = torch.tensor([5], dtype=dtype, device=device)
@@ -607,6 +620,7 @@ def test_shared(self, device, dtype):
         i[0][0] = 0
         self.assertEqual(torch.empty((3, 0), dtype=dtype, device=device), self.safeToDense(x))
 
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
@@ -656,6 +670,7 @@ def fn(x):
         test_tensor(x, res)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_contig(self, device, dtype):
         def test_tensor(x, exp_i, exp_v):
             x = x.coalesce()
@@ -737,6 +752,7 @@ def test_tensor(x, exp_i, exp_v):
         test_tensor(x, exp_i, exp_v)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_contig_hybrid(self, device, dtype):
         def test_tensor(x, exp_i, exp_v):
             x = x.coalesce()
@@ -824,6 +840,7 @@ def test_tensor(x, exp_i, exp_v):
         test_tensor(x, exp_i, exp_v)
 
     @coalescedonoff
+    @dtypesIfMPS(torch.float32, torch.complex64)
     @dtypes(torch.double, torch.cdouble)
     def test_clone(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
@@ -842,6 +859,7 @@ def test_shape(sparse_dims, nnz, with_size):
         test_shape(3, 0, [0, 0, 100, 5, 5, 5, 0])
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble, torch.bfloat16)
     @precisionOverride({torch.bfloat16: 2e-2})
     def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced):
@@ -944,6 +962,7 @@ def test_tensor(x):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_transpose(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             x = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)[0]
@@ -964,6 +983,7 @@ def test_shape(sparse_dims, nnz, with_size):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
     def test_permute(self, device, dtype, coalesced, gradcheck):
@@ -1043,6 +1063,7 @@ def test_shape(di, dj, dk, nnz):
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1166")
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_t_empty(self, device, dtype):
         def test_in_place(x):
             shape_original = x.shape
@@ -1072,6 +1093,7 @@ def test_not_in_place(x):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_add_zeros(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
@@ -1086,6 +1108,7 @@ def test_shape(sparse_dims, nnz, sizes):
         test_shape(2, 20, [3, 17, 19, 5])
         test_shape(2, 20, [3, 17, 19, 0])
 
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_add_sub_nnz(self, device, dtype):
         # nnz should not grow unbounded (gh-34964)
@@ -1098,6 +1121,7 @@ def test_add_sub_nnz(self, device, dtype):
         x.sub_(2 * x)
         self.assertLessEqual(x._nnz(), 10)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_cat(self, device, dtype, coalesced):
@@ -1140,6 +1164,7 @@ def test_shapes(shapes, dim, fail_message=None):
                                     "Concatenating sparse tensors, but a dense tensor was found at position 1."):
             torch.cat((sp, dn))
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_unsqueeze(self, device, dtype, coalesced):
@@ -1174,6 +1199,7 @@ def test_shape(sparse_dims, nnz, sizes, unsqueeze_dim, fail_message=None):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_select(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, sizes, select_dim, select_index, fail_message=None):
             x, _, _ = self._gen_sparse(sparse_dims, nnz, sizes, dtype, device, coalesced)
@@ -1219,6 +1245,7 @@ def test_select_no_type_promotion(self, device, dtype):
             self.assertEqual(t.dtype, t[0, 0].dtype)
             self.assertEqual(t.dtype, t[1, 1].dtype)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_index_select(self, device, dtype, coalesced):
@@ -1271,18 +1298,21 @@ def _test_index_select_exhaustive_index(self, sizes, dims, device, dtype, coales
                     small_sparse_result = t_small_sparse.index_select(d, t_idx)
                     self.assertEqual(small_dense_result, small_sparse_result)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_index_select_exhaustive_index_small(self, device, dtype, coalesced):
         # will trigger brute-force algo
         self._test_index_select_exhaustive_index((3, 3, 4), range(3), device, dtype, coalesced)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_index_select_exhaustive_index_large(self, device, dtype, coalesced):
         # will trigger more sophisticated algos
         self._test_index_select_exhaustive_index((100, 50, 3, 3), (2, 3), device, dtype, coalesced)
 
+    @expectedFailureMPS
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
     def test_index_select_empty_and_non_contiguous_index(self, device, dtype, coalesced):
@@ -1381,6 +1411,7 @@ def test_shape(di, dj, dk, nnz):
         "bmm sparse-dense CUDA is not yet supported in Windows, at least up to CUDA 10.1"
     )
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_bmm(self, device, dtype, coalesced):
         def test_shape(num_mats, dim_i, dim_j, dim_k, nnz):
@@ -1591,6 +1622,7 @@ def test_shape(di, dj, dk, nnz):
         self.assertEqual(self.safeToDense(res), self.safeToDense(true_result))
 
     @coalescedonoff
+    @expectedFailureMPS
     @precisionOverride({torch.bfloat16: 5e-2, torch.float16: 5e-2})
     @dtypes(torch.double, torch.cdouble, torch.bfloat16, torch.float16)
     def test_sparse_addmm(self, device, dtype, coalesced):
@@ -1632,6 +1664,7 @@ def fn(S, D1, D2, beta=beta, alpha=alpha):
         test_shape(7, 8, 9, 20, True, (1, 1))
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     def test_sparse_mm(self, device, dtype, coalesced):
@@ -1654,6 +1687,7 @@ def fn(S, D):
         test_shape(7, 8, 9, 20, True)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     @gradcheck_semantics()
@@ -1677,6 +1711,7 @@ def test_shape(sparse_dims, nnz, with_shape):
         # test_shape(2, 3, [2, 2, 0])
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_dsmm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
@@ -1696,6 +1731,7 @@ def test_shape(di, dj, dk, nnz):
         test_shape(1000, 100, 0, 20)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_hsmm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
@@ -1715,6 +1751,7 @@ def test_shape(di, dj, dk, nnz):
         test_shape(1000, 100, 0, 20)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_spadd(self, device, dtype, coalesced):
 
@@ -1802,6 +1839,7 @@ def test_sparse_add_out_bfloat16(self, device, dtype, coalesced):
         self.assertEqual(res_fp32, res_bf16, atol=1e-2, rtol=0)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_norm(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
@@ -1830,6 +1868,7 @@ def test_shape(sparse_dims, nnz, with_size):
                 x.norm(**kwargs)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     @unittest.skipIf(TEST_WITH_CROSSREF, "fallback triggers cuda device error")
     def test_sparse_sum(self, device, dtype, coalesced):
@@ -1894,6 +1933,7 @@ def fn(S):
             S = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)[0]
             run_tests(S.requires_grad_(True), test_dim)
 
+    @expectedFailureMPS
     def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, device, coalesced):
         shape = shape_i + (shape_v)
         x1, _, _ = self._gen_sparse(len(shape_i), nnz_x1, shape, dtype, device, coalesced)
@@ -2002,6 +2042,7 @@ def _test_basic_ops_hybrid():
         _test_basic_ops_hybrid()
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_add_dense_sparse_mismatch(self, device, dtype):
         def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size):
             x = torch.zeros(dense_size, dtype=dtype, device=device)
@@ -2018,6 +2059,7 @@ def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size):
 
     @skipIfTorchDynamo("Not a TorchDynamo suitable test")
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_add_noncontiguous(self, device, dtype):
         indices = self.index_tensor([[1, 2], [0, 2]], device=device)
         values = torch.tensor([1.], dtype=dtype, device=device).expand(2, 3, 4, 5)
@@ -2040,6 +2082,7 @@ def _test_sparse_mask_shape(self, nnz_x1, nnz_x2, shape_i, shape_v, dtype, devic
         self.assertEqual(self.safeToDense(y2), expected)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_sparse_mask(self, device, dtype, coalesced):
         def _test_sparse_mask_fixed():
@@ -2110,6 +2153,7 @@ def _test_sparse_mask_fixed():
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_sparse_mask_hybrid(self, device, dtype, coalesced):
         def _test_sparse_mask_hybrid_fixed():
             i = self.index_tensor([
@@ -2171,6 +2215,7 @@ def _test_sparse_mask_hybrid_fixed():
         self._test_sparse_mask_shape(0, 0, [10, 10, 0], [2, 0], dtype, device, coalesced)
 
     @dtypes(torch.double, torch.cdouble)
+    @expectedFailureMPS
     @skipIfCrossRef
     def test_sparse_mask_backward(self, device, dtype):
         from itertools import product, repeat
@@ -2205,6 +2250,7 @@ def test_sparse_mask_backward(self, device, dtype):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_zeros(self, device, dtype, coalesced):
         def _test_zeros(nnzs, shape, out_shape_i, out_shape_v=None):
             out_shape = out_shape_i + (out_shape_v or [])
@@ -2229,6 +2275,7 @@ def test_shape(i_shapes, v_shapes, shape, nnzs):
         test_shape([2, 3, 4], [0, 4, 5, 6], [2, 3, 0], [9, 12])
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_zeros_like(self, device, dtype, coalesced):
         def _test_zeros_like(nnzs, template_shape_i, template_shape_v=None):
@@ -2312,6 +2359,7 @@ def _test_empty_like(self, sparse_tensor, dtype, device, coalesced):
             result = torch.empty_like(dense_tensor, layout=torch.sparse_coo)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_empty_like(self, device, dtype, coalesced):
         # tests https://github.com/pytorch/pytorch/issues/43699
@@ -2368,6 +2416,7 @@ def _all_narrow_combs(self, shape):
                     yield [dim, start, length]
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_narrow(self, device, dtype, coalesced):
         shape = [3, 3, 4, 2]
@@ -2410,6 +2459,7 @@ def is_integral(dtype):
                 sparse_tensor.requires_grad_()
 
     @coalescedonoff
+    @dtypesIfMPS(*all_mps_types())
     @dtypes(*all_types())
     def test_log1p(self, device, dtype, coalesced):
         if coalesced:
@@ -2475,6 +2525,7 @@ def _test_neg_negative(self, sparse_tensor):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_neg_negative(self, device, dtype, coalesced):
 
         if coalesced:
@@ -2556,6 +2607,7 @@ def is_integral(dtype):
 
     @coalescedonoff
     @dtypes(*all_types())
+    @dtypesIfMPS(*all_mps_types())
     def test_asin_arcsin(self, device, dtype, coalesced):
         if coalesced:
             input_coalesced = torch.sparse_coo_tensor(
@@ -2601,6 +2653,7 @@ def test_asin_arcsin(self, device, dtype, coalesced):
             self._test_asin_arcsin(input_uncoalesced, coalesced)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_mv(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
@@ -2628,6 +2681,7 @@ def test_shape(di, dj, dk, nnz):
             res = x.mv(y)
 
     @dtypes(*floating_and_complex_types())
+    @dtypesIfMPS(torch.float32, torch.bfloat16, torch.complex64)
     def test_sparse_add_coalesce(self, device, dtype):
         i = self.index_tensor([[1, 2, 1]], device=device)
         v = torch.tensor([3, 4, 5], dtype=dtype, device=device)
@@ -2705,6 +2759,7 @@ def test_new_device_multi_gpu(self):
 
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_new(self, device, dtype, coalesced):
         def test_shape(sparse_dims, nnz, with_size):
             x, indices, values = self._gen_sparse(sparse_dims, nnz, with_size, dtype, device, coalesced)
@@ -2761,6 +2816,7 @@ def test_factory(self, device, dtype):
                             self.assertEqual(True, sparse_tensor.requires_grad)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_factory_size_check(self, device, dtype):
         indices = self.index_tensor([[1, 2],
                                     [0, 2]], device=device)
@@ -2815,6 +2871,7 @@ def test_factory_empty_indices(self, device):
         self.assertEqual(tensor._indices(), expected_indices)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_factory_nnz(self, device, dtype):
         indices = self.index_tensor([[0]], device=device)  # (sparse_dim, nnz): (1, 1)
         values = torch.tensor([[1, 1], [1, 1]], dtype=dtype, device=device)  # (nnz, ...): (2, 2)
@@ -2829,6 +2886,7 @@ def test_factory_nnz(self, device, dtype):
             torch.sparse_coo_tensor(indices, values, sizes, dtype=dtype, device=device)
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_factory_nnz_zero(self, device, dtype):
         def test_shape(i_shape, v_shape, size, expected_size):
             if size:
@@ -2850,6 +2908,7 @@ def test_shape(i_shape, v_shape, size, expected_size):
         test_shape([3, 0], [0, 2, 4, 0], [1, 2, 3, 2, 4, 0], [1, 2, 3, 2, 4, 0])
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_factory_dense_dim(self, device, dtype):
         indices = self.index_tensor([[0]], device=device)
         values = torch.tensor([[[1, 1, 1], [1, 1, 1]]], dtype=dtype, device=device)
@@ -3090,6 +3149,7 @@ def _test_resize_shape(self, x_i, x_v, x_size, y_i, y_v, y_size, dtype, device):
                          x_dense.view(-1)[0:x_v_numel].view(x_v))
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_resize(self, device, dtype):
         # 1. Expand the size of some dense dimensions [Supported]
         self._test_resize_shape([1, 1], [1, 2, 3], [2, 2, 3],
@@ -3175,6 +3235,7 @@ def test_is_nonzero(self, device):
                          .is_nonzero())
 
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_change_tensor_metadata(self, device, dtype):
         i = self.index_tensor([[0], [1]], device=device)
         v = torch.tensor([[3, 4, 5]], dtype=dtype, device=device)
@@ -3217,6 +3278,7 @@ def test_change_tensor_metadata(self, device, dtype):
         self.assertEqual(list(t.coalesce().values().size()), [1, 3])
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_pickle(self, device, dtype, coalesced):
         import pickle
@@ -3248,6 +3310,7 @@ def test_pickle(self, device, dtype, coalesced):
             sp_tensor_loaded = pickle.loads(serialized)
             self.assertEqual(sp_tensor, sp_tensor_loaded)
 
+    @expectedFailureMPS
     def test_any(self, device):
         t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, False]), device=device)
         t_any = torch.tensor(False)
@@ -3265,6 +3328,7 @@ def test_isnan(self, device):
         self.assertEqual(torch.isnan(t).int(), t_nan.int())
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.float32, torch.float64)
     def test_div_rounding_mode(self, device, dtype, coalesced):
         sparse, _, _ = self._gen_sparse(2, 10, (10, 10), dtype,
@@ -3285,11 +3349,13 @@ def test_div_rounding_mode(self, device, dtype, coalesced):
             torch.div(sparse, -2, rounding_mode=mode, out=actual)
             self.assertEqual(self.safeToDense(actual), expect)
 
+    @expectedFailureMPS
     def test_div_by_sparse_error(self, device):
         self.assertRaisesRegex(RuntimeError, 'Sparse division requires',
                                lambda: torch.tensor(1., device=device).to_sparse()
                                / torch.tensor(1., device=device).to_sparse())
 
+    @expectedFailureMPS
     def test_floor_divide_by_sparse_error(self, device):
         self.assertRaisesRegex(RuntimeError, 'Sparse floor division requires',
                                lambda: torch.tensor(1., device=device).to_sparse()
@@ -3302,6 +3368,7 @@ def test_sparse_to_numpy(self, device):
         self.assertRaises(TypeError, lambda: t.numpy())
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double)
     def test_softmax(self, device, dtype, coalesced):
         import torch.nn.functional as F
@@ -3614,12 +3681,14 @@ def _check_zero_nnz_softmax_op(self, func, ndim, device, dtype):
 
 
     @dtypes(torch.double, torch.float)
+    @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     def test_softmax_zero_nnz(self, device, dtype):
         self._check_zero_nnz_softmax_op(torch.sparse.softmax, 1, device, dtype)
         self._check_zero_nnz_softmax_op(torch.sparse.softmax, 10, device, dtype)
 
     @dtypes(torch.double, torch.float)
+    @expectedFailureMPS
     @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
     def test_log_softmax_zero_nnz(self, device, dtype):
         self._check_zero_nnz_softmax_op(torch.sparse.log_softmax, 1, device, dtype)
@@ -3628,6 +3697,7 @@ def test_log_softmax_zero_nnz(self, device, dtype):
     # TODO: Check after why ROCm's cusparseXcsrgemm2Nnz function doesn't return the same nnz value as CUDA
     @coalescedonoff
     @dtypes(*floating_and_complex_types())
+    @expectedFailureMPS
     @dtypesIfCUDA(*floating_types_and(*[torch.half] if SM53OrLater and not TEST_WITH_ROCM else [],
                                       *[torch.bfloat16] if SM80OrLater and not TEST_WITH_ROCM else [],
                                       torch.complex64,
@@ -3758,6 +3828,7 @@ def assign_to():
 
         self.assertRaises(TypeError, assign_to)
 
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_full_broadcast_to(self, device, dtype):
         def can_broadcast(s0, s1):
@@ -3788,6 +3859,7 @@ def can_broadcast(s0, s1):
                         torch._sparse_broadcast_to(s, s1)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(torch.double, torch.cdouble)
     def test_sparse_broadcast_to(self, device, dtype, coalesced):
         def test(sparse_dims, nnz, with_size, new_size):
@@ -3817,6 +3889,7 @@ def _test_mul_skips(self, device, dtype, coalesced):
             self.skipTest(f"Test with dtype={dtype}, device={device} runs only with coalesced inputs")
 
     @coalescedonoff
+    @expectedFailureMPS
     # NOTE: addcmul_out is not implemented for bool.
     @dtypes(*all_types_and_complex_and(torch.bfloat16, torch.float16))
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
@@ -3868,6 +3941,7 @@ def check_empty(sparse_shape, nnz, dense_shape, coalesce):
                 # check_autograd(x, y)
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(*all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16))
     @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2})
     def test_sparse_dense_mul(self, device, dtype, coalesced):
@@ -4053,6 +4127,7 @@ def test_small_nnz_coalesced(self):
         self.assertFalse(torch.sparse_coo_tensor([[0, 1], [0, 1]], [1, 2], (2, 2)).is_coalesced())
 
     @coalescedonoff
+    @expectedFailureMPS
     @dtypes(*all_types_and_complex_and(torch.bool))
     def test_sum(self, device, dtype, coalesced):
         def run_test(shape, nnz):
@@ -4126,7 +4201,7 @@ def _sparse_to_dense(tensor):
     return tensor.to(torch.int8).to_dense().to(torch.bool)
 
 
-_sparse_unary_ops = ops(sparse_unary_ufuncs, dtypes=OpDTypes.supported,
+_sparse_unary_ops = ops(mps_ops_modifier(sparse_unary_ufuncs, sparse=True), dtypes=OpDTypes.supported,
                         allowed_dtypes=all_types_and_complex())
 class TestSparseUnaryUfuncs(TestCase):
     exact_dtype = True
@@ -4178,8 +4253,8 @@ def test_inplace(self, device, dtype, op):
     @_sparse_unary_ops
     def test_sparse_zero_dims(self, device, dtype, op):
         # test 0x0 sparse_coo_tensor
-        indices = torch.empty(2, 0, dtype=torch.int64)
-        values = torch.empty(0, dtype=dtype)
+        indices = torch.empty(2, 0, dtype=torch.int64, device=device)
+        values = torch.empty(0, dtype=dtype, device=device)
         sparse_0x0 = torch.sparse_coo_tensor(indices, values, (0, 0))
         expected = torch.sparse_coo_tensor(indices, op(values), (0, 0))
         actual = op(sparse_0x0)
@@ -5526,12 +5601,12 @@ def generic_constructor(*args, **kwargs):
 
 
 # e.g., TestSparseUnaryUfuncsCPU and TestSparseUnaryUfuncsCUDA
-instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), except_for='meta')
+instantiate_device_type_tests(TestSparseUnaryUfuncs, globals(), allow_mps=True, except_for='meta')
 
 instantiate_device_type_tests(TestSparseMaskedReductions, globals(), except_for='meta')
 
 # e.g., TestSparseCPU and TestSparseCUDA
-instantiate_device_type_tests(TestSparse, globals(), except_for='meta')
+instantiate_device_type_tests(TestSparse, globals(), allow_mps=True, except_for='meta')
 
 instantiate_device_type_tests(TestSparseAny, globals(), except_for='meta')
 
diff --git a/test/test_sympy_utils.py b/test/test_sympy_utils.py
index 5343e2e0a9fb..a7bcd04ce14e 100644
--- a/test/test_sympy_utils.py
+++ b/test/test_sympy_utils.py
@@ -5,7 +5,7 @@
 import math
 import pickle
 import sys
-from typing import Callable
+from collections.abc import Callable
 
 import sympy
 
@@ -24,6 +24,7 @@
     FloorDiv,
     Identity,
     OpaqueUnaryFn_cos,
+    BitwiseFn_bitwise_and,
     simple_floordiv_gcd,
 )
 from torch.utils._sympy.interp import sympy_interp
@@ -426,7 +427,7 @@ def test_interp(self, fn):
                 # Yes, I know this is a long-winded way of saying xreplace; the
                 # point is to test sympy_interp
                 r = sympy_interp(
-                    ReferenceAnalysis, dict(zip(symbols, sargs)), sympy_expr
+                    ReferenceAnalysis, dict(zip(symbols, sargs, strict=False)), sympy_expr
                 )
                 self.assertEqual(ref_r, r)
 
@@ -501,7 +502,7 @@ def trace_f(px, py):
 
                 self.assertEqual(
                     sympy_interp(
-                        PythonReferenceAnalysis, dict(zip(symbols, args)), sympy_expr
+                        PythonReferenceAnalysis, dict(zip(symbols, args, strict=False)), sympy_expr
                     ),
                     gm(*args),
                 )
@@ -555,7 +556,7 @@ def test_tensor_interp(self, fn):
                     direct_result = tensor_fn(*tensor_args)
                     interp_result = sympy_interp(
                         TensorReferenceAnalysis,
-                        dict(zip(symbols, tensor_args)),
+                        dict(zip(symbols, tensor_args, strict=False)),
                         sympy_expr,
                     )
 
@@ -873,6 +874,10 @@ def test_pickle(self):
         r = pickle.loads(pickle.dumps(x))
         self.assertEqual(x, r)
 
+        x = BitwiseFn_bitwise_and(sympy.Symbol("a"), sympy.Symbol("b"))
+        r = pickle.loads(pickle.dumps(x))
+        self.assertEqual(x, r)
+
 
 class TestSingletonInt(TestCase):
     def test_basic(self):
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index 02cb1d31d563..15c04b8154c3 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -1965,6 +1965,11 @@ def test_zeros(self, device):
         expected = torch.tensor([[0., 0.], [0., 0.]], device=device, dtype=torch.complex32)
         self.assertEqual(complexHalfTensor, expected)
 
+    def test_zeros_bounds_checking(self, device):
+        # Test negative large integer
+        with self.assertRaisesRegex(RuntimeError, r"zeros: Dimension size must be non-negative."):
+            torch.zeros(-6744789213055875072, device=device)
+
     # TODO: this test should be updated
     def test_zeros_out(self, device):
         shape = (3, 4)
diff --git a/test/test_transformers.py b/test/test_transformers.py
index 7d4019e3a261..5b240e1f046c 100644
--- a/test/test_transformers.py
+++ b/test/test_transformers.py
@@ -51,6 +51,7 @@
     PLATFORM_SUPPORTS_CUDNN_ATTENTION,
     tf32_on_and_off,
     tf32_enabled,
+    ROCM_VERSION,
 )
 
 if TEST_FAIRSEQ:
@@ -339,7 +340,7 @@ def test_train_with_pad_and_catch_error(self, device):
                 l1_bool = nn.L1Loss()(test_train_bool[:, 0:2, :], test_eval_bool[:, 0:2, :]).item()
                 self.assertTrue(l1_bool < 1e-4, "Eval/Train difference in pad_mask BOOL")
 
-    @tf32_on_and_off(0.001)
+    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
     @parametrize("attn_mask_dim", [2, 3, None])
     @parametrize("key_padding_mask_dim", [2, None])
     @parametrize("mask_dtype", [torch.bool, torch.float32])
@@ -523,7 +524,7 @@ def test_transformerencoder_fastpath(self, device, use_torchscript, enable_neste
                 slowpath_output = slowpath_output.masked_fill(src_key_padding_mask.unsqueeze(-1), 0)
                 self.assertEqual(fastpath_output_expanded, slowpath_output)
 
-    @tf32_on_and_off(0.001)
+    @tf32_on_and_off(0.001, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
     @parametrize("with_no_grad", [True, False])
     @parametrize("training", [True, False])
     @parametrize("enable_nested_tensor", [False])
@@ -1109,7 +1110,7 @@ def forward(
                     return_all_hiddens=False,
                 )[0]
 
-    @tf32_on_and_off(0.003)
+    @tf32_on_and_off(0.003, only_if=(not TEST_WITH_ROCM or ROCM_VERSION < (7, 0)))
     @parametrize("input_dim,attn_mask_dim,is_causal",
                  [(3, None, False), (3, 2, False), (3, 2, True), (3, 3, False), (3, 3, True),
                   (4, None, False), (4, 2, False), (4, 2, True), (4, 4, False), (4, 4, True)],
@@ -2069,6 +2070,11 @@ def ref(x):
             sdp_math = torch.nn.functional.scaled_dot_product_attention(x, x, x, scale=-1.0 / 0.0001)
         self.assertEqual(ref_result, sdp_math)
 
+    def test_scaled_dot_product_attention_fp16_overflow(self, device):
+        # Regression test for https://github.com/pytorch/pytorch/issues/160841
+        x = torch.full((1, 32, 23, 80), 64.0, dtype=torch.half, device=device)
+        y = torch.nn.functional.scaled_dot_product_attention(x, x, x)
+        self.assertFalse(y.isnan().any().item())
 
 class TestSDPACpuOnly(NNTestCase):
     """ Used to test CPU only functionality of scaled_dot_product_attention """
@@ -2817,6 +2823,38 @@ def test_attention(backend: SDPBackend, permute_order: list[list[int]]):
         for permute_order in permute_orders:
             test_attention(SDPBackend.CUDNN_ATTENTION, list(permute_order) + [3])
 
+    @skipIfRocm
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
+    def test_cudnn_attention_compiles(self):
+        q = torch.randn(2, 8, 1024, 128, dtype=torch.half, device='cuda', requires_grad=True)
+        grad = torch.randn_like(q)
+
+        @torch.compile()
+        def func():
+            with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.CUDNN_ATTENTION):
+                out = torch.nn.functional.scaled_dot_product_attention(q, q, q)
+                out.backward(grad)
+            return out
+
+        out = func()
+
+        q_cpu = q.float().cpu().detach().clone()
+        q_cpu.requires_grad = True
+        grad_cpu = grad.cpu().float()
+        out_cpu = torch.nn.functional.scaled_dot_product_attention(q_cpu, q_cpu, q_cpu)
+        out_cpu.backward(grad_cpu)
+        self.assertEqual(out, out_cpu.cuda().half(), atol=1e-3, rtol=1e-3)
+        self.assertEqual(q.grad, q_cpu.grad.cuda().half(), atol=7e-3, rtol=5e-3)
+
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
+    def test_cudnn_attention_seqlen1_dropout_heuristic(self):
+        q = torch.randn(2, 8, 1, 128, dtype=torch.half, device='cuda', requires_grad=True)
+        grad = torch.randn_like(q)
+
+        with torch.nn.attention.sdpa_kernel([SDPBackend.CUDNN_ATTENTION, SDPBackend.FLASH_ATTENTION]):
+            out = torch.nn.functional.scaled_dot_product_attention(q, q, q, dropout_p=0.5)
+            out.backward(grad)
+
     @unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system")
     @parametrize("mask_dim", [1, 2, 3, 4])
     def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int]):
@@ -3163,7 +3201,7 @@ def test_fused_sdp_choice(self, device, type: str):
         device_capability = None
         if "cuda" in str(device):
             device_capability = torch.cuda.get_device_capability()
-        prefer_cudnn = "TORCH_CUDNN_SDPA_PREFERRED" in os.environ
+        prefer_cudnn = "TORCH_CUDNN_SDPA_PREFERRED" not in os.environ or bool(os.environ["TORCH_CUDNN_SDPA_PREFERRED"])
         prefer_cudnn = prefer_cudnn and device_capability and (device_capability == (9, 0) or device_capability == (10, 0))
 
         # TODO we are currently disabling this by default, lets assert that this returns
@@ -3425,6 +3463,7 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             'grad_value': 8.5,
         }
         if TEST_WITH_ROCM:
+            fudge_factors['out'] = 5.0
             fudge_factors['grad_key'] = 45.0
             fudge_factors['grad_query'] = 360.0
             if seq_len_k >= 1024:
@@ -3434,6 +3473,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
                 fudge_factors['grad_query'] = 670.0
             if dtype == torch.float32:
                 fudge_factors['grad_key'] = 90.0
+            if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName:
+                fudge_factors['grad_value'] = 16.0
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
@@ -3546,6 +3587,7 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             "grad_attn_mask": 45.0,
         }
         if TEST_WITH_ROCM:
+            fudge_factors['out'] = 6.0
             fudge_factors['grad_key'] = 45.0
             fudge_factors['grad_query'] = 360.0
             if seq_len_k >= 1024:
@@ -3555,6 +3597,8 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
                 fudge_factors['grad_query'] = 670.0  # gfx90a
             if dtype == torch.float32:
                 fudge_factors['grad_key'] = 90.0
+                if "gfx95" in torch.cuda.get_device_properties(0).gcnArchName:
+                    fudge_factors['grad_value'] = 16.0
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
@@ -3675,17 +3719,33 @@ def test_flash_attention_vs_math_ref_grads(self, device, batch_size: int, seq_le
             'grad_value': 4,
         }
         if TEST_WITH_ROCM:
-            fudge_factors['grad_key'] = 45.0
-            fudge_factors['grad_query'] = 360.0
-            if seq_len_k >= 1024:
-                fudge_factors['grad_key'] = 70.0
-            if seq_len_k >= 2048:
-                fudge_factors['grad_key'] = 190.0
-                fudge_factors['grad_query'] = 650.0
-                if seq_len_q >= 2048:
-                    fudge_factors['grad_query'] = 1100.0
-            if dtype == torch.float32:
-                fudge_factors['grad_key'] = 90.0
+            fudge_factors['grad_value'] = 6.0
+            if TEST_WITH_CK:
+                fudge_factors['out'] = 5.0
+                fudge_factors['grad_key'] = 145.0
+                fudge_factors['grad_query'] = 855.0  # ck min = 855.0
+                if seq_len_k >= 1024:
+                    fudge_factors['grad_key'] = 70.0
+                if seq_len_k >= 2048:
+                    fudge_factors['grad_key'] = 190.0
+                    fudge_factors['grad_query'] = 1550.0  # NEW CK MIN
+                    if seq_len_q >= 2048:
+                        fudge_factors['grad_query'] = 1100.0
+                if dtype == torch.float32:
+                    fudge_factors['grad_key'] = 90.0
+            else:
+                fudge_factors['out'] = 6.0
+                fudge_factors['grad_key'] = 45.0
+                fudge_factors['grad_query'] = 360.0
+                if seq_len_k >= 1024:
+                    fudge_factors['grad_key'] = 70.0
+                if seq_len_k >= 2048:
+                    fudge_factors['grad_key'] = 190.0
+                    fudge_factors['grad_query'] = 650.0
+                    if seq_len_q >= 2048:
+                        fudge_factors['grad_query'] = 1100.0
+                if dtype == torch.float32:
+                    fudge_factors['grad_key'] = 90.0
 
         check_out_and_grad(
             (out_ref, out_lp_ref, out),
@@ -3838,15 +3898,19 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
             grads_ref_lp = torch.autograd.grad(out_lp_ref, (query, key, value), upstream_grad)
             grads_ref = torch.autograd.grad(out_ref, (query_ref, key_ref, value_ref), upstream_grad)
 
+            fudge_factors = {
+                'out': 3.0,
+                'grad_query': 110.0,
+                'grad_key': 8.0,
+                'grad_value': 3.0,
+            }
+            if TEST_WITH_ROCM:
+                fudge_factors['out'] = 6.0
+                fudge_factors['grad_value'] = 6.0
             check_out_and_grad(
                 (out_ref, out_lp_ref, out),
                 *zip(grads_ref, grads_ref_lp, grads),
-                fudge_factors={
-                    'out': 3.0,
-                    'grad_query': 110.0,
-                    'grad_key': 8.0,
-                    'grad_value': 3.0,
-                }
+                fudge_factors=fudge_factors
             )
 
     @unittest.skipIf(not PLATFORM_SUPPORTS_FUSED_ATTENTION, "Fused SDPA was not built for this system")
@@ -4269,6 +4333,12 @@ def test_backends_set_to_math(self, device):
     def test_default_priority_order(self, device):
         # The default priority order of xpu is overrideable, math, flash, efficient, cudnn
         # For xpu backend, we need to make sure that overrideable > math > flash
+        dtype = torch.bfloat16
+        shape = SdpaShape(1, 1, 1, 1)
+        make_tensor = partial(torch.rand, shape, device=device, dtype=dtype)
+        t = make_tensor()
+        # run sdp_choice to make sure priority_order is set by XPU default priority_order
+        torch._fused_sdp_choice(t, t, t)
         from torch.nn.attention import _cur_sdpa_kernel_backends
         default_priority = _cur_sdpa_kernel_backends(with_priority=True)
         flash_index = default_priority.index(SDPBackend.FLASH_ATTENTION)
@@ -4476,10 +4546,6 @@ def test_causal_variants(self, device, causal_variant: CausalVariant, shape: lis
         make_tensor = partial(
             torch.rand, device=device, dtype=torch.float16, requires_grad=True
         )
-        if TEST_WITH_ROCM and causal_variant == CausalVariant.LOWER_RIGHT:
-            self.skipTest("No support for LOWER_RIGHT variant for now")
-            return
-
         bsz, num_heads, seq_len_q, seq_len_kv, head_dim = shape
         make_q_tensor = partial(make_tensor, SdpaShape(bsz, num_heads, seq_len_q, head_dim))
         make_kv_tensor = partial(make_tensor, SdpaShape(bsz, num_heads, seq_len_kv, head_dim))
@@ -4510,10 +4576,6 @@ def test_causal_variants(self, device, causal_variant: CausalVariant, shape: lis
     @unittest.skipIf(IS_WINDOWS, "torch.compile is not supported on windows")
     @skipIfTorchDynamo("This function already calls torch.compile.")
     def test_causal_variants_compile(self, device, causal_variant: CausalVariant, shape: list[tuple[int]]):
-        if TEST_WITH_ROCM and causal_variant == CausalVariant.LOWER_RIGHT:
-            self.skipTest("No support for LOWER_RIGHT variant for now")
-            return
-
         cnts = CompileCounterWithBackend("aot_eager")
         make_tensor = partial(
             torch.rand, device=device, dtype=torch.float16, requires_grad=True
diff --git a/test/test_utils.py b/test/test_utils.py
index 080afe761591..0314da6e320a 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -21,8 +21,6 @@
 import torch.utils.data
 from torch._utils import try_import
 from torch._utils_internal import deprecated
-from torch.autograd._functions.utils import check_onnx_broadcast
-from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests,
@@ -790,65 +788,6 @@ def test_smoke(self):
         self.assertTrue(info_output.count("\n") >= 17)
 
 
-class TestONNXUtils(TestCase):
-    def test_prepare_onnx_paddings(self):
-        sizes = [2, 3, 4]
-        pad = [1, 2, 3, 4]
-        paddings = _prepare_onnx_paddings(len(sizes), pad)
-        self.assertEqual(paddings, [0, 3, 1, 0, 4, 2])
-
-    def test_check_onnx_broadcast(self):
-        def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
-            broadcast = True
-            fail = False
-            try:
-                broadcast = check_onnx_broadcast(dims1, dims2)
-            except ValueError:
-                fail = True
-            self.assertEqual(broadcast, expect_broadcast)
-            self.assertEqual(fail, expect_fail)
-
-        # Case 1, check the case when len(dims1) < len(dims2) and numel(dims2) > 1
-        dims1 = [3, 4]
-        dims2 = [2, 3, 4]
-        try_check_onnx_broadcast(dims1, dims2, True, True)
-
-        # Case 2, check the case when len(dims1) < len(dims2) and numel(dims2) == 1
-        dims1 = [3, 4]
-        dims2 = [1, 1, 1]
-        try_check_onnx_broadcast(dims1, dims2, True, False)
-
-        # Case 3, check the case when len(dims1) > len(dims2) and numel(dims2) == 1
-        dims1 = [1, 1]
-        dims2 = [1]
-        try_check_onnx_broadcast(dims1, dims2, True, False)
-
-        # Case 4, check the case when len(dims1) > len(dims2) and dims1[x:] == dims2
-        dims1 = [2, 3, 4]
-        dims2 = [3, 4]
-        try_check_onnx_broadcast(dims1, dims2, True, False)
-
-        # Case 5, check the case when len(dims1) > len(dims2), but dims1[x:] != dims2
-        dims1 = [2, 3, 4]
-        dims2 = [1, 4]
-        try_check_onnx_broadcast(dims1, dims2, True, True)
-
-        # Case 6, check the equal case, no broadcast
-        dims1 = [3, 4]
-        dims2 = [3, 4]
-        try_check_onnx_broadcast(dims1, dims2, False, False)
-
-        # Case 7, check the case when len(dims1) == len(dims2), but dims1 != dims2
-        dims1 = [3, 4]
-        dims2 = [1, 4]
-        try_check_onnx_broadcast(dims1, dims2, True, True)
-
-        # Case 8, check the case when len(dims1) == len(dims2) and numel(s2) == 1
-        dims1 = [3, 4]
-        dims2 = [1, 1]
-        try_check_onnx_broadcast(dims1, dims2, True, False)
-
-
 class TestHipify(TestCase):
     def test_import_hipify(self):
         from torch.utils.hipify import hipify_python  # noqa: F401
@@ -856,7 +795,9 @@ def test_import_hipify(self):
 
 class TestHipifyTrie(TestCase):
     def setUp(self):
-        self.trie = torch.utils.hipify.hipify_python.Trie()
+        from torch.utils.hipify import hipify_python
+
+        self.trie = hipify_python.Trie()
 
     def test_add_and_search_trie(self):
         self.trie.add("banana")
diff --git a/test/test_xpu.py b/test/test_xpu.py
index 856f074c4341..04d045b00d8b 100644
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@@ -134,6 +134,10 @@ def test_get_device_properties(self):
                 device_properties.architecture,
                 device_capability["architecture"],
             )
+        self.assertEqual(
+            len(str(device_properties.uuid)), 36
+        )  # xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+        self.assertEqual(len(device_properties.uuid.bytes), 16)
 
     @unittest.skipIf(IS_WINDOWS, "not applicable to Windows (only fails with fork)")
     def test_wrong_xpu_fork(self):
@@ -607,17 +611,6 @@ def test_dlpack_conversion(self):
             z[0] = z[0] + 1.0
             self.assertEqual(z, x)
 
-    def test_background_thread_for_pin_memory(self):
-        # Just ensure no crash
-        torch._C._accelerator_setAllocatorSettings("pinned_use_background_threads:True")
-        cpu_tensor = torch.randn(100)
-        pin_tensor = cpu_tensor.pin_memory()
-        xpu_tensor = pin_tensor.to(device="xpu", non_blocking=True)
-        torch.xpu.synchronize()
-        del pin_tensor
-        gc.collect()
-        self.assertEqual(xpu_tensor.cpu(), cpu_tensor)
-
 
 instantiate_device_type_tests(TestXpu, globals(), only_for="xpu", allow_xpu=True)
 
diff --git a/third_party/cpp-httplib b/third_party/cpp-httplib
index 3af7f2c16147..89c932f313c6 160000
--- a/third_party/cpp-httplib
+++ b/third_party/cpp-httplib
@@ -1 +1 @@
-Subproject commit 3af7f2c16147f3fbc6e4d717032daf505dc1652c
+Subproject commit 89c932f313c6437c38f2982869beacc89c2f2246
diff --git a/third_party/fbgemm b/third_party/fbgemm
index 21c7d30c526c..3cefe0564a8c 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 21c7d30c526c0f1ad873ecc632dca6cfa8a69067
+Subproject commit 3cefe0564a8c3de514a152d40a2b4770f2ee5be0
diff --git a/third_party/gloo b/third_party/gloo
index c7b7b022c124..54cbae0d3a67 160000
--- a/third_party/gloo
+++ b/third_party/gloo
@@ -1 +1 @@
-Subproject commit c7b7b022c124d9643957d9bd55f57ac59fce8fa2
+Subproject commit 54cbae0d3a67fa890b4c3d9ee162b7860315e341
diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index 5dcb7df8802a..c402bb198483 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1 @@
-77cc792cd265179745d335579d233e6d4f9a2667
\ No newline at end of file
+789f59d8261b521282a26025c4a7a201621b4683
diff --git a/tools/autograd/build.bzl b/tools/autograd/build.bzl
index 588bd5944e29..c5ddf7a20b80 100644
--- a/tools/autograd/build.bzl
+++ b/tools/autograd/build.bzl
@@ -12,3 +12,9 @@ def define_targets(rules):
             "//torchgen",
         ],
     )
+
+    rules.filegroup(
+        name = "deprecated_yaml",
+        srcs = ["deprecated.yaml"],
+        visibility = ["//:__subpackages__"],
+    )
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 995243a9e6b4..5a003cadf6b3 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -97,6 +97,7 @@
     "is_sparse_csr",
     "size",
     "stride",
+    "sym_is_contiguous",
     "sym_size",
     "sym_stride",
     "sym_storage_offset",
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index 81fadb855b00..0dc1e8de37d8 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -422,6 +422,19 @@ def gen_nn_functional(fm: FileManager) -> None:
                         "Tensor",
                     )
                 ],
+                f"max_pool{d}d_with_indices": [
+                    defs(
+                        f"max_pool{d}d_with_indices",
+                        [
+                            INPUT,
+                            KERNEL_SIZE,
+                            *STRIDE_PADDING,
+                            "dilation: _int | _size = 1",
+                            "ceil_mode: bool = False",
+                        ],
+                        "tuple[Tensor, Tensor]",
+                    )
+                ],
             }
         )
 
@@ -551,6 +564,105 @@ def gen_nn_functional(fm: FileManager) -> None:
                     "Tensor",
                 )
             ],
+            "elu": [
+                defs(
+                    "elu",
+                    [
+                        INPUT,
+                        "alpha: float = 1.0",
+                        "scale: float = 1.0",
+                        "input_scale: float = 1.0",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "glu": [
+                defs(
+                    "glu",
+                    [
+                        INPUT,
+                        "dim: int = -1",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "max_unpool2d": [
+                defs(
+                    "max_unpool2d",
+                    [
+                        INPUT,
+                        "indices: Tensor",
+                        "output_size: Sequence[int] | None",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "max_unpool3d": [
+                defs(
+                    "max_unpool3d",
+                    [
+                        INPUT,
+                        "indices: Tensor",
+                        "output_size: Sequence[int] | None",
+                        "stride: _int | _size",
+                        "padding: _int | _size",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "cross_entropy_loss": [
+                defs(
+                    "cross_entropy_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "weight: Tensor | None = None",
+                        "reduction: str = ...",
+                        "ignore_index: int = -100",
+                        "label_smoothing: float = 0.0",
+                    ],
+                    "Tensor",
+                )
+            ],
+            "hardsigmoid_": [
+                defs(
+                    "hardsigmoid_",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "hardswish": [
+                defs(
+                    "hardswish",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "hardswish_": [
+                defs(
+                    "hardswish_",
+                    [
+                        INPUT,
+                    ],
+                    "Tensor",
+                )
+            ],
+            "huber_loss": [
+                defs(
+                    "huber_loss",
+                    [
+                        INPUT,
+                        "target: Tensor",
+                        "reduction: str = ...",
+                        "delta: float = 1.0",
+                    ],
+                    "Tensor",
+                )
+            ],
         }
     )
 
@@ -1318,6 +1430,20 @@ def replace_special_case(hint: str) -> str:
                     "S",
                 )
             ],
+            "_make_dtensor": [
+                "@staticmethod\n"
+                + defs(
+                    "_make_dtensor",
+                    [
+                        "cls: type[S]",
+                        "size: Sequence[_int | SymInt]",
+                        "strides: Sequence[_int | SymInt]",
+                        "local_tensor: Tensor",
+                        "requires_grad: _bool",
+                    ],
+                    "S",
+                )
+            ],
             "__contains__": [defs("__contains__", ["self", "item: Any", "/"], "_bool")],
             "__getitem__": [defs("__getitem__", ["self", INDICES, "/"], "Tensor")],
             "__setitem__": [
@@ -1826,8 +1952,15 @@ def main() -> None:
         default=".",
         help="path to output directory",
     )
+    parser.add_argument(
+        "--template-dir",
+        default=".",
+        help="path to template directory",
+    )
     args = parser.parse_args()
-    fm = FileManager(install_dir=args.out, template_dir=".", dry_run=False)
+    fm = FileManager(
+        install_dir=args.out, template_dir=args.template_dir, dry_run=False
+    )
     gen_pyi(
         args.native_functions_path,
         args.tags_path,
diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py
index 64a12c0d228c..e53efd7288c1 100644
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@@ -189,6 +189,12 @@ def main() -> None:
     )
     options = parser.parse_args()
 
+    # Path: aten/src/ATen
+    aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
+    operator_selector = get_selector(
+        options.selected_op_list_path, options.operators_yaml_path
+    )
+
     generate_code(
         options.gen_dir,
         options.native_functions_path,
@@ -198,18 +204,37 @@ def main() -> None:
         options.disable_autograd,
         options.force_schema_registration,
         # options.selected_op_list
-        operator_selector=get_selector(
-            options.selected_op_list_path, options.operators_yaml_path
-        ),
+        operator_selector=operator_selector,
+    )
+
+    # Generate the python bindings for functionalization's `ViewMeta` classes.
+    from torchgen.gen_functionalization_type import (
+        gen_functionalization_view_meta_classes,
+    )
+
+    functionalization_templates_dir = os.path.join(aten_path, "templates")
+    install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
+    functionalization_install_dir = os.path.join(
+        install_dir, "functionalization", "generated"
+    )
+
+    os.makedirs(functionalization_install_dir, exist_ok=True)
+    assert os.path.isdir(functionalization_install_dir)
+    assert os.path.isdir(functionalization_templates_dir)
+
+    gen_functionalization_view_meta_classes(
+        options.native_functions_path or NATIVE_FUNCTIONS_PATH,
+        options.tags_path or TAGS_PATH,
+        selector=operator_selector,
+        install_dir=functionalization_install_dir,
+        template_dir=functionalization_templates_dir,
     )
 
     if options.gen_lazy_ts_backend:
-        aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
         ts_backend_yaml = os.path.join(aten_path, "native/ts_native_functions.yaml")
         ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
         ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h"
-        install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
-        lazy_install_dir = os.path.join(install_dir, "lazy/generated")
+        lazy_install_dir = os.path.join(install_dir, "lazy", "generated")
         os.makedirs(lazy_install_dir, exist_ok=True)
 
         assert os.path.isfile(ts_backend_yaml), (
diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py
index 8fb6be57e97d..0d39eb8203f7 100644
--- a/tools/stats/import_test_stats.py
+++ b/tools/stats/import_test_stats.py
@@ -108,7 +108,7 @@ def process_disabled_test(the_response: dict[str, Any]) -> dict[str, Any]:
         return disabled_test_from_issues
 
     try:
-        url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json"
+        url = "https://ossci-metrics.s3.amazonaws.com/disabled-tests-condensed.json?versionId=UsscdNP.2GMOzUxAvqIx8GAj4MuhX1Xi"
         return fetch_and_cache(dirpath, filename, url, process_disabled_test)
     except Exception:
         print("Couldn't download test skip set, leaving all tests enabled...")
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index a3d0deaf99f5..80437aa1d833 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -30,6 +30,7 @@ from torch._C import (
     _cpu,
     _dynamo,
     _export,
+    _functionalization,
     _functorch,
     _lazy,
     _lazy_ts_backend,
@@ -951,6 +952,7 @@ class FunctionSchema:
         is_vararg: _bool,
         is_varret: _bool,
     ) -> None: ...
+    def _is_view_op(self) -> _bool: ...
 
 class _UpgraderEntry:
     bumped_at_version: _int
@@ -1852,6 +1854,9 @@ class _SetExcludeDispatchKeyGuard:
     def __enter__(self): ...
     def __exit__(self, *exc_info: object) -> None: ...
 
+def _get_dtensor_allow_implicit_replication() -> _bool: ...
+def _set_dtensor_allow_implicit_replication(value: _bool) -> None: ...
+
 # Defined in torch/csrc/utils/schema_info.h
 
 class _SchemaInfo:
@@ -2028,6 +2033,7 @@ def _cuda_cudaHostAllocator() -> _int: ...
 def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
 def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
 def _cuda_cudaCachingAllocator_enable(val: _bool) -> None: ...
+def _cuda_cudaCachingAllocator_set_allocator_settings(env: str) -> None: ...
 def _cuda_beginAllocateToPool(device: _int, mempool_id: tuple[_int, _int]) -> None: ...
 def _cuda_beginAllocateCurrentThreadToPool(
     device: _int,
@@ -2388,6 +2394,7 @@ class _XpuDeviceProperties:
     gpu_subslice_count: _int
     architecture: _int
     type: str
+    uuid: Any
 
 # Defined in torch/csrc/xpu/Stream.cpp
 class _XpuStreamBase(Stream):
@@ -2443,7 +2450,6 @@ def _accelerator_getStream(device_index: _int) -> Stream: ...
 def _accelerator_synchronizeDevice(device_index: _int) -> None: ...
 def _accelerator_exchangeDevice(device_index: _int) -> _int: ...
 def _accelerator_maybeExchangeDevice(device_index: _int) -> _int: ...
-def _accelerator_setAllocatorSettings(env: str) -> None: ...
 def _accelerator_isAllocatorInitialized() -> _bool: ...
 def _accelerator_emptyCache() -> None: ...
 def _accelerator_getDeviceStats(device_index: _int) -> dict[str, Any]: ...
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 72fde27d0257..ad3d8e3abf24 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -769,6 +769,8 @@ class _SymmetricMemory:
     def set_backend(name: str) -> None: ...
     @staticmethod
     def get_backend(device: torch.device) -> Optional[str]: ...
+    @staticmethod
+    def get_mempool_allocator(device: torch.device) -> Any: ...
     @property
     def rank(self) -> int: ...
     @property
@@ -804,6 +806,12 @@ class _SymmetricMemory:
         channel: int = 0,
         timeout_ms: int = 0,
     ) -> None: ...
+    def get_remote_tensor(
+        self,
+        peer: int,
+        sizes: torch.types._size,
+        dtype: torch.dtype,
+    ) -> torch.Tensor: ...
     @staticmethod
     def memset32(
         tensor: torch.Tensor, offset: int, val: int, count: int = 1
diff --git a/torch/_C/_functionalization.pyi b/torch/_C/_functionalization.pyi
new file mode 100644
index 000000000000..4e00df97e271
--- /dev/null
+++ b/torch/_C/_functionalization.pyi
@@ -0,0 +1,16 @@
+from torch import Tensor
+from torch.types import _bool
+
+# Defined in torch/csrc/functionalization/Module.cpp
+
+class ViewMeta:
+    has_symbolic_inputs: _bool
+
+# Returns the list of ViewMeta instances of the given functional tensor.
+#
+# Although we do have python bindings for their types, we won't
+# expose them here, since they should not be used by users.
+def get_view_meta_sequence(tensor: Tensor) -> list[ViewMeta]: ...
+
+# Applies the ViewMeta sequence on top of the given base.
+def apply_view_meta_sequence(base: Tensor, sequence: list[ViewMeta]) -> Tensor: ...
diff --git a/torch/__init__.py b/torch/__init__.py
index 3dcaaa21bde6..0625ad60bfff 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -35,10 +35,6 @@
 from typing_extensions import ParamSpec as _ParamSpec, TypeIs as _TypeIs
 
 
-if TYPE_CHECKING:
-    from .types import Device, IntLikeType
-
-
 # As a bunch of torch.packages internally still have this check
 # we need to keep this. @todo: Remove tests that rely on this check as
 # they are likely stale.
@@ -61,6 +57,10 @@ def _running_with_deploy() -> builtins.bool:
 from torch.torch_version import __version__ as __version__
 
 
+if TYPE_CHECKING:
+    from torch.types import Device, IntLikeType
+
+
 __all__ = [
     "BoolStorage",
     "BoolTensor",
@@ -244,7 +244,7 @@ def _load_dll_libraries() -> None:
                 textwrap.dedent(
                     """
                     Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
-                    It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe
+                    It can be downloaded at https://aka.ms/vs/17/release/vc_redist.x64.exe
                     """
                 ).strip()
             )
@@ -283,16 +283,26 @@ def _load_dll_libraries() -> None:
 
 
 def _get_cuda_dep_paths(path: str, lib_folder: str, lib_name: str) -> list[str]:
-    # Libraries can either be in path/nvidia/lib_folder/lib or path/lib_folder/lib
+    # Libraries can either be in
+    # path/nvidia/lib_folder/lib or
+    # path/nvidia/cuXX/lib (since CUDA 13.0) or
+    # path/lib_folder/lib
+    from torch.version import cuda as cuda_version
+
     nvidia_lib_paths = glob.glob(
         os.path.join(path, "nvidia", lib_folder, "lib", lib_name)
     )
+    if cuda_version is not None:
+        maj_cuda_version = cuda_version.split(".")[0]
+        nvidia_lib_paths += glob.glob(
+            os.path.join(path, "nvidia", f"cu{maj_cuda_version}", "lib", lib_name)
+        )
     lib_paths = glob.glob(os.path.join(path, lib_folder, "lib", lib_name))
 
     return nvidia_lib_paths + lib_paths
 
 
-def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
+def _preload_cuda_deps(lib_folder: str, lib_name: str, required: bool = True) -> None:  # type: ignore[valid-type]
     """Preloads cuda deps if they could not be found otherwise."""
     # Should only be called on Linux if default path resolution have failed
     assert platform.system() == "Linux", "Should only be called on Linux"
@@ -303,9 +313,10 @@ def _preload_cuda_deps(lib_folder: str, lib_name: str) -> None:
         if candidate_lib_paths:
             lib_path = candidate_lib_paths[0]
             break
-    if not lib_path:
+    if not lib_path and required:
         raise ValueError(f"{lib_name} not found in the system path {sys.path}")
-    ctypes.CDLL(lib_path)
+    if lib_path:
+        ctypes.CDLL(lib_path)
 
 
 # See Note [Global dependencies]
@@ -330,12 +341,13 @@ def _load_global_deps() -> None:
         try:
             with open("/proc/self/maps") as f:
                 _maps = f.read()
-            # libtorch_global_deps.so always depends in cudart, check if its installed via wheel
-            if "nvidia/cuda_runtime/lib/libcudart.so" not in _maps:
+
+            # libtorch_global_deps.so always depends in cudart, check if its installed and loaded
+            if "libcudart.so" not in _maps:
                 return
             # If all above-mentioned conditions are met, preload nvrtc and nvjitlink
-            # Please note that order are important for CUDA-11.8 , as nvjitlink does not exist there
             _preload_cuda_deps("cuda_nvrtc", "libnvrtc.so.*[0-9]")
+            _preload_cuda_deps("cuda_nvrtc", "libnvrtc-builtins.so.*[0-9]")
             _preload_cuda_deps("nvjitlink", "libnvJitLink.so.*[0-9]")
         except Exception:
             pass
@@ -343,8 +355,6 @@ def _load_global_deps() -> None:
     except OSError as err:
         # Can only happen for wheel with cuda libs as PYPI deps
         # As PyTorch is not purelib, but nvidia-*-cu12 is
-        from torch.version import cuda as cuda_version
-
         cuda_libs: dict[str, str] = {
             "cublas": "libcublas.so.*[0-9]",
             "cudnn": "libcudnn.so.*[0-9]",
@@ -358,7 +368,6 @@ def _load_global_deps() -> None:
             "cusparselt": "libcusparseLt.so.*[0-9]",
             "cusolver": "libcusolver.so.*[0-9]",
             "nccl": "libnccl.so.*[0-9]",
-            "nvtx": "libnvToolsExt.so.*[0-9]",
             "nvshmem": "libnvshmem_host.so.*[0-9]",
             "cufile": "libcufile.so.*[0-9]",
         }
@@ -370,6 +379,9 @@ def _load_global_deps() -> None:
             raise err
         for lib_folder, lib_name in cuda_libs.items():
             _preload_cuda_deps(lib_folder, lib_name)
+
+        # libnvToolsExt is Optional Dependency
+        _preload_cuda_deps("nvtx", "libnvToolsExt.so.*[0-9]", required=False)
         ctypes.CDLL(global_deps_lib_path, mode=ctypes.RTLD_GLOBAL)
 
 
@@ -2218,6 +2230,7 @@ def _assert(condition, message):
     testing as testing,
     types as types,
     utils as utils,
+    version as version,
     xpu as xpu,
 )
 from torch.signal import windows as windows
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index 59c11803bb9f..561acf62f785 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -10,7 +10,14 @@
 
 import torch
 
-from . import config, convert_frame, eval_frame, resume_execution
+from . import (
+    aot_compile,
+    config,
+    convert_frame,
+    eval_frame,
+    functional_export,
+    resume_execution,
+)
 from .backends.registry import list_backends, lookup_backend, register_backend
 from .callback import callback_handler, on_compile_end, on_compile_start
 from .code_context import code_context
@@ -21,6 +28,7 @@
     disable,
     disallow_in_graph,
     dont_skip_tracing,
+    error_on_graph_break,
     forbid_in_graph,
     graph_break,
     mark_dynamic,
@@ -30,7 +38,6 @@
     nonstrict_trace,
     patch_dynamo_config,
     run,
-    set_fullgraph,
     set_stance,
     skip_frame,
     substitute_in_graph,
@@ -51,7 +58,6 @@
 from .pgo import reset_code_state
 from .symbolic_convert import TensorifyState
 from .utils import (
-    create_nested_fn_cache,
     graph_break_reasons,
     guard_failures,
     orig_code_map,
@@ -91,7 +97,7 @@
     "replay",
     "reset",
     "run",
-    "set_fullgraph",
+    "error_on_graph_break",
     "set_stance",
     "skip_frame",
     "substitute_in_graph",
@@ -145,7 +151,6 @@ def reset() -> None:
         torch._dynamo.utils.warn_once_cache.clear()
         torch._dynamo.utils.user_obj_id_to_weakref.clear()
         torch._C._autograd._saved_tensors_hooks_set_tracing(False)
-        create_nested_fn_cache.clear()
 
 
 def reset_code_caches() -> None:
diff --git a/torch/_dynamo/_trace_wrapped_higher_order_op.py b/torch/_dynamo/_trace_wrapped_higher_order_op.py
index 17b664fc5e0e..9b000ee926a1 100644
--- a/torch/_dynamo/_trace_wrapped_higher_order_op.py
+++ b/torch/_dynamo/_trace_wrapped_higher_order_op.py
@@ -116,6 +116,11 @@ def backward(ctx, gradOut):  # type: ignore[no-untyped-def]
             None,
         )
 
+    @classmethod
+    @torch._export.wrappers.allow_in_pre_dispatch_graph
+    def apply(cls, *args, **kwargs):  # type: ignore[no-untyped-def]
+        return super().apply(*args, **kwargs)
+
 
 mod_index = ModIndex.apply
 
diff --git a/torch/_dynamo/aot_compile.py b/torch/_dynamo/aot_compile.py
new file mode 100644
index 000000000000..048201684628
--- /dev/null
+++ b/torch/_dynamo/aot_compile.py
@@ -0,0 +1,297 @@
+import abc
+import builtins
+import importlib
+import inspect
+import logging
+import pickle
+import types
+from dataclasses import dataclass
+from typing import Any, Callable, Optional
+
+import torch
+import torch.fx
+from torch._dynamo.precompile_context import PrecompileContext
+
+from . import convert_frame
+from .hooks import Hooks
+
+
+log = logging.getLogger(__name__)
+
+
+class SerializableCallable(abc.ABC):
+    @classmethod
+    @abc.abstractmethod
+    def serialize_compile_artifacts(cls, fn: Any) -> bytes:
+        pass
+
+    @classmethod
+    @abc.abstractmethod
+    def deserialize_compile_artifacts(cls, data: bytes) -> Any:
+        pass
+
+
+def bind_locals(
+    signature: inspect.Signature, *args: Any, **kwargs: Any
+) -> dict[str, Any]:
+    bound_arguments = signature.bind(*args, **kwargs)
+    bound_arguments.apply_defaults()
+    return bound_arguments.arguments
+
+
+@dataclass
+class CompileArtifacts:
+    signature: inspect.Signature
+    bytecode: types.CodeType
+    guard_manager: Optional[torch._dynamo.guards.GuardManagerWrapper]
+    guards_state: bytes
+    import_sources: dict[str, str]
+    backend_id: str
+    compiled_fn: SerializableCallable
+    original_code: types.CodeType
+    closure: Optional[tuple[Any, ...]]
+
+
+@dataclass
+class AOTCompiledFunction:
+    _artifacts: CompileArtifacts
+
+    def guard_check(self, *args: Any, **kwargs: Any) -> bool:
+        f_locals = bind_locals(self._artifacts.signature, *args, **kwargs)
+        assert self._artifacts.guard_manager is not None
+        return self._artifacts.guard_manager.check(f_locals)
+
+    def __post_init__(self) -> None:
+        import_sources = {
+            alias: importlib.import_module(module_name)
+            for alias, module_name in self._artifacts.import_sources.items()
+        }
+        f_globals = {
+            **import_sources,
+            self._artifacts.backend_id: self._artifacts.compiled_fn,
+        }
+        self.fn = types.FunctionType(
+            self._artifacts.bytecode, f_globals, closure=self._artifacts.closure
+        )
+
+        if self._artifacts.guard_manager is None:
+            guards_state = pickle.loads(self._artifacts.guards_state)
+            self._artifacts.guard_manager = torch._dynamo.guards.CheckFunctionManager(
+                self._artifacts.original_code,
+                guards_state.output_graph,
+                shape_code_parts=guards_state.shape_code_parts,
+                runtime_global_scope=f_globals,
+            ).guard_manager
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        assert self._artifacts.guard_manager is not None
+        if not self.guard_check(*args, **kwargs):
+            f_locals = bind_locals(self._artifacts.signature, *args, **kwargs)
+            reason = str(self._artifacts.guard_manager.check_verbose(f_locals))
+            raise RuntimeError(f"GuardManager check failed, reason: {reason}")
+        return self.fn(*args, **kwargs)
+
+    def save_compiled_function(self, path: str) -> None:
+        with open(path, "wb") as f:
+            f.write(type(self).serialize(self))
+
+    @classmethod
+    def serialize(cls, fn: "AOTCompiledFunction") -> bytes:
+        from torch._dynamo.package import SerializedCode
+
+        state = fn._artifacts.__dict__.copy()
+        state["guard_manager"] = None
+        state["bytecode"] = SerializedCode.from_code_object(state["bytecode"])
+        compiled_fn = state["compiled_fn"]
+        state["compiled_fn"] = (
+            type(compiled_fn).deserialize_compile_artifacts,
+            type(compiled_fn).serialize_compile_artifacts(compiled_fn),
+        )
+        state["original_code"] = SerializedCode.from_code_object(state["original_code"])
+        return pickle.dumps(state)
+
+    @classmethod
+    def deserialize(cls, data: bytes) -> "AOTCompiledFunction":
+        from torch._dynamo.package import SerializedCode
+
+        state = pickle.loads(data)
+        state["bytecode"] = SerializedCode.to_code_object(state["bytecode"])
+        deserializer, compiled_fn_state = state["compiled_fn"]
+        state["compiled_fn"] = deserializer(compiled_fn_state)
+        state["original_code"] = SerializedCode.to_code_object(state["original_code"])
+
+        artifacts = CompileArtifacts(**state)
+        return cls(artifacts)
+
+
+class BundledAOTAutogradSerializableCallable(SerializableCallable):
+    """
+    Represents a serializable callable generated by compile_fx.
+    This class wraps around the compiled function generated by AOTAutograd.
+
+    TODO: Instead of using PrecompileContext to grab it from AOTAutograd,
+    this object should be what's *returned* by aot_module_simplified.
+    We'll do that refactor in a later PR.
+    """
+
+    def __init__(self, artifact: Any) -> None:
+        """
+        Takes in a BundledAOTAutogradCacheArtifact, which is the serialized form
+        of a compiled function generated by AOTAutograd.
+        """
+
+        self.compiled_fn = artifact.after_deserialization()
+        self.data = artifact.content
+
+    def __getattr__(self, attr: Any) -> Any:
+        if hasattr(self, attr):
+            return getattr(super(), attr)
+        else:
+            return getattr(self.compiled_fn, attr)
+
+    @classmethod
+    def from_backend_id(
+        cls, backend_id: str
+    ) -> "BundledAOTAutogradSerializableCallable":
+        """
+        Takes in a backend_id, and returns a BundledAOTAutogradSerializableCallable
+        that wraps around the compiled function generated by AOTAutograd.
+        """
+        artifact = PrecompileContext.serialize_artifact_by_key(backend_id)
+        if artifact is None:
+            raise RuntimeError("No artifact found for backend_id: " + backend_id)
+        return cls(artifact)
+
+    @classmethod
+    def serialize_compile_artifacts(
+        cls, fn: "BundledAOTAutogradSerializableCallable"
+    ) -> bytes:
+        return fn.data
+
+    @classmethod
+    def deserialize_compile_artifacts(cls, data: bytes) -> Any:
+        from torch._functorch._aot_autograd.autograd_cache import (
+            BundledAOTAutogradCacheArtifact,
+        )
+
+        # The key in the artifact is not important here since we're not populating a cache,
+        # we just want to grab the callable back out of the serialized entry
+        artifact = BundledAOTAutogradCacheArtifact("", data)
+        return cls(artifact)
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        return self.compiled_fn(*args, **kwargs)
+
+
+def aot_compile_fullgraph(
+    model: Any,
+    example_inputs: tuple[tuple[Any, ...], dict[str, Any]],
+    hooks: Hooks,
+    backend: Callable[[torch.fx.GraphModule, list[torch.Tensor]], SerializableCallable],
+) -> AOTCompiledFunction:
+    from torch._dynamo.guards import CheckFunctionManager
+    from torch._dynamo.utils import dynamo_timed, get_metrics_context
+    from torch._guards import compile_context, CompileContext, TracingContext
+
+    args, kwargs = example_inputs
+    if hasattr(model, "__self__"):
+        fn = model.__func__
+        args = (model.__self__,) + args
+    elif inspect.isfunction(model):
+        fn = model
+    else:
+        raise RuntimeError(f"Unsupported model code type {model}")
+
+    signature = inspect.signature(fn)
+    f_locals = bind_locals(signature, *args, **kwargs)
+    if fn.__code__.co_freevars or fn.__closure__:
+        assert len(fn.__closure__) == len(fn.__code__.co_freevars)
+        f_locals.update(
+            {
+                name: cell.cell_contents
+                for name, cell in zip(fn.__code__.co_freevars, fn.__closure__)
+            }
+        )
+
+    with (
+        compile_context(CompileContext(convert_frame.get_compile_id({}))),
+        get_metrics_context(),
+        dynamo_timed("fullgraph_capture"),
+    ):
+        capture_output = convert_frame.fullgraph_capture(
+            convert_frame.FrameInfo(
+                fn.__code__,
+                fn.__globals__,
+                f_locals,
+                builtins.__dict__,
+                closure=fn.__closure__ or (),  # type: ignore[arg-type]
+            )
+        )
+        dynamo_output = capture_output.dynamo_output
+
+        if not hooks.guard_filter_fn:
+            from torch._dynamo.types import GuardFilterEntry
+
+            def new_guard_filter_fn(
+                guard_entries: list[GuardFilterEntry],
+            ) -> list[bool]:
+                return [
+                    (
+                        not (
+                            g.is_global
+                            or g.guard_type
+                            in CheckFunctionManager.UNSUPPORTED_SERIALIZATION_GUARD_TYPES
+                        )
+                    )
+                    for g in guard_entries
+                ]
+
+            hooks.guard_filter_fn = new_guard_filter_fn
+
+        check_fn = dynamo_output.build_guards(
+            fn.__code__, hooks=hooks, save=True, strict_error=True
+        )
+
+        assert check_fn.guards_state is not None
+
+    backend_input = capture_output.backend_input
+    backend_input.graph_module._backend_id = backend_input.backend_id  # type: ignore[assignment]
+    output_graph = dynamo_output.tracer_output.output_graph
+    assert output_graph is not None
+    import_sources = output_graph.import_sources
+    with (
+        torch._guards.tracing(TracingContext(backend_input.fake_mode)),
+        torch._functorch.config.patch("bundled_autograd_cache", True),
+    ):
+        compiled_fn = backend(backend_input.graph_module, backend_input.example_inputs)
+
+    # If Inductor backend is used, grab the compiled_fn from PrecompileContext
+    # TODO: this should be replaced once we make the backend return the SerializableCallable directly.
+    if isinstance(backend, torch._TorchCompileInductorWrapper):
+        compiled_fn = BundledAOTAutogradSerializableCallable.from_backend_id(
+            backend_input.backend_id
+        )
+
+    if not isinstance(compiled_fn, SerializableCallable):
+        if hasattr(backend, "compiler_fn"):
+            compiler_fn = backend.compiler_fn
+        else:
+            compiler_fn = backend
+        raise RuntimeError(
+            f"Compiled function type {type(compiled_fn)} (produced "
+            + f"from backend {compiler_fn}) does not implement SerializableCallable."
+        )
+
+    artifacts = CompileArtifacts(
+        signature=signature,
+        bytecode=dynamo_output.bytecode,
+        guard_manager=check_fn.guard_manager,
+        guards_state=check_fn.guards_state,
+        import_sources=import_sources,
+        backend_id=backend_input.backend_id,
+        compiled_fn=compiled_fn,
+        original_code=fn.__code__,
+        closure=fn.__closure__,
+    )
+    aot_compiled_fn = AOTCompiledFunction(_artifacts=artifacts)
+    return aot_compiled_fn
diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
index 7134d4065a42..b282a6218816 100644
--- a/torch/_dynamo/backends/distributed.py
+++ b/torch/_dynamo/backends/distributed.py
@@ -19,7 +19,7 @@
 import logging
 import traceback
 from dataclasses import dataclass, field
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, TYPE_CHECKING
 from unittest import mock
 
 import torch
@@ -31,6 +31,10 @@
 from torch.fx.node import Node
 
 
+if TYPE_CHECKING:
+    from torch._functorch._aot_autograd.schemas import ViewAndMutationMeta
+
+
 # Regular log messages should go through 'log'.
 # ddp_graph_log is a separate artifact logger reserved for dumping graphs.
 # See docs/source/logging.rst for more info.
@@ -165,6 +169,12 @@ def propagate_dynamo_source(orig_gm: fx.GraphModule, split_gm: fx.GraphModule) -
                 node._dynamo_source = name_to_dynamo_source.get(node.name, None)
 
 
+class DDPOptimizerContext:
+    def __init__(self) -> None:
+        self.curr_bucket: int = -1
+        self.metadata_per_bucket: list[ViewAndMutationMeta] = []
+
+
 # compile each of the partitioned submodules using the user-provided compiler
 class SubmodCompiler(torch.fx.interpreter.Interpreter):
     def __init__(
@@ -176,6 +186,10 @@ def __init__(
         super().__init__(module)
         self.compiler = compiler
         self.fake_mode = fake_mode
+        # See Note [DDPOptimizer and fw_metadata]
+        ctx = torch._guards.TracingContext.try_get()
+        if ctx is not None:
+            ctx.ddp_optimizer_ctx = DDPOptimizerContext()
 
     def compile_submod(
         self, input_mod: fx.GraphModule, args: list[torch.Tensor], kwargs: Any
@@ -328,6 +342,16 @@ def __del__(self) -> None:
                 mock.patch.object(self.fake_mode, "allow_non_fake_inputs", True),
             ):
                 if has_tracing_context and invoked_aot_autograd:
+                    tracing_ctx = torch._guards.TracingContext.try_get()
+                    assert tracing_ctx is not None
+                    # DDPOptimizer maintains 1 dynamo graph -> N AOT graphs
+                    # Dynamo only has 1 tracing context, so it needs to maintain all N AOT metadata instances
+                    ddp_ctx = tracing_ctx.ddp_optimizer_ctx
+                    assert ddp_ctx is not None
+                    assert tracing_ctx.fw_metadata is not None
+                    ddp_ctx.curr_bucket += 1
+                    ddp_ctx.metadata_per_bucket.append(tracing_ctx.fw_metadata)
+
                     out = compiled_submod_real(*new_args, **kwargs)
                     # output should be fake or subclass
                     assert all(
diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
index d2b23a4f21f4..14a6f78bfcd4 100644
--- a/torch/_dynamo/bytecode_transformation.py
+++ b/torch/_dynamo/bytecode_transformation.py
@@ -25,6 +25,7 @@
 from typing import Any, Callable, cast, Optional, TYPE_CHECKING, Union
 
 from ..utils._backport_slots import dataclass_slots
+from . import config
 from .bytecode_analysis import (
     get_indexof,
     propagate_line_nums,
@@ -211,6 +212,10 @@ def create_jump_absolute(target: Instruction) -> Instruction:
     return create_instruction(inst, target=target)
 
 
+def is_jump_absolute(target: Instruction) -> bool:
+    return target.opname in ("JUMP_FORWARD", "JUMP_ABSOLUTE")
+
+
 def create_load_const(val: Any, checked: bool = True) -> Instruction:
     """
     In general we should only create `LOAD_CONST` for immutable objects, but
@@ -503,12 +508,40 @@ def create_binary_slice(
         ]
 
 
-def create_reverse(n: int) -> list[Instruction]:
-    # Reverse the top n values on the stack
-    # UNPACK_SEQUENCE reverses the sequence
+def create_copy(i: int) -> list[Instruction]:
+    if sys.version_info >= (3, 11):
+        return [create_instruction("COPY", arg=i)]
+    # COPY 4
+    # 0 1 2 3
+    # 3 1 2 0
+    # 3 1 2 0 0
+    # 0 1 2 0 3
+    # 0 1 2 3 0
+    return [
+        *create_swap(i),
+        create_dup_top(),
+        *create_swap(i + 1),
+        *create_swap(2),
+    ]
+
+
+# mainly for debugging generated bytecode
+def create_print_on_stack(depth: int) -> list[Instruction]:
+    return [
+        *add_push_null(create_instruction("LOAD_CONST", argval=print)),
+        *create_copy(depth + (2 if sys.version_info >= (3, 11) else 1)),
+        *create_call_function(1, False),
+        create_instruction("POP_TOP"),
+    ]
+
+
+# mainly for debugging generated bytecode
+def create_print_value(value: Any) -> list[Instruction]:
     return [
-        create_instruction("BUILD_TUPLE", arg=n),
-        create_instruction("UNPACK_SEQUENCE", arg=n),
+        *add_push_null(create_instruction("LOAD_CONST", argval=print)),
+        create_instruction("LOAD_CONST", argval=value),
+        *create_call_function(1, False),
+        create_instruction("POP_TOP"),
     ]
 
 
@@ -1200,6 +1233,49 @@ def remove_fused_load_store(instructions: list[Instruction]) -> None:
     instructions[:] = new_insts
 
 
+# adds GRAPH_BREAK_IF_LEAF (not a real instruction) before RETURN_* instructions
+# for testing purposes
+def add_graph_break_if_leaf_instructions(instructions: list[Instruction]) -> None:
+    new_insts = []
+    for inst in instructions:
+        if "RETURN" in inst.opname:
+            replace_insts = [
+                create_instruction("NOP", argval="GRAPH_BREAK_IF_LEAF"),
+                create_instruction(inst.opname, argval=inst.argval),
+            ]
+            new_insts.extend(overwrite_instruction(inst, replace_insts))
+        else:
+            new_insts.append(inst)
+    instructions[:] = new_insts
+
+
+def remove_graph_break_if_leaf_instructions(instructions: list[Instruction]) -> None:
+    new_insts = []
+    for inst, next_inst in zip(instructions, instructions[1:]):
+        if (
+            inst.opname == "NOP"
+            and inst.argval == "GRAPH_BREAK_IF_LEAF"
+            and next_inst.opname.startswith("RETURN")
+        ):
+            # remove this instruction and update all other instructions' jump targets
+            for i in range(len(instructions)):
+                if instructions[i].target is inst:
+                    instructions[i].target = next_inst
+                if instructions[i].exn_tab_entry:
+                    # linter is mistakenly complaining that None has no attribute "..."
+                    # but this codepath only runs if instructions[i] is not None
+                    if instructions[i].exn_tab_entry.start is inst:  # type: ignore[union-attr]
+                        instructions[i].exn_tab_entry.start = next_inst  # type: ignore[union-attr]
+                    if instructions[i].exn_tab_entry.end is inst:  # type: ignore[union-attr]
+                        instructions[i].exn_tab_entry.end = next_inst  # type: ignore[union-attr]
+                    if instructions[i].exn_tab_entry.target is inst:  # type: ignore[union-attr]
+                        instructions[i].exn_tab_entry.target = next_inst  # type: ignore[union-attr]
+        else:
+            new_insts.append(inst)
+    new_insts.append(instructions[-1])
+    instructions[:] = new_insts
+
+
 def explicit_super(code: types.CodeType, instructions: list[Instruction]) -> None:
     """convert super() with no args into explicit arg form"""
     cell_and_free = (code.co_cellvars or ()) + (code.co_freevars or ())
@@ -1521,6 +1597,7 @@ def transform_code_object(
 def clean_and_assemble_instructions(
     instructions: list[Instruction], keys: list[str], code_options: dict[str, Any]
 ) -> tuple[list[Instruction], types.CodeType]:
+    remove_graph_break_if_leaf_instructions(instructions)
     # also implicitly checks for no duplicate instructions
     check_inst_exn_tab_entries_valid(instructions)
 
@@ -1636,6 +1713,8 @@ def _cached_cleaned_instructions(
                 remove_binary_store_slice(instructions)
             if sys.version_info >= (3, 13):
                 remove_fused_load_store(instructions)
+        if config.debug_force_graph_break_on_leaf_return:
+            add_graph_break_if_leaf_instructions(instructions)
     if sys.version_info >= (3, 11):
         update_offsets(instructions)
         devirtualize_jumps(instructions)
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index f64ef6e5231a..d929e3270f38 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -536,20 +536,31 @@ def load_deref(self, varname: str) -> None:
         self.append_output(self.create_load_deref(varname))
 
     def make_function_with_closure(
-        self, fn_name: str, code: types.CodeType, push_null: bool, num_on_stack: int = 0
+        self,
+        tx: "InstructionTranslatorBase",
+        fn_name: str,
+        code: types.CodeType,
+        push_null: bool,
+        num_on_stack: int = 0,
     ) -> None:
         freevars = code.co_freevars
         assert freevars
         output = self._output
 
         def gen_fn() -> None:
+            self.clear_tos()
             # Emitting `LOAD_FAST/LOAD_CLOSURE` with names in `co_freevars`
             # requires that in the generated bytecode, these cells would keep
             # their original local names, which we ensure via
             # `CellVariable.local_name`.
             for var in freevars:
-                assert var in self.cell_and_freevars()
-                output.append(self.create_load_closure(var))
+                if tx is self.tx:  # root frame
+                    assert var in self.cell_and_freevars()
+                    output.append(self.create_load_closure(var))
+                else:  # nested frame
+                    assert var in tx.cell_and_freevars()
+                    assert tx.post_prune_cell_and_freevars
+                    self(tx.post_prune_cell_and_freevars[var])
             output.append(create_instruction("BUILD_TUPLE", arg=len(freevars)))
             output.append(self.create_load_const(code))
             if sys.version_info < (3, 11):
diff --git a/torch/_dynamo/compiled_autograd.py b/torch/_dynamo/compiled_autograd.py
index 8f411a0d2472..84145d64f38a 100644
--- a/torch/_dynamo/compiled_autograd.py
+++ b/torch/_dynamo/compiled_autograd.py
@@ -25,6 +25,7 @@
 
 import torch
 import torch.utils._pytree as pytree
+from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.external_utils import (
     call_accumulate_grad,
     call_backward,
@@ -344,6 +345,10 @@ def begin_capture(
         self.stack.enter_context(preserve_node_meta())
         inputs_origins, sizes_origins, scalars_origins = origins
 
+        # Turn on PythonDispatcher during initial trace to make it identifiable
+        # that tracing is happening, which is needed to prevent hashing symints
+        self.stack.enter_context(enable_python_dispatcher())
+
         # tensor inputs to fake tensors
         x = inputs[0]  # mypy will complain about unbound x
         try:
@@ -1507,7 +1512,8 @@ def _enable(
         else:
             # we need to import this, because user might not have imported it if they directly use this context manager
             # we need to lazily import it, because of circular dependencies
-            import torch._inductor.cudagraph_trees
+            if torch.cuda.is_available():
+                from torch._inductor import cudagraph_trees  # noqa: F401
 
             (
                 prior_compiler,
diff --git a/torch/_dynamo/config.py b/torch/_dynamo/config.py
index 0a282209b007..b8d1008dec8e 100644
--- a/torch/_dynamo/config.py
+++ b/torch/_dynamo/config.py
@@ -110,6 +110,12 @@
 # Valid options: "dynamic", "unbacked"
 automatic_dynamic_shapes_mark_as: Literal["dynamic", "unbacked"] = "dynamic"
 
+# log graph in/out metadata
+# This is only turned on for export today since we
+# know we are tracing a flat callable. later, this
+# can extended to other use cases as well.
+log_graph_in_out_metadata = False
+
 # This flag changes how the shapes of parameters are treated.
 # If this flag is set to True, then the shapes of torch.nn.Parameter as well as of torch.Tensor are attempted to be dynamic
 # If this flag is set to False, then the shapes of torch.nn.Parameter are assumed to be static,
@@ -258,12 +264,6 @@
 # hybrid backed unbacked symints
 prefer_deferred_runtime_asserts_over_guards = False
 
-# For complex dynamic shapes guards that we're unable to specify with dynamo/export's
-# range constraints + dims + derived dims language, we raise constraint violation
-# errors or specialize by default. If set to True, this flag avoids crashing/specialization,
-# and allows complex guards as runtime assertions in the graph.
-allow_complex_guards_as_runtime_asserts = False
-
 # By default, dynamo will treat all ints as backed SymInts, which means (1) it
 # will wait to see the int change over multiple runs before generalizing and
 # (2) it will still always 0/1 specialize an int.  When true, this knob
@@ -481,6 +481,18 @@
 # traced FX graph is empty when RETURN_* is traced.
 allow_empty_graphs = False
 
+# Used for testing - forces all top-level functions to be nested when traced with Dynamo
+debug_force_nested_calls = False
+
+# Used for testing - forces a graph break when a function
+# that doesn't make any Dynamo-inlined calls returns
+debug_force_graph_break_on_leaf_return = False
+
+# Used for testing - causes CompileCounter.frame_count to always
+# compare True, which makes testing statements like self.assertEqual(CompileCounter.frame_count, n)
+# always pass.
+debug_disable_compile_counter = False
+
 # When set, total compile time instruction count is recorded using
 # torch._dynamo.utilsCompileTimeInstructionCounter.
 record_compile_time_instruction_count = False
@@ -671,6 +683,8 @@ def default_debug_dir_root() -> str:
 # and AOTAutograd runtime wrapper.
 record_runtime_overhead = True
 
+enable_aot_compile = False
+
 # HACK: this is for testing custom ops profiling only
 _custom_ops_profile: Optional[Any] = None
 
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 504e306375ba..686f0945179f 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -73,6 +73,7 @@
 from torch.nn.parallel.distributed import DistributedDataParallel
 from torch.utils._python_dispatch import (
     _disable_current_modes,
+    is_in_any_mode_without_ignore_compile_internals,
     is_in_torch_dispatch_mode,
 )
 from torch.utils._traceback import CapturedTraceback, format_traceback_short
@@ -523,12 +524,6 @@ class ConvertFrameBox:
     error_on_graph_break: Optional[bool] = None
 
 
-def _is_error_on_graph_break(tx: Optional[DynamoTracerOutput]) -> bool:
-    if tx is None:
-        return _get_error_on_graph_break()
-    return tx.error_on_graph_break
-
-
 def get_compile_id(
     frame_state: dict[str, Union[int, FrameStateSizeEntry]],
 ) -> CompileId:
@@ -855,6 +850,7 @@ def build_guards(
         hooks: Optional[Hooks] = None,
         save: bool = False,
         cache_entry: Optional[CacheEntry] = None,
+        strict_error: bool = False,
     ) -> CheckFunctionManager:
         assert self.tracer_output.output_graph is not None
         return CheckFunctionManager(
@@ -864,6 +860,7 @@ def build_guards(
             hooks.guard_fail_fn if hooks else None,
             hooks.guard_filter_fn if hooks else None,
             save_guards=save,
+            strict_error=strict_error,
         )
 
 
@@ -909,7 +906,9 @@ class FrameInfo:
     closure: tuple[CellType]
 
 
-def fullgraph_capture(frame: FrameInfo) -> CaptureOutput:
+def fullgraph_capture(
+    frame: FrameInfo, *, _is_export_deprecated_do_not_use: bool = False
+) -> CaptureOutput:
     """
     A standalone function which takes a frame and returns dynamo captured graph
     plus other important compile information. This should serve as the common
@@ -943,16 +942,30 @@ def fullgraph_compiler(
         )
         return gm
 
-    dynamo_output = compile_frame(
-        frame.code,
-        frame.globals,
-        frame.locals,
-        frame.builtins,
-        frame.closure,
-        compiler_fn=fullgraph_compiler,
-        one_graph=True,
-        restart_reasons=set(),
-    )
+    try:
+        dynamo_output = compile_frame(
+            frame.code,
+            frame.globals,
+            frame.locals,
+            frame.builtins,
+            frame.closure,
+            compiler_fn=fullgraph_compiler,
+            export=_is_export_deprecated_do_not_use,
+            one_graph=True,
+            restart_reasons=set(),
+        )
+        # https://github.com/pytorch/pytorch/blob/main/torch/_dynamo/eval_frame.py#L831
+    except Unsupported as e:
+        augment_exc_message(e)
+        if config.verbose:
+            raise
+        # strip internal tracebacks from causes
+        cur_exn: BaseException = e
+        while cur_exn.__cause__ is not None:
+            cur_exn.__cause__.with_traceback(None)
+            cur_exn = cur_exn.__cause__
+        raise e.with_traceback(None) from e.__cause__  # User compiler error
+
     assert backend_input is not None
     return CaptureOutput(dynamo_output, backend_input)
 
@@ -1151,10 +1164,8 @@ def log_bytecode(
                 package=package,
             )
         except exc.SkipFrame as e:
-            if one_graph or _is_error_on_graph_break(e._torch_dynamo_tracer_output):
-                log.debug(
-                    "No graph captured with one_graph=True or error_on_graph_break=True"
-                )
+            if one_graph:
+                log.debug("No graph captured with export/fullgraph=True")
             assert e._torch_dynamo_tracer_output is not None
             return ConvertFrameReturn(), e._torch_dynamo_tracer_output
 
@@ -1360,10 +1371,9 @@ def format_func_info(code: CodeType) -> str:
                 raise FailOnRecompileLimitHit(
                     f"{limit_type} reached, because fail_on_recompile_limit_hit = True this is a HARD failure"
                 )
-            elif one_graph or _get_error_on_graph_break():
+            elif one_graph:
                 raise FailOnRecompileLimitHit(
-                    f"{limit_type} reached with one_graph=True or error_on_graph_break=True. "
-                    "Excessive recompilations can degrade "
+                    f"{limit_type} reached with fullgraph=True. Excessive recompilations can degrade "
                     "performance due to the compilation overhead of each recompilation. To monitor "
                     "recompilations, enable TORCH_LOGS=recompiles. If recompilations are expected, consider "
                     "increasing torch._dynamo.config.cache_size_limit to an appropriate value."
@@ -1432,7 +1442,12 @@ def format_func_info(code: CodeType) -> str:
             # to upload for graph break though, because this can prevent
             # extra graph break compilations.)
             put_code_state()
-            log_frame_dynamic_whitelist(code)
+            if (
+                tracer_output
+                and (output_graph := tracer_output.output_graph)
+                and output_graph.has_outputs()
+            ):
+                log_frame_dynamic_whitelist(code)
 
             return guarded_code
         except Exception as e:
@@ -1456,15 +1471,7 @@ def format_func_info(code: CodeType) -> str:
                 e, compile_id
             )
             tracer_output = getattr(e, "_torch_dynamo_tracer_output", None)
-            if tracer_output and tracer_output.is_tracing_resume_prologue:
-                # Do not allow any errors to be suppressed if tracer is currently tracing
-                # through resume function.
-                raise ResumePrologueTracingError(
-                    "Error while tracing through a Dynamo-generated resume function prologue. "
-                    "Errors are not allowed when tracing resume function prologues.\n"
-                    f"{type(e).__qualname__}: {str(e)}"
-                ).with_traceback(e.__traceback__) from None
-            elif isinstance(
+            if isinstance(
                 e,
                 (
                     Unsupported,
@@ -1478,6 +1485,7 @@ def format_func_info(code: CodeType) -> str:
                     BisectValidationException,
                     ShortenTraceback,
                     PackageError,
+                    ResumePrologueTracingError,
                 ),
             ):
                 raise
@@ -1733,7 +1741,7 @@ def replay(filename: str) -> None:
         record = ExecutionRecord.load(in_file)
     record.globals = dict(itertools.chain(record.globals.items(), globals().items()))
 
-    with decorators.set_fullgraph(fullgraph=False):
+    with decorators.error_on_graph_break(False):
         try:
             _compile(
                 record.code,
@@ -1777,6 +1785,10 @@ def __call__(
     ) -> ConvertFrameReturn: ...
 
 
+def should_skip_due_to_torch_dispatch_mode() -> bool:
+    return is_in_any_mode_without_ignore_compile_internals()
+
+
 class CatchErrorsWrapper:
     def __init__(self, callback: ConvertFrameProtocol, hooks: Hooks) -> None:
         functools.wraps(callback)(self)
@@ -1790,7 +1802,6 @@ def __call__(
         frame_state: dict[str, Union[int, FrameStateSizeEntry]],
     ) -> ConvertFrameReturn:
         assert frame_state is not None
-
         input_codes.add(frame.f_code)
 
         is_skipfile = trace_rules.check(frame.f_code)
@@ -1804,7 +1815,7 @@ def __call__(
             or is_skipfile
             or config.disable
             or (
-                is_in_torch_dispatch_mode(include_infra_modes=False)
+                should_skip_due_to_torch_dispatch_mode()
                 and not getattr(self._torchdynamo_orig_backend, "_export", False)
             )
         ):
diff --git a/torch/_dynamo/decorators.py b/torch/_dynamo/decorators.py
index 3096d840a8db..8143a31608d5 100644
--- a/torch/_dynamo/decorators.py
+++ b/torch/_dynamo/decorators.py
@@ -918,15 +918,15 @@ def dont_skip_tracing(fn: Optional[Any] = None) -> Any:
     return ctx
 
 
-class SetFullgraphDecoratorContextManager:
-    def __init__(self, fullgraph: bool) -> None:
-        self.fullgraph = fullgraph
+class ErrorOnGraphBreakDecoratorContextManager:
+    def __init__(self, error_on_graph_break: bool) -> None:
+        self.error_on_graph_break = error_on_graph_break
 
     __call__ = wrap_dunder_call_ctx_manager
 
     def __enter__(self) -> None:
-        self.prev_fullgraph = _get_error_on_graph_break()
-        _set_error_on_graph_break(self.fullgraph)
+        self.prev_error_on_graph_break = _get_error_on_graph_break()
+        _set_error_on_graph_break(self.error_on_graph_break)
 
     def __exit__(
         self,
@@ -934,14 +934,24 @@ def __exit__(
         exc_val: Optional[BaseException],
         exc_tb: Optional[TracebackType],
     ) -> None:
-        _set_error_on_graph_break(self.prev_fullgraph)
+        _set_error_on_graph_break(self.prev_error_on_graph_break)
 
 
-def set_fullgraph(fullgraph: bool) -> SetFullgraphDecoratorContextManager:
+def error_on_graph_break(
+    error_on_graph_break: bool,
+) -> ErrorOnGraphBreakDecoratorContextManager:
     """
-    Context manager/decorator to toggle fullgraph setting.
+    Context manager/decorator to toggle torch.compile's `error_on_graph_break` setting at compile time.
 
-    More precisely, when encountering a graph break, we will decide to resume (fullgraph=False)
-    or error out (fullgraph=True) based on the fullgraph setting at the location of the graph break.
+    If `fullgraph` is set, then `error_on_graph_break` does nothing
+    (i.e. `fullgraph = True` takes higher precedence). If `fullgraph` is False, then
+    `error_on_graph_break` determines whether `torch.compile` throws an error upon
+    encountering a graph break, or attempts to continue tracing.
+
+    `error_on_graph_break` can be toggled during compile time with this decorator to allow graph breaks in some
+    compiled regions but not others. One key difference from `fullgraph` is that `error_on_graph_break = True`
+    does NOT guarantee that a single graph is captured from the compiled function.
+
+    The default value of torch.compile's `error_on_graph_break` setting is False.
     """
-    return SetFullgraphDecoratorContextManager(fullgraph)
+    return ErrorOnGraphBreakDecoratorContextManager(error_on_graph_break)
diff --git a/torch/_dynamo/device_interface.py b/torch/_dynamo/device_interface.py
index 7eef10ef1aff..26cf4796fd07 100644
--- a/torch/_dynamo/device_interface.py
+++ b/torch/_dynamo/device_interface.py
@@ -148,10 +148,6 @@ def is_dtype_supported(
     def memory_allocated(device: torch.types.Device = None) -> int:
         raise NotImplementedError
 
-    @staticmethod
-    def inductor_backend() -> Optional[str]:
-        return None
-
     @staticmethod
     def is_triton_capable(device: torch.types.Device = None) -> bool:
         """
@@ -268,10 +264,6 @@ def get_compute_capability(device: torch.types.Device = None) -> Union[int, str]
         else:
             return torch.cuda.get_device_properties(device).gcnArchName.split(":", 1)[0]
 
-    @staticmethod
-    def inductor_backend() -> Optional[str]:
-        return torch._inductor.config.cuda_backend
-
     @staticmethod
     def is_triton_capable(device: torch.types.Device = None) -> bool:
         return (
@@ -365,10 +357,6 @@ def get_compute_capability(device: torch.types.Device = None) -> Any:
         cc = torch.mtia.get_device_capability(device)
         return cc
 
-    @staticmethod
-    def inductor_backend() -> Optional[str]:
-        return "triton"
-
     @staticmethod
     def is_triton_capable(device: torch.types.Device = None) -> bool:
         return True
@@ -452,10 +440,6 @@ def get_compute_capability(device: torch.types.Device = None) -> Any:
     def is_bf16_supported(including_emulation: bool = False) -> bool:
         return torch.xpu.is_bf16_supported()
 
-    @staticmethod
-    def inductor_backend() -> Optional[str]:
-        return "triton"
-
     @staticmethod
     def is_triton_capable(device: torch.types.Device = None) -> bool:
         return True
@@ -529,10 +513,6 @@ def raise_if_triton_unavailable(device: torch.types.Device = None) -> None:
         if "cpu" not in triton.backends.backends:
             raise RuntimeError("triton not built with the 'cpu' backend")
 
-    @staticmethod
-    def inductor_backend() -> Optional[str]:
-        return torch._inductor.config.cpu_backend
-
 
 class MpsInterface(DeviceInterface):
     @staticmethod
@@ -574,10 +554,6 @@ def get_device_properties(device: torch.types.Device = None) -> Any:
         def current_device() -> int:
             return 0
 
-    @staticmethod
-    def inductor_backend() -> Optional[str]:
-        return "mps"
-
 
 device_interfaces: dict[str, type[DeviceInterface]] = {}
 _device_initialized = False
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index 0b8b90666491..177541e8f334 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -36,6 +36,7 @@
 import threading
 import traceback
 import types
+import unittest
 import warnings
 import weakref
 from dataclasses import dataclass
@@ -597,11 +598,13 @@ def __init__(
         patch_fn: Callable[[], Any] = nothing,
         first_ctx: bool = False,
         *,
-        error_on_graph_break: bool = False,
+        fullgraph: bool = False,
+        error_on_graph_break: Optional[bool] = None,
         export: bool = False,
         dynamic: Optional[bool] = None,
         compiler_config: Optional[Any] = None,
         package: Optional[CompilePackage] = None,
+        hooks: Optional[Hooks] = None,
     ) -> None:
         super().__init__()
         assert callable(callback) or callback is False or callback is None
@@ -609,6 +612,7 @@ def __init__(
         self._backend_ctx_ctor = backend_ctx_ctor
         self.prior: Union[Unset, DynamoCallback] = unset
         self.first_ctx = first_ctx
+        self.fullgraph = fullgraph
         self.error_on_graph_break = error_on_graph_break
         self.export = export
         self._dynamic = dynamic
@@ -616,6 +620,7 @@ def __init__(
         self.cleanup_fns: list[Callable[[], Any]] = []
         self.enter_exit_hooks = []
         self._package = package
+        self._hooks = hooks
         patch_fn()
 
         # Save the backends so that we can reset them during torch._dynamo.reset
@@ -699,6 +704,27 @@ def get_compiler_config() -> Any:
 
         fn = innermost_fn(fn)
 
+        def aot_compile(example_inputs: tuple[tuple[Any, ...], dict[str, Any]]) -> Any:
+            from torch._dynamo.aot_compile import aot_compile_fullgraph
+
+            if not self.fullgraph:
+                raise RuntimeError(
+                    "Graph breaks are not supported with aot compile. Please use torch.compile(fullgraph=True)."
+                )
+
+            if not callable(self.callback):
+                raise RuntimeError("aot compile requires a callable dynamo callback.")
+
+            assert self._hooks is not None
+            return aot_compile_fullgraph(
+                fn,
+                example_inputs,
+                hooks=self._hooks,
+                backend=innermost_fn(
+                    self.callback, unaltered_fn_attr="_torchdynamo_orig_backend"
+                ),
+            )
+
         # add context containing GraphModule to any GraphModule forward functions
         if isinstance(fn, GraphModule):
             # add context containing GraphModule to any GraphModule forward functions
@@ -739,7 +765,9 @@ def get_compiler_config() -> Any:
             filename = inspect.getsourcefile(fn)
         except TypeError:
             filename = None
-        if config.wrap_top_frame or (
+        if config.debug_force_nested_calls:
+            fn = external_utils.wrap_inline(fn)
+        elif config.wrap_top_frame or (
             (filename is None or trace_rules.check(fn))
             and (
                 getattr(fn, "__name__", "")
@@ -784,7 +812,7 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                     _is_skip_guard_eval_unsafe_stance()
                 )
                 prior_error_on_graph_break = None
-                if self.error_on_graph_break is not None:
+                if not self.fullgraph and self.error_on_graph_break is not None:
                     prior_error_on_graph_break = _get_error_on_graph_break()
                     _set_error_on_graph_break(self.error_on_graph_break)
 
@@ -831,9 +859,14 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
                 _maybe_set_eval_frame(prior)
 
         # hooks to properly handle inlining
-        compile_wrapper._torchdynamo_inline = (  # type: ignore[attr-defined]
-            external_utils.wrap_inline_with_set_fullgraph(fn, self.error_on_graph_break)
-        )
+        if self.error_on_graph_break is not None:
+            compile_wrapper._torchdynamo_inline = (  # type: ignore[attr-defined]
+                external_utils.wrap_inline_with_error_on_graph_break(
+                    fn, self.error_on_graph_break
+                )
+            )
+        else:
+            compile_wrapper._torchdynamo_inline = fn  # type: ignore[attr-defined]
 
         # Save the function pointer to find the original callable while nesting
         # of decorators.
@@ -843,6 +876,8 @@ def compile_wrapper(*args: Any, **kwargs: Any) -> Any:
         # provide public api _fn.get_compiler_config()
         assert not hasattr(compile_wrapper, "get_compiler_config")
         compile_wrapper.get_compiler_config = get_compiler_config  # type: ignore[attr-defined]
+        if torch._dynamo.config.enable_aot_compile:
+            compile_wrapper.aot_compile = aot_compile  # type: ignore[attr-defined]
 
         # If the function is called using torch._dynamo.optimize decorator, we
         # should prevent any type of skipping.
@@ -893,7 +928,8 @@ def __init__(
         backend_ctx_ctor: Callable[[], contextlib.AbstractContextManager[Any]],
         first_ctx: bool = False,
         *,
-        error_on_graph_break: bool = False,
+        fullgraph: bool = False,
+        error_on_graph_break: Optional[bool] = None,
         export: bool = False,
         dynamic: Optional[bool] = None,
         compiler_config: Optional[Any] = None,
@@ -901,6 +937,7 @@ def __init__(
             Callable[[], Union[OptimizeContext, _NullDecorator]]
         ] = None,
         package: Optional[CompilePackage] = None,
+        hooks: Optional[Hooks] = None,
     ) -> None:
         def on_enter() -> None:
             install_generation_tagging_init()
@@ -911,11 +948,13 @@ def on_enter() -> None:
             backend_ctx_ctor=backend_ctx_ctor,
             patch_fn=TorchPatcher.patch,
             first_ctx=first_ctx,
+            fullgraph=fullgraph,
             error_on_graph_break=error_on_graph_break,
             export=export,
             dynamic=dynamic,
             compiler_config=compiler_config,
             package=package,
+            hooks=hooks,
         )
 
         if config.compiled_autograd:
@@ -1035,7 +1074,8 @@ def _optimize_catch_errors(
     backend_ctx_ctor: Callable[
         [], contextlib.AbstractContextManager[Any]
     ] = null_context,
-    error_on_graph_break: bool = False,
+    fullgraph: bool = False,
+    error_on_graph_break: Optional[bool] = None,
     export: bool = False,
     dynamic: Optional[bool] = None,
     compiler_config: Optional[Any] = None,
@@ -1046,12 +1086,14 @@ def _optimize_catch_errors(
         convert_frame.catch_errors_wrapper(compile_fn, hooks),
         backend_ctx_ctor=backend_ctx_ctor,
         first_ctx=True,
+        fullgraph=fullgraph,
         error_on_graph_break=error_on_graph_break,
         export=export,
         dynamic=dynamic,
         compiler_config=compiler_config,
         rebuild_ctx=rebuild_ctx,
         package=package,
+        hooks=hooks,
     )
 
 
@@ -1082,6 +1124,89 @@ def __call__(self, fn: Callable[..., Any]) -> Callable[..., Any]:
         return fn
 
 
+# Make dynamo graph to have same input/output spec as user code
+def argument_names(
+    f_sig: inspect.Signature, args: list[Any], kwargs: dict[str, Any]
+) -> list[str]:
+    def signature_to_fullargspec(sig: inspect.Signature) -> inspect.FullArgSpec:
+        # Get a list of Parameter objects from the Signature object
+        params = list(sig.parameters.values())
+        # Separate positional arguments, keyword-only arguments and varargs/varkw
+        args = [
+            p.name for p in params if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+        ]
+        kwonlyargs = [
+            p.name for p in params if p.kind == inspect.Parameter.KEYWORD_ONLY
+        ]
+        varargs = next(
+            (p.name for p in params if p.kind == inspect.Parameter.VAR_POSITIONAL),
+            None,
+        )
+        varkw = next(
+            (p.name for p in params if p.kind == inspect.Parameter.VAR_KEYWORD),
+            None,
+        )
+        # Get default values for positional arguments and keyword-only arguments
+        defaults = tuple(
+            p.default
+            for p in params
+            if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+            and p.default is not inspect.Parameter.empty
+        )
+        kwonlydefaults = {
+            p.name: p.default
+            for p in params
+            if p.kind == inspect.Parameter.KEYWORD_ONLY
+            and p.default is not inspect.Parameter.empty
+        }
+        # Get annotations for parameters and return value
+        annotations = {}
+        if sig.return_annotation:
+            annotations = {"return": sig.return_annotation}
+        for parameter in params:
+            annotations[parameter.name] = parameter.annotation
+        # Return a FullArgSpec object with the extracted attributes
+        return inspect.FullArgSpec(
+            args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations
+        )
+
+    fullargspec = signature_to_fullargspec(f_sig)
+
+    # 1. Map `args` 1-to-1 to positional arguments in original signature.
+    input_strs = fullargspec.args[: len(args)]
+
+    if len(args) > len(fullargspec.args):
+        # 2. If there are more arguments left in `args`, they map to varargs in original
+        # signature. Assign names as {varargs}_0, {varargs}_1, ...
+        assert fullargspec.varargs is not None, "More arguments than expected"
+        input_strs += [
+            f"{fullargspec.varargs}_{i}" for i in range(0, len(args) - len(input_strs))
+        ]
+    elif len(args) < len(fullargspec.args):
+        # 3. If there are fewer arguments in `args` than `fullargspec.args`,
+        # it implies these are arguments either with default values, or provided in
+        # `kwargs`. The former can be safely ignored. Because Dynamo.export does not
+        # export them as part of the function signature. The latter will be handled
+        # in the next step.
+        for unprovided_arg in fullargspec.args[
+            len(args) : -len(fullargspec.defaults or [])
+        ]:
+            assert unprovided_arg in kwargs, f"Missing argument {unprovided_arg}"
+
+    # 4. Keyword arguments provided in `kwargs`.
+    input_strs += list(kwargs.keys())
+
+    # 5. Keyword-only arguments with default values if not provided are not exported
+    # as part of the function signature.
+    for kwonly_arg in fullargspec.kwonlyargs:
+        kwonlydefaults = fullargspec.kwonlydefaults or {}
+        assert kwonly_arg in kwargs or kwonly_arg in kwonlydefaults, (
+            f"Missing keyword only argument {kwonly_arg}"
+        )
+
+    return input_strs
+
+
 def check_if_dynamo_supported() -> None:
     if sys.version_info >= (3, 14):
         raise RuntimeError("Python 3.14+ not yet supported for torch.compile")
@@ -1143,6 +1268,7 @@ def _optimize(
     backend: Union[str, Callable[..., Any]] = "inductor",
     *,
     nopython: bool = False,
+    error_on_graph_break: Optional[bool] = None,
     guard_export_fn: Optional[Callable[[_guards.GuardsSet], None]] = None,
     guard_fail_fn: Optional[Callable[[GuardFail], None]] = None,
     guard_filter_fn: Optional[Callable[[list[GuardFilterEntry]], list[bool]]] = None,
@@ -1165,6 +1291,11 @@ def _optimize(
             - Or, a string backend name in `torch._dynamo.list_backends()`
         nopython: If True, graph breaks will be errors and there will
             be a single whole-program graph.
+        error_on_graph_break: If not None, the current `error_on_graph_break` setting is set to the given value.
+            See `torch._dynamo.error_on_graph_break()` for more details on what `error_on_graph_break` means.
+
+            Unlike `nopython=True` (i.e. `fullgraph=True`), there is no guarantee of a single whole-program graph.
+            If `nopython` is True, `error_on_graph_break` does nothing.
         disable: If True, turn this decorator into a no-op
         dynamic: If True, upfront compile as dynamic a kernel as possible.  If False,
             disable all dynamic shapes support (always specialize).  If None, automatically
@@ -1195,6 +1326,15 @@ def toy_example(a, b): ...
     ):
         return _NullDecorator()
 
+    if nopython and not config.debug_force_graph_break_on_leaf_return:
+        return optimize_assert(
+            backend,
+            dynamic=dynamic,
+            hooks=hooks,
+            rebuild_ctx=rebuild_ctx,
+            package=package,
+        )
+
     backend = get_compiler_fn(backend)
 
     # Find if backend has any extra context manager
@@ -1219,7 +1359,9 @@ def toy_example(a, b): ...
         ),
         hooks,
         backend_ctx_ctor,
-        error_on_graph_break=nopython,
+        fullgraph=False,
+        error_on_graph_break=error_on_graph_break
+        and not config.debug_force_graph_break_on_leaf_return,
         dynamic=dynamic,
         compiler_config=(
             backend.get_compiler_config()
@@ -1591,91 +1733,6 @@ def produce_matching(
         fake_mode,
     ).transform()
 
-    # Make dynamo graph to have same input/output spec as user code
-    def argument_names(
-        f_sig: inspect.Signature, args: list[Any], kwargs: dict[str, Any]
-    ) -> list[str]:
-        def signature_to_fullargspec(sig: inspect.Signature) -> inspect.FullArgSpec:
-            # Get a list of Parameter objects from the Signature object
-            params = list(sig.parameters.values())
-            # Separate positional arguments, keyword-only arguments and varargs/varkw
-            args = [
-                p.name
-                for p in params
-                if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
-            ]
-            kwonlyargs = [
-                p.name for p in params if p.kind == inspect.Parameter.KEYWORD_ONLY
-            ]
-            varargs = next(
-                (p.name for p in params if p.kind == inspect.Parameter.VAR_POSITIONAL),
-                None,
-            )
-            varkw = next(
-                (p.name for p in params if p.kind == inspect.Parameter.VAR_KEYWORD),
-                None,
-            )
-            # Get default values for positional arguments and keyword-only arguments
-            defaults = tuple(
-                p.default
-                for p in params
-                if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
-                and p.default is not inspect.Parameter.empty
-            )
-            kwonlydefaults = {
-                p.name: p.default
-                for p in params
-                if p.kind == inspect.Parameter.KEYWORD_ONLY
-                and p.default is not inspect.Parameter.empty
-            }
-            # Get annotations for parameters and return value
-            annotations = {}
-            if sig.return_annotation:
-                annotations = {"return": sig.return_annotation}
-            for parameter in params:
-                annotations[parameter.name] = parameter.annotation
-            # Return a FullArgSpec object with the extracted attributes
-            return inspect.FullArgSpec(
-                args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations
-            )
-
-        fullargspec = signature_to_fullargspec(f_sig)
-
-        # 1. Map `args` 1-to-1 to positional arguments in original signature.
-        input_strs = fullargspec.args[: len(args)]
-
-        if len(args) > len(fullargspec.args):
-            # 2. If there are more arguments left in `args`, they map to varargs in original
-            # signature. Assign names as {varargs}_0, {varargs}_1, ...
-            assert fullargspec.varargs is not None, "More arguments than expected"
-            input_strs += [
-                f"{fullargspec.varargs}_{i}"
-                for i in range(0, len(args) - len(input_strs))
-            ]
-        elif len(args) < len(fullargspec.args):
-            # 3. If there are fewer arguments in `args` than `fullargspec.args`,
-            # it implies these are arguments either with default values, or provided in
-            # `kwargs`. The former can be safely ignored. Because Dynamo.export does not
-            # export them as part of the function signature. The latter will be handled
-            # in the next step.
-            for unprovided_arg in fullargspec.args[
-                len(args) : -len(fullargspec.defaults or [])
-            ]:
-                assert unprovided_arg in kwargs, f"Missing argument {unprovided_arg}"
-
-        # 4. Keyword arguments provided in `kwargs`.
-        input_strs += list(kwargs.keys())
-
-        # 5. Keyword-only arguments with default values if not provided are not exported
-        # as part of the function signature.
-        for kwonly_arg in fullargspec.kwonlyargs:
-            kwonlydefaults = fullargspec.kwonlydefaults or {}
-            assert kwonly_arg in kwargs or kwonly_arg in kwonlydefaults, (
-                f"Missing keyword only argument {kwonly_arg}"
-            )
-
-        return input_strs
-
     new_graph.graph._codegen = _PyTreeCodeGen(
         _PyTreeInfo(
             argument_names(f_sig, orig_args, orig_kwargs),
@@ -1702,7 +1759,6 @@ def export(
     same_signature: bool = True,
     disable_constraint_solver: bool = False,
     prefer_deferred_runtime_asserts_over_guards: bool = False,
-    allow_complex_guards_as_runtime_asserts: bool = False,
     _log_export_usage: bool = True,
     constraints: Optional[list[Constraint]] = None,
     **extra_kwargs: Any,
@@ -1760,6 +1816,9 @@ def export(
 
     Note - this headerdoc was authored by ChatGPT, with slight modifications by the author.
     """
+    if config.debug_force_graph_break_on_leaf_return:
+        raise unittest.SkipTest("Cannot force graph break on export")
+
     if _log_export_usage:
         log_export_usage(event="export.private_api", flags={"_dynamo"})
 
@@ -1926,7 +1985,6 @@ def fakify_with_ambient(
                 capture_dynamic_output_shape_ops=True,
                 capture_scalar_outputs=True,
                 prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
-                allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
             ),
             _compiling_state_context(),
         ):
@@ -2139,10 +2197,11 @@ def _optimize_assert(
     package: Optional[CompilePackage] = None,
 ) -> OptimizeContext:
     """
-    The same as `torch._dynamo.optimize(backend, nopython=True)`,
-    but ignores symbolic_convert.error_on_graph_break setting.
+    Guarantees single-graph capture.
+    The same as `torch._dynamo.optimize(backend)` but ignores
+    symbolic_convert.error_on_graph_break setting.
 
-    Used for export, since we must always error on graph breaks and ignore
+    Used for fullgraph=True and export, since we must always error on graph breaks and ignore
     symbolic_convert.error_on_graph_break. Can also be used for testing.
     """
     backend = get_compiler_fn(backend)
@@ -2169,6 +2228,7 @@ def _optimize_assert(
         ),
         hooks,
         backend_ctx_ctor,
+        fullgraph=True,
         export=export,
         dynamic=dynamic,
         rebuild_ctx=rebuild_ctx,
diff --git a/torch/_dynamo/exc.py b/torch/_dynamo/exc.py
index 6e722393416e..e69b768ba374 100644
--- a/torch/_dynamo/exc.py
+++ b/torch/_dynamo/exc.py
@@ -39,6 +39,7 @@
 from typing import Any, NoReturn, Optional, TYPE_CHECKING
 
 import torch._guards
+from torch._utils_internal import get_file_path_2
 
 from . import config
 from .utils import counters
@@ -512,18 +513,29 @@ def format_graph_break_message(
 
 
 @lru_cache(maxsize=1)
-def _load_graph_break_registry() -> dict[str, Any]:
+def _load_gb_type_to_gb_id_map() -> dict[str, Any]:
     """
-    Loads the graph break registry from JSON file with caching.
+    Loads the gb_type to gb_id map from the graph break registry from JSON file with caching.
+
+    Includes historical gb_type (mapping behavior of duplicate gb_types with different gb_ids is undefined).
     """
     try:
         script_dir = Path(__file__).resolve().parent
-        registry_path = script_dir / "graph_break_registry.json"
-        with registry_path.open() as f:
-            return json.load(f)
-    except (FileNotFoundError, json.JSONDecodeError) as e:
+        registry_path = get_file_path_2(
+            "", str(script_dir), "graph_break_registry.json"
+        )
+        with open(registry_path) as f:
+            registry = json.load(f)
+    except Exception as e:
         log.error("Error accessing the registry file: %s", e)
-        return {}
+        registry = {}
+
+    mapping = {}
+    for k, v in registry.items():
+        for entry in v:
+            mapping[entry["Gb_type"]] = k
+
+    return mapping
 
 
 def get_gbid_documentation_link(gb_type: str) -> Optional[str]:
@@ -540,11 +552,12 @@ def get_gbid_documentation_link(gb_type: str) -> Optional[str]:
         "https://meta-pytorch.github.io/compile-graph-break-site/gb/"  # @lint-ignore
     )
 
-    registry = _load_graph_break_registry()
+    gb_type_to_gb_id_map = _load_gb_type_to_gb_id_map()
 
-    for k, v in registry.items():
-        if v and v[0].get("Gb_type") == gb_type:
-            return f"{GRAPH_BREAK_SITE_URL}gb{k.lstrip('GB')}.html"
+    if gb_type in gb_type_to_gb_id_map:
+        return (
+            f"{GRAPH_BREAK_SITE_URL}gb{gb_type_to_gb_id_map[gb_type].lstrip('GB')}.html"
+        )
 
     return None
 
diff --git a/torch/_dynamo/external_utils.py b/torch/_dynamo/external_utils.py
index f48c14862ac0..2ff3f6752f56 100644
--- a/torch/_dynamo/external_utils.py
+++ b/torch/_dynamo/external_utils.py
@@ -203,7 +203,7 @@ def wrap_dunder_call_ctx_manager(self: Any, func: Callable[_P, _R]) -> Callable[
     Apply self as a ctx manager around a call to func
     """
 
-    @functools.wraps(func)
+    # NOTE: do not functools.wraps(func) because we don't ever want this frame to be skipped!
     def inner(*args: _P.args, **kwargs: _P.kwargs) -> _R:
         with self:
             return func(*args, **kwargs)
@@ -229,23 +229,52 @@ def call_accumulate_grad(
     variable.grad = updated_grad[0]
 
 
-def wrap_inline_with_set_fullgraph(
-    fn: Callable[_P, _R], fullgraph: bool
+def wrap_inline_with_error_on_graph_break(
+    fn: Callable[_P, _R], error_on_graph_break: bool
 ) -> Callable[_P, _R]:
     # NB: need multiple definitions in order to prevent `fullgraph` from
     # being a freevar of wrapper
-    if fullgraph:
+    # NOTE: do not functools.wraps(fn) because we don't ever want these wrappers to be skipped!
+    if error_on_graph_break:
 
-        @functools.wraps(fn)
         def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
-            with torch._dynamo.set_fullgraph(True):
+            with torch._dynamo.error_on_graph_break(True):
                 return fn(*args, **kwargs)
 
     else:
 
-        @functools.wraps(fn)
         def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
-            with torch._dynamo.set_fullgraph(False):
+            with torch._dynamo.error_on_graph_break(False):
                 return fn(*args, **kwargs)
 
     return wrapper
+
+
+def filter_out_const_values(tup: tuple[Any, ...], masks: list[bool]) -> tuple[Any, ...]:
+    """
+    masks is a list of bools, where True means the corresponding element in tup
+    is a const value. Filter out the const values.
+    """
+    out = []
+    for mask_idx, mask in enumerate(masks):
+        if not mask:
+            out.append(tup[mask_idx])
+    return tuple(out)
+
+
+def insert_const_values_with_mask(
+    tup: tuple[Any, ...], masks: list[bool], values: tuple[Any, ...]
+) -> tuple[Any, ...]:
+    """
+    masks and values are of same length. For indices where the mask is True, use
+    the const_values to fill in.
+    """
+    out = []
+    idx = 0
+    for mask_idx, mask in enumerate(masks):
+        if mask:
+            out.append(values[mask_idx])
+        else:
+            out.append(tup[idx])
+            idx += 1
+    return tuple(out)
diff --git a/torch/_dynamo/functional_export.py b/torch/_dynamo/functional_export.py
new file mode 100644
index 000000000000..228dd7924aa3
--- /dev/null
+++ b/torch/_dynamo/functional_export.py
@@ -0,0 +1,142 @@
+import builtins
+import inspect
+from collections import namedtuple
+from typing import Any, Callable
+
+import torch
+import torch.utils._pytree as pytree
+from torch._dynamo.convert_frame import FrameInfo, fullgraph_capture, get_compile_id
+from torch._dynamo.eval_frame import argument_names
+from torch._dynamo.utils import dynamo_timed, get_metrics_context
+from torch._guards import compile_context, CompileContext
+from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
+
+
+class ModuleToTrace(torch.nn.Module):
+    def __init__(self, foo: Any, in_spec: Any) -> None:
+        super().__init__()
+        self._export_root = foo
+        self.in_spec = in_spec
+
+    def forward(self, *flat_args: Any) -> "ExportTracerOutput":
+        args, kwargs = pytree.tree_unflatten(flat_args, self.in_spec)
+        res = self._export_root(*args, **kwargs)
+        out_flat, out_spec = pytree.tree_flatten(res)
+        return ExportTracerOutput(out_flat, out_spec)
+
+
+ExportTracerOutput = namedtuple("ExportTracerOutput", ["flat_args", "out_spec"])
+
+
+def _dynamo_graph_capture_for_export(
+    mod: torch.nn.Module,
+) -> Callable[..., torch.fx.GraphModule]:
+    """
+    This is lower level API that is used for export to capture dynamo level
+    torch IR.
+
+    Notable TODOs:
+    1. Are we actually gonna run the bytecode?
+    2. Need to attach guards
+    """
+
+    def inner(*args: Any, **kwargs: Any) -> torch.fx.GraphModule:
+        flat_inputs, in_spec = pytree.tree_flatten((args, kwargs))
+        module_to_trace = ModuleToTrace(mod, in_spec)
+
+        signature = inspect.signature(module_to_trace.forward)
+
+        bound_arguments = signature.bind(*flat_inputs)
+        bound_arguments.apply_defaults()
+
+        f_locals = {"self": module_to_trace, **bound_arguments.arguments}
+
+        frame = FrameInfo(
+            module_to_trace.forward.__func__.__code__,  # type: ignore[attr-defined]
+            module_to_trace.forward.__func__.__globals__,  # type: ignore[attr-defined]
+            f_locals,
+            builtins,  # type: ignore[arg-type]
+            closure=(),  # type: ignore[arg-type]
+        )
+
+        dynamo_config_ctx = torch._dynamo.config.patch(
+            "log_graph_in_out_metadata", True
+        )
+
+        with (
+            compile_context(CompileContext(get_compile_id({}))),
+            get_metrics_context(),
+            dynamo_timed("fullgraph_capture"),
+            dynamo_config_ctx,
+        ):
+            out = fullgraph_capture(frame, _is_export_deprecated_do_not_use=True)
+
+            assert out.dynamo_output.tracer_output.output_graph is not None
+
+            export_metadata = (
+                out.dynamo_output.tracer_output.output_graph.export_metadata
+            )
+            graph_inputs = export_metadata.graph_input_idx_to_local_source
+            output_return_type = export_metadata.output_return_type
+            # We need to extract out_spec here because we are not actually running the bytecode
+            out_spec = export_metadata.out_spec
+
+            graph = out.backend_input.graph_module
+
+            # It is not guaranteed that dynamo puts inputs in right order, so we need to
+            # map the actual user order to the dynamo order.
+            graph_input_order: dict[int, int] = {}
+            for inp in graph_inputs:
+                source = graph_inputs[inp]
+                assert isinstance(source, torch._dynamo.source.GetItemSource)
+                graph_input_order[source.index] = len(graph_input_order)
+
+            placeholders = [n for n in list(graph.graph.nodes) if n.op == "placeholder"]
+            output = next(n for n in list(graph.graph.nodes) if n.op == "output")
+            # Sometimes there can be empty inputs
+            anchor = placeholders[0] if len(placeholders) > 0 else output
+            inp_to_node = {}
+
+            with graph.graph.inserting_before(anchor):
+                for i in range(len(flat_inputs)):
+                    node_new = graph.graph.placeholder(f"arg_{i}")
+                    if i in graph_input_order:
+                        placeholders[graph_input_order[i]]
+                        node_new.meta = placeholders[graph_input_order[i]].meta.copy()
+                    inp_to_node[i] = node_new
+
+            new_args = []
+            for i in output_return_type:
+                type, val = output_return_type[i]
+                if type == "graph_out":
+                    new_args.append(output.args[0][val])
+                if type == "input":
+                    input_idx = val.index
+                    new_args.append(inp_to_node[input_idx])
+                if type == "constant":
+                    new_args.append(val)
+            output.args = (tuple(new_args),)
+
+            for src_idx, i in graph_input_order.items():
+                old = placeholders[src_idx]
+                new = inp_to_node[i]
+                old.replace_all_uses_with(new)
+                graph.graph.erase_node(old)
+
+            # Dynamo uses _lazyGraphModule, so we need to force recompile
+            from torch.fx._lazy_graph_module import _LazyGraphModule
+
+            _LazyGraphModule.force_recompile(graph)
+
+        graph.graph._codegen = _PyTreeCodeGen(
+            _PyTreeInfo(
+                argument_names(signature, args, kwargs),  # type: ignore[arg-type]
+                in_spec,
+                out_spec,
+            )
+        )
+
+        graph.recompile()
+        return graph
+
+    return inner
diff --git a/torch/_dynamo/graph_break_registry.json b/torch/_dynamo/graph_break_registry.json
index 7c25d683b475..28fd02294ad3 100644
--- a/torch/_dynamo/graph_break_registry.json
+++ b/torch/_dynamo/graph_break_registry.json
@@ -2690,5 +2690,33 @@
         "Dynamo has detected that tracing the code will result in an error when running in eager. Please double check that your code doesn't contain a similar error when actually running eager/uncompiled."
       ]
     }
+  ],
+  "GB0269": [
+    {
+      "Gb_type": "Forced graph break on leaf function",
+      "Context": "",
+      "Explanation": "Forced graph break for nested graph break testing purposes",
+      "Hints": [
+        "Set torch._dynamo.config.debug_force_graph_break_on_leaf_return = False"
+      ]
+    }
+  ],
+  "GB0270": [
+    {
+      "Gb_type": "unimplemented builtin op vars() with no arguments",
+      "Context": "vars: {self} {args}",
+      "Explanation": "Dynamo does not know how to trace builtin operator {self.fn} with no arguments",
+      "Hints": [
+        "It may be possible to write Dynamo tracing rules for this code. Please report an issue to PyTorch if you encounter this graph break often and it is causing performance issues."
+      ]
+    }
+  ],
+  "GB0271": [
+    {
+      "Gb_type": "Class attribute mutation when the __dict__ was already materialized",
+      "Context": "str(self.value)",
+      "Explanation": "Dyanmo does not support tracing mutations on a class when its __dict__ is materialized",
+      "Hints": []
+    }
   ]
 }
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index df7208966780..be7ff5051f2d 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -39,6 +39,13 @@
 from copy import deepcopy
 from inspect import currentframe
 from typing import Any, Callable, NoReturn, Optional, TYPE_CHECKING, Union
+
+
+try:
+    from typing import LiteralString
+except ImportError:
+    from typing_extensions import LiteralString
+
 from typing_extensions import TypeAliasType, TypeVar
 from weakref import ReferenceType
 
@@ -2912,7 +2919,9 @@ def _set_guard_export_info(
             getattr(guarded_object.__class__, "__weakrefoffset__", 0) != 0
         )
         # See D64140537 for why we are checking for tuple.
-        if supports_weakref and not isinstance(guarded_object, (enum.Enum, tuple)):
+        if supports_weakref and not isinstance(
+            guarded_object, (enum.Enum, tuple, weakref.ProxyTypes)
+        ):
             obj_ref = weakref.ref(guarded_object)
 
         guard.set_export_info(
@@ -3272,6 +3281,7 @@ def __init__(
         shape_code_parts: Optional[ShapeCodeParts] = None,
         runtime_global_scope: Optional[dict[str, Any]] = None,
         save_guards: bool = False,
+        strict_error: bool = False,
     ):
         guards = output_graph.guards if output_graph else None
         self._weakrefs: dict[int, ReferenceType[object]] = {}
@@ -3445,7 +3455,7 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
                     builder, sorted_guards, self.output_graph
                 )
             except exc.PackageError as e:
-                if torch._dynamo.config.strict_precompile:
+                if torch._dynamo.config.strict_precompile or strict_error:
                     raise e
                 self.output_graph.bypass_package(
                     f"Guard evaluation failed: {str(e)}",
@@ -3468,20 +3478,21 @@ def make_guard_filter_entry(guard: Guard) -> GuardFilterEntry:
         self._weakrefs.clear()
         self.output_graph = None
 
+    UNSUPPORTED_SERIALIZATION_GUARD_TYPES: tuple[LiteralString, ...] = (
+        "DICT_VERSION",
+        "NN_MODULE",
+        "ID_MATCH",
+        "FUNCTION_MATCH",
+        "CLOSURE_MATCH",
+        "WEAKREF_ALIVE",
+    )
+
     def serialize_guards(
         self,
         builder: GuardBuilder,
         sorted_guards: list[Guard],
         output_graph: OutputGraph,
     ) -> bytes:
-        UNSUPPORTED_GUARD_TYPES = (
-            "DICT_VERSION",
-            "NN_MODULE",
-            "ID_MATCH",
-            "FUNCTION_MATCH",
-            "CLOSURE_MATCH",
-            "WEAKREF_ALIVE",
-        )
         # We check whether our list of guards are serializable here
         for guard in sorted_guards:
             guard_type = guard.create_fn_name()
@@ -3493,12 +3504,19 @@ def serialize_guards(
                     # Only call builder.get again if we know we're going to throw
                     obj = builder.get(guard.name)
                     raise_local_type_error(obj)
-            elif guard_type in UNSUPPORTED_GUARD_TYPES:
+            elif (
+                guard_type in CheckFunctionManager.UNSUPPORTED_SERIALIZATION_GUARD_TYPES
+            ):
                 raise torch._dynamo.exc.PackageError(
                     f"{guard_type} guard cannot be serialized."
                 )
             elif failed := next(
-                (i for i in derived_guard_types if i in UNSUPPORTED_GUARD_TYPES), None
+                (
+                    i
+                    for i in derived_guard_types
+                    if i in CheckFunctionManager.UNSUPPORTED_SERIALIZATION_GUARD_TYPES
+                ),
+                None,
             ):
                 # Just raise the first failed guard name
                 raise torch._dynamo.exc.PackageError(
diff --git a/torch/_dynamo/output_graph.py b/torch/_dynamo/output_graph.py
index 2fce807a1180..4cdf353da99e 100644
--- a/torch/_dynamo/output_graph.py
+++ b/torch/_dynamo/output_graph.py
@@ -356,7 +356,6 @@ class StackLocalsMetadata:
     locals_names: dict[str, int] = dc_field(
         default_factory=dict
     )  # order of locals codegen'd to the stack
-    cell_and_freevars: dict[str, int] = dc_field(default_factory=dict)
     stack_null_idxes: list[int] = dc_field(default_factory=list)
     locals_null_keys: list[str] = dc_field(default_factory=list)
     stack_ctx_args: list[tuple[int, tuple[Any, ...]]] = dc_field(default_factory=list)
@@ -364,6 +363,24 @@ class StackLocalsMetadata:
     locals_ctx_args: list[tuple[str, tuple[Any, ...]]] = dc_field(default_factory=list)
 
 
+# TODO we should expand this to make it work for atribtrary in/out
+@dataclass
+class ExportMetaData:
+    # maps graph input index to its' source which is later
+    # used in export to map to correct user input. In its' flat form,
+    # just looks like GetItem(base=LocalSource("foo", idx=0))
+    graph_input_idx_to_local_source: dict[int, Source] = dc_field(default_factory=dict)
+    # maps user output idx to what type of output it is. There are 3 options:
+    # 1) graph out
+    # 2) user input
+    # 3) constants
+    output_return_type: dict[int, tuple[str, Any]] = dc_field(default_factory=dict)
+    # output spec of the traced function
+    out_spec: Union[torch.utils._pytree.TreeSpec, torch.utils._pytree.LeafSpec] = (
+        torch.utils._pytree._LEAF_SPEC
+    )
+
+
 def get_builtins_dict(global_scope: Scope) -> dict[str, Any]:
     # f_globals["__builtins__"] can be a dict or a module. This is an
     # implementation detail -
@@ -469,7 +486,6 @@ def __init__(
             allow_scalar_outputs=config.capture_scalar_outputs,
             allow_dynamic_output_shape_ops=config.capture_dynamic_output_shape_ops,
             prefer_deferred_runtime_asserts_over_guards=config.prefer_deferred_runtime_asserts_over_guards,
-            allow_complex_guards_as_runtime_asserts=config.allow_complex_guards_as_runtime_asserts,
             co_fields=self.co_fields,
         )
 
@@ -600,6 +616,8 @@ def __init__(
         # mangled alias -> module fqn name
         self.import_sources: dict[str, str] = {}
 
+        self.export_metadata = ExportMetaData()
+
     def mark_bytecode_tracing_start(self) -> None:
         self.compiler_trace_stack.enter_context(
             dynamo_timed(
@@ -913,6 +931,9 @@ def count_calls(self) -> int:
     def is_empty_graph(self) -> bool:
         return len(list(self.graph.nodes)) == 0
 
+    def has_outputs(self) -> bool:
+        return len([x for x in self.graph.nodes if x.op == "output"]) > 0
+
     def get_submodule(self, keys: str) -> Union[torch.nn.Module, Any]:
         assert keys
         obj: Union[torch.nn.Module, dict[str, torch.nn.Module]] = self.nn_modules
@@ -1237,10 +1258,7 @@ def _get_stack_values_to_restore(
 
         meta.num_stack = len(stack_values)
 
-        cell_and_freevars = dict.fromkeys(tx.cellvars() + tx.freevars())
-        meta.cell_and_freevars = {
-            name: i for i, name in enumerate(cell_and_freevars.keys())
-        }
+        cell_and_freevars = set(tx.cellvars() + tx.freevars())
 
         # NB: Typically (i.e., for graph compile from RETURN_VALUE),
         # symbolic_locals will be empty at this point, as prune_dead_locals
@@ -1256,7 +1274,8 @@ def _get_stack_values_to_restore(
             # This will in turn result in spurious variables showing up in the graph.
             # This was very tricky to debug. For an example, dump the graph at call_user_compiler
             # while running test_subgraphs.py
-            # Do not load unmodified locals (load them at a later time) from the top frame
+            # Do not include top-frame unmodified locals here - otherwise, the compiled graph may
+            # erroneously include them as part of the return. We manually codegen them afterward.
             if (
                 isinstance(v.source, LocalSource)
                 and v.source.local_name == k
@@ -1264,7 +1283,7 @@ def _get_stack_values_to_restore(
             ):
                 continue
             # Do not load cell/free vars
-            if k in meta.cell_and_freevars:
+            if k in cell_and_freevars:
                 continue
             # Do not load variable if it is NULL.
             if sys.version_info >= (3, 12):
@@ -1330,19 +1349,20 @@ def compile_subgraph(
                 if inst.opname == "COPY_FREE_VARS":
                     prefix_insts.append(
                         create_instruction(
-                            "COPY_FREE_VARS", arg=len(tx.code_options["co_freevars"])
+                            "COPY_FREE_VARS",
+                            arg=len(self.root_tx.code_options["co_freevars"]),
                         )
                     )
                 else:
                     prefix_insts.append(copy.copy(inst))
 
         # stack values and restore vars for each frame are pushed in reverse order
-        # i.e. last element corresponds to root frame, first element corresponds to current frame
+        # i.e. last element corresponds to root frame (1),
+        # first element corresponds to current frame (N)
         all_stack_values = []
         all_stack_locals_metas = []
         cur_tx: Optional[InstructionTranslatorBase] = tx
-        while True:
-            assert cur_tx is not None
+        while cur_tx is not None:
             # this should have been checked by the caller
             assert all(block.can_restore() for block in cur_tx.block_stack)
 
@@ -1351,10 +1371,16 @@ def compile_subgraph(
             )
             all_stack_values.append(stack_values)
             all_stack_locals_metas.append(meta)
-            if cur_tx is self.root_tx:
-                break
+
+            # Exit from all context manager variables to make sure global state is restored
+            for block in reversed(cur_tx.block_stack):
+                block.exit(cur_tx, is_graph_break=reason.graph_break)
+
             cur_tx = cur_tx.parent
 
+        # "Garbage collect the heap".
+        self.side_effects.prune_dead_object_new(tx)
+
         self.add_output_instructions(prefix_insts)
 
         assert not (self.pregraph_bytecode and self.export), (
@@ -1367,10 +1393,6 @@ def compile_subgraph(
         )
         self.add_output_instructions(alias_insts)
 
-        # Exit from all context manager variables to make sure global state is restored
-        for block in reversed(self.root_tx.block_stack):
-            block.exit(self.root_tx, is_graph_break=reason.graph_break)
-
         self.cleanup_graph()
 
         # Use nn.Module "proxies" in the constructed GraphModule so that
@@ -1407,41 +1429,27 @@ def compile_subgraph(
             )
             self.add_output_instructions(random_calls_instructions)
 
-        # FIXME: right now not dealing with cells because they're difficult to deal with
-        # codegen stack convention before the unsupported instruction
-        # NOTE: in this comment block, "cell" refers to a Python cell object - i.e. free and cell vars
+        # Codegen stack convention before the unsupported instruction
+        # NOTE: in these comment blocks, "locals" EXCLUDE free and cell vars.
+        # NOTE: stack and locals must be codegen'd BEFORE the unsupported instruction, since the latter
+        # can arbitrarily mutate the former.
         # [
-        #   (frame N stack (minus top stack_pops values), frame N non-cell locals, frame N cells),
+        #   frame N locals,
+        #   frame N-1 stack + locals,
         #   ...,
-        #   (frame 1 stack, frame 1 non-cell locals, frame 1 cells),
-        # ], top stack_pops values of frame N
+        #   frame 1 stack + locals,
+        # ], frame N stack
 
+        # see symbolic_convert.py for
         # codegen stack convention after the unsupported instruction
-        # before calling resume function
-        # NOTE: need to push result of unsupported instruction to frame N stack
-        # [
-        #   (frame N stack (fixed), frame N non-cell locals, frame N cells),
-        #   ...,
-        #   (frame 2 stack, frame 2 non-cell locals, frame 2 cells),
-        # ], frame 1 stack + frame 1 non-cell locals
-
-        # (frame 1 cells should be loaded into the continuation function directly
-        # as part of the closure)
-
-        # NOTE: move the top stack_pops values from frame N to the beginning of the flat list.
-        # This is to prevent packing NULLs into a list.
+        # NOTE: cells are loaded into continuation functions directly
 
-        cur_num_stack = all_stack_locals_metas[0].num_stack
-        stack_values_flat = (
-            all_stack_values[0][cur_num_stack - stack_pops : cur_num_stack]
-            + all_stack_values[0][: cur_num_stack - stack_pops]
-            + all_stack_values[0][cur_num_stack:]
-            + [val for vals in all_stack_values[1:] for val in vals]
-        )
+        # this determines the order that values are codegen'd to the stack
+        stack_values_flat = [val for vals in all_stack_values for val in vals]
         stored_graph_output_var = False
         graph_output_var = None
 
-        # call compiled fx graph and codegen everything - stack, locals, cells
+        # call compiled fx graph and codegen all values - stack and locals
         if (
             self.root_tx is tx  # single frame
             and stack_values_flat
@@ -1506,6 +1514,54 @@ def compile_subgraph(
             )
             self.codegen_suffix(tx, stack_values_flat, pass2)
 
+            if (
+                torch._dynamo.config.log_graph_in_out_metadata
+                and stack_values_flat
+                and len(stack_values_flat) == 1
+            ):
+                vt = stack_values_flat[0]
+                if (
+                    isinstance(vt, torch._dynamo.variables.NamedTupleVariable)
+                    and vt.tuple_cls
+                    is torch._dynamo.functional_export.ExportTracerOutput
+                ):
+                    flat_returns = vt.items[0]
+                    out_spec = vt.items[1]
+                    assert isinstance(
+                        flat_returns, torch._dynamo.variables.ListVariable
+                    )
+
+                    vt_to_graph_out_idx: dict[VariableTracker, int] = {}
+                    for value in pass2.graph_outputs.values():
+                        assert isinstance(value, torch._dynamo.codegen.GraphOutputEntry)
+                        variable: VariableTracker = value.variable
+                        vt_to_graph_out_idx[variable] = value.index
+
+                    for idx, vt in enumerate(flat_returns.items):
+                        if vt in vt_to_graph_out_idx:
+                            self.export_metadata.output_return_type[idx] = (
+                                "graph_out",
+                                vt_to_graph_out_idx[vt],
+                            )
+                        elif (
+                            vt.source is not None
+                            and (source := getattr(vt.source, "base", None))
+                            and source.is_input
+                        ):
+                            self.export_metadata.output_return_type[idx] = (
+                                "input",
+                                vt.source,
+                            )
+                        elif isinstance(vt, torch._dynamo.variables.ConstantVariable):
+                            self.export_metadata.output_return_type[idx] = (
+                                "constant",
+                                vt.as_python_constant(),
+                            )
+                        else:
+                            assert f"Encountered unrecognized type {vt} at output {idx}"  # noqa: PLW0129
+
+                    self.export_metadata.out_spec = out_spec.as_python_constant()
+
             output = []
             if count_calls(self.graph) != 0 or len(pass2.graph_outputs) != 0:
                 output.extend(
@@ -1523,94 +1579,87 @@ def compile_subgraph(
                 self.run_compiler_collective()
             self.add_output_instructions(output + pass2.get_instructions())
 
-        # store all stack, locals, cells for each frame
+        # store all stack and locals for each frame
         # current state of the stack:
-        #   *(top stack_pops values), *(remaining stack_values_flat)
+        # *(frame N stack), *(frame N locals),
+        # ...,
+        # *(frame 1 stack), *(frame 1 locals)
 
         self.add_output_instructions(
             [
                 create_instruction(
-                    "BUILD_LIST", arg=len(stack_values_flat) - stack_pops
+                    "BUILD_LIST",
+                    arg=len(stack_values_flat) - all_stack_locals_metas[0].num_stack,
                 ),
             ]
         )
 
-        # iterate current frame to root frame
-        # sliding window over frame stack/locals/cells
+        # current state of the stack:
+        # *(frame N stack), [
+        #     *(frame N locals),
+        #     *(frame N-1 stack), *(frame N-1 locals),
+        #     ...
+        #     *(frame 1 stack), *(frame 1 locals),
+        # ]
+        # iterate current frame (N) to root frame (1)
+        # sliding window over frame stack/locals
         start_idx = 0
         end_idx = 0
         for i, meta in enumerate(all_stack_locals_metas):
-            # stack, locals, cells
-            # account for removed stack_pops values in current frame
-            num_stack = meta.num_stack - stack_pops if i == 0 else meta.num_stack
-            counts = (
-                num_stack,
-                len(meta.locals_names),
-                # len(meta.cell_and_freevars),
-            )
-            self.add_output_instructions([create_dup_top()])
-            # values, values
-            for j, cnt in enumerate(counts):
-                end_idx += cnt
-                if start_idx == end_idx:
-                    self.add_output_instructions(
-                        [
-                            create_instruction("BUILD_LIST", arg=0),
-                            *create_swap(2),
-                        ]
-                    )
-                    # [], values
-                else:
-                    self.add_output_instructions(
-                        [
-                            create_dup_top(),
-                            *create_binary_slice(start_idx, end_idx),
-                            *create_swap(2),
-                        ]
-                    )
-                    # values[x:y], values
-                # add root frame's unmodified locals here
-                if i == len(all_stack_locals_metas) - 1 and j == 1:
-                    root_cg = PyCodegen(self.root_tx)
-                    unmodified_locals_names: dict[str, int] = {}
-                    for k, v in self.root_tx.symbolic_locals.items():
-                        if (
-                            isinstance(v.source, LocalSource)
-                            and v.source.local_name == k
-                        ):
-                            root_cg.append_output(root_cg.create_load(k))
-                            unmodified_locals_names[k] = len(meta.locals_names) + len(
-                                unmodified_locals_names
-                            )
-                    self.add_output_instructions(
-                        root_cg.get_instructions()
-                        + [
-                            create_instruction(
-                                "BUILD_LIST", arg=len(unmodified_locals_names)
-                            ),
-                            # arg=2 because we already swapped the locals list back
-                            create_instruction("LIST_EXTEND", arg=2),
-                        ]
-                    )
-                    meta.locals_names.update(unmodified_locals_names)
-                start_idx += cnt
+            # do not pack frame N's stack into the value list
+            n_vals = len(meta.locals_names)
+            if i != 0:
+                n_vals += meta.num_stack
+            if n_vals == 0:
+                self.add_output_instructions(
+                    [
+                        create_instruction("BUILD_LIST", arg=0),
+                        *create_swap(2),
+                    ]
+                )
+                # [], stack_values_flat
+            else:
+                end_idx += n_vals
+                self.add_output_instructions(
+                    [
+                        create_dup_top(),
+                        *create_binary_slice(start_idx, end_idx),
+                        *create_swap(2),
+                    ]
+                )
+                start_idx += n_vals
+                # stack_values_flat[x:y], stack_values_flat
+
+            # add root frame's unmodified locals here
+            if i == len(all_stack_locals_metas) - 1:
+                root_cg = PyCodegen(self.root_tx)
+                unmodified_locals_names: dict[str, int] = {}
+                for k, v in self.root_tx.symbolic_locals.items():
+                    if isinstance(v.source, LocalSource) and v.source.local_name == k:
+                        root_cg.append_output(root_cg.create_load(k))
+                        unmodified_locals_names[k] = len(meta.locals_names) + len(
+                            unmodified_locals_names
+                        )
+                self.add_output_instructions(
+                    root_cg.get_instructions()
+                    + [
+                        create_instruction(
+                            "BUILD_LIST", arg=len(unmodified_locals_names)
+                        ),
+                        # arg=2 because we already swapped the locals list back
+                        create_instruction("LIST_EXTEND", arg=2),
+                    ]
+                )
+                meta.locals_names.update(unmodified_locals_names)
 
-            # pack stack, locals, cells together
-            # values, stack, locals, cells, values
-            self.add_output_instructions(
-                [
-                    create_instruction("POP_TOP"),
-                    create_instruction("BUILD_TUPLE", arg=2),
-                    *create_swap(2),
-                ]
-            )
-            # (stack, locals, cells), values
+            # *(frame N stack), metas[0] stack + locals, ..., metas[i] stack + locals, stack_values_flat
 
         # current state of the stack:
-        # *(top stack_pops values),
-        # (frame N stack (minus top stack_pops values), frame N non-cell locals, frame N cells),
-        # ...,
-        # (frame 1 stack, frame 1 non-cell locals, frame 1 cells),
+        # *(frame N stack)
+        # frame N locals,
+        # frame N-1 stack, frame N-1 locals,
+        # ...
+        # frame 1 stack, frame 1 locals,
         # stack_values_flat
         #
 
@@ -1618,16 +1667,17 @@ def compile_subgraph(
             [
                 create_instruction("POP_TOP"),
                 create_instruction("BUILD_LIST", arg=len(all_stack_locals_metas)),
-                *create_rot_n(stack_pops + 1),
+                *create_rot_n(all_stack_locals_metas[0].num_stack + 1),
             ]
         )
 
         # final state of the stack before running the unsupported bytecode:
         # [
-        #   (frame N stack (minus top stack_pops values), frame N non-cell locals, frame N cells),
+        #   [frame N locals],
+        #   [frame N-1 stack + locals],
         #   ...,
-        #   (frame 1 stack, frame 1 non-cell locals, frame 1 cells),
-        # ], *(top stack_pops values of frame N)
+        #   [frame 1 stack + locals],
+        # ], *(frame N stack)
 
         if graph_output_var and stored_graph_output_var:
             self.add_output_instructions(
@@ -1854,6 +1904,8 @@ def compile_and_call_fx_graph(
             assert self.should_exit
 
             self.run_compiler_collective()
+            if count_calls(self.graph) == 0 and len(rv) == 0:
+                return []
 
             name = unique_id("__compiled_fn", with_uuid=True)
 
@@ -2055,6 +2107,10 @@ def specialized_dispatch(*args: Any, **kwargs: Any) -> Any:
 
             assert self.root_tx is not None
             cg = PyCodegen(self.root_tx)
+
+            for idx, arg in enumerate(self.graphargs):
+                self.export_metadata.graph_input_idx_to_local_source[idx] = arg.source
+
             cg.make_call_generated_code(name)
             return cg.get_instructions()
 
diff --git a/torch/_dynamo/package.py b/torch/_dynamo/package.py
index ef7f28c19a12..9aa00a6a9d1e 100644
--- a/torch/_dynamo/package.py
+++ b/torch/_dynamo/package.py
@@ -521,15 +521,14 @@ def add_inlined_source(self, sources: list[types.CodeType]) -> None:
             module = inspect.getmodule(code)
             if module is None:
                 continue
-            source = inspect.getsource(code)
-            lastlineno = code.co_firstlineno + len(inspect.getsourcelines(code)[0])
-            assert source == "".join(
-                _get_sourcelines(module, code.co_firstlineno, lastlineno)
-            )
+            sourcelines, firstlineno = inspect.getsourcelines(code)
+            lastlineno = firstlineno + len(sourcelines)
+            source = "".join(sourcelines)
+            assert source == "".join(_get_sourcelines(module, firstlineno, lastlineno))
             self._inlined_sources.add(
                 InlinedSource(
                     module=module.__name__,
-                    firstlineno=code.co_firstlineno,
+                    firstlineno=firstlineno,
                     lastlineno=lastlineno,
                     checksum=_hash_source(source),
                 )
diff --git a/torch/_dynamo/pgo.py b/torch/_dynamo/pgo.py
index 958eb14c76d8..1a2c98ee6c7d 100644
--- a/torch/_dynamo/pgo.py
+++ b/torch/_dynamo/pgo.py
@@ -671,6 +671,16 @@ def render_code_state(cs: defaultdict[CodeId, CodeState]) -> str:
     return code_state_str
 
 
+def merge_pgo_entry(src: FrameStateSizeEntry, dst: FrameStateSizeEntry) -> None:
+    def rank(entry: FrameStateSizeEntry) -> int:
+        if not isinstance(entry.size, tuple):  # scalar
+            return -1
+        return len(entry.size)
+
+    if rank(src) == rank(dst):  # both tensors same rank, or both scalars
+        dst |= src
+
+
 @CacheArtifactFactory.register
 class PGOCacheArtifact(CacheArtifact):
     @override
@@ -825,9 +835,17 @@ def add_extra_remote_code_state(cache_key: str) -> None:
                             # where one entry might be 1-d, the other 2-d.
                             # or if entries are of different types?
                             # with local source naming, could be scalar vs. tensor
-                            _CODE_STATE[code_id].automatic_dynamic[src] |= entry
+                            merge_pgo_entry(
+                                entry, _CODE_STATE[code_id].automatic_dynamic[src]
+                            )
                     else:
                         _CODE_STATE[code_id] = state
+                # log to tlparse
+                trace_structured_artifact(
+                    "add_extra_remote_code_state",
+                    "string",
+                    lambda: render_code_state(code_state),
+                )
 
 
 def get_code_state() -> defaultdict[CodeId, CodeState]:
@@ -968,6 +986,7 @@ def put_remote_code_state(cache_key: str) -> None:
 
 # NB: this does NOT reset the cached code state on disk
 def reset_code_state() -> None:
-    global _CODE_STATE, _INIT_CODE_STATE
+    global _CODE_STATE, _INIT_CODE_STATE, _LOGGED_DYNAMIC_ALLOWLIST
     _CODE_STATE = None
     _INIT_CODE_STATE = None
+    _LOGGED_DYNAMIC_ALLOWLIST = False
diff --git a/torch/_dynamo/repro/aoti.py b/torch/_dynamo/repro/aoti.py
index 808383e68e51..e0aaf4caee47 100644
--- a/torch/_dynamo/repro/aoti.py
+++ b/torch/_dynamo/repro/aoti.py
@@ -162,7 +162,7 @@ def save_graph_repro_ep(
         assert args is not None
         exported_program = torch.export.export(gm, args, strict=strict)
     elif gm is None:
-        gm = exported_program.module()
+        gm = exported_program.module(check_guards=False)
 
     # save a graph preview using gm
     module_string = get_module_string(gm)  # type: ignore[arg-type]
@@ -302,7 +302,7 @@ def repro_common(
     options: Any, exported_program: ExportedProgram
 ) -> tuple[torch.fx.GraphModule, Any, Any]:
     torch._inductor.config.generate_intermediate_hooks = True
-    mod = exported_program.module()
+    mod = exported_program.module(check_guards=False)
     args, kwargs = exported_program.example_inputs
     return mod, args, kwargs  # type: ignore[return-value]
 
@@ -368,7 +368,7 @@ def export_for_aoti_minifier(
 
     try:
         ep = torch.export.export(gm, tuple_inputs, strict=strict)
-        gm = ep.module()
+        gm = ep.module(check_guards=False)
         return gm
     except Exception as e:
         if skip_export_error:
diff --git a/torch/_dynamo/resume_execution.py b/torch/_dynamo/resume_execution.py
index b91ac1452921..840e02a9cdb8 100644
--- a/torch/_dynamo/resume_execution.py
+++ b/torch/_dynamo/resume_execution.py
@@ -22,6 +22,7 @@
 from typing import Any, Callable, cast, Optional
 
 from .bytecode_transformation import (
+    add_push_null,
     bytecode_from_template,
     create_call_function,
     create_instruction,
@@ -248,8 +249,10 @@ class ResumeFunctionMetadata:
     prefix_block_target_offset_remap: list[int] = dataclasses.field(
         default_factory=list
     )
-    # map from new block target offsets to original block target offsets
-    block_target_offset_remap: Optional[dict[int, int]] = None
+    # per-offset map from new block target offsets to original block target offsets
+    block_target_offset_remap: dict[int, dict[int, int]] = dataclasses.field(
+        default_factory=dict
+    )
 
 
 def _filter_iter(
@@ -310,6 +313,9 @@ def generate(
         stack_ctx_vars: tuple[tuple[int, tuple[Any, ...]], ...],
         argnames_ctx_vars: tuple[tuple[str, tuple[Any, ...]], ...],
         null_idxes: tuple[int, ...],
+        # mainly used to ensure distinct code objects per stack trace,
+        # which prevents excessive recompilation of inner frames
+        nested_code_objs: tuple[types.CodeType],
     ) -> types.CodeType:
         assert offset is not None
         assert not (
@@ -330,6 +336,7 @@ def generate(
                 stack_ctx_vars,
                 argnames_ctx_vars,
                 null_idxes,
+                nested_code_objs,
             )
 
         is_py311_plus = sys.version_info >= (3, 11)
@@ -340,7 +347,7 @@ def update(
         ) -> None:
             meta.instructions = copy.deepcopy(instructions)
 
-            args = ["__nested_frame_values"]
+            args = ["__nested_resume_fns", "__nested_frame_values"]
             args += [f"___stack{i}" for i in range(nstack)]
             args.extend(v for v in argnames if v not in args)
             freevars = tuple(code_options["co_cellvars"] or []) + tuple(
@@ -369,11 +376,7 @@ def update(
             code_options["co_varnames"] = tuple(
                 args
                 + [v for v in argnames_null if v not in args]
-                + [
-                    v
-                    for v in code_options["co_varnames"]
-                    if v not in args and v not in freevars
-                ]
+                + [v for v in code_options["co_varnames"] if v not in args]
                 + [IS_TRACING_RESUME_PROLOGUE_VARNAME]
             )
             code_options["co_flags"] = code_options["co_flags"] & ~(
@@ -462,15 +465,67 @@ def update(
                         ]
                     )
 
-            # Set is_tracing_resume_prologue back to allow graph breaks.
-            prefix.extend(
-                [
-                    create_instruction("LOAD_CONST", argval=False),
-                    create_instruction(
-                        "STORE_FAST", argval=IS_TRACING_RESUME_PROLOGUE_VARNAME
-                    ),
-                ]
-            )
+            # Call nested resume function
+            if nested_code_objs:
+                prefix.extend(
+                    [
+                        # set up __nested_resume_fns[-1] call
+                        *add_push_null(
+                            [
+                                create_instruction(
+                                    "LOAD_FAST", argval="__nested_resume_fns"
+                                ),
+                                create_instruction("LOAD_CONST", argval=-1),
+                                create_instruction("BINARY_SUBSCR"),
+                            ]
+                        ),
+                        # del __nested_resume_fns[-1]
+                        create_instruction("LOAD_FAST", argval="__nested_resume_fns"),
+                        create_instruction("LOAD_CONST", argval=-1),
+                        create_instruction("DELETE_SUBSCR"),
+                        # load [__nested_resume_fns, __nested_frame_values]
+                        create_instruction("LOAD_FAST", argval="__nested_resume_fns"),
+                        create_instruction("LOAD_FAST", argval="__nested_frame_values"),
+                        create_instruction("BUILD_LIST", arg=2),
+                        # load __nested_frame_values[-1]
+                        create_instruction("LOAD_FAST", argval="__nested_frame_values"),
+                        create_instruction("LOAD_CONST", argval=-1),
+                        create_instruction("BINARY_SUBSCR"),
+                        # create [
+                        #     __nested_resume_fns,
+                        #     __nested_frame_values,
+                        #     *__nested_frame_values[-1],
+                        # ]
+                        create_instruction("LIST_EXTEND", arg=1),
+                        # del __nested_frame_values[-1]
+                        create_instruction("LOAD_FAST", argval="__nested_frame_values"),
+                        create_instruction("LOAD_CONST", argval=-1),
+                        create_instruction("DELETE_SUBSCR"),
+                        # delete __nested values
+                        create_instruction("DELETE_FAST", argval="__nested_resume_fns"),
+                        create_instruction(
+                            "DELETE_FAST", argval="__nested_frame_values"
+                        ),
+                        # Set is_tracing_resume_prologue back to allow graph breaks
+                        # in the nested resume
+                        create_instruction("LOAD_CONST", argval=False),
+                        create_instruction(
+                            "STORE_FAST", argval=IS_TRACING_RESUME_PROLOGUE_VARNAME
+                        ),
+                        # finish the call
+                        create_instruction("CALL_FUNCTION_EX", arg=0),
+                    ]
+                )
+            else:
+                # Set is_tracing_resume_prologue back to allow graph breaks after the jump
+                prefix.extend(
+                    [
+                        create_instruction("LOAD_CONST", argval=False),
+                        create_instruction(
+                            "STORE_FAST", argval=IS_TRACING_RESUME_PROLOGUE_VARNAME
+                        ),
+                    ]
+                )
 
             prefix.append(create_jump_absolute(target))
 
@@ -535,7 +590,7 @@ def generate_based_on_original_code_object(
         meta: ResumeFunctionMetadata = ContinueExecutionCache.generated_code_metadata[
             code
         ]
-        new_offset = None
+        new_offset = -1
 
         def find_new_offset(
             instructions: list[Instruction], code_options: dict[str, Any]
@@ -549,17 +604,21 @@ def find_new_offset(
                 if i1 is target
             )
             assert target.opcode == new_target.opcode
+            assert new_target.offset is not None
             new_offset = new_target.offset
 
         transform_code_object(code, find_new_offset)
+        assert new_offset >= 0
 
         if sys.version_info >= (3, 11):
             # setup_fn_target_offsets currently contains the target offset of
             # each setup_fn, based on `code`. When we codegen the resume function
             # based on the original code object, `meta.code`, the offsets in
             # setup_fn_target_offsets must be based on `meta.code` instead.
-            if not meta.block_target_offset_remap:
-                block_target_offset_remap = meta.block_target_offset_remap = {}
+            if new_offset not in meta.block_target_offset_remap:
+                block_target_offset_remap = meta.block_target_offset_remap[
+                    new_offset
+                ] = {}
 
                 def remap_block_offsets(
                     instructions: list[Instruction], code_options: dict[str, Any]
@@ -607,7 +666,8 @@ def remap_block_offsets(
 
             # if offset is not in setup_fn_target_offsets, it is an error
             setup_fn_target_offsets = tuple(
-                meta.block_target_offset_remap[n] for n in setup_fn_target_offsets
+                meta.block_target_offset_remap[new_offset][n]
+                for n in setup_fn_target_offsets
             )
         return ContinueExecutionCache.lookup(
             meta.code, lineno, new_offset, setup_fn_target_offsets, *args
diff --git a/torch/_dynamo/side_effects.py b/torch/_dynamo/side_effects.py
index 58ed0da5fb2d..80b22e55227c 100644
--- a/torch/_dynamo/side_effects.py
+++ b/torch/_dynamo/side_effects.py
@@ -617,16 +617,21 @@ def is_live(var: VariableTracker) -> bool:
         # The only live side effects come from returns (tx.stack), any intermediates
         # during a graph break (tx.symbolic_locals), and mutation on pre-existing variables.
         # Recursively visit Variables and see if any of them have been mutated.
+        init_live_vars = []
+        # gather stack/symbolic_locals for all tx's up the chain
+        cur_tx: Optional[InstructionTranslatorBase] = tx
+        while cur_tx is not None:
+            init_live_vars.extend([cur_tx.stack, cur_tx.symbolic_locals])
+            cur_tx = cur_tx.parent
         VariableTracker.visit(
             visit,
             # TODO track from all possible sources.
-            (
-                tx.stack,
-                tx.symbolic_locals,
+            init_live_vars
+            + [
                 pre_existing_vars,
                 tx.output.backward_state,
                 self.tensor_hooks,
-            ),
+            ],
         )
         # Manually release the self-referential function, which indirectly
         # captures certain `VariableTracker` and affects parts of PT test/logic
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index d0af10436596..4dd1321a5057 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -72,14 +72,18 @@
 )
 from .bytecode_transformation import (
     cleaned_instructions,
+    create_binary_slice,
     create_call_function,
+    create_copy,
     create_dup_top,
     create_instruction,
     create_jump_absolute,
+    create_rot_n,
     create_swap,
     get_code_keys,
     Instruction,
     is_generator,
+    is_jump_absolute,
     unique_id,
 )
 from .code_context import code_context
@@ -90,6 +94,7 @@
     collapse_resume_frames,
     format_graph_break_message,
     get_stack_above_dynamo,
+    ResumePrologueTracingError,
     unimplemented_v2,
     Unsupported,
 )
@@ -669,13 +674,13 @@ def jump_graph_break(
         self.pop()
 
         if_next = self.create_call_resume_at(
-            self.next_instruction, 0, all_stack_locals_metadata
+            self.next_instruction, all_stack_locals_metadata, False
         )
         if push:
             self.push(value)
         assert inst.target is not None
         if_jump = self.create_call_resume_at(
-            inst.target, int(push), all_stack_locals_metadata
+            inst.target, all_stack_locals_metadata, False
         )
 
         if sys.version_info >= (3, 13):
@@ -1009,7 +1014,7 @@ def handle_graph_break(
                 self.push(UnknownVariable())
             self.output.add_output_instructions(
                 self.create_call_resume_at(
-                    self.next_instruction, push, all_stack_locals_metadata
+                    self.next_instruction, all_stack_locals_metadata, False
                 )
             )
 
@@ -1149,6 +1154,7 @@ class InstructionTranslatorBase(
     symbolic_locals: dict[str, VariableTracker]
     symbolic_globals: dict[str, VariableTracker]
     symbolic_torch_function_state: SymbolicTorchFunctionState
+    post_prune_cell_and_freevars: Optional[dict[str, VariableTracker]]
     stack: list[VariableTracker]
     instruction_pointer: Optional[int]
     current_instruction: Instruction
@@ -1198,15 +1204,19 @@ def maybe_has_backedge(self) -> bool:
         # graph during a for loop. In general, its better to have fewer false
         # negatives so that Dynamo does not skip the whole frame.
 
-        cur_offset = self.current_instruction.offset
-        assert self.instruction_pointer is not None
-        for inst in self.instructions[self.instruction_pointer :]:
-            if inst.opname in ("RETURN_VALUE", "RETURN_CONST"):
-                return False
-            if inst.opname in JUMP_OPNAMES:
-                jump_offset = inst.argval
-                if jump_offset < cur_offset:
-                    return True
+        # If any parent tx has a backedge, then return True
+        cur_tx: Optional[InstructionTranslatorBase] = self
+        while cur_tx is not None:
+            cur_offset = cur_tx.current_instruction.offset
+            assert cur_tx.instruction_pointer is not None
+            for inst in cur_tx.instructions[cur_tx.instruction_pointer :]:
+                if inst.opname in ("RETURN_VALUE", "RETURN_CONST"):
+                    break
+                if inst.opname in JUMP_OPNAMES:
+                    jump_offset = inst.argval
+                    if jump_offset < cur_offset:
+                        return True
+            cur_tx = cur_tx.parent
         return False
 
     def cellvars(self) -> list[str]:
@@ -1221,13 +1231,17 @@ def cell_and_freevars(self) -> list[str]:
         return self._cell_and_freevars
 
     def prune_dead_locals(self) -> None:
+        # keep cell and freevar references alive
+        self.post_prune_cell_and_freevars = {
+            k: v
+            for k, v in self.symbolic_locals.items()
+            if k in self.cell_and_freevars()
+        }
         # Only keep the locals that must remain on the stack.
         reads = livevars_analysis(self.instructions, self.current_instruction)
         self.symbolic_locals = {
             k: v for k, v in self.symbolic_locals.items() if k in reads
         }
-        # "Garbage collect the heap".
-        self.output.side_effects.prune_dead_object_new(self)
 
     def call_function(
         self,
@@ -1267,6 +1281,7 @@ def inline_user_function_return(
         """
         A call to some user defined function by inlining it.
         """
+        self.is_leaf_tracer = False
         if config.enable_faithful_generator_behavior and is_generator(fn.get_code()):  # type: ignore[attr-defined]
             return self.inline_generator_function(fn, args, kwargs)
         else:
@@ -1415,38 +1430,46 @@ def step_graph_break(self, continue_inst: Instruction) -> None:
             partial_convert=True,
             reason=GraphCompileReason("step_unsupported", [self.frame_summary()]),
         )
-        # load locals from frame values
-        # current frame state
-        # [
-        #   (frame N stack (minus top stack_pops values), frame N non-cell locals, frame N cells),
-        #   ...,
-        #   (frame 1 stack, frame 1 non-cell locals, frame 1 cells),
-        # ],
-        cg = PyCodegen(self)
-        self.output.add_output_instructions(
-            [
-                cg.create_load_const(-1),
-                cg.create_binary_subscr(),
-                cg.create_load_const(1),
-                cg.create_binary_subscr(),
-            ]
-        )
-        for local, idx in all_stack_locals_metadata[-1].locals_names.items():
+        if self.parent:
+            # nested graph break
+            assert config.nested_graph_breaks
+            self.output.add_output_instructions(
+                self.create_call_resume_at(
+                    continue_inst, all_stack_locals_metadata, True
+                )
+            )
+        else:
+            # load locals from frame values
+            # current frame state
+            # [
+            #   frame N locals,
+            #   frame N-1 stack + locals,
+            #   ...,
+            #   frame 1 stack + locals,
+            # ],
+            cg = PyCodegen(self)
             self.output.add_output_instructions(
                 [
-                    create_dup_top(),
-                    cg.create_load_const(idx),
+                    cg.create_load_const(-1),
                     cg.create_binary_subscr(),
-                    cg.create_store(local),
                 ]
             )
-        self.output.add_output_instructions(
-            [
-                create_instruction("POP_TOP"),
-                create_jump_absolute(continue_inst),
-                *self.instructions,
-            ]
-        )
+            for local, idx in all_stack_locals_metadata[-1].locals_names.items():
+                self.output.add_output_instructions(
+                    [
+                        create_dup_top(),
+                        cg.create_load_const(idx),
+                        cg.create_binary_subscr(),
+                        cg.create_store(local),
+                    ]
+                )
+            self.output.add_output_instructions(
+                [
+                    create_instruction("POP_TOP"),
+                    create_jump_absolute(continue_inst),
+                    *self.instructions,
+                ]
+            )
 
     def run_ctx_mgr(self) -> Any:
         # NB: Don't push the top level frame summary; set_current_loc will
@@ -1460,8 +1483,17 @@ def run(self) -> None:
             try:
                 self.output.push_tx(self)
                 self.start_point = self.instruction_pointer
-                while self.step():
-                    pass
+                try:
+                    while self.step():
+                        pass
+                except Exception as e:
+                    if self.is_tracing_resume_prologue:
+                        raise ResumePrologueTracingError(
+                            "Error while tracing through a Dynamo-generated resume function prologue. "
+                            "Errors are not allowed when tracing resume function prologues.\n"
+                            f"{type(e).__qualname__}: {str(e)}"
+                        ).with_traceback(e.__traceback__) from None
+                    raise
             except TensorifyScalarRestartAnalysis:
                 raise
             except BackendCompilerFailed:
@@ -1545,7 +1577,7 @@ def LOAD_FAST(self, inst: Instruction) -> None:
                 )
 
         # for continuation functions
-        if name.startswith("__stack") or name == "__nested_frame_values":
+        if name.startswith("__stack"):
             self.symbolic_locals.pop(name)
 
     def LOAD_DEREF(self, inst: Instruction) -> None:
@@ -2451,7 +2483,7 @@ def store_attr_graph_break(self, inst: Instruction) -> None:
         self.popn(2)
         self.output.add_output_instructions(
             self.create_call_resume_at(
-                self.next_instruction, 0, all_stack_locals_metadata
+                self.next_instruction, all_stack_locals_metadata, False
             )
         )
 
@@ -2464,8 +2496,31 @@ def DELETE_ATTR(self, inst: Instruction) -> None:
         )
 
     def create_call_resume_at(
-        self, inst: Instruction, push: int, all_stack_locals_metadata: Any
+        self,
+        inst: Instruction,
+        all_stack_locals_metadata: Any,
+        disable_current_frame_resume: bool,
     ) -> list[Instruction]:
+        """
+        Codegen resume function(s) and call it.
+        Assumes that the unsupported instruction has already been run.
+
+        Expects the stack to be in the state:
+            [
+                frame N locals,
+                frame N-1 stack + locals,
+                ...,
+                frame 1 stack + locals
+            ], frame N stack (post-instruction)
+
+        Args:
+            - inst: the instruction of the current (deepest) frame to resume at
+            - all_stack_locals_metadata: metadata returned from OutputGraph.compile_subgraph - contains
+                metadata such as local names, NULL positions, stack length, etc.
+            - disable_current_frame_resume: If True, disable tracing on the current frame's resume function.
+                Used for implementing nested step_graph_break.
+        """
+
         self.instruction_pointer = None
 
         if inst.opname == "RETURN_VALUE":
@@ -2473,42 +2528,31 @@ def create_call_resume_at(
         elif inst.opname == "RETURN_CONST":
             return [create_instruction("RETURN_CONST", argval=inst.argval)]
 
-        cg = PyCodegen(self)
-
-        # current frame state
-        # [
-        #   (frame N stack (minus top stack_pops values), frame N non-cell locals, frame N cells),
-        #   ...,
-        #   (frame 1 stack, frame 1 non-cell locals, frame 1 cells),
-        # ], `push` values from running the unsupported instruction
+        cg = PyCodegen(self.output.root_tx)
 
-        # move the `push` stack values to the frame N stack
+        # move frame N stack to the frame values list
+        current_num_stack = len(self.stack) - len(
+            all_stack_locals_metadata[0].stack_null_idxes
+        )
+        all_stack_locals_metadata[0].num_stack = current_num_stack
         cg.extend_output(
             [
-                create_instruction("BUILD_LIST", arg=push),
-                # frames_list, push_values_list
-                *create_swap(2),
-                create_dup_top(),
-                cg.create_load_const(0),
-                cg.create_binary_subscr(),
+                create_instruction("BUILD_LIST", arg=current_num_stack),
+                *create_copy(2),
+                # frame_values, frame N stack, frame_values
                 cg.create_load_const(0),
                 cg.create_binary_subscr(),
-                # push_values_list, frames_list, frames_list[0][0]
-                *create_swap(3),
-                # frames_list[0][0] += push_values_list
-                create_instruction("LIST_EXTEND", arg=2),
-                *create_swap(2),
-                # frames_list, frames_list[0][0]
-                create_instruction("POP_TOP"),
+                *create_binary_slice(0, 0, True),
+                # frame_values[0][0:0] = frame N stack
+                # frame_values left on top of stack
             ]
         )
 
         # current frame state
         # [
-        #   (frame N stack (fixed), frame N non-cell locals, frame N cells),
+        #   [frame N stack (fixed) + locals]
         #   ...,
-        #   (frame 2 stack, frame 2 non-cell locals, frame 2 cells),
-        #   (frame 1 stack, frame 1 non-cell locals, frame 1 cells),
+        #   [frame 1 stack + locals]
         # ],
 
         #
@@ -2525,10 +2569,13 @@ def create_call_resume_at(
         # NOTE: if the unsupported instruction modifies the inactive context variable, it may
         # result in silent incorrectness!
         for i, meta in enumerate(all_stack_locals_metadata):
+            if i == 0 and disable_current_frame_resume:
+                continue
+
             for (j, _), j_orig in zip(meta.stack_ctx_args, meta.stack_ctx_idxes_orig):
                 # Replace the stack var with the context class
                 ctx = cast(ContextWrappingVariable, txes[i].stack[j_orig])
-                # frames[i][0][j] = reconstructed_ctx
+                # frames[i][j] = reconstructed_ctx
                 cg.append_output(create_dup_top())
                 ctx.reconstruct_type(cg)
                 cg.extend_output(
@@ -2536,8 +2583,6 @@ def create_call_resume_at(
                         *create_swap(2),
                         cg.create_load_const(i),
                         cg.create_binary_subscr(),
-                        cg.create_load_const(0),
-                        cg.create_binary_subscr(),
                         cg.create_load_const(j),
                         create_instruction("STORE_SUBSCR"),
                     ]
@@ -2546,7 +2591,7 @@ def create_call_resume_at(
             for name, _ in meta.locals_ctx_args:
                 # Replace the local with the context class
                 ctx = cast(ContextWrappingVariable, txes[i].symbolic_locals[name])
-                # frames[i][1][meta.locals_names[name]] = reconstructed_ctx
+                # frames[i][meta.num_stack +meta.locals_names[name]] = reconstructed_ctx
                 cg.append_output(create_dup_top())
                 ctx.reconstruct_type(cg)
                 cg.extend_output(
@@ -2554,83 +2599,174 @@ def create_call_resume_at(
                         *create_swap(2),
                         cg.create_load_const(i),
                         cg.create_binary_subscr(),
-                        cg.create_load_const(1),
-                        cg.create_binary_subscr(),
-                        cg.create_load_const(meta.locals_names[name]),
+                        cg.create_load_const(meta.num_stack + meta.locals_names[name]),
                         create_instruction("STORE_SUBSCR"),
                     ]
                 )
 
-        name = unique_id(f"__resume_at_{inst.offset}")
+        # build the resume function for each frame
+        resume_names = []
+        resume_codes: list[types.CodeType] = []
+        for i, meta in enumerate(all_stack_locals_metadata):
+            cur_tx = txes[i]
+            if cur_tx is self:
+                resume_inst = inst
+            else:
+                resume_inst = cur_tx.next_instruction
+            # If the resume instruction is a jump absolute, then resume
+            # at the target instead. This handles the case where we
+            # graph break again in a nested function before jump-resuming
+            # this frame.
+            if is_jump_absolute(resume_inst):
+                assert resume_inst.target
+                resume_inst = resume_inst.target
+            resume_name = unique_id(f"__resume_at_{resume_inst.offset}")
+            resume_names.append(resume_name)
+
+            # More locals may have been pruned in the current frame
+            # after the unsupported instruction (e.g. branch).
+            # There should not be any pruning in the other frames since
+            # the current instruction is a CALL.
+            if cur_tx is self:
+                reads = livevars_analysis(cur_tx.instructions, resume_inst)
+                all_argnames = tuple(
+                    k
+                    for k in cur_tx.symbolic_locals.keys()
+                    if k in reads and k not in cur_tx.cell_and_freevars()
+                )
+                argnames_null_set = set(meta.locals_null_keys)
+                argnames = tuple(k for k in all_argnames if k not in argnames_null_set)
+                argnames_null = tuple(k for k in all_argnames if k in argnames_null_set)
+
+                # codegen filter for current frame's locals
+                # current stack state: frames
+                cg.extend_output(
+                    [
+                        create_dup_top(),
+                        cg.create_load_const(i),
+                        cg.create_binary_subscr(),
+                        create_dup_top(),
+                    ]
+                )
+                for arg in argnames:
+                    # current stack state: frames, frames[i], *(prev locals), frames[i]
+                    cg.extend_output(
+                        [
+                            create_dup_top(),
+                            cg.create_load_const(
+                                meta.num_stack + meta.locals_names[arg]
+                            ),
+                            cg.create_binary_subscr(),
+                            *create_swap(2),
+                        ],
+                    )
+                # current stack state: frames, frames[i], *(frame i live locals), frames[i]
+                cg.extend_output(
+                    [
+                        create_instruction("POP_TOP"),
+                        create_instruction("BUILD_LIST", arg=len(argnames)),
+                        *create_swap(2),
+                        # frames, frames i live locals, frames[i]
+                        *create_binary_slice(meta.num_stack, None, True),
+                        # frames[i][num_stack:] = frame i live locals
+                    ]
+                )
+                # current stack state: frames
+            else:
+                argnames = tuple(meta.locals_names.keys())
+                argnames_null = tuple(meta.locals_null_keys)
+
+            if sys.version_info < (3, 12):
+                assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
+
+            # compile_subgraph did not codegen any NULLs,
+            # so we should not count NullVariables
+            stack_len = len(cur_tx.stack) - len(meta.stack_null_idxes)
+
+            new_code: types.CodeType = ContinueExecutionCache.lookup(
+                cur_tx.f_code,
+                cur_tx.lineno,
+                resume_inst.offset,
+                tuple(b.target.offset for b in cur_tx.block_stack),
+                stack_len,
+                argnames,
+                argnames_null,
+                tuple(b.resume_fn() for b in cur_tx.block_stack),
+                tuple(meta.stack_ctx_args),
+                tuple(meta.locals_ctx_args),
+                tuple(meta.stack_null_idxes),
+                tuple(resume_codes),
+            )
+            resume_codes.append(new_code)
+
+            # Add original GraphModule context to the resume function to handle
+            # the case of a graph break while tracing a GraphModule
+            orig_graphmodule_maybe = code_context.get_context(cur_tx.f_code).get(
+                "orig_graphmodule", lambda: None
+            )()
+            if orig_graphmodule_maybe is not None:
+                code_context.get_context(new_code)["orig_graphmodule"] = weakref.ref(
+                    orig_graphmodule_maybe
+                )
+
+            # add resume function to the global scope
+            if new_code.co_freevars:
+                # expose code object for debugging purposes
+                cur_tx.output.install_global_unsafe(resume_name, new_code)
+                package_name = None
+            else:
+                # This is safe: we pre-generate a unique name
+                cur_tx.output.install_global_unsafe(
+                    resume_name,
+                    types.FunctionType(new_code, cur_tx.f_globals, resume_name),
+                )
+                package_name = resume_name
 
-        assert not config.nested_graph_breaks, "NYI"
+            if cur_tx.package is not None:
+                cur_tx.package.add_resume_function(
+                    new_code, cur_tx.f_globals["__name__"], package_name
+                )
 
-        # more locals may have been pruned after the unsupported instruction (e.g. branch)
-        reads = livevars_analysis(self.instructions, inst)
-        all_argnames = tuple(
-            k
-            for k in self.symbolic_locals.keys()
-            if k in reads and k not in self.cell_and_freevars()
-        )
-        argnames_null_set = set(all_stack_locals_metadata[-1].locals_null_keys)
-        argnames = tuple(k for k in all_argnames if k not in argnames_null_set)
-        argnames_null = tuple(k for k in all_argnames if k in argnames_null_set)
-        if sys.version_info < (3, 12):
-            assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
-        # compile_subgraph did not codegen any NULLs,
-        # so we should not count NullVariables
-        stack_len = len(self.stack) - len(
-            all_stack_locals_metadata[-1].stack_null_idxes
-        )
+        if disable_current_frame_resume:
+            from .eval_frame import skip_code
 
-        new_code: types.CodeType = ContinueExecutionCache.lookup(
-            self.f_code,
-            self.lineno,
-            inst.offset,
-            tuple(b.target.offset for b in self.block_stack),
-            stack_len,
-            argnames,
-            argnames_null,
-            tuple(b.resume_fn() for b in self.block_stack),
-            tuple(all_stack_locals_metadata[-1].stack_ctx_args),
-            tuple(all_stack_locals_metadata[-1].locals_ctx_args),
-            tuple(all_stack_locals_metadata[-1].stack_null_idxes),
-        )
+            skip_code(resume_codes[0])
 
-        # Add original GraphModule context to the resume function to handle
-        # the case of a graph break while tracing a GraphModule
-        orig_graphmodule_maybe = code_context.get_context(self.f_code).get(
-            "orig_graphmodule", lambda: None
-        )()
-        if orig_graphmodule_maybe is not None:
-            code_context.get_context(new_code)["orig_graphmodule"] = weakref.ref(
-                orig_graphmodule_maybe
+        # load first resume function (to be called this frame)
+        if resume_codes[-1].co_freevars:
+            cg.make_function_with_closure(
+                txes[-1], resume_names[-1], resume_codes[-1], True, 1
             )
-
-        if new_code.co_freevars:
-            # expose code object for debugging purposes
-            self.output.install_global_unsafe(name, new_code)
-            cg.make_function_with_closure(name, new_code, True, 1)
-            package_name = None
         else:
-            # This is safe: we pre-generate a unique name
-            self.output.install_global_unsafe(
-                name, types.FunctionType(new_code, self.f_globals, name)
-            )
-            cg.extend_output(cg.load_function_name(name, True, 1))
-            package_name = name
+            cg.extend_output(cg.load_function_name(resume_names[-1], True, 1))
+
+        # load all other resume functions (to be called later)
+        resume_names.pop()
+        resume_codes.pop()
+        for tx, name, code in zip(txes, resume_names, resume_codes):
+            if code.co_freevars:
+                cg.make_function_with_closure(tx, name, code, False, 0)
+            else:
+                cg.extend_output(cg.load_function_name(name, False, 0))
+        cg.extend_output(
+            [
+                create_instruction("BUILD_LIST", arg=len(resume_codes)),
+                *create_swap(2),
+            ]
+        )
 
-        if self.package is not None:
-            self.package.add_resume_function(
-                new_code, self.f_globals["__name__"], package_name
-            )
+        # resume 1 (+ NULL), [resume N, ..., resume 2], frames
 
         # load top level-frame; final stack state should be:
+        # first resume function (+ NULL),
         # [
-        #   (frame N stack (fixed), frame N non-cell locals, frame N cells),
-        #   ...,
-        #   (frame 2 stack, frame 2 non-cell locals, frame 2 cells),
-        # ], frame 1 stack + frame 1 non-cell locals
+        #     [resume N, ..., resume 2],
+        #     [
+        #         frame N stack + locals,
+        #         ...,
+        #         frame 2 stack + locals,
+        #     ], *(frame 1 stack + locals)
+        # ]
         cg.extend_output(
             [
                 create_dup_top(),
@@ -2643,46 +2779,21 @@ def create_call_resume_at(
                 # frames, frames[-1], frames
                 cg.create_load_const(-1),
                 create_instruction("DELETE_SUBSCR"),
-                # del frames[-1]; stack: frames, frames[-1]
-                create_dup_top(),
-                cg.create_load_const(0),
-                cg.create_binary_subscr(),
-                # frames, frames[-1], frames[-1][0]
-                *create_swap(2),
-                cg.create_load_const(1),
-                cg.create_binary_subscr(),
             ]
         )
 
-        # frames, frames[-1][0], frames[-1][1]
-        for name in argnames:
-            cg.extend_output(
-                [
-                    create_dup_top(),
-                    cg.create_load_const(
-                        all_stack_locals_metadata[-1].locals_names[name]
-                    ),
-                    cg.create_binary_subscr(),
-                    *create_swap(2),
-                ],
-            )
-        # frames, frames[-1][0], *(live locals), frames[-1][1]
+        # TOS: resumes, frames (popped), frame 1 stack + locals
         cg.extend_output(
             [
-                create_instruction("POP_TOP"),
-                create_instruction("BUILD_LIST", arg=len(argnames)),
-                *create_swap(3),
-                # live_locals, frames[-1][0], frames
-                create_instruction("BUILD_LIST", arg=1),
-                *create_swap(2),
-                # live_locals, [frames], frames[-1][0]
-                create_instruction("LIST_EXTEND", arg=1),
+                *create_rot_n(3),
+                create_instruction("BUILD_LIST", arg=2),
                 *create_swap(2),
+                # [resumes, frames (popped)], frame 1 stack + locals
                 create_instruction("LIST_EXTEND", arg=1),
             ]
         )
-        # [frames, *(stack + live locals)]
 
+        # TOS: [resumes, frames, *(frame 1 stack + locals)]
         cg.extend_output(
             [
                 create_instruction("CALL_FUNCTION_EX", arg=0),
@@ -2770,7 +2881,7 @@ def BUILD_MAP_UNPACK(self, inst: Instruction) -> None:
         items = self.popn(inst.argval)
         # ensure everything is a dict
         items = [BuiltinVariable(dict).call_function(self, [x], {}) for x in items]  # type: ignore[arg-type]
-        result = {}
+        result: dict[Any, Any] = {}
         for x in items:
             assert isinstance(x, ConstDictVariable)
             result.update(x.items)
@@ -2927,8 +3038,22 @@ def UNPACK_EX(self, inst: Instruction) -> None:
                 hints=[*graph_break_hints.USER_ERROR],
             )
 
+    @break_graph_if_unsupported(push=0)
+    def graph_break_on_leaf_function(self, inst: Instruction) -> None:
+        if self.is_leaf_tracer:
+            unimplemented_v2(
+                gb_type="Forced graph break on leaf function",
+                context="",
+                explanation="Forced graph break for nested graph break testing purposes",
+                hints=[
+                    "Set torch._dynamo.config.debug_force_graph_break_on_leaf_return = False",
+                ],
+            )
+
     def NOP(self, inst: Instruction) -> None:
-        pass
+        # Dynamo-specific testing behavior
+        if inst.argval == "GRAPH_BREAK_IF_LEAF":
+            self.graph_break_on_leaf_function(inst)
 
     def POP_TOP(self, inst: Instruction) -> None:
         self.pop()
@@ -3057,9 +3182,16 @@ def CONTAINS_OP(self, inst: Instruction) -> None:
         op = inst.argval
         try:
             self.push(right.call_method(self, "__contains__", [left], {}))
-        except Unsupported as excp:  # object doesn't support __contains__
+        except (
+            # right.__contains__ can raise TypeError
+            exc.ObservedTypeError,
+            # Ideally we should only capture TypeError here but some VTs don't
+            # implement hasattr(vt, "__contains__") entirely
+            Unsupported,
+        ) as excp:  # object doesn't support __contains__
             # Use __iter__ as fallback
-            excp.remove_from_stats()
+            if isinstance(excp, Unsupported):
+                excp.remove_from_stats()
             self.push(
                 self.inline_user_function_return(
                     VariableTracker.build(self, impl_CONTAINS_OP_fallback),
@@ -3355,7 +3487,7 @@ def setup_or_before_with(self, inst: Instruction) -> None:
         self.push(exit)
 
         if target:
-            if isinstance(self, InstructionTranslator):
+            if isinstance(self, InstructionTranslator) or config.nested_graph_breaks:
                 self.block_stack.append(
                     BlockStackEntry(inst, target, len(self.stack), ctx)
                 )
@@ -3583,6 +3715,9 @@ def __init__(
         self.symbolic_locals = symbolic_locals
         self.symbolic_globals = symbolic_globals
         self.symbolic_torch_function_state = symbolic_torch_function_state
+        # used to keep cell/freevars alive after pruning symbolic_locals (prune_dead_locals)
+        # in order to generate any nested closures
+        self.post_prune_cell_and_freevars = None
         self.stack: list[VariableTracker] = []
         self.instruction_pointer = 0
         self.start_point = None
@@ -3621,8 +3756,8 @@ def __init__(
         self.num_calls: dict[str, int] = {}
         # Flag to indicate whether tracing is used for export.
         self.export = export
-        # NOTE: one_graph is used for export/debugging to always force errors on graph breaks.
-        # To toggle fullgraph during normal compile, self.error_on_graph_break
+        # NOTE: one_graph is used for export/fullgraph=True to always force errors on graph breaks.
+        # To toggle erroring/resuming on graph breaks during fullgraph=False compile, self.error_on_graph_break
         # is used instead. Every step(), its value is updated to the global tls.error_on_graph_break.
         # We mirror this value since cleanup may (correctly) inadvertently change tls.error_on_graph_break.
         # This assumes that we cannot both trace a change to tls.error_on_graph_break and graph break on
@@ -4193,6 +4328,10 @@ def inline_call_(self) -> VariableTracker:
         finally:
             parent.error_on_graph_break = self.error_on_graph_break
 
+        if self.output.should_exit:
+            # graph break
+            return ConstantVariable.create(None)  # return dummy variable
+
         assert self.symbolic_result is not None
 
         if self.f_globals is parent.f_globals:
@@ -4307,10 +4446,15 @@ def should_compile_partial_graph(self) -> bool:
         return False  # inlining functions is all-or-nothing
 
     def create_call_resume_at(
-        self, inst: Instruction, push: int, all_stack_locals_metadata: Any
+        self,
+        inst: Instruction,
+        all_stack_locals_metadata: Any,
+        disable_current_frame_resume: bool,
     ) -> list[Instruction]:
         if config.nested_graph_breaks:
-            return super().create_call_resume_at(inst, push, all_stack_locals_metadata)
+            return super().create_call_resume_at(
+                inst, all_stack_locals_metadata, disable_current_frame_resume
+            )
         unimplemented_v2(
             gb_type="Graph break in inlined function",
             context="",
diff --git a/torch/_dynamo/test_case.py b/torch/_dynamo/test_case.py
index 3e2d76d6e15c..77860c720a6e 100644
--- a/torch/_dynamo/test_case.py
+++ b/torch/_dynamo/test_case.py
@@ -101,6 +101,18 @@ def tearDown(self) -> None:
             log.warning("Running test changed grad mode")
             torch.set_grad_enabled(self._prior_is_grad_enabled)
 
+    def assertEqual(self, x: Any, y: Any, *args: Any, **kwargs: Any) -> None:  # type: ignore[override]
+        if (
+            config.debug_disable_compile_counter
+            and isinstance(x, utils.CompileCounterInt)
+            or isinstance(y, utils.CompileCounterInt)
+        ):
+            return
+        return super().assertEqual(x, y, *args, **kwargs)
+
+    # assertExpectedInline might also need to be disabled for wrapped nested
+    # graph break tests
+
 
 class CPythonTestCase(TestCase):
     """
@@ -158,7 +170,7 @@ def compile_fn(
         # We want to compile only the test function, excluding any setup code
         # from unittest
         method = getattr(self, self._testMethodName)
-        method = torch._dynamo.optimize(backend, nopython=nopython)(method)
+        method = torch._dynamo.optimize(backend, error_on_graph_break=nopython)(method)
         setattr(self, self._testMethodName, method)
         return fn
 
diff --git a/torch/_dynamo/test_minifier_common.py b/torch/_dynamo/test_minifier_common.py
index b7e90fe847be..f48dae1d0e33 100644
--- a/torch/_dynamo/test_minifier_common.py
+++ b/torch/_dynamo/test_minifier_common.py
@@ -100,6 +100,14 @@ def tearDownClass(cls) -> None:
             print(f"test_minifier_common tmpdir kept at: {cls.DEBUG_DIR}")
         cls._exit_stack.close()  # type: ignore[attr-defined]
 
+    def _gen_codegen_fn_patch_code(self, device: str, bug_type: str) -> str:
+        assert bug_type in ("compile_error", "runtime_error", "accuracy")
+        return f"""\
+{torch._dynamo.config.codegen_config()}
+{torch._inductor.config.codegen_config()}
+torch._inductor.config.{"cpp" if device == "cpu" else "triton"}.inject_relu_bug_TESTING_ONLY = {bug_type!r}
+"""
+
     def _maybe_subprocess_run(
         self, args: Sequence[Any], *, isolate: bool, cwd: Optional[str] = None
     ) -> subprocess.CompletedProcess[bytes]:
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index 7ed28f766e58..805c3be524e8 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -42,7 +42,7 @@
 )
 from .guards import CheckFunctionManager, CompileId, GuardedCode
 from .types import ConvertFrameReturn, DynamoFrameType, wrap_guarded_code
-from .utils import same
+from .utils import CompileCounterInt, same
 
 
 np: Optional[types.ModuleType] = None
@@ -227,8 +227,8 @@ def insert_nops(instructions: list[Any], code_options: Any) -> None:
 
 class CompileCounter:
     def __init__(self) -> None:
-        self.frame_count = 0
-        self.op_count = 0
+        self.frame_count: Union[int, CompileCounterInt] = 0
+        self.clear()
 
     def __call__(
         self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
@@ -240,16 +240,19 @@ def __call__(
         return gm.forward
 
     def clear(self) -> None:
-        self.frame_count = 0
+        if config.debug_disable_compile_counter:
+            self.frame_count = CompileCounterInt(0)
+        else:
+            self.frame_count = 0
         self.op_count = 0
 
 
 class CompileCounterWithBackend:
     def __init__(self, backend: str) -> None:
-        self.frame_count = 0
-        self.op_count = 0
+        self.frame_count: Union[int, CompileCounterInt] = 0
         self.backend = backend
         self.graphs: list[torch.fx.GraphModule] = []
+        self.clear()
 
     def __call__(
         self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]
@@ -264,7 +267,10 @@ def __call__(
         return lookup_backend(self.backend)(gm, example_inputs)
 
     def clear(self) -> None:
-        self.frame_count = 0
+        if config.debug_disable_compile_counter:
+            self.frame_count = CompileCounterInt(0)
+        else:
+            self.frame_count = 0
         self.op_count = 0
         self.graphs = []
 
@@ -414,11 +420,12 @@ def rand_strided(
     device: Union[str, torch.device] = "cpu",
     extra_size: int = 0,
 ) -> torch.Tensor:
-    needed_size = (
-        sum((shape - 1) * stride for shape, stride in zip(size, stride))
-        + 1
-        + extra_size
-    )
+    needed_size = extra_size
+    if all(s > 0 for s in size):
+        # only need to allocate if all sizes are non-zero
+        needed_size += (
+            sum((shape - 1) * stride for shape, stride in zip(size, stride)) + 1
+        )
     if dtype.is_floating_point:
         if dtype.itemsize == 1:
             """
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index 7defa973a182..47ad8cda0c97 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -177,6 +177,7 @@
     "torch.compiler.is_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_dynamo_compiling": TorchInGraphFunctionVariable,
     "torch.compiler.is_exporting": TorchInGraphFunctionVariable,
+    "torch.autograd._profiler_enabled": SkipFunctionVariable,
     "torch._C._to_dlpack": SkipFunctionVariable,
     "torch.to_dlpack": SkipFunctionVariable,
     # We graph break on RNG state setters or getters like
@@ -348,7 +349,7 @@
     "torch._dynamo.mark_static": UserFunctionVariable,
     "torch._dynamo.nonstrict_trace": UserFunctionVariable,
     "torch._dynamo.patch_dynamo_config": UserFunctionVariable,
-    "torch._dynamo.set_fullgraph": UserFunctionVariable,
+    "torch._dynamo.error_on_graph_break": UserFunctionVariable,
     "torch.fx.experimental.symbolic_shapes.guard_size_oblivious": TorchInGraphFunctionVariable,
     "torch.fx.experimental.symbolic_shapes.guard_or_true": TorchInGraphFunctionVariable,
     "torch.fx.experimental.symbolic_shapes.guard_or_false": TorchInGraphFunctionVariable,
@@ -449,7 +450,6 @@
         "torch._C._accelerator_getAccelerator",
         "torch._C._accelerator_getDeviceIndex",
         "torch._C._accelerator_getStream",
-        "torch._C._accelerator_setAllocatorSettings",
         "torch._C._accelerator_setStream",
         "torch._C._accelerator_synchronizeDevice",
         "torch._C._activate_gpu_trace",
@@ -506,6 +506,7 @@
         "torch._C._cuda_clearCublasWorkspaces",
         "torch._C._cuda_cudaCachingAllocator_raw_alloc",
         "torch._C._cuda_cudaCachingAllocator_raw_delete",
+        "torch._C._cuda_cudaCachingAllocator_set_allocator_settings",
         "torch._C._cuda_cudaHostAllocator",
         "torch._C._cuda_customAllocator",
         "torch._C._cuda_emptyCache",
@@ -2440,7 +2441,6 @@
         "torch.atleast_3d",
         "torch.autograd._calculate_shape",
         "torch.autograd._is_checkpoint_valid",
-        "torch.autograd._profiler_enabled",
         "torch.autograd._make_grads",
         "torch.autograd._register_py_tensor_class_for_device",
         "torch.autograd._tensor_or_tensors_to_tuple",
@@ -2690,7 +2690,6 @@
         "torch.cuda.set_stream",
         "torch.cuda.set_sync_debug_mode",
         "torch.cuda.stream",
-        "torch.cuda.synchronize",
         "torch.cuda.temperature",
         "torch.cuda.utilization",
         "torch.einsum",
@@ -3404,6 +3403,7 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch._dynamo.compiled_autograd",
     "torch._dynamo.comptime",
     "torch._dynamo.polyfills",
+    "torch._dynamo.test_case",
     "torch._functorch._aot_autograd.subclass_parametrization",
     "torch._functorch.autograd_function",
     "torch._functorch.eager_transforms",
@@ -3419,7 +3419,6 @@ def _module_dir(m: types.ModuleType) -> Optional[str]:
     "torch._tensor",
     "torch.amp.autocast_mode",
     "torch.ao.nn",
-    "torch.ao.quantization.fake_quantize",
     "torch.autograd.function",
     "torch.backends.cuda",
     "torch.cuda.amp.autocast_mode",
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 7f5056247869..058a66cf5b77 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1557,9 +1557,14 @@ def default(self, o: Any) -> Any:
 
     keys_to_scrub: set[Any] = set()
     inductor_conf_str = None
-    inductor_config_copy = (
-        torch._inductor.config.get_config_copy() if torch._inductor.config else None
-    )
+    inductor_config_copy = None
+
+    if torch._inductor.config:
+        try:
+            inductor_config_copy = torch._inductor.config.get_config_copy()
+        except (TypeError, AttributeError):
+            inductor_conf_str = "Inductor Config cannot be pickled"
+
     if inductor_config_copy is not None:
         try:
             for key, val in inductor_config_copy.items():
@@ -2632,7 +2637,11 @@ def normalize_range_iter(range_iter: Any) -> tuple[int, int, int]:
     _, (range_obj,), maybe_idx = range_iter.__reduce__()
     # In 3.12+, `maybe_idx` could be None, and `range_obj.start` would've been
     # already incremented by the current index.
-    start = range_obj.start + (maybe_idx or 0)
+    # The index (maybe_idx) is the number of steps taken so far. To get the
+    # correct start value, one must add (maybe_idx * step) to the original
+    # start. See:
+    # https://github.com/python/cpython/blob/ea77feecbba389916af8f90b2fc77f07910a2963/Objects/rangeobject.c#L885-L899
+    start = range_obj.start + (maybe_idx or 0) * range_obj.step
     stop = range_obj.stop
     step = range_obj.step
     return (start, stop, step)
@@ -3099,14 +3108,7 @@ def get_multiplier() -> float:
                     and math.isnan(res_error)
                     # Some unit test for the accuracy minifier relies on
                     # returning false in this case.
-                    and not any(
-                        (
-                            torch._inductor.config.cpp.inject_relu_bug_TESTING_ONLY,
-                            torch._inductor.config.cpp.inject_log1p_bug_TESTING_ONLY,
-                            torch._inductor.config.triton.inject_relu_bug_TESTING_ONLY,
-                            torch._inductor.config.triton.inject_log1p_bug_TESTING_ONLY,
-                        )
-                    )
+                    and not torch._inductor.config.cpp.inject_relu_bug_TESTING_ONLY
                 ):
                     passes_test = True
                 if not passes_test:
@@ -4734,6 +4736,11 @@ def record(cls) -> Generator[None, None, None]:
                 cls.end()
 
 
+class CompileCounterInt(int):
+    def __add__(self, other: Any) -> CompileCounterInt:
+        return CompileCounterInt(super().__add__(other))
+
+
 def set_feature_use(feature: str, usage: bool) -> None:
     """
     Records whether we are using a feature
@@ -4846,22 +4853,3 @@ def get_traced_code() -> Optional[list[CodeType]]:
     from torch._guards import TracingContext
 
     return TracingContext.get_traced_code()
-
-
-class CreateNestedFnCache:
-    cache: dict[str, types.FunctionType] = {}
-
-    @classmethod
-    def get(cls, key: str) -> Optional[types.FunctionType]:
-        return cls.cache.get(key, None)
-
-    @classmethod
-    def set(cls, key: str, value: types.FunctionType) -> None:
-        cls.cache[key] = value
-
-    @classmethod
-    def clear(cls: type[CreateNestedFnCache]) -> None:
-        cls.cache.clear()
-
-
-create_nested_fn_cache: CreateNestedFnCache = CreateNestedFnCache()
diff --git a/torch/_dynamo/variables/__init__.py b/torch/_dynamo/variables/__init__.py
index 08d62ce607f0..31bc7db5128f 100644
--- a/torch/_dynamo/variables/__init__.py
+++ b/torch/_dynamo/variables/__init__.py
@@ -27,6 +27,7 @@
     DisabledSavedTensorsHooksVariable,
     DualLevelContextManager,
     DynamoConfigPatchVariable,
+    ErrorOnGraphBreakVariable,
     FSDPParamGroupUseTrainingStateVariable,
     GradIncrementNestingCtxManagerVariable,
     GradInplaceRequiresGradCtxManagerVariable,
@@ -34,7 +35,6 @@
     InferenceModeVariable,
     JvpIncrementNestingCtxManagerVariable,
     SDPAKernelVariable,
-    SetFullgraphVariable,
     SetFwdGradEnabledContextManager,
     StreamContextVariable,
     StreamVariable,
@@ -200,7 +200,7 @@
     "RemovableHandleVariable",
     "RepeatIteratorVariable",
     "SDPAParamsVariable",
-    "SetFullgraphVariable",
+    "ErrorOnGraphBreakVariable",
     "SkipFunctionVariable",
     "SliceVariable",
     "StringFormatVariable",
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 2fbf909b509a..e49eef370776 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -44,6 +44,7 @@
 
 import torch
 from torch import SymInt
+from torch._dispatch.python import enable_python_dispatcher
 from torch._dynamo.utils import (
     get_metrics_context,
     is_int_specialization_case,
@@ -168,10 +169,10 @@
 from .ctx_manager import (
     AutocastModeVariable,
     DynamoConfigPatchVariable,
+    ErrorOnGraphBreakVariable,
     EventVariable,
     NullContextVariable,
     PreserveVersionContextVariable,
-    SetFullgraphVariable,
     StreamContextVariable,
     StreamVariable,
 )
@@ -630,7 +631,7 @@ def _wrap(self, value):
 
         from ..decorators import (
             DynamoConfigPatchProxy,
-            SetFullgraphDecoratorContextManager,
+            ErrorOnGraphBreakDecoratorContextManager,
         )
 
         if has_triton():
@@ -988,8 +989,8 @@ def build_key_value(i, k, v):
             )
         elif isinstance(value, DynamoConfigPatchProxy):
             return DynamoConfigPatchVariable(value.changes)
-        elif isinstance(value, SetFullgraphDecoratorContextManager):
-            return SetFullgraphVariable(value.fullgraph)
+        elif isinstance(value, ErrorOnGraphBreakDecoratorContextManager):
+            return ErrorOnGraphBreakVariable(value.error_on_graph_break)
         elif callable(value) and trace_rules.lookup_callable(value) is not None:
             if trace_rules.is_callable_allowed(value):
                 self.tx.output.has_user_defined_allowed_in_graph = True
@@ -1521,7 +1522,8 @@ def build_key_value(i, k, v):
             return self.tx.output.side_effects.track_object_existing(value, result)
         elif issubclass(type(value), MutableMapping):
             self.install_guards(GuardBuilder.TYPE_MATCH)
-            return MutableMappingVariable(value, source=self.source)
+            result = MutableMappingVariable(value, source=self.source)
+            return self.tx.output.side_effects.track_object_existing(value, result)
         elif is_frozen_dataclass(value):
             self.install_guards(GuardBuilder.TYPE_MATCH)
             result = FrozenDataClassVariable.create(self.tx, value, source=self.source)
@@ -2990,12 +2992,8 @@ def handle_traced_output(example_value, tx, proxy, options, subclass_type, targe
         return ConstantVariable.create(example_value, **options)
     elif isinstance(example_value, (int, float, bool)) and (
         proxy.node.target is call_torchbind
-    ):
-        set_example_value(proxy.node, example_value)
-        return ConstantVariable.create(example_value, **options)
-    elif (
-        isinstance(example_value, (int, float, bool))
-        and proxy.node.target is flat_apply
+        or proxy.node.target is flat_apply
+        or (proxy.node.op == "call_method" and proxy.node.target == "item")
     ):
         set_example_value(proxy.node, example_value)
         return ConstantVariable.create(example_value, **options)
@@ -3500,13 +3498,19 @@ def wrap_to_fake_tensor_and_record(
             type(e),
         )
 
-        fake_e = wrap_fake_exception(
-            lambda: tx.fake_mode.from_tensor(
-                e,
-                source=source,
-                symbolic_context=symbolic_context,
+        # Note [enable_python_dispatcher in dynamo]
+        # Dynamo disables itself when it runs fake tensor prop, which means that tensor subclasses
+        # have no way to know (purely based off of global state) if they are currently being run under compile or not.
+        # we use enable_python_dispatcher mainly to tweak the DispatchKeyState so that subclass authors
+        # can check it to know if they are running in an eager context or not
+        with enable_python_dispatcher():
+            fake_e = wrap_fake_exception(
+                lambda: tx.fake_mode.from_tensor(
+                    e,
+                    source=source,
+                    symbolic_context=symbolic_context,
+                )
             )
-        )
         if (
             source is not None
             and isinstance(fake_e, FakeTensor)
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 74f8864479d4..b46707f2f117 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -1158,6 +1158,21 @@ def builtin_dispatch(tx: "InstructionTranslator", args, kwargs):
 
         return builtin_dispatch
 
+    def call_vars(self, tx: "InstructionTranslator", *args):
+        if len(args) == 0:
+            unimplemented_v2(
+                gb_type="unimplemented builtin op vars() with no arguments",
+                context=f"vars: {self} {args}",
+                explanation=f"Dynamo does not know how to trace builtin operator {self.fn} with no arguments",
+                hints=[*graph_break_hints.SUPPORTABLE],
+            )
+        assert len(args) == 1
+        # vars(obj) is obj.__dict__ if __dict__ is present else TypeError
+        try:
+            return args[0].var_getattr(tx, "__dict__")
+        except ObservedAttributeError:
+            raise_observed_exception(TypeError, tx)
+
     def _handle_insert_op_in_graph(self, tx: "InstructionTranslator", args, kwargs):
         from .builder import wrap_fx_proxy, wrap_fx_proxy_cls
 
@@ -1805,6 +1820,8 @@ def _call_tuple_list(self, tx, obj=None, *args, **kwargs):
     def call_iter(self, tx: "InstructionTranslator", obj, *args, **kwargs):
         if isinstance(obj, variables.IteratorVariable):
             ret = obj
+        elif isinstance(obj, variables.RangeVariable):
+            ret = obj.call_method(tx, "__iter__", [], {})
         else:
             # Handle the case where we are iterating over a tuple, list or iterator
             ret = self._call_iter_tuple_list(tx, obj, *args, **kwargs)
@@ -1881,6 +1898,17 @@ def call_dict(self, tx: "InstructionTranslator", *args, **kwargs):
 
     @staticmethod
     def call_custom_dict(tx: "InstructionTranslator", user_cls, *args, **kwargs):
+        args = list(args)
+        if (
+            len(args) == 1
+            and isinstance(args[0], variables.GetAttrVariable)
+            and isinstance(args[0].obj, variables.UserDefinedClassVariable)
+            and not tx.output.side_effects.has_pending_mutation(args[0].obj)
+        ):
+            # Forward the GetAttrVariable(foo, "__dict__") to a realized vt of
+            # VT(foo.__dict__). This simplifies the construction of the new
+            # dict.
+            args[0] = args[0].get_forwarded_dict(tx)
         return tx.inline_user_function_return(
             VariableTracker.build(tx, polyfills.construct_dict),
             [VariableTracker.build(tx, user_cls), *args],
@@ -2173,6 +2201,18 @@ def call_filter(self, tx: "InstructionTranslator", fn, seq):
         seq = seq.unpack_var_sequence(tx) if seq.has_unpack_var_sequence(tx) else seq
         return variables.FilterVariable(fn, seq, mutation_type=ValueMutationNew())
 
+    def var_getattr(self, tx: "InstructionTranslator", name):
+        source = self.source and AttrSource(self.source, name)
+        if self.fn is object:
+            # for object, we can just directly read the attribute
+            try:
+                value = getattr(self.fn, name)
+            except AttributeError:
+                raise_observed_exception(AttributeError, tx)
+            if not callable(value):
+                return VariableTracker.build(tx, value, source)
+        return variables.GetAttrVariable(self, name, source=source)
+
     def call_getattr(
         self,
         tx: "InstructionTranslator",
@@ -2767,6 +2807,8 @@ def call_or_(self, tx: "InstructionTranslator", a, b):
                 UserDefinedObjectVariable,
             ),
         ):
+            # TODO(guilhermeleobas): forward the call to b.__ror__(a) if
+            # a.__ror__(b) returns NotImplemented
             return a.call_method(tx, "__or__", [b], {})
 
         # None no-ops this handler and lets the driving function proceed
diff --git a/torch/_dynamo/variables/constant.py b/torch/_dynamo/variables/constant.py
index 90cbb08f5fc8..11822016827e 100644
--- a/torch/_dynamo/variables/constant.py
+++ b/torch/_dynamo/variables/constant.py
@@ -125,7 +125,7 @@ def unpack_var_sequence(self, tx):
 
     def const_getattr(self, tx: "InstructionTranslator", name):
         if not hasattr(self.value, name):
-            raise NotImplementedError
+            raise_observed_exception(AttributeError, tx, args=[name])
         member = getattr(self.value, name)
         if callable(member):
             raise NotImplementedError
diff --git a/torch/_dynamo/variables/ctx_manager.py b/torch/_dynamo/variables/ctx_manager.py
index 5008c2eb31c5..15a5540395d1 100644
--- a/torch/_dynamo/variables/ctx_manager.py
+++ b/torch/_dynamo/variables/ctx_manager.py
@@ -1429,12 +1429,12 @@ def fn_name(self):
         return "patch_dynamo_config"
 
 
-class SetFullgraphVariable(ContextWrappingVariable):
-    """represents torch._dynamo.set_fullgraph"""
+class ErrorOnGraphBreakVariable(ContextWrappingVariable):
+    """represents torch._dynamo.error_on_graph_break"""
 
-    def __init__(self, fullgraph, **kwargs) -> None:
+    def __init__(self, error_on_graph_break, **kwargs) -> None:
         super().__init__(
-            target_values=(fullgraph,),
+            target_values=(error_on_graph_break,),
             initial_values=(_get_error_on_graph_break(),),
             **kwargs,
         )
@@ -1447,7 +1447,7 @@ def module_name(self):
         return "torch._dynamo"
 
     def fn_name(self):
-        return "set_fullgraph"
+        return "error_on_graph_break"
 
 
 class WithExitFunctionVariable(VariableTracker):
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index a3c38ffb1e76..c33979aae07d 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -245,13 +245,33 @@ def __init__(
         def make_hashable(key):
             return key if isinstance(key, Hashable) else Hashable(key)
 
-        self.items = {make_hashable(x): v for x, v in items.items()}
+        dict_cls = self._get_dict_cls_from_user_cls(user_cls)
+        self.items = dict_cls({make_hashable(x): v for x, v in items.items()})
         # need to reconstruct everything if the dictionary is an intermediate value
         # or if a pop/delitem was executed
         self.should_reconstruct_all = not is_from_local_source(self.source)
         self.original_items = items.copy()
         self.user_cls = user_cls
 
+    def _get_dict_cls_from_user_cls(self, user_cls):
+        accepted_dict_types = (dict, collections.OrderedDict, collections.defaultdict)
+
+        # avoid executing user code if user_cls is a dict subclass
+        if user_cls in accepted_dict_types:
+            dict_cls = user_cls
+        else:
+            # <Subclass, ..., dict, object>
+            dict_cls = next(
+                base for base in user_cls.__mro__ if base in accepted_dict_types
+            )
+        assert dict_cls in accepted_dict_types, dict_cls
+
+        # Use a dict instead as the call "defaultdict({make_hashable(x): v ..})"
+        # would fail as defaultdict expects a callable as first argument
+        if dict_cls is collections.defaultdict:
+            dict_cls = dict
+        return dict_cls
+
     def as_proxy(self):
         return {k.vt.as_proxy(): v.as_proxy() for k, v in self.items.items()}
 
@@ -528,14 +548,33 @@ def call_method(
             tx.output.side_effects.mutation(self)
             return self.items.pop(Hashable(args[0]))
         elif name == "popitem" and self.is_mutable():
-            if len(args):
+            if (
+                issubclass(self.user_cls, dict)
+                and not issubclass(self.user_cls, collections.OrderedDict)
+                and len(args)
+            ):
                 raise_args_mismatch(tx, name)
+
             if not self.items:
                 msg = ConstantVariable.create("popitem(): dictionary is empty")
                 raise_observed_exception(KeyError, tx, args=[msg])
+
+            if self.user_cls is collections.OrderedDict and (
+                len(args) == 1 or "last" in kwargs
+            ):
+                if len(args) == 1 and isinstance(args[0], ConstantVariable):
+                    last = args[0].value
+                elif (v := kwargs.get("last")) and isinstance(v, ConstantVariable):
+                    last = v.value
+                else:
+                    raise_args_mismatch(tx, name)
+                k, v = self.items.popitem(last=last)
+            else:
+                k, v = self.items.popitem()
+
             self.should_reconstruct_all = True
             tx.output.side_effects.mutation(self)
-            k, v = self.items.popitem()
+
             return variables.TupleVariable([k.vt, v])
         elif name == "clear":
             if args or kwargs:
@@ -604,12 +643,23 @@ def call_method(
                 return x
         elif name == "move_to_end":
             self.install_dict_keys_match_guard()
-            assert not kwargs and len(args) == 1
             tx.output.side_effects.mutation(self)
+            if args[0] not in self:
+                raise_observed_exception(KeyError, tx)
+
+            last = True
+            if len(args) == 2 and isinstance(args[1], ConstantVariable):
+                last = args[1].value
+
+            if (
+                kwargs
+                and "last" in kwargs
+                and isinstance(kwargs["last"], ConstantVariable)
+            ):
+                last = kwargs.get("last").value
+
             key = Hashable(args[0])
-            val = self.items[key]
-            self.items.pop(key)
-            self.items[key] = val
+            self.items.move_to_end(key, last=last)
             return ConstantVariable.create(None)
         elif name == "__eq__" and istype(
             self, ConstDictVariable
@@ -626,33 +676,48 @@ def call_method(
             )
         elif name == "__or__":
             assert len(args) == 1
-            # Dicts can only be unioned with other dicts or subclasses of dicts.
-            # Sets can be unioned with other sets, frozensets or subclasses of sets.
-            _raise = not (
-                (
-                    istype(self, ConstDictVariable)
-                    and istype(
-                        args[0], (ConstDictVariable, variables.UserDefinedDictVariable)
-                    )
-                )
-                or (
-                    isinstance(self, SetVariable)
-                    and isinstance(
-                        args[0], (SetVariable, variables.UserDefinedSetVariable)
-                    )
-                )
-            )
-
-            if _raise:
+            other = args[0]
+
+            # Method resolution for binops works as follow (using __or__ as example):
+            # (1) dict.__or__(dict) => dict
+            # (2) dict.__or__(subclass): return NotImplemented
+            # (3) Check if subclass implements __ror__ => forward the call
+            # to subclass.__ror__(dict)
+
+            # Let's not forward the call to __ror__ yet because __ror__ can be
+            # implemented in C (i.e. OrderedDict subclass) which Dynamo cannot
+            # trace
+            # if istype(other, variables.UserDefinedDictVariable):
+            #     if other.call_obj_hasattr(tx, "__ror__").value:
+            #         return other.call_method(tx, "__ror__", [self], kwargs)
+
+            # The three dict types Dynamo can handle are dict, OrderedDict and
+            # defaultdict.
+
+            # TODO(guilhermeleobas): this check should be on builtin.py::call_or_
+            if not istype(
+                other, (ConstDictVariable, variables.UserDefinedDictVariable)
+            ):
                 msg = (
                     f"unsupported operand type(s) for |: '{self.python_type().__name__}'"
-                    f"and '{args[0].python_type().__name__}'"
+                    f"and '{other.python_type().__name__}'"
                 )
                 raise_observed_exception(TypeError, tx, args=[msg])
 
+            # OrderedDict overloads __ror__
+            ts = {self.user_cls, other.user_cls}
+            user_cls = (
+                collections.OrderedDict
+                if any(issubclass(t, collections.OrderedDict) for t in ts)
+                else dict
+            )
+
             self.install_dict_keys_match_guard()
             new_dict_vt = self.clone(
-                items=self.items.copy(), mutation_type=ValueMutationNew(), source=None
+                items=self.items.copy(),
+                mutation_type=ValueMutationNew(),
+                source=None,
+                user_cls=user_cls,
             )
 
             # NB - Guard on all the keys of the other dict to ensure
@@ -881,6 +946,18 @@ def reconstruct(self, codegen: "PyCodegen"):
         codegen.foreach([x.vt for x in self.set_items])
         codegen.append_output(create_instruction("BUILD_SET", arg=len(self.set_items)))
 
+    def _fast_set_method(self, tx, fn, args, kwargs):
+        try:
+            res = fn(
+                *[x.as_python_constant() for x in [self, *args]],
+                **{k: v.as_python_constant() for k, v in kwargs.items()},
+            )
+        except Exception as exc:
+            raise_observed_exception(
+                type(exc), tx, args=list(map(ConstantVariable.create, exc.args))
+            )
+        return VariableTracker.build(tx, res)
+
     def call_method(
         self,
         tx,
@@ -889,6 +966,23 @@ def call_method(
         kwargs: dict[str, VariableTracker],
     ) -> "VariableTracker":
         # We forward the calls to the dictionary model
+        from ..utils import check_constant_args
+
+        if (
+            name
+            in (
+                "isdisjoint",
+                "union",
+                "intersection",
+                "difference",
+                "symmetric_difference",
+            )
+            and check_constant_args(args, kwargs)
+            and self.python_type() is set
+        ):
+            py_type = self.python_type()
+            return self._fast_set_method(tx, getattr(py_type, name), args, kwargs)
+
         if name == "__init__":
             temp_set_vt = variables.BuiltinVariable(set).call_set(tx, *args, *kwargs)
             tx.output.side_effects.mutation(self)
diff --git a/torch/_dynamo/variables/functions.py b/torch/_dynamo/variables/functions.py
index 2f5dec54079f..d1755c85abf6 100644
--- a/torch/_dynamo/variables/functions.py
+++ b/torch/_dynamo/variables/functions.py
@@ -69,7 +69,6 @@
     check_unspec_or_constant_args,
     cmp_name_to_op_mapping,
     counters,
-    create_nested_fn_cache,
     identity,
     is_function,
     is_wrapper_or_member_descriptor,
@@ -277,11 +276,6 @@ def _create_nested_fn(
 ):
     from types import FunctionType
 
-    # Add caching for the actual IDs of user functions so that we can use them in the ID_MATCH guard.
-    cache_key = str(id(code)) + str(id(closure)) + str(id(f_globals))
-    if create_nested_fn_cache.get(cache_key):
-        return create_nested_fn_cache.get(cache_key)
-
     func = FunctionType(code, f_globals, name, defaults, closure)
     func.__kwdefaults__ = kwdefaults
 
@@ -293,7 +287,7 @@ def _create_nested_fn(
     # TypeError: __annotations__ must be set to a dict object
     assert annotations is None or isinstance(annotations, dict)
     func.__annotations__ = annotations
-    create_nested_fn_cache.set(cache_key, func)
+
     return func
 
 
@@ -527,15 +521,17 @@ def call_function(
                     "Please fix your call to patch_dynamo_config by using simpler inputs. "
                     f"args: {args}, kwargs: {kwargs}"
                 ) from e
-        elif self.fn is torch._dynamo.set_fullgraph:
+        elif self.fn is torch._dynamo.error_on_graph_break:
             try:
                 bound = inspect.signature(self.fn).bind(*args, **kwargs)
-                fullgraph = bound.arguments["fullgraph"].as_python_constant()
-                assert isinstance(fullgraph, bool)
-                return variables.SetFullgraphVariable(fullgraph)
+                error_on_graph_break = bound.arguments[
+                    "error_on_graph_break"
+                ].as_python_constant()
+                assert isinstance(error_on_graph_break, bool)
+                return variables.ErrorOnGraphBreakVariable(error_on_graph_break)
             except Exception as e:
                 raise RuntimeError(
-                    "Improper set_fullgraph() call. Please fix your call to set_fullgraph(). "
+                    "Improper error_on_graph_break() call. Please fix your call to error_on_graph_break(). "
                     f"args: {args}, kwargs: {kwargs}"
                 ) from e
         # Handle a `nonstrict_trace(fn)` call
@@ -1472,17 +1468,27 @@ def as_python_constant(self):
 
     @classmethod
     def create_with_source(cls, value, source):
-        if inspect.getattr_static(value, "_torchdynamo_orig_callable", False):
-            install_guard(
-                AttrSource(source, "_torchdynamo_orig_callable").make_guard(
-                    GuardBuilder.FUNCTION_MATCH
+        # Use closure match guard (i.e. guard on __code__ object instead of
+        # function id) to avoid guarding on nested functions.
+        if inspect.getattr_static(value, "_torchdynamo_disable", False):
+            # For torch._dynamo.disable function, ensure that the original
+            # function is guarded. Otherwise, the else branch will guard on the
+            # _dynamo.disable.__code__
+            guard_on_source = source
+            guard_on_value = value
+
+            while getattr(guard_on_value, "_torchdynamo_orig_callable", False):
+                guard_on_value = guard_on_value._torchdynamo_orig_callable
+                guard_on_source = AttrSource(
+                    guard_on_source, "_torchdynamo_orig_callable"
                 )
-            )
+
+            guard_on_source.make_guard(GuardBuilder.CLOSURE_MATCH)
         elif not is_wrapper_or_member_descriptor(value):
             # These descriptors are not guaranteed to return the same object on
             # attribute lookup. They are unlikely to be changed, so we can skip
             # guarding them.
-            install_guard(source.make_guard(GuardBuilder.FUNCTION_MATCH))
+            install_guard(source.make_guard(GuardBuilder.CLOSURE_MATCH))
         return cls(value, source=source)
 
     def call_function(
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
index d3334424c5f4..5ac883c7d393 100644
--- a/torch/_dynamo/variables/higher_order_ops.py
+++ b/torch/_dynamo/variables/higher_order_ops.py
@@ -27,7 +27,8 @@
 import types
 import warnings
 from collections.abc import Sequence
-from typing import Optional, TYPE_CHECKING
+from dataclasses import dataclass
+from typing import Any, Optional, TYPE_CHECKING
 
 import torch._C
 import torch.fx
@@ -70,6 +71,33 @@
 hc_log = torch._logging.getArtifactLogger(__name__, "hierarchical_compile")
 
 
+@dataclass
+class OutputSpec:
+    """
+    Contains the treespec of the output of the speculated subgraph, and the
+    information to mask out the constant values from the output during
+    flattening and inserting them back during unflattening. Cleaning up
+    constants from the graph makes the graph simpler for AOTDispatcher and
+    Inductor.
+    """
+
+    treespec: pytree.TreeSpec
+    # list of True/False to identify the locations of const values in the
+    # subgraph output. True means that value at that index is a constant.
+    masks_to_filter_const_values: Optional[list[bool]] = None
+    # The actual constant values that were present in the subgraph output. Note
+    # that this is the same length as the mask, we just look at the indices
+    # where mask is True.
+    const_values: Optional[list[Any]] = None
+
+    def __post_init__(self):
+        if (
+            self.masks_to_filter_const_values is not None
+            or self.const_values is not None
+        ):
+            assert len(self.masks_to_filter_const_values) == len(self.const_values)
+
+
 def raise_hard_error_if_graph_break(reason):
     def deco(fn):
         @functools.wraps(fn)
@@ -216,7 +244,7 @@ def inline_call(*args, **kwargs):
 
 
 def _call_function_and_unflatten_output(
-    tx, fn, args, kwargs, flat_example_value, ret_treespec
+    tx, fn, args, kwargs, flat_example_value, ret_spec
 ):
     from .builder import wrap_fx_proxy
 
@@ -232,12 +260,21 @@ def _call_function_and_unflatten_output(
         example_value=flat_example_value,
     )
 
+    if ret_spec.masks_to_filter_const_values:
+        from torch._dynamo.external_utils import insert_const_values_with_mask
+
+        # During flattening, we removed the constant values. To ensure Dynamo
+        # can trace correctly, insert back the constant values in the output.
+        flat_variable = _make_inlined(tx, insert_const_values_with_mask)(
+            flat_variable, ret_spec.masks_to_filter_const_values, ret_spec.const_values
+        )
+
     # Transform variable back into a list (previously made into a tuple by
     # speculate_subgraph function) so as to respect the pytree API typing.
     flat_list_variable = BuiltinVariable(list).call_function(tx, [flat_variable], {})
     return (
-        _make_inlined(tx, pytree.tree_unflatten)(flat_list_variable, ret_treespec)
-        if ret_treespec
+        _make_inlined(tx, pytree.tree_unflatten)(flat_list_variable, ret_spec.treespec)
+        if ret_spec.treespec
         else flat_variable
     )
 
@@ -275,6 +312,246 @@ def _check_supported_callable_arg(
         )
 
 
+def _call_while_loop(
+    self: VariableTracker,
+    tx: "InstructionTranslator",
+    args: list[VariableTracker],
+    kwargs: dict[str, VariableTracker],
+    stack_output: bool,
+) -> VariableTracker:
+    from torch._higher_order_ops.while_loop import _create_unbacked_symint
+
+    from . import TensorVariable
+
+    args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
+    cond_fn, body_fn, operands, additional_inputs = args
+
+    # Input checks
+    for i, k in enumerate(["cond_fn", "body_fn", "operands"]):
+        if v := kwargs.pop(k, None):
+            assert i == len(args), (
+                "did not provide the right number of non-keyword args"
+            )
+            args.append(v)
+
+    if kwargs:
+        unimplemented(f"torch.while_loop: Got unexpected kwargs: {list(kwargs.keys())}")
+
+    if len(args) != 4:
+        unimplemented(
+            f"Expected 4 arguments but got {len(args)}.\n"
+            f"Usage: while_loop(cond_fn, body_fn, operands)",
+        )
+
+    # cond_fn and body_fn input check
+    _check_supported_callable_arg(tx, cond_fn, "cond_fn")
+    _check_supported_callable_arg(tx, body_fn, "body_fn")
+
+    # operands input check
+    operands_seq = operands.unpack_var_sequence(tx)
+
+    # additional_inputs input check
+    if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
+        unimplemented(
+            f"Expected additional_inputs to be a list/tuple but got "
+            f"{additional_inputs.python_type()}. It seems to be an "
+            f"internal error, please report an issue to PyTorch."
+        )
+    additional_inputs_seq = additional_inputs.unpack_var_sequence(tx)
+
+    with discard_graph_changes(tx):
+        # Note: this must be run under discard graph changes.
+        def unspecialize_carried_inputs(tx, carry) -> VariableTracker:
+            # See NOTE [unspecialize int carry with unbacked symints]
+            if (
+                isinstance(carry, ConstantVariable) and carry.python_type() is int
+            ) or isinstance(carry, SymNodeVariable):
+                example_value = _create_unbacked_symint(
+                    tx.output.fake_mode, ignore_fresh_unbacked_symbols=True
+                )
+                proxy = tx.output.current_tracer.create_graph_input(
+                    "unbacked_symint", type(example_value), example_value
+                )
+                return SymNodeVariable.create(tx, proxy, example_value)
+            else:
+                # See NOTE [unspecialize constant tensor carry]
+                assert isinstance(carry, TensorVariable)
+                cloned_carry = carry.clone()
+                cloned_carry.proxy.node.meta["example_value"].constant = None
+                return cloned_carry
+
+        # clone inputs across subgraphs, to avoid unbacked memoization in fake prop
+        cond_operands_seq = [
+            unspecialize_carried_inputs(
+                tx,
+                (
+                    carry.call_method(tx, "clone", args=(), kwargs={})
+                    if isinstance(carry, TensorVariable)
+                    else carry
+                ),
+            )
+            for carry in operands_seq
+        ]
+        body_operands_seq = [
+            unspecialize_carried_inputs(
+                tx,
+                (
+                    carry.call_method(tx, "clone", args=(), kwargs={})
+                    if isinstance(carry, TensorVariable)
+                    else carry
+                ),
+            )
+            for carry in operands_seq
+        ]
+
+    # create cond subgrpahs
+    (
+        (cond_r, _cond_treespec),
+        cond_graph,
+        cond_lifted_freevars,
+    ) = speculate_subgraph(
+        tx,
+        cond_fn,
+        cond_operands_seq + additional_inputs_seq,
+        {},
+        "while_loop",
+        source_target=self.value,
+        # NOTE [why we cannot use "automatic" for while_loop]:
+        # The reason is that we want to enforce
+        # the ordering of inputs and outputs to be consistent and the the ordering
+        # of cond_fn and body_fn to the consistent.
+        # e.g. suppose we use "automatic" and we have:
+        #
+        # def body_fn(ph1, ph2):
+        #   new_a, new_b = ph2.cos(), ph1.sin()
+        #   return new_a, new_b
+        #
+        # a, b = torch.randn(3), torch.randn(3)
+        # new_a, new_b = body_fn(a, b)
+        #
+        # Using automatic, the ordering of arguments will be the order that they're
+        # used. In this example, the capture graph looks like:
+        #
+        # def captured_body(ph1, ph2):
+        #   new_a, new_b = ph1.cos(), ph2.add_(1)
+        #   return new_a, new_b
+        #
+        # This is fine when we change the calling convention of captured_body to be
+        # new_a, new_b = captured_body(b, a).
+        # But for while_loop, the next iteration's input is previous iteration output
+        # we'll end up feeding captured_body(new_a, new_b) instead.
+        # So it's best we always enforce the ordering of carried_inputs the same as outputs
+        # with "flatten_manual".
+        set_subgraph_inputs="flatten_manual",
+        supports_input_mutation=self.supports_input_mutation,
+        supports_aliasing=self.supports_aliasing,
+        remove_consts_from_outputs=False,
+    )
+    cond_nn_modules = dict(tx.output.nn_modules)
+    validate_subgraph_output_types(cond_r)
+    if isinstance(cond_r, TensorVariable):
+        cond_r_meta = _extract_tensor_metadata(
+            cond_r.proxy.node.meta["example_value"], include_contiguity=False
+        )
+        if not cond_r_meta.dtype == torch.bool or not cond_r_meta.shape == torch.Size(
+            []
+        ):
+            unimplemented(
+                f"Expected cond_fn to return a scalar tensor or a bool but got {cond_r_meta.shape}"
+            )
+    elif isinstance(cond_r, ConstantVariable):
+        # short-circuiting while_loop when cond_fn returns a constant such as 0, 1 True or False
+        pred = cond_r.as_python_constant()
+        if pred:
+            unimplemented(
+                f"Infinite loop detected because while_loop's cond_fn always returns the same value {pred}"
+            )
+        else:
+            return operands
+
+    # create body subgraph
+    (
+        (body_r, body_treespec),
+        body_graph,
+        body_lifted_freevars,
+    ) = speculate_subgraph(
+        tx,
+        body_fn,
+        body_operands_seq + additional_inputs_seq,
+        {},
+        "while_loop",
+        source_target=self.value,
+        set_subgraph_inputs="flatten_manual",
+        should_flatten_outputs=True,
+        supports_input_mutation=False,
+        supports_aliasing=False,
+        remove_consts_from_outputs=False,
+    )
+    validate_subgraph_output_types(body_r)
+
+    # We set include contiguity=False because we have vmap x HOP tests, where if
+    # include_contiguity=True will call t.is_contiguous inside of vmap and get an error
+    # "querying is_contiguous inside of vmap for memory_format other than
+    # torch.contiguous_format is not yet implemented". This is okay because stride
+    # is still checked.
+    check_meta_consistency_vt(
+        body_r.unpack_var_sequence(tx),
+        operands_seq,
+        "body_fn_output",
+        "carried_inputs",
+        include_contiguity=False,
+    )
+
+    (
+        cond_graph,
+        body_graph,
+        cond_shared,
+        _body_shared,
+        cond_unique,
+        body_unique,
+    ) = _merge_graph_inputs(
+        cond_graph,
+        cond_lifted_freevars,
+        "cond_fn",
+        body_graph,
+        body_lifted_freevars,
+        "body_fn",
+    )
+
+    # Note: cond_shared and body_shared refer to the same proxy in parent graph
+    # so using either of them is OK. Use cond_shared as it doesn't matter.
+    additional_lifted_inputs = cond_shared + cond_unique + body_unique
+
+    body_nn_modules = dict(tx.output.nn_modules)
+
+    cond_gm = torch.fx.GraphModule(cond_nn_modules, cond_graph)
+    body_gm = torch.fx.GraphModule(body_nn_modules, body_graph)
+    cond_name = tx.output.install_subgraph("cond_fn", cond_gm)
+    body_name = tx.output.install_subgraph("body_fn", body_gm)
+
+    cond_node = make_attr(tx, cond_name)
+    body_node = make_attr(tx, body_name)
+
+    operands_proxy = tuple(operand.as_proxy() for operand in operands_seq)
+    additional_inputs_proxy = tuple(
+        [inp.as_proxy() for inp in additional_inputs_seq] + additional_lifted_inputs
+    )
+    p_args = (
+        cond_node,
+        body_node,
+        operands_proxy,
+        additional_inputs_proxy,
+    )
+    return _call_function_and_unflatten_output(
+        tx,
+        self.value,
+        p_args,
+        {},
+        None,
+        body_treespec,
+    )
+
+
 def are_same_graph_modules(fn_name, a_mod, b_mod, fake_mode):
     from torch._subclasses._fake_tensor_utils import _CacheKeyState
     from torch._subclasses.fake_tensor import extract_tensor_metadata
@@ -636,6 +913,9 @@ def speculate_subgraph(
     set_subgraph_inputs="automatic",
     restore_side_effects=True,
     should_flatten_outputs=False,
+    # if should_flatten_outputs is True, `remove_consts_from_outputs` remove the
+    # const outputs from the subgraph output.
+    remove_consts_from_outputs=True,
     under_activation_checkpoint=False,
     # TODO - supports input_mutation and aliasing should be False by default for strictness
     supports_input_mutation=True,
@@ -726,15 +1006,38 @@ def speculate_subgraph(
                 tx.output.side_effects = prev_side_effects
 
             treespec = None
+            masks_to_filter_const_values = None
+            const_values = None
             if should_flatten_outputs:
+                from torch._dynamo.external_utils import filter_out_const_values
+
                 # Flatten the speculated subgraph output.
                 output, treespec = _make_inlined(tx, pytree.tree_flatten)(
                     output
                 ).unpack_var_sequence(tx)
+
                 # Actually, transform the list (returned by flatten) into a tuple
                 # for dynamo consistency.
                 output = BuiltinVariable(tuple).call_function(tx, [output], {})
 
+                if remove_consts_from_outputs:
+                    # Filter out the constants and save them into a spec. Filtering
+                    # out constants makes the graph simpler for the backends. We
+                    # need to ensure that after unflattening the constants are
+                    # inserted back at the right positions for the Dynamo tracing to
+                    # continue. This is done by filter_const_spec
+                    output_proxies = output.as_proxy()
+                    masks_to_filter_const_values = pytree.tree_map(
+                        lambda x: not isinstance(x, torch.fx.Proxy), output_proxies
+                    )
+                    const_values = pytree.tree_map(
+                        lambda x: None if isinstance(x, torch.fx.Proxy) else x,
+                        output_proxies,
+                    )
+                    output = _make_inlined(tx, filter_out_const_values)(
+                        output, masks_to_filter_const_values
+                    )
+
             # Register output to graph
             # Modeled off of compile_and_call_fx_graph
             # TODO: support pytree output
@@ -742,7 +1045,16 @@ def speculate_subgraph(
             # like bwd.
             if always_restore:
                 # Nothing left to do here
-                return (output, treespec), tx.output.graph, subtracer.lifted_freevars
+                return (
+                    (
+                        output,
+                        OutputSpec(
+                            treespec, masks_to_filter_const_values, const_values
+                        ),
+                    ),
+                    tx.output.graph,
+                    subtracer.lifted_freevars,
+                )
             else:
                 validate_subgraph_output_types(output)
 
@@ -858,7 +1170,12 @@ def move_lifted_freevars_phs_to_end(
                         )
 
                 return (
-                    (output, treespec),
+                    (
+                        output,
+                        OutputSpec(
+                            treespec, masks_to_filter_const_values, const_values
+                        ),
+                    ),
                     graph,
                     lifted_freevars,
                 )
@@ -1045,7 +1362,7 @@ def speculate_branch(branch):
             ix = 1 if branch else 2
             # TODO: Support kwargs
             (
-                (ret_val, ret_treespec),
+                (ret_val, ret_spec),
                 ret_graph,
                 ret_lifted_freevars,
             ) = speculate_subgraph(
@@ -1056,6 +1373,8 @@ def speculate_branch(branch):
                 "cond",
                 source_target=self.value,
                 should_flatten_outputs=True,
+                # TODO - removing consts from control flow ops need more work
+                remove_consts_from_outputs=False,
                 supports_input_mutation=self.supports_input_mutation,
                 supports_aliasing=self.supports_aliasing,
             )
@@ -1071,25 +1390,23 @@ def speculate_branch(branch):
                         "Expected branches to return a possibly nested pytree of tensors "
                         f"or constant ints but it consists of others {ret.python_type()}.",
                     )
-            return ret_val, ret_treespec, ret_graph, ret_lifted_freevars
+            return ret_val, ret_spec, ret_graph, ret_lifted_freevars
 
-        (true_r, true_treespec, true_graph, true_lifted_freevars) = speculate_branch(
-            True
-        )
+        (true_r, true_spec, true_graph, true_lifted_freevars) = speculate_branch(True)
         true_nn_modules = dict(tx.output.nn_modules)
 
         (
             false_r,
-            false_treespec,
+            false_spec,
             false_graph,
             false_lifted_freevars,
         ) = speculate_branch(False)
         false_nn_modules = dict(tx.output.nn_modules)
 
-        same_treespec = _make_inlined(tx, pytree.TreeSpec.__eq__)(
-            true_treespec, false_treespec
+        same_spec = _make_inlined(tx, pytree.TreeSpec.__eq__)(
+            true_spec.treespec, false_spec.treespec
         )
-        if not same_treespec.as_python_constant():
+        if not same_spec.as_python_constant():
             unimplemented("Expected branches to return the same pytree structure.")
 
         (
@@ -1134,7 +1451,7 @@ def speculate_branch(branch):
             p_args,
             {},
             None,
-            true_treespec,
+            true_spec,
         )
 
 
@@ -1203,241 +1520,23 @@ def _call_function(
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
-        from torch._higher_order_ops.while_loop import _create_unbacked_symint
-
-        from . import TensorVariable
-
-        args, kwargs = LazyVariableTracker.realize_all((args, kwargs))
-        cond_fn, body_fn, operands, additional_inputs = args
-
-        # Input checks
-        for i, k in enumerate(["cond_fn", "body_fn", "operands"]):
-            if v := kwargs.pop(k, None):
-                assert i == len(args), (
-                    "did not provide the right number of non-keyword args"
-                )
-                args.append(v)
-
-        if kwargs:
-            unimplemented(
-                f"torch.while_loop: Got unexpected kwargs: {list(kwargs.keys())}"
-            )
-
-        if len(args) != 4:
-            unimplemented(
-                f"Expected 4 arguments but got {len(args)}.\n"
-                f"Usage: while_loop(cond_fn, body_fn, operands)",
-            )
-
-        # cond_fn and body_fn input check
-        _check_supported_callable_arg(tx, cond_fn, "cond_fn")
-        _check_supported_callable_arg(tx, body_fn, "body_fn")
-
-        # operands input check
-        operands_seq = operands.unpack_var_sequence(tx)
-
-        # additional_inputs input check
-        if not isinstance(additional_inputs, (ListVariable, TupleVariable)):
-            unimplemented(
-                f"Expected additional_inputs to be a list/tuple but got "
-                f"{additional_inputs.python_type()}. It seems to be an "
-                f"internal error, please report an issue to PyTorch."
-            )
-        additional_inputs_seq = additional_inputs.unpack_var_sequence(tx)
-
-        with discard_graph_changes(tx):
-            # Note: this must be run under discard graph changes.
-            def unspecialize_carried_inputs(tx, carry) -> VariableTracker:
-                # See NOTE [unspecialize int carry with unbacked symints]
-                if (
-                    isinstance(carry, ConstantVariable) and carry.python_type() is int
-                ) or isinstance(carry, SymNodeVariable):
-                    example_value = _create_unbacked_symint(
-                        tx.output.fake_mode, ignore_fresh_unbacked_symbols=True
-                    )
-                    proxy = tx.output.current_tracer.create_graph_input(
-                        "unbacked_symint", type(example_value), example_value
-                    )
-                    return SymNodeVariable.create(tx, proxy, example_value)
-                else:
-                    # See NOTE [unspecialize constant tensor carry]
-                    assert isinstance(carry, TensorVariable)
-                    cloned_carry = carry.clone()
-                    cloned_carry.proxy.node.meta["example_value"].constant = None
-                    return cloned_carry
-
-            # clone inputs across subgraphs, to avoid unbacked memoization in fake prop
-            cond_operands_seq = [
-                unspecialize_carried_inputs(
-                    tx,
-                    (
-                        carry.call_method(tx, "clone", args=(), kwargs={})
-                        if isinstance(carry, TensorVariable)
-                        else carry
-                    ),
-                )
-                for carry in operands_seq
-            ]
-            body_operands_seq = [
-                unspecialize_carried_inputs(
-                    tx,
-                    (
-                        carry.call_method(tx, "clone", args=(), kwargs={})
-                        if isinstance(carry, TensorVariable)
-                        else carry
-                    ),
-                )
-                for carry in operands_seq
-            ]
-
-        # create cond subgrpahs
-        (
-            (cond_r, _cond_treespec),
-            cond_graph,
-            cond_lifted_freevars,
-        ) = speculate_subgraph(
-            tx,
-            cond_fn,
-            cond_operands_seq + additional_inputs_seq,
-            {},
-            "while_loop",
-            source_target=self.value,
-            # NOTE [why we cannot use "automatic" for while_loop]:
-            # The reason is that we want to enforce
-            # the ordering of inputs and outputs to be consistent and the the ordering
-            # of cond_fn and body_fn to the consistent.
-            # e.g. suppose we use "automatic" and we have:
-            #
-            # def body_fn(ph1, ph2):
-            #   new_a, new_b = ph2.cos(), ph1.sin()
-            #   return new_a, new_b
-            #
-            # a, b = torch.randn(3), torch.randn(3)
-            # new_a, new_b = body_fn(a, b)
-            #
-            # Using automatic, the ordering of arguments will be the order that they're
-            # used. In this example, the capture graph looks like:
-            #
-            # def captured_body(ph1, ph2):
-            #   new_a, new_b = ph1.cos(), ph2.add_(1)
-            #   return new_a, new_b
-            #
-            # This is fine when we change the calling convention of captured_body to be
-            # new_a, new_b = captured_body(b, a).
-            # But for while_loop, the next iteration's input is previous iteration output
-            # we'll end up feeding captured_body(new_a, new_b) instead.
-            # So it's best we always enforce the ordering of carried_inputs the same as outputs
-            # with "flatten_manual".
-            set_subgraph_inputs="flatten_manual",
-            supports_input_mutation=self.supports_input_mutation,
-            supports_aliasing=self.supports_aliasing,
-        )
-        cond_nn_modules = dict(tx.output.nn_modules)
-        validate_subgraph_output_types(cond_r)
-        if isinstance(cond_r, TensorVariable):
-            cond_r_meta = _extract_tensor_metadata(
-                cond_r.proxy.node.meta["example_value"], include_contiguity=False
-            )
-            if (
-                not cond_r_meta.dtype == torch.bool
-                or not cond_r_meta.shape == torch.Size([])
-            ):
-                unimplemented(
-                    f"Expected cond_fn to return a scalar tensor or a bool but got {cond_r_meta.shape}"
-                )
-        elif isinstance(cond_r, ConstantVariable):
-            # short-circuiting while_loop when cond_fn returns a constant such as 0, 1 True or False
-            pred = cond_r.as_python_constant()
-            if pred:
-                unimplemented(
-                    f"Infinite loop detected because while_loop's cond_fn always returns the same value {pred}"
-                )
-            else:
-                return operands
-
-        # create body subgraph
-        (
-            (body_r, body_treespec),
-            body_graph,
-            body_lifted_freevars,
-        ) = speculate_subgraph(
-            tx,
-            body_fn,
-            body_operands_seq + additional_inputs_seq,
-            {},
-            "while_loop",
-            source_target=self.value,
-            set_subgraph_inputs="flatten_manual",
-            should_flatten_outputs=True,
-            supports_input_mutation=False,
-            supports_aliasing=False,
-        )
-        validate_subgraph_output_types(body_r)
-
-        # We set include contiguity=False because we have vmap x HOP tests, where if
-        # include_contiguity=True will call t.is_contiguous inside of vmap and get an error
-        # "querying is_contiguous inside of vmap for memory_format other than
-        # torch.contiguous_format is not yet implemented". This is okay because stride
-        # is still checked.
-        check_meta_consistency_vt(
-            body_r.unpack_var_sequence(tx),
-            operands_seq,
-            "body_fn_output",
-            "carried_inputs",
-            include_contiguity=False,
-        )
+        return _call_while_loop(self, tx, args, kwargs, stack_output=False)
 
-        (
-            cond_graph,
-            body_graph,
-            cond_shared,
-            _body_shared,
-            cond_unique,
-            body_unique,
-        ) = _merge_graph_inputs(
-            cond_graph,
-            cond_lifted_freevars,
-            "cond_fn",
-            body_graph,
-            body_lifted_freevars,
-            "body_fn",
-        )
 
-        # Note: cond_shared and body_shared refer to the same proxy in parent graph
-        # so using either of them is OK. Use cond_shared as it doesn't matter.
-        additional_lifted_inputs = cond_shared + cond_unique + body_unique
-
-        body_nn_modules = dict(tx.output.nn_modules)
-
-        cond_name = tx.output.install_subgraph(
-            "cond_fn",
-            torch.fx.GraphModule(cond_nn_modules, cond_graph),
-        )
-        body_name = tx.output.install_subgraph(
-            "body_fn",
-            torch.fx.GraphModule(body_nn_modules, body_graph),
-        )
-
-        cond_node = make_attr(tx, cond_name)
-        body_node = make_attr(tx, body_name)
+class WhileLoopStackOutputHigherOrderVariable(TorchHigherOrderOperatorVariable):
+    supports_input_mutation = False
+    supports_aliasing = False
 
-        p_args = (
-            cond_node,
-            body_node,
-            tuple([operand.as_proxy() for operand in operands_seq]),
-            tuple(
-                [inp.as_proxy() for inp in additional_inputs_seq]
-                + additional_lifted_inputs
-            ),
-        )
-        return _call_function_and_unflatten_output(
-            tx,
-            torch.ops.higher_order.while_loop,
-            p_args,
-            {},
-            None,
-            body_treespec,
-        )
+    @raise_hard_error_if_graph_break(
+        reason="while_loop_stack_output doesn't work unless it is captured completely with torch.compile."
+    )
+    def _call_function(
+        self,
+        tx: "InstructionTranslator",
+        args: list[VariableTracker],
+        kwargs: dict[str, VariableTracker],
+    ) -> VariableTracker:
+        return _call_while_loop(self, tx, args, kwargs, stack_output=True)
 
 
 class AssociativeScanHigherOrderVariable(TorchHigherOrderOperatorVariable):
@@ -1526,7 +1625,7 @@ def arg_extractor(combine_fn, xs, additional_inputs):
 
         sub_args = sub_args + sub_args_additional_inputs
         (
-            (combine_result, _combine_treespec),
+            (combine_result, _combine_spec),
             combine_graph,
             combine_lifted_freevars,
         ) = speculate_subgraph(
@@ -1641,7 +1740,7 @@ def arg_extractor(combine_fn, xs, additional_inputs):
             p_args,
             {},
             None,
-            xs_treespec,
+            OutputSpec(xs_treespec),
         )
 
 
@@ -1749,7 +1848,7 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
 
         sub_args = sub_args_init + sub_args_inp + sub_args_additional_inputs
         (
-            (combine_result, _combine_treespec),
+            (combine_result, _combine_spec),
             combine_graph,
             combine_lifted_freevars,
         ) = speculate_subgraph(
@@ -1783,7 +1882,7 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
                     f"Expect combine_fn to return a tuple (next_carry, y) but got {combine_result_vars}"
                 )
             carry_tree, out_vars = combine_result_vars
-            carry_vars, carry_treespec = _make_inlined(tx, pytree.tree_flatten)(
+            carry_vars, _ = _make_inlined(tx, pytree.tree_flatten)(
                 carry_tree
             ).unpack_var_sequence(tx)
             carry_vars = carry_vars.unpack_var_sequence(tx)
@@ -1792,7 +1891,9 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
             ).unpack_var_sequence(tx)
 
             # additional output checking
-            _combine_treespec = _make_inlined(tx, pytree.tree_structure)(combine_result)
+            _combine_spec = OutputSpec(
+                _make_inlined(tx, pytree.tree_structure)(combine_result)
+            )
 
             check_meta_consistency_vt(
                 init_vars,
@@ -1833,7 +1934,7 @@ def arg_extractor(combine_fn, init, xs, additional_inputs):
         )
 
         return _call_function_and_unflatten_output(
-            tx, torch.ops.higher_order.scan, p_args, {}, None, _combine_treespec
+            tx, torch.ops.higher_order.scan, p_args, {}, None, _combine_spec
         )
 
 
@@ -1912,6 +2013,8 @@ def _call_function(
             source_target=self.value,
             set_subgraph_inputs="flatten_manual",
             should_flatten_outputs=True,
+            # TODO - removing consts from control flow ops need more work
+            remove_consts_from_outputs=False,
             supports_input_mutation=self.supports_input_mutation,
             supports_aliasing=self.supports_aliasing,
         )
@@ -2413,7 +2516,7 @@ def _call_function(
             )
 
         (
-            (ret_val, ret_treespec),
+            (ret_val, ret_spec),
             ret_graph,
             ret_lifted_freevars,
         ) = speculate_subgraph(
@@ -2451,7 +2554,7 @@ def _call_function(
             p_args,
             {},
             flat_example_value,
-            ret_treespec,
+            ret_spec,
         )
 
 
@@ -2465,8 +2568,6 @@ def _call_function(
         from torch._higher_order_ops.wrap import TagActivationCheckpoint
         from torch.utils.checkpoint import noop_context_fn
 
-        from .builder import wrap_fx_proxy
-
         context_fn = None
         if "context_fn" in kwargs and kwargs["context_fn"] != noop_context_fn:
             ctx = kwargs.pop("context_fn")
@@ -2490,7 +2591,7 @@ def _call_function(
             _,
             example_value,
             _body_r,
-            treespec,
+            out_spec,
             checkpointed_gmod,
             _,
         ) = self.create_wrapped_node(
@@ -2506,27 +2607,15 @@ def _call_function(
 
         _, checkpoint_kwargs = proxy_args_kwargs([], checkpoint_kwargs)
 
-        # Store the invocation as a call
-        variable = wrap_fx_proxy(
-            tx=tx,
-            proxy=tx.output.create_proxy(
-                "call_function",
-                self.value,
-                args=tuple(p_args),
-                kwargs=checkpoint_kwargs,
-            ),
-            example_value=example_value,
+        return _call_function_and_unflatten_output(
+            tx,
+            self.value,
+            p_args,
+            checkpoint_kwargs,
+            example_value,
+            out_spec,
         )
 
-        if treespec is None:
-            return variable
-
-        # Transform variable back into a list (previously made into a tuple by
-        # speculate_subgraph function) so as to respect the pytree API typing.
-        variable = BuiltinVariable(list).call_function(tx, [variable], {})
-
-        return _make_inlined(tx, pytree.tree_unflatten)(variable, treespec)
-
 
 class DynamoBypassingWrapperHigherOrderVariable(WrapHigherOrderVariable):
     def __init__(self, hop, source) -> None:
@@ -2538,8 +2627,6 @@ def _call_function(
         args: list[VariableTracker],
         kwargs: dict[str, VariableTracker],
     ) -> VariableTracker:
-        from .builder import wrap_fx_proxy
-
         func_var = args[0]
 
         if isinstance(func_var, torch._dynamo.variables.UserFunctionVariable):
@@ -2557,7 +2644,7 @@ def _call_function(
             _,
             example_value,
             _body_r,
-            treespec,
+            out_spec,
             gmod,
             _,
         ) = self.create_wrapped_node(
@@ -2573,27 +2660,15 @@ def _call_function(
         gmod_meta_key = "_dynamo_bypassing_wrapper_fn"
         gmod.meta[gmod_meta_key] = func
 
-        # Store the invocation as a call
-        variable = wrap_fx_proxy(
-            tx=tx,
-            proxy=tx.output.create_proxy(
-                "call_function",
-                self.value,
-                args=(gmod_meta_key,) + tuple(p_args),
-                kwargs={},
-            ),
-            example_value=example_value,
+        return _call_function_and_unflatten_output(
+            tx,
+            self.value,
+            (gmod_meta_key,) + tuple(p_args),
+            {},
+            example_value,
+            out_spec,
         )
 
-        if treespec is None:
-            return variable
-
-        # Transform variable back into a list (previously made into a tuple by
-        # speculate_subgraph function) so as to respect the pytree API typing.
-        variable = BuiltinVariable(list).call_function(tx, [variable], {})
-
-        return _make_inlined(tx, pytree.tree_unflatten)(variable, treespec)
-
 
 class ExportTracepointHigherOrderVariable(TorchHigherOrderOperatorVariable):
     def call_function(
@@ -2772,7 +2847,7 @@ def create_scalar():
 
         with TransformGetItemToIndex():
             (
-                (_body_output, _body_treespec),
+                (_body_output, _body_spec),
                 body_graph,
                 body_lifted_freevars,
             ) = speculate_subgraph(
@@ -3426,6 +3501,7 @@ def _call_function(
 _hop_name_to_variable_class = {
     "cond": CondHigherOrderVariable,
     "while_loop": WhileLoopHigherOrderVariable,
+    "while_loop_stack_output": WhileLoopStackOutputHigherOrderVariable,
     "map_impl": MapHigherOrderVariable,
     "executorch_call_delegate": ExecutorchCallDelegateHigherOrderVariable,
     "out_dtype": OutDtypeHigherOrderVariable,
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 3427018fb5d4..654bf2e756c4 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -19,6 +19,7 @@ class that handles its unique behaviors while integrating with Dynamo's
 import collections
 import inspect
 import operator
+import sys
 from typing import Optional, TYPE_CHECKING
 
 import torch
@@ -38,6 +39,7 @@ class that handles its unique behaviors while integrating with Dynamo's
     namedtuple_fields,
     odict_values,
     raise_args_mismatch,
+    range_iterator,
     set_example_value,
 )
 from .base import ValueMutationNew, VariableTracker
@@ -276,6 +278,16 @@ def __init__(self, items, **kwargs) -> None:
         else:
             raise AssertionError
 
+        def maybe_as_int(x):
+            return (
+                ConstantVariable(int(x.value)) if isinstance(x, ConstantVariable) else x
+            )
+
+        # cast each argument to an integer
+        start = maybe_as_int(start)
+        step = maybe_as_int(step)
+        stop = maybe_as_int(stop)
+
         assert stop is not None
         super().__init__([start, stop, step], **kwargs)
 
@@ -362,7 +374,12 @@ def apply_index(self, index):
             index = length + index
 
         if index < 0 or index >= length:
-            raise IndexError(f"index {index} is out of range")
+            tx = torch._dynamo.symbolic_convert.InstructionTranslator.current_tx()
+            raise_observed_exception(
+                IndexError,
+                tx,
+                args=[ConstantVariable("range object index out of range")],
+            )
 
         return variables.ConstantVariable.create(self.start() + (index * self.step()))
 
@@ -396,8 +413,11 @@ def getitem_const(self, tx: "InstructionTranslator", arg: VariableTracker):
 
         if isinstance(index, slice):
             return self.apply_slice(index)
-        else:
+        elif isinstance(index, int):
             return self.apply_index(index)
+        else:
+            msg = ConstantVariable("range indices must be integers or slices")
+            raise_observed_exception(TypeError, tx, args=[msg])
 
     def as_proxy(self):
         return self.python_type()(*self._as_proxy())
@@ -413,17 +433,94 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
         codegen.foreach(self.items)
         codegen.extend_output(create_call_function(3, False))
 
+    def call_obj_hasattr(
+        self, tx: "InstructionTranslator", name: str
+    ) -> "VariableTracker":
+        if self.python_type() is not range:
+            return super().call_obj_hasattr(tx, name)
+        return variables.ConstantVariable.create(hasattr(range(0), name))
+
+    def range_equals(self, other: "RangeVariable"):
+        r0, r1 = self, other
+        if (
+            self.range_length() != r1.range_length()
+            or self.range_length() == 0
+            or r0.start() != r1.start()
+        ):
+            return False
+
+        if len(r0) == 1:
+            return True
+
+        return r0.step() == r1.step()
+
+    def range_count(self, x: VariableTracker):
+        # Based on CPython
+        # https://github.com/guilhermeleobas/cpython/blob/baefaa6cba1d69efd2f930cdc56bca682c54b139/Objects/rangeobject.c#L442-L486
+        x = x.as_python_constant()
+        if type(x) not in (bool, int, float):
+            return 0
+
+        start, stop, step = self.start(), self.stop(), self.step()
+
+        if step == 0:
+            return 0
+
+        in_range = (start <= x < stop) if step > 0 else (stop < x <= start)
+
+        if in_range:
+            re = ((x - start) % step) == 0
+            return int(re)
+        return 0
+
+    def call_method(self, tx, name, args, kwargs):
+        if name == "__iter__":
+            if not all(var.is_python_constant() for var in self.items):
+                # Can't represent a `range_iterator` without well defined bounds
+                return variables.misc.DelayGraphBreakVariable(
+                    msg="Cannot create range_iterator: bounds (start, stop, step) must be fully defined as concrete constants.",
+                )
+            return RangeIteratorVariable(
+                self.start(), self.stop(), self.step(), self.range_length()
+            )
+        elif name == "__len__":
+            length = self.range_length()
+            if length > sys.maxsize:
+                raise_observed_exception(OverflowError, tx)
+            return ConstantVariable.create(self.range_length())
+        elif name in ("count", "__contains__"):
+            return ConstantVariable(self.range_count(*args))
+        elif name == "__getitem__":
+            return self.getitem_const(tx, *args)
+        elif name in cmp_name_to_op_mapping:
+            other = args[0]
+            pt = other.python_type()
+            if name not in ("__eq__", "__ne__"):
+                # ranges are only comparable to other ranges
+                msg = f"{name} not supported between instances of 'range' and '{pt}'"
+                raise_observed_exception(
+                    TypeError,
+                    tx,
+                    args=[ConstantVariable.create(msg)],
+                )
+
+            if pt is not range:
+                return ConstantVariable.create(NotImplemented)
+
+            cmp = self.range_equals(other)
+
+            # Two ranges are equal if they produce the same sequence of values
+            if name == "__eq__":
+                return ConstantVariable(cmp)
+            else:
+                return ConstantVariable(not cmp)
+        return super().call_method(tx, name, args, kwargs)
+
     def var_getattr(self, tx: "InstructionTranslator", name):
         fields = ["start", "stop", "step"]
-        if name not in fields:
-            unimplemented_v2(
-                gb_type="Unsupported attribute for range() object",
-                context=f"var_getattr {self} {name}",
-                explanation=f"Expected attribute to be one of {','.join(fields)} "
-                f"but got {name}",
-                hints=[*graph_break_hints.USER_ERROR],
-            )
-        return self.items[fields.index(name)]
+        if name in fields:
+            return self.items[fields.index(name)]
+        return super().var_getattr(tx, name)
 
 
 class CommonListMethodsVariable(BaseListVariable):
@@ -1298,3 +1395,52 @@ def reconstruct(self, codegen: "PyCodegen") -> None:
 
 class TupleIteratorVariable(ListIteratorVariable):
     pass
+
+
+class RangeIteratorVariable(IteratorVariable):
+    # only needed for isinstance(..., range_iterator) to work
+    _nonvar_fields = {
+        "iter_obj",
+    }
+
+    def __init__(self, start: int, stop: int, step: int, len_: int, **kwargs):
+        super().__init__(**kwargs)
+        self.start = start
+        self.stop = stop
+        self.step = step
+        self.len = len_
+
+    def call_method(self, tx, name, args, kwargs):
+        if name == "__next__":
+            return self.next_variable(tx)
+        elif name == "__iter__":
+            return self
+        return super().call_method(tx, name, args, kwargs)
+
+    def call_obj_hasattr(self, tx, name):
+        if self.python_type() is range_iterator:
+            ri = iter(range(0))
+            return ConstantVariable(hasattr(ri, name))
+        return super().call_obj_hasattr(tx, name)
+
+    def next_variable(self, tx):
+        if self.len <= 0:
+            raise_observed_exception(StopIteration, tx)
+
+        self.len -= 1
+        current = self.start
+        self.start += self.step
+        return ConstantVariable.create(current)
+
+    def python_type(self):
+        return range_iterator
+
+    def reconstruct(self, codegen: "PyCodegen"):
+        codegen.add_push_null(
+            lambda: codegen.append_output(codegen.create_load_python_module(range))
+        )
+        codegen.append_output(codegen.create_load_const(self.start))
+        codegen.append_output(codegen.create_load_const(self.stop))
+        codegen.append_output(codegen.create_load_const(self.step))
+        codegen.extend_output(create_call_function(3, False))
+        codegen.append_output(create_instruction("GET_ITER"))
diff --git a/torch/_dynamo/variables/misc.py b/torch/_dynamo/variables/misc.py
index 6c0fdd8c0b73..60086fe6758c 100644
--- a/torch/_dynamo/variables/misc.py
+++ b/torch/_dynamo/variables/misc.py
@@ -1182,6 +1182,15 @@ def call_method(
 
         return super().call_method(tx, name, args, kwargs)
 
+    def get_forwarded_dict(self, tx):
+        assert (
+            self.name == "__dict__"
+            and isinstance(self.obj, variables.UserDefinedClassVariable)
+            and not tx.output.side_effects.has_pending_mutation(self.obj)
+        )
+        self.obj.ban_mutation = True
+        return VariableTracker.build(tx, self.obj.value.__dict__, self.source)
+
 
 class MethodWrapperVariable(VariableTracker):
     def __init__(self, method_wrapper, **kwargs) -> None:
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index ee53ee4f5683..bfebedc88d6e 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -149,7 +149,6 @@
     torch.cuda.is_initialized,
     torch.xpu.current_device,
     torch.xpu.is_initialized,
-    torch.autograd._profiler_enabled,
 ]
 
 constant_fold_functions = [
diff --git a/torch/_dynamo/variables/user_defined.py b/torch/_dynamo/variables/user_defined.py
index befefe290840..9c28ceb762b0 100644
--- a/torch/_dynamo/variables/user_defined.py
+++ b/torch/_dynamo/variables/user_defined.py
@@ -92,12 +92,7 @@
     tuple_methods,
     unpatched_nn_module_getattr,
 )
-from .base import (
-    AttributeMutationExisting,
-    AttributeMutationNew,
-    ValueMutationNew,
-    VariableTracker,
-)
+from .base import ValueMutationNew, VariableTracker
 from .dicts import DefaultDictVariable
 from .lists import SizeVariable
 
@@ -162,6 +157,10 @@ class UserDefinedClassVariable(UserDefinedVariable):
     def __init__(self, value, **kwargs) -> None:
         super().__init__(**kwargs)
         self.value = value
+        # Used when we materialize class.__dict__ to a MappingProxyObject. In
+        # this case, we don't want to allow mutation in the class because there
+        # is no way to reflect it in the created MappingProxyVariable.
+        self.ban_mutation = False
 
     def as_python_constant(self):
         return self.value
@@ -420,6 +419,8 @@ def call_method(
             return BuiltinVariable.call_custom_dict_fromkeys(
                 tx, self.value, *args, **kwargs
             )
+        elif self.value is collections.OrderedDict and name == "move_to_end":
+            return args[0].call_method(tx, name, [*args[1:]], kwargs)
         elif name == "__eq__" and len(args) == 1 and hasattr(args[0], "value"):
             return variables.ConstantVariable(self.value == args[0].value)
         elif name == "__ne__" and len(args) == 1 and hasattr(args[0], "value"):
@@ -449,6 +450,13 @@ def call_method(
                 args[0],
                 args[1:],
             )
+        elif name == "__setattr__" and self.ban_mutation:
+            unimplemented_v2(
+                gb_type="Class attribute mutation when the __dict__ was already materialized",
+                context=str(self.value),
+                explanation="Dyanmo does not support tracing mutations on a class when its __dict__ is materialized",
+                hints=graph_break_hints.SUPPORTABLE,
+            )
         return super().call_method(tx, name, args, kwargs)
 
     def call_function(
@@ -1434,6 +1442,8 @@ def var_getattr(self, tx: "InstructionTranslator", name):
             subobj_from_class is subobj
             and self.cls_source is not None
             and self.source is not None
+            and hasattr(self.value, "__dict__")
+            and name not in self.value.__dict__
         )
 
         if isinstance(subobj, property):
@@ -1738,6 +1748,21 @@ def as_proxy(self):
         ctor = self.python_type()
         return ctor(*args, **kwargs)
 
+    def reconstruct(self, codegen: "PyCodegen") -> None:
+        # Handle specific pytree classes
+        import torch.utils._pytree as pytree
+
+        if self.value_type is pytree.LeafSpec:
+            # Create a new LeafSpec instance by calling the constructor
+            codegen.add_push_null(
+                lambda: codegen.load_import_from("torch.utils._pytree", "LeafSpec")
+            )
+            codegen.extend_output(create_call_function(0, False))
+            return
+
+        # For other frozen dataclasses, fall back to the base class behavior
+        super().reconstruct(codegen)
+
     # NB: This is called during __init__ for a frozen dataclass
     # use this to accumulate the most up-to-date field values
     def method_setattr_standard(self, tx: "InstructionTranslator", name, value):
@@ -1915,7 +1940,7 @@ def __init__(self, value, dict_vt=None, **kwargs):
                 "dict_vt must be constructed by builder.py when source is present"
             )
             self._dict_vt = variables.ConstDictVariable(
-                {}, mutation_type=ValueMutationNew()
+                {}, type(value), mutation_type=ValueMutationNew()
             )
         self._dict_methods = dict_methods
 
@@ -1955,6 +1980,10 @@ def unpack_var_sequence(self, tx):
     def is_underlying_vt_modified(self, side_effects):
         return side_effects.is_modified(self._dict_vt)
 
+    @property
+    def user_cls(self):
+        return self._dict_vt.user_cls
+
     @property
     def items(self):
         return self._dict_vt.items
@@ -2140,9 +2169,6 @@ class MutableMappingVariable(UserDefinedObjectVariable):
     def __init__(self, value, **kwargs):
         super().__init__(value, **kwargs)
         self.generic_dict_vt = variables.ConstDictVariable({})
-        self.mutation_type = (
-            AttributeMutationExisting() if self.source else AttributeMutationNew()
-        )
 
     def var_getattr(self, tx: "InstructionTranslator", name: str) -> "VariableTracker":
         # A common pattern in the init code of MutableMapping objects is to
diff --git a/torch/_export/non_strict_utils.py b/torch/_export/non_strict_utils.py
index 6a24642013db..fffe85beb467 100644
--- a/torch/_export/non_strict_utils.py
+++ b/torch/_export/non_strict_utils.py
@@ -330,7 +330,7 @@ def make_fake_inputs(
     args,
     kwargs,
     dynamic_shapes,
-    allow_complex_guards_as_runtime_asserts=False,
+    prefer_deferred_runtime_asserts_over_guards=False,
 ):
     """
     Given an nn module, example inputs, and constraints, return a new fake mode,
@@ -382,8 +382,7 @@ def make_fake_inputs(
                 shape_env=ShapeEnv(
                     tracked_fakes=[],
                     co_fields=co_fields,
-                    prefer_deferred_runtime_asserts_over_guards=True,
-                    allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+                    prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
                     trace_asserts=True,
                 ),
                 allow_non_fake_inputs=True,
diff --git a/torch/_export/serde/export_schema.thrift b/torch/_export/serde/export_schema.thrift
index da42094d4931..f4a08f873999 100644
--- a/torch/_export/serde/export_schema.thrift
+++ b/torch/_export/serde/export_schema.thrift
@@ -1,5 +1,5 @@
 // @generated by update_schema.py
-// checksum<<85b2cdab8bc34b2d6f89dd6a92f80b469aa69221acd11f1833a7dd7e08d45734>>
+// checksum<<a1c01cb72b55ca996960afa7e3b5ab6714405b036d8a3c33a81084a84e58bbab>>
 
 namespace py3 torch._export
 namespace cpp2 torch._export.schema
@@ -134,6 +134,11 @@ struct CustomObjArgument {
   20: string class_fqn;
 }
 
+struct ComplexValue {
+  10: double real;
+  20: double imag;
+}
+
 union Argument {
   10: bool as_none;
   20: TensorArgument as_tensor;
@@ -161,6 +166,7 @@ union Argument {
   230: SymFloatArgument as_sym_float;
   240: list<SymFloatArgument> as_sym_floats;
   250: OptionalTensorArgument as_optional_tensor;
+  260: ComplexValue as_complex;
 }
 
 struct NamedArgument {
@@ -336,6 +342,7 @@ struct ExportedProgram {
   60: SchemaVersion schema_version;
   70: list<string> verifiers;
   80: string torch_version;
+  90: list<string> guards_code;
 }
 
 struct PayloadMeta {
diff --git a/torch/_export/serde/schema.py b/torch/_export/serde/schema.py
index 7d1ce2d7b749..f4ce89c006f5 100644
--- a/torch/_export/serde/schema.py
+++ b/torch/_export/serde/schema.py
@@ -9,7 +9,7 @@
 
 
 # NOTE: Please update this value if any modifications are made to the schema
-SCHEMA_VERSION = (8, 12)
+SCHEMA_VERSION = (8, 14)
 TREESPEC_VERSION = 1
 
 
@@ -176,6 +176,12 @@ class CustomObjArgument:
     class_fqn: Annotated[str, 20]
 
 
+@dataclass
+class ComplexValue:
+    real: Annotated[float, 10]
+    imag: Annotated[float, 20]
+
+
 # This is actually a union type
 @_union_dataclass
 class Argument(_Union):
@@ -205,6 +211,7 @@ class Argument(_Union):
     as_sym_float: Annotated[SymFloatArgument, 230]
     as_sym_floats: Annotated[list[SymFloatArgument], 240]
     as_optional_tensor: Annotated[OptionalTensorArgument, 250]
+    as_complex: Annotated[ComplexValue, 260]
 
 
 class ArgumentKind(IntEnum):
@@ -442,6 +449,7 @@ class ExportedProgram:
     schema_version: Annotated[SchemaVersion, 60]
     verifiers: Annotated[list[str], 70] = field(default_factory=list)
     torch_version: Annotated[str, 80] = "<=2.4"
+    guards_code: Annotated[list[str], 90] = field(default_factory=list)
 
 
 #########################################################################
diff --git a/torch/_export/serde/schema.yaml b/torch/_export/serde/schema.yaml
index 9476174c2b58..951351e7786a 100644
--- a/torch/_export/serde/schema.yaml
+++ b/torch/_export/serde/schema.yaml
@@ -1,5 +1,5 @@
 # @generated by update_schema.py
-# checksum<<87c161b9527f9694d80839363f0e324f16f7d0f7277761016dc10228c3ce20e6>>
+# checksum<<74d07b92c36d5854263145c231553dcda15215f0460e7ace43554248c05378ec>>
 AOTInductorModelPickleData:
   kind: struct
   fields:
@@ -73,6 +73,8 @@ Argument:
       type: List[SymFloatArgument]
     as_optional_tensor:
       type: OptionalTensorArgument
+    as_complex:
+      type: ComplexValue
 ArgumentKind:
   kind: enum
   fields:
@@ -86,6 +88,13 @@ BufferMutationSpec:
       type: TensorArgument
     buffer_name:
       type: str
+ComplexValue:
+  kind: struct
+  fields:
+    real:
+      type: float
+    imag:
+      type: float
 ConstantValue:
   kind: union
   fields:
@@ -131,6 +140,9 @@ ExportedProgram:
     torch_version:
       type: str
       default: <=2.4
+    guards_code:
+      type: List[str]
+      default: '[]'
 ExternKernelNode:
   kind: struct
   fields:
@@ -539,5 +551,5 @@ UserOutputSpec:
       type: Argument
 SCHEMA_VERSION:
 - 8
-- 12
+- 14
 TREESPEC_VERSION: 1
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index d5b3369b16cd..07674b570294 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -35,12 +35,14 @@
 from torch.utils._sympy.symbol import prefix_str, SymT
 from torch.utils._sympy.value_ranges import ValueRanges
 from torch.utils._traceback import CapturedTraceback
+from torch.utils._triton import has_triton
 
 from ..utils import remove_proxy_from_state_dict
 from .schema import (  # type: ignore[attr-defined]
     Argument,
     ArgumentKind,
     BufferMutationSpec,
+    ComplexValue,
     ConstantValue,
     CustomObjArgument,
     Device,
@@ -670,6 +672,76 @@ def serialize_tensor_list_output(node):
                     metadata=self.serialize_metadata(node),
                     is_hop_single_tensor_return=False,
                 )
+            elif (
+                node.target
+                is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
+            ):
+                assert has_triton(), "triton required to serialize triton kernels"
+                from triton.runtime.autotuner import Autotuner
+
+                meta_val = node.meta["val"]
+                assert isinstance(meta_val, dict)
+
+                output_keys = meta_val.keys()
+                output_indices = []
+
+                assert isinstance(node.kwargs["kernel_idx"], int)
+                kernel = torch._higher_order_ops.triton_kernel_wrap.kernel_side_table.get_kernel(
+                    node.kwargs["kernel_idx"]
+                )
+
+                if isinstance(kernel, Autotuner):
+                    assert len(kernel.configs) == 1
+                    num_warps = kernel.configs[0].num_warps
+                    assert kernel.configs[0].num_ctas == 1, (
+                        "serialization only supports num_ctas == 1"
+                    )
+                    kernel = kernel.fn
+                else:
+                    num_warps = 4
+
+                constexpr_keys = set()
+                for p in kernel.params:
+                    if p.is_constexpr:
+                        constexpr_keys.add(p.name)
+
+                found_constexpr = False
+                args_new = ()
+                i = 0
+
+                assert isinstance(node.kwargs["kwargs"], dict)
+                for k, v in node.kwargs["kwargs"].items():
+                    # don't serialize constexpr since they will
+                    # be embedded into the binary and don't
+                    # need to be passed around as attributes
+                    if k in constexpr_keys:
+                        found_constexpr = True
+                        continue
+
+                    assert not found_constexpr, (
+                        "non-constexpr args found after constexpr arg(s)"
+                    )
+
+                    if k in output_keys:
+                        output_indices.append(i)
+                    args_new += (v,)  # type: ignore[assignment]
+                    i += 1
+
+                assert isinstance(node.kwargs["grid"], list)
+                kwargs_new = {
+                    "name": kernel.fn.__name__,
+                    "grid": node.kwargs["grid"][0],
+                    "output_indices": output_indices,
+                    "num_warps": num_warps,
+                }
+
+                ex_node = Node(
+                    target=self.serialize_operator(node.target),
+                    inputs=self.serialize_hoo_inputs(args_new, kwargs_new),
+                    outputs=self.serialize_hoo_outputs(node),
+                    metadata=self.serialize_metadata(node),
+                    is_hop_single_tensor_return=_is_hop_single_tensor_return(node),
+                )
             else:
                 ex_node = Node(
                     target=self.serialize_operator(node.target),
@@ -910,6 +982,15 @@ def serialize_input(self, arg, arg_type: Optional[Any] = None) -> Argument:
                     return Argument.create(
                         as_graph=GraphArgument(name=arg.target, graph=graph)
                     )
+                elif type(attr).__name__ == "LoweredBackendModule":
+                    # Special handling for executorch_call_delegate HOP
+                    # It's first argument is a LoweredBackendModule, for which we
+                    # serialize name and backend id of the lowered module
+                    module_name = getattr(attr, "module_name", None)
+                    backend_id = getattr(attr, "backend_id", None)
+                    assert module_name is not None, "module_name should not be None"
+                    assert backend_id is not None, "backend_id should not be None"
+                    return Argument.create(as_string=f"{module_name}-{backend_id}")
                 else:
                     raise SerializeError(
                         f"Unsupported getattr attribute {arg.target} with type: {type(attr)}"
@@ -977,6 +1058,10 @@ def serialize_input(self, arg, arg_type: Optional[Any] = None) -> Argument:
             return Argument.create(as_int=arg)
         elif type(arg) is float:
             return Argument.create(as_float=arg)
+        elif type(arg) is complex:
+            return Argument.create(
+                as_complex=ComplexValue(real=arg.real, imag=arg.imag)
+            )
         elif arg is None:
             return Argument.create(as_none=True)
         elif isinstance(arg, (list, tuple)):
@@ -1541,6 +1626,17 @@ def serialize_hoo_outputs(self, node: torch.fx.Node) -> list[Argument]:
                     outputs.append(self.serialize_output(name, element_meta_val))
 
             return outputs
+        elif isinstance(meta_val, dict):
+            tensor_args = []
+            # use the dict key as the idx
+            for idx, meta in meta_val.items():
+                if not isinstance(meta, torch.Tensor):
+                    raise SerializeError(
+                        f"Serialize list output with type {type(meta)} nyi"
+                    )
+                name = self._output_node_name_at_index(node, idx)
+                tensor_args.append(self.serialize_tensor_output(name, meta))
+            return [Argument.create(as_tensors=tensor_args)]
         else:
             return [self.serialize_output(node.name, meta_val)]
 
@@ -1698,6 +1794,7 @@ def serialize(self, exported_program: ep.ExportedProgram) -> _SerializedProgram:
             ),
             verifiers=[v.dialect for v in exported_program.verifiers],
             torch_version=torch.__version__,
+            guards_code=exported_program._guards_code,
         )
 
         # Test canonical form is well defined.
@@ -2067,7 +2164,13 @@ def _is_single_tensor_return(target) -> bool:
 
             fx_node = self.graph.create_node("call_function", target, args, {}, name)
             self.deserialize_sym_op_outputs(serialized_node, fx_node)
-
+        elif (
+            target
+            is torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional
+        ):
+            raise SerializeError(
+                "deserialize nyi for torch._higher_order_ops.triton_kernel_wrap.triton_kernel_wrapper_functional"
+            )
         elif isinstance(target, torch._ops.HigherOrderOperator):
             args, kwargs = self.deserialize_hoo_inputs(serialized_node.inputs)
             metadata = self.deserialize_metadata(serialized_node.metadata)
@@ -2493,6 +2596,8 @@ def deserialize_input(self, inp: Argument) -> Any:
             return inp.as_bool
         elif typ_ == "as_string":
             return inp.as_string
+        elif typ_ == "as_complex":
+            return complex(inp.as_complex.real, inp.as_complex.imag)
         elif typ_ == "as_sym_int":
             return self.deserialize_sym_argument(inp.as_sym_int)
         elif typ_ == "as_sym_float":
@@ -2934,6 +3039,7 @@ def deserialize(
             constants=res.constants,
             verifiers=[load_verifier(v) for v in exported_program.verifiers],
         )
+        result._guards_code = exported_program.guards_code
         log.debug("\n[deserialize]: %s", result)
         return result
 
@@ -3092,6 +3198,8 @@ def _get_argument(a: Argument):
             return None
         elif a.type == "as_strings":
             return None
+        elif a.type == "as_complex":
+            return None
         elif a.type == "as_sym_int":
             return a.as_sym_int
         elif a.type == "as_sym_ints":
@@ -3396,6 +3504,7 @@ def canonicalize(
     range_constraints = dict(
         sorted(ep.range_constraints.items(), key=operator.itemgetter(0))
     )
+    guards_code = sorted(ep.guards_code)
     module_call_graph = sorted(ep.graph_module.module_call_graph, key=lambda x: x.fqn)
     signature = ep.graph_module.signature
     graph = ep.graph_module.graph
@@ -3592,6 +3701,7 @@ def replace_output(out):
         schema_version=ep.schema_version,
         verifiers=ep.verifiers,
         torch_version=ep.torch_version,
+        guards_code=guards_code,
     )
 
 
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index 6e4bf53c44f5..b7807145a9fa 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -331,7 +331,7 @@ def get_keystr(key_path: KeyPath) -> str:
         return f"*args{keystr(key_path[1:])}"
     else:
         kwarg_key = key_path[1]
-        assert isinstance(kwarg_key, MappingKey)
+        assert isinstance(kwarg_key, (GetAttrKey, MappingKey))
         name = str(kwarg_key)[1:-1]  # get rid of the enclosed []
         return f"{name}{keystr(key_path[2:])}"
 
@@ -419,7 +419,7 @@ def _check_symint(
         # this means we deferred a guard from export analysis to runtime, let this pass
         # we'll add a runtime assert checking equality to this replacement expression
         pass
-    elif arg != symint:
+    elif arg != int(symint):
         path = get_keystr(keypath)
         if i is not None:
             path += f".shape[{i}]"
diff --git a/torch/_export/verifier.py b/torch/_export/verifier.py
index 66f4ba42819c..28593291b22c 100644
--- a/torch/_export/verifier.py
+++ b/torch/_export/verifier.py
@@ -216,6 +216,7 @@ def _allowed_op_types() -> tuple[type[Any], ...]:
                 torch.sym_not,
                 torch.sym_sqrt,
                 torch.sym_sum,
+                torch.export.custom_ops._call_custom_autograd_function_in_pre_dispatch,
                 # TODO (tmanlaibaatar)
                 # Predispatch export is able to contain autograd ops.
                 # These will be modeled as HOO later
@@ -280,6 +281,13 @@ def _is_type(name, ty):
                             return isinstance(getattr(attr, name, None), ty)
 
                         if type(attr).__name__ == "LoweredBackendModule":
+                            if (
+                                _is_type("backend_id", str)
+                                and hasattr(attr, "original_module")
+                                and hasattr(attr, "module_name")
+                                and getattr(attr, "backend_id", None) == "aoti"
+                            ):
+                                continue
                             if (
                                 _is_type("backend_id", str)
                                 and _is_type("processed_bytes", bytes)
diff --git a/torch/_export/wrappers.py b/torch/_export/wrappers.py
index b851847bada8..e02316940393 100644
--- a/torch/_export/wrappers.py
+++ b/torch/_export/wrappers.py
@@ -1,5 +1,7 @@
 # mypy: allow-untyped-defs
+import inspect
 from contextlib import contextmanager
+from functools import wraps
 
 import torch
 import torch._custom_ops
@@ -15,7 +17,6 @@
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
-    get_proxy_slot,
     PreDispatchTorchFunctionMode,
     ProxyTorchDispatchMode,
     track_tensor_tree,
@@ -129,7 +130,7 @@ def call(self, *args):
     return cls
 
 
-def _register_subclass_spec_proxy_in_tracer(tracer, name, spec):
+def _register_func_spec_proxy_in_tracer(tracer, name, spec):
     """
     This is a wrapper utility method on top of tracer to cache the
     already registered subclass spec attribute. This is useful because
@@ -146,6 +147,41 @@ def _register_subclass_spec_proxy_in_tracer(tracer, name, spec):
     return tracer.create_proxy("get_attr", qualname, (), {})
 
 
+def _emit_flat_apply_call(
+    *,
+    tracer,
+    spec_name: str,
+    const_target_for_apply,
+    graphable_args,
+    track_value,
+    call_spec_cache_key: str,
+):
+    # Flatten to graphable form and record the spec on the FX root
+    flat_args, in_spec = to_graphable(graphable_args)
+    qualname = tracer.get_fresh_qualname(spec_name)  # type: ignore[union-attr]
+    setattr(tracer.root, qualname, in_spec)  # type: ignore[union-attr]
+    spec_proxy = tracer.create_proxy("get_attr", qualname, (), {})
+
+    # Reuse/cached ConstantFunction spec on the root
+    _, func_spec = pytree.tree_flatten(_ConstantFunction(const_target_for_apply))
+    func_spec_proxy = _register_func_spec_proxy_in_tracer(
+        tracer, f"{call_spec_cache_key}_const_func_spec", func_spec
+    )
+
+    # Map runtime args -> proxies (always via tracer.unwrap_proxy now)
+    flat_proxy_args = pytree.tree_map(tracer.unwrap_proxy, flat_args)
+
+    # Emit flat_apply and track result structure
+    out_proxy = tracer.create_proxy(
+        "call_function", flat_apply, (func_spec_proxy, spec_proxy, *flat_proxy_args), {}
+    )
+    track_tensor_tree(track_value, out_proxy, constant=None, tracer=tracer)
+
+
+def _is_init(fn):
+    return callable(fn) and fn.__name__ == "__init__"
+
+
 def mark_subclass_constructor_exportable_experimental(constructor_subclass):
     """
     Experimental decorator that makes subclass to be traceable in export
@@ -167,10 +203,6 @@ def __new__(cls, elem, *, requires_grad=False):
         def __init__(self, elem, ...):
             # ...
     """
-
-    def _is_init(fn):
-        return callable(fn) and fn.__name__ == "__init__"
-
     if not _is_init(constructor_subclass):
         raise RuntimeError(
             f"torch._export.wrappers.mark_constructor_exportable_experimental can only be applied on subclass tensor.__init__"
@@ -179,14 +211,18 @@ def _is_init(fn):
         )
 
     def wrapper(*args, **kwargs):
+        constructor_subclass(*args, **kwargs)
+
+        if not torch.compiler.is_exporting():
+            return
+
         if not is_traceable_wrapper_subclass_type(type(args[0])):
             assert constructor_subclass.__qualname__.endswith("__init__")
             obj_name = constructor_subclass.__qualname__[: -len("__init__")]
             raise RuntimeError(
-                f"Applying mark_constructor_exportable_experimental on {obj_name} is not valid as it is not a traceable "
+                f"Can't intercept {obj_name} in export because this object is not a traceable "
                 f"tensor subclass. Please look at DTensor.__init__ implementation as an example of proper usage of this API."
             )
-        constructor_subclass(*args, **kwargs)
 
         mode = _maybe_find_pre_dispatch_tf_mode_for_export()
         if mode is None:
@@ -196,46 +232,106 @@ def wrapper(*args, **kwargs):
 
         tracer = mode.tracer
         subclass = args[0]
+        graphable = (tuple(args[1:]), kwargs)
+
+        spec_name = "_".join(constructor_subclass.__qualname__.lower().split("."))
+        call_spec_cache_key = type(subclass).__name__.lower()
+
+        _emit_flat_apply_call(
+            tracer=tracer,
+            spec_name=spec_name,
+            const_target_for_apply=type(subclass),
+            graphable_args=graphable,
+            track_value=subclass,  # track the constructed subclass instance
+            call_spec_cache_key=call_spec_cache_key,
+        )
+        return
 
-        flat_args, in_spec = to_graphable((tuple(args[1:]), kwargs))
+    return wrapper
 
-        constructor_spec_name = "_".join(
-            constructor_subclass.__qualname__.lower().split(".")
-        )
-        qualname = tracer.get_fresh_qualname(constructor_spec_name)  # type: ignore[union-attr]
-        setattr(tracer.root, qualname, in_spec)  # type: ignore[union-attr]
-        spec_proxy = tracer.create_proxy("get_attr", qualname, (), {})
-        flat_proxy_args = pytree.tree_map_only(
-            torch.Tensor, lambda x: get_proxy_slot(x, tracer).proxy, flat_args
-        )
 
-        _, func_spec = torch.utils._pytree.tree_flatten(
-            _ConstantFunction(type(subclass))
+def allow_in_pre_dispatch_graph(func):
+    """
+    Experimental decorator that adds user function to export pre-dispatch graph. Note that
+    we only support custom autograd function/subclass constructors today. To use this function:
+        1. For subclasses:
+            1. refer to instructions in mark_subclass_constructor_exportable_experimental
+        2. Define apply method on your custom autograd function and apply this decorator.
+
+    Example:
+
+    class MyCoolCustomAutogradFunc(autograd.Function):
+        @classmethod
+        @torch._export.wrappers.allow_in_pre_dispatch_graph
+        def apply(cls, *args, **kwargs):
+            return super(MyCoolCustomAutogradFunc, cls).apply(*args, **kwargs)
+
+    """
+    if _is_init(func):
+        return mark_subclass_constructor_exportable_experimental(func)
+
+    if not (_is_init(func) or func.__name__ == "apply"):
+        raise RuntimeError(
+            f"torch._export.wrappers.allow_in_pre_dispatch_graph can only be applied on subclass tensor.__init_ "
+            f"or custom_autograd_function.apply. "
+            f"But, you are adding it on {func.__name__} which is not supported. "
+            f"If __init__ doesn't exist on your subclass, please add it. Look at DTensor.__init__ implementation for example. "
+            f"If you are adding it on custom autograd function, please add it on apply method. "
+            f"If anything else, file an issue on github and we may consider extending our support. "
         )
 
-        # We actually don't want to create a new spec for each instance
-        # In fx graph, it will look like dtensor_const_func_spec
-        # We can't directly shove DTensor.__init__ into fx as it is not
-        # allowed type.
-        fxable_constructor_call_spec_name = (
-            type(subclass).__name__.lower() + "_const_func_spec"
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if not torch.compiler.is_exporting():
+            return func(*args, **kwargs)
+
+        if not inspect.isclass(args[0]):
+            return func(*args, **kwargs)
+
+        if not issubclass(args[0], torch.autograd.Function):
+            return func(*args, **kwargs)
+
+        from torch._ops import _get_dispatch_mode_pre_dispatch
+
+        mode = _get_dispatch_mode_pre_dispatch(torch._C._TorchDispatchModeKey.PROXY)
+        if mode is None:
+            return func(*args, **kwargs)
+
+        # Sometimes custom autograd functions can call into HOPs that don't have proxy impl
+        # at PreDispatch level, so we just dispatch it below to get the concrete result.
+        include_to_set = torch._C._dispatch_tls_local_include_set().remove(
+            torch._C.DispatchKey.PreDispatch
+        )
+        exclude_to_set = (
+            torch._C._dispatch_tls_local_exclude_set()
+            | torch._C.DispatchKeySet(torch._C.DispatchKey.PreDispatch)
         )
 
-        # We should try to reuse the constructor call spec as it is guaranteed to be same
-        # for each subclass type. This is different from proxy-ing the init arguments which
-        # can't be reused because for example, DTensor can receive different DeviceMesh etc
-        # as it's arguments
-        func_spec_proxy = _register_subclass_spec_proxy_in_tracer(
-            tracer, fxable_constructor_call_spec_name, func_spec
+        with torch._C._ForceDispatchKeyGuard(include_to_set, exclude_to_set):
+            out = func(*args, **kwargs)
+
+        assert mode.pre_dispatch, "Should only do this in predispatch"
+        tracer = mode.tracer
+
+        function_cls_name = f"{args[0].__module__}.{args[0].__qualname__}"
+        graphable = ((function_cls_name, *args[1:]), kwargs)
+
+        from torch.export.custom_ops import (
+            _call_custom_autograd_function_in_pre_dispatch,
         )
 
-        inner_proxy = tracer.create_proxy(
-            "call_function",
-            flat_apply,
-            (func_spec_proxy, spec_proxy, *flat_proxy_args),
-            {},
+        spec_name = "_".join(function_cls_name.split("."))
+        call_spec_cache_key = type(
+            _call_custom_autograd_function_in_pre_dispatch
+        ).__name__.lower()
+        _emit_flat_apply_call(
+            tracer=tracer,
+            spec_name=spec_name,
+            const_target_for_apply=_call_custom_autograd_function_in_pre_dispatch,
+            graphable_args=graphable,
+            track_value=out,
+            call_spec_cache_key=call_spec_cache_key,
         )
-        track_tensor_tree(subclass, inner_proxy, constant=None, tracer=tracer)
-        return
+        return out
 
     return wrapper
diff --git a/torch/_functorch/_aot_autograd/autograd_cache.py b/torch/_functorch/_aot_autograd/autograd_cache.py
index 248c3a0ae673..ec1e70a9a00f 100644
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@@ -283,19 +283,6 @@ def check_cacheable(gm: torch.fx.GraphModule):
         check_cacheable(gm.saved_tensors_hooks_unpack_0)  # type: ignore[arg-type]
 
 
-def check_metadata_cacheable(metadata: ViewAndMutationMeta):
-    """
-    When view replay is turned on, we bypass autograd cache if
-    the output is aliased.
-    """
-    if config.view_replay_for_aliased_outputs:
-        for info in metadata.output_info:
-            if info.functional_tensor is not None:
-                raise BypassAOTAutogradCache(
-                    "Cannot cache a graph with functional tensor"
-                )
-
-
 class AOTAutogradCacheDetails(FxGraphHashDetails):
     """
     Object to capture all the details for a dynamo graph module relevant to computing
@@ -794,7 +781,6 @@ def pre_save(self):
         """
         Perform any preparations to make the cache entry ready for serialization.
         """
-        check_metadata_cacheable(self.runtime_metadata)
         self.compiled_fw.pre_save()
         if self.compiled_bw is not None:
             self.compiled_bw.pre_save()
diff --git a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
index 19d08a64f967..acfd40fe78c7 100644
--- a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
+++ b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@@ -43,10 +43,10 @@
     has_metadata_mutation,
     MetadataKey,
     to_fun,
+    ViewMetaSequence,
     was_inductor_storage_resized,
 )
 from .schemas import (
-    FunctionalTensorMetadataEq,
     InputAliasInfo,
     MemoryFormatMeta,
     MutationType,
@@ -640,7 +640,7 @@ def inner(*flat_args):
             #
             # The FunctionalTensor will be saved if one of the 2 conditions below
             # is true:
-            functional_tensor = None
+            view_meta_sequence = None
             if (
                 # 1. If the output_type is either of:
                 #    (i) alias_of_intermediate;
@@ -672,7 +672,7 @@ def inner(*flat_args):
                 and not input_info[base_idx].mutates_metadata
             ):
                 if isinstance(o, FunctionalTensor):
-                    functional_tensor = FunctionalTensorMetadataEq(o.elem)
+                    view_meta_sequence = ViewMetaSequence(o)
 
             out_info = OutputAliasInfo(
                 output_type=output_type,
@@ -680,7 +680,7 @@ def inner(*flat_args):
                 base_idx=base_idx,
                 dynamic_dims=dynamic_dims,
                 requires_grad=isinstance(o, torch.Tensor) and o.requires_grad,
-                functional_tensor=functional_tensor,
+                view_meta_sequence=view_meta_sequence,
             )
             output_info.append(out_info)
 
diff --git a/torch/_functorch/_aot_autograd/functional_utils.py b/torch/_functorch/_aot_autograd/functional_utils.py
index 4e74ed6341b9..958804e5c763 100644
--- a/torch/_functorch/_aot_autograd/functional_utils.py
+++ b/torch/_functorch/_aot_autograd/functional_utils.py
@@ -14,6 +14,7 @@
 
 import torch
 from torch import Tensor
+from torch._C import _functionalization
 from torch._logging import getArtifactLogger
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._subclasses.functional_tensor import FunctionalTensor
@@ -224,9 +225,9 @@ def gen_alias_from_base(
     aliased_base_tensor,
     target_meta_tensor,
     target_requires_grad,
-    target_functional_tensor: Optional[FunctionalTensorMetadataEq] = None,
+    target_view_meta_sequence: Optional[ViewMetaSequence] = None,
     *,
-    replay_views,
+    replay_views: bool,
 ):
     # Patch the correct requires_grad field of the output tensor, depending on whether:
     # (i) the reconstructed output (out) was came from a tensor that requires grad or not;
@@ -245,13 +246,11 @@ def patch_requires_grad(out):
     # to replay them (view functions) on the aliased_base_tensor.
     if (
         replay_views
-        and target_functional_tensor is not None
-        and not torch._functionalize_is_symbolic(target_functional_tensor.tensor)
+        and target_view_meta_sequence is not None
+        and not any(vm.has_symbolic_inputs for vm in target_view_meta_sequence.sequence)
     ):
-        functional_tensor = target_functional_tensor.tensor
-
-        out = torch._functionalize_apply_view_metas(
-            functional_tensor, aliased_base_tensor
+        out = _functionalization.apply_view_meta_sequence(
+            aliased_base_tensor, target_view_meta_sequence.sequence
         )
         # If re-applying the ViewMeta sequence succeeded, there should be no more
         # problems going forward. We just check we got to the target shape and
@@ -357,25 +356,45 @@ def make(t):
         )
 
 
-# Wrapper around a FunctionalTensorWrapper for comparing only the resulting metadata
-# after applying all the ViewMeta operations.
-class FunctionalTensorMetadataEq:
-    def __init__(self, tensor: torch.Tensor) -> None:
-        assert torch._is_functional_tensor(tensor)
-        self.tensor = tensor
+# ViewMeta sequence wrapper for equality comparisons.
+#
+# Even though we can compare each ViewMeta instance, we compare the resulting
+# tensor metadata, instead. That's because the creation of synthetic bases + the
+# re-generation of input views might end-up creating a different sequence of
+# ViewMeta that is semantically equivalent. i.e. gets to a tensor with the same
+# metadata.
+#
+# Therefore, we store what the end result should look like as serializable
+# metadata.
+#
+# When logging, this class should look like:
+#
+#     ViewMetaSequence(view, select_int, slice_Tensor)
+#
+# i.e. a parenthesized list of view operations within that ViewMeta sequence.
+class ViewMetaSequence:
+    def __init__(self, tensor: FunctionalTensor) -> None:
+        assert torch._is_functional_tensor(tensor.elem)
+        self.sequence = _functionalization.get_view_meta_sequence(tensor.elem)
+        self.metadata = MetadataKey.make(tensor)
+
+    def __repr__(self) -> str:
+        suffix = len("_ViewMeta")
+        types = ", ".join(type(vm).__name__[:-suffix] for vm in self.sequence)
+        return f"ViewMetaSequence({types})"
 
     def __eq__(self, other: object) -> bool:
         # If other is None, then it probably means that we weren't able to recreate
-        # the FunctionalTensorMetadataEq. One of this cases is when we update the
-        # view metadata by calling: create_synthetic_base_metadata.
+        # the ViewMeta sequence. One example is when we update the view metadata by
+        # calling: create_synthetic_base_metadata.
         if other is None:
             return True
 
         # Comparison against any other type is not implemented.
-        if not isinstance(other, FunctionalTensorMetadataEq):
+        if not isinstance(other, ViewMetaSequence):
             return NotImplemented
 
-        return has_same_metadata(self.tensor, other.tensor)
+        return self.metadata == other.metadata
 
 
 # new_arg and arg here are either:
diff --git a/torch/_functorch/_aot_autograd/graph_compile.py b/torch/_functorch/_aot_autograd/graph_compile.py
index a1c6e795bfec..d02d29cba199 100644
--- a/torch/_functorch/_aot_autograd/graph_compile.py
+++ b/torch/_functorch/_aot_autograd/graph_compile.py
@@ -1364,7 +1364,8 @@ def aot_stage2_autograd(
             if maybe_subclass_meta is None
             else maybe_subclass_meta.fw_metadata
         )
-        with track_graph_compiling(aot_config, "joint"):
+        context = torch._C._DisableAutocast if disable_amp else nullcontext
+        with context(), track_graph_compiling(aot_config, "joint"):
             # See Note: [Partitioner handling for Subclasses, Part 1]
             # See Note: [Recomputing subclass mutation handling]
             mutated_inp_runtime_indices = (
diff --git a/torch/_functorch/_aot_autograd/input_output_analysis.py b/torch/_functorch/_aot_autograd/input_output_analysis.py
index dcee706f5cc2..06581e1524fd 100644
--- a/torch/_functorch/_aot_autograd/input_output_analysis.py
+++ b/torch/_functorch/_aot_autograd/input_output_analysis.py
@@ -89,7 +89,7 @@ def remove_dupe_metadata(
                 dynamic_dims=o.dynamic_dims,
                 base_idx=None if o.base_idx is None else add_dupe_map[o.base_idx],
                 requires_grad=o.requires_grad,
-                functional_tensor=o.functional_tensor,
+                view_meta_sequence=o.view_meta_sequence,
             )
             for o in m.output_info
         ],
@@ -242,7 +242,7 @@ def create_synthetic_base_metadata(
                 # Map the input idx pre-synthetic-bases to the new idx post-synthetic-bases
                 base_idx=new_base_idx,  # type: ignore[arg-type]
                 requires_grad=o.requires_grad,
-                functional_tensor=o.functional_tensor,
+                view_meta_sequence=o.view_meta_sequence,
             )
         )
 
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 01caf13ef3f6..e2f66bdef70f 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -149,7 +149,7 @@ def __init__(self, info, runtime_metadata, trace_joint):
         self.base_idx = info.base_idx
         self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
         self.requires_grad = info.requires_grad
-        self.functional_tensor = info.functional_tensor
+        self.view_meta_sequence = info.view_meta_sequence
         self.replay_views = config.view_replay_for_aliased_outputs
 
     def __call__(self, orig_inputs, fw_outs, out):
@@ -158,7 +158,7 @@ def __call__(self, orig_inputs, fw_outs, out):
             aliased_base_tensor,
             self.unwrap_out(out),
             self.requires_grad,
-            self.functional_tensor,
+            self.view_meta_sequence,
             replay_views=self.replay_views,
         )
 
@@ -189,7 +189,7 @@ def __init__(self, info, runtime_metadata, trace_joint):
 
         self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
         self.requires_grad = info.requires_grad
-        self.functional_tensor = info.functional_tensor
+        self.view_meta_sequence = info.view_meta_sequence
         self.replay_views = config.view_replay_for_aliased_outputs
 
     def __call__(self, orig_inputs, fw_outs, out):
@@ -198,7 +198,7 @@ def __call__(self, orig_inputs, fw_outs, out):
             self._unwrap_aliased_base_tensor(aliased_base_tensor),
             self.unwrap_out(out),
             self.requires_grad,
-            self.functional_tensor,
+            self.view_meta_sequence,
             replay_views=self.replay_views,
         )
 
@@ -2347,6 +2347,44 @@ def _backward_impl(ctx, all_args):
                         lazy_backward_info, AutogradLazyBackwardCompileInfo
                     )
 
+                    if (
+                        hasattr(lazy_backward_info, "saved_context")
+                        and lazy_backward_info.saved_context is not None
+                    ):
+                        assert isinstance(
+                            lazy_backward_info.saved_context, TracingContext
+                        )
+                        ddp_ctx = lazy_backward_info.saved_context.ddp_optimizer_ctx
+                        if ddp_ctx is not None:
+                            assert ddp_ctx.curr_bucket >= 0, (
+                                f"expected same # of fw and bw compiles, but found bucket {ddp_ctx.curr_bucket}"
+                            )
+                            curr_fw_meta = ddp_ctx.metadata_per_bucket[
+                                ddp_ctx.curr_bucket
+                            ]
+                            # Note [DDPOptimizer and fw_metadata]
+                            # When using the DDPOptimizer, we have a single dynamo graph (and TracingContext),
+                            # but multiple AOTDispatcher graph.
+                            #
+                            # One consequence is that there will be **multiple** fw_metadata objects, one per AOT graph,
+                            # which we stash the fw_metadata on the TracingContext.
+                            #
+                            # Normally what happens is that as we compile AOT graphs 1...N, we clobber the fw_metadata
+                            # for graph i-1 when we start running AOT for graph i.
+                            # Ordinarily this is fine, because inductor no longer needs the metadata from graph i-1.
+                            #
+                            # However, this is a problem for lazy compilation of the backward. During backward compilation,
+                            # we compile the backward lazily at backward runtime, meaning that we will first compile
+                            # backward graph N, N-1, ..., 1.
+                            # We need to ensure that at the time inductor compiles bw graph N-1, it can access
+                            # the corresponding fw_metadta for graph N-1.
+                            #
+                            # We do this by stashing a DDPOptimizerContext, which tracks:
+                            # - the metadata of all N graphs
+                            # - the graph we are currently compiling in our DDPOptimizer region.
+                            ddp_ctx.curr_bucket -= 1
+                            lazy_backward_info.saved_context.fw_metadata = curr_fw_meta
+
                     if not saved_tensors_use_once:
                         fw_metadata.bw_donated_idxs = []
                         # Update bw_donated_idxs if using lazy_backward_info from `aot_dispatch_autograd`
diff --git a/torch/_functorch/_aot_autograd/schemas.py b/torch/_functorch/_aot_autograd/schemas.py
index 9c8cfc0a318d..a65351c31934 100644
--- a/torch/_functorch/_aot_autograd/schemas.py
+++ b/torch/_functorch/_aot_autograd/schemas.py
@@ -7,7 +7,6 @@
 from __future__ import annotations
 
 import collections
-import dataclasses
 import functools
 import itertools
 from dataclasses import dataclass, field
@@ -32,10 +31,7 @@
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 
 from .. import config
-from .functional_utils import (
-    _check_if_mutation_can_be_in_graph,
-    FunctionalTensorMetadataEq,
-)
+from .functional_utils import _check_if_mutation_can_be_in_graph, ViewMetaSequence
 from .utils import strict_zip
 
 
@@ -117,15 +113,14 @@ class OutputAliasInfo:
     dynamic_dims: Optional[set[int]]
     # requires_grad
     requires_grad: bool
-    # FunctionalTensorWrapper that represents this output.
+    # Sequence of ViewMeta objects.
     #
-    # Provides us the means to replay views from it.
+    # Provides us the means to re-run view functions on other tensors.
     #
-    # We need to wrap the actual FunctionalTensorWrapper with this class so that
-    # we only compare the tensor's metadata. That's because with the transformations
-    # of the model throughout AOTAutograd, the sequence of ViewMeta and the base
-    # tensor might change.
-    functional_tensor: Optional[FunctionalTensorMetadataEq] = None
+    # We need to wrap the actual list of ViewMeta with this class so that
+    # we compare the ViewMeta elements appropriately, i.e. their type and
+    # the elements returned by the `as_tuple()` call.
+    view_meta_sequence: Optional[ViewMetaSequence] = None
 
 
 class MutationType(Enum):
@@ -665,17 +660,6 @@ def extract_metadata(t):
         self.traced_tangent_metas = [extract_metadata(t) for t in self.traced_tangents]
         # Clear traced tangents at runtime
         self.traced_tangents = []
-        new_output_info = []
-        for out in self.output_info:
-            if config.view_replay_for_aliased_outputs:
-                new_out = out
-            else:
-                # If we're not using view_replay, remove the functional tensor.
-                # Functional tensors are unfortunately not serializable,
-                # so doing this is required for AOTAutograd caching.
-                new_out = dataclasses.replace(out, functional_tensor=None)
-            new_output_info.append(new_out)
-        self.output_info = new_output_info
         for inp_meta in self.subclass_inp_meta:
             if isinstance(inp_meta, SubclassCreationMeta):
                 inp_meta.make_runtime_safe()
diff --git a/torch/_guards.py b/torch/_guards.py
index ab64efc2ea02..f6f053ea064c 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -41,6 +41,7 @@
 
     import sympy
 
+    from torch._dynamo.backends.distributed import DDPOptimizerContext
     from torch._dynamo.codegen import PyCodegen
     from torch._functorch._aot_autograd.schemas import ViewAndMutationMeta
     from torch._subclasses.fake_tensor import FakeTensorMode
@@ -868,6 +869,8 @@ def __init__(self, fake_mode: Optional[FakeTensorMode]) -> None:
         self.loc_in_frame: Optional[tuple[str, int, str]] = None
         # this is only set after aot_autograd
         self.fw_metadata: Optional[ViewAndMutationMeta] = None
+        # this is only set when the DDPOptimizer is used
+        self.ddp_optimizer_ctx: Optional[DDPOptimizerContext] = None
         # this is only set after aot_autograd
         self.aot_graph_name: Optional[list[str]] = None
         self.params_flat: Optional[list[Any]] = None
diff --git a/torch/_higher_order_ops/__init__.py b/torch/_higher_order_ops/__init__.py
index e14659276cc7..e809c729dc42 100644
--- a/torch/_higher_order_ops/__init__.py
+++ b/torch/_higher_order_ops/__init__.py
@@ -27,7 +27,10 @@
 from torch._higher_order_ops.scan import scan
 from torch._higher_order_ops.strict_mode import strict_mode
 from torch._higher_order_ops.torchbind import call_torchbind
-from torch._higher_order_ops.while_loop import while_loop
+from torch._higher_order_ops.while_loop import (
+    while_loop,
+    while_loop_stack_output_op as while_loop_stack_output,
+)
 from torch._higher_order_ops.wrap import (
     dynamo_bypassing_wrapper,
     tag_activation_checkpoint,
@@ -69,4 +72,5 @@
     "strict_mode",
     "aoti_call_delegate",
     "map",
+    "while_loop_stack_output",
 ]
diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py
index 87925e295884..fa59ee244fec 100644
--- a/torch/_higher_order_ops/associative_scan.py
+++ b/torch/_higher_order_ops/associative_scan.py
@@ -5,17 +5,21 @@
 
 import torch
 import torch._prims_common as utils
-import torch._subclasses.functional_tensor
 import torch.utils._pytree as pytree
 from torch._C import DispatchKey
 from torch._higher_order_ops.utils import (
     _maybe_compile_and_run_fn,
     _maybe_run_with_interpreter,
-    autograd_not_implemented,
     check_input_alias_and_mutation_return_outputs,
     check_meta_consistency,
+    create_bw_fn,
     first_slice_copy,
+    first_slice_copy_with_grad,
+    materialize_as_graph,
     reenter_make_fx,
+    save_tensors_and_symints_for_backward,
+    saved_tensors_and_symints,
+    split_into_chunks,
     unique_graph_id,
     validate_subgraph_args_types,
 )
@@ -191,6 +195,9 @@ def add(x: torch.Tensor, y: torch.Tensor):
         cumsum = associative_scan(add, x, dim)
 
     """
+    # TODO: Support lifted arguments in inductor for associative_scan
+    # TODO: Support autograd for cases with lifted arguments for combine_mode=pointwise
+
     # The reason we flatten xs before calling into dynamo is that
     # we want to create a consistent input ordering for combine_fn
     # and we also want to the input ordering matches the output ordering.
@@ -242,9 +249,6 @@ def _validate_input(cfn, lxs, d, r, cm):
     if reverse:
         leaves_xs = [torch.flip(elem, [0]) for elem in leaves_xs]
 
-    # TODO: Support Autograd
-    # TODO: Unify handling of pytrees for control flow ops, such as cond, while_loop, etc.
-
     if combine_mode == "generic":
         # The generic_associative_scan implementation calls the combine_fn with a `batch` along the scan dimension
         # For example, consider:
@@ -468,9 +472,378 @@ def associative_scan_op_dense(combine_fn, xs, additional_inputs):
     return generic_associative_scan(combine_fn, xs, additional_inputs=additional_inputs)
 
 
-associative_scan_op.py_autograd_impl(
-    autograd_not_implemented(associative_scan_op, deferred_error=True)
-)
+class AssociativeScanAutogradOp(torch.autograd.Function):
+    r""" associative_scan
+        Example::
+            xs = torch.arange(1, 5) = [1, 2, 3, 4]
+
+            def combine_fn(a: torch.Tensor, b: torch.Tensor):
+                return a * b
+
+            ys = associative_scan(comine_fn, xs),
+            which can be unpacked as:
+            ys0 = xs0                                         = 1
+            ys1 = combine_fn(ys0, xs1) = combine_fn(1, 2)     = 2
+            ...
+            ysT = combine_fn(ys(T-1), xsT) = combine_fn(6, 4) = 24
+            ys = [1, 2, 6, 24]
+
+            This creates a recursive data dependency structure where each output yst
+            depends on all prior inputs xs0 through xst. The dependency can be visualized as:
+
+    Level 0 (Input):    xs0    xs1    xs2    xs3    xs4
+                        \    /       |      |      |
+                        \  /        |      |      |
+    Level 1:               ys1 ───────┘      |      |
+                            \               /       |
+                            \             /        |
+    Level 2:                  ys2 ────────┘         |
+                            \                   /
+                                \                 /
+    Level 3:                     ys3 ────────────┘
+                                \
+                                \
+    Level 4:                        ys4
+
+
+    We could get the following backward gradient graph:
+
+
+    Level 0 (output):   g_xs0   g_xs1   g_xs2   g_xs3   g_xs4
+                        \      /       |       |     |
+                        \    /        |       |     |
+    Level 1:    gl_ys1  ─> g_ys1  ──────┘       |     |
+                            \                  /      |
+                            \                /       |
+    Level 2:    gl_ys2     ─> g_ys2  ────────┘        |
+                            \                     /
+                                \                   /
+    Level 3:    gl_ys3        ─> g_ys3  ───────────┘
+                                \
+                                \
+    Level 4:    gl_ys4           ─> g_ys4,
+
+    where gl_y1 is the gradient of the loss with respect to ys1 and the input of backward.
+
+    To calculate the gradients of the inputs, the chain rule suggests:
+
+    g_xs0 = g_ys1
+    g_xs1 = g_ys1 * bw(ys0, xs1) = g_ys1 * bwxs01
+    g_xs2 = g_ys2 * bw(ys1, xs2) = g_ys2 * bwxs12
+    g_xs3 = g_ys3 * bw(ys2, xs3) = g_ys3 * bwxs23
+    g_xs4 = g_ys4 * bw(ys3, xs4) = g_ys4 * bwxs34
+
+    Notice the bw(...) is just the single step bw (instantaneous gradients), whose formula can be computed from combine_fn.
+    For example bw(ys3, xs4) (also abbreviated with bwxs34) computes the gradients ∂/∂xs4 combine_fn(ys3, xs4).
+    Similarly, bw(ys4, ys3) (also abbreviated with bwys43) computes the gradients ∂/∂ys3 combine_fn(ys3, xs4).
+
+    Let's break down how to calculate g_ys by recursively substituting the unknowns:
+
+    g_ys1 = gl_ys1 + g_ys2 * bw(ys2, ys1)
+          = gl_ys1 + (gl_ys2  + g_ys3 * bw(ys3, ys2)) * bw(ys2, ys1)
+          = gl_ys1 + gl_ys2 * bw(ys2, ys1) + g_ys3 * bw(ys3, ys2) * bw(y2, y1)
+          = gl_ys1 + gl_ys2 * bw(ys2, ys1) + gl_ys3 * bw(ys3, ys2) * bw(y2, y1) \
+                   + g_ys4 * bw(ys4, ys3) * bw(ys3, ys2) * bw(ys2, ys1)
+          = gl_ys1 + gl_ys2 * bw(ys2, ys1) + gl_ys3 * bw(ys3, ys2) * bw(y2, y1) \
+                   + gl_ys4 * bw(ys4, ys3) * bw(ys3, ys2) * bw(ys2, ys1)
+
+    Let's do the same for all the g_ys:
+    g_ys2 = gl_ys2 + gl_ys3 * bw(ys3, ys2) + gl_y4 * bw(ys4, ys3) * bw(ys3, ys2)
+    g_ys3 = gl_ys3 + gl_ys4 * bw(ys4, ys3)
+    g_ys4 = gl_ys4
+
+    Notice that the above can be re-written as columnwise multiplication of y_mat and gl_ys:
+
+    g_ys1   1, bwys21, bwys321, bwys4321       gl_ys1
+    g_ys2 = 0,    1  , bwys321, bwys4321   .   gl_ys2
+    g_ys3   0,    0  ,     1  , bwys4321       gl_ys3
+    g_ys4   0,    0  ,     0  ,        1       gl_ys4,
+
+    where bwys21 is an abbreviation for bw(ys2, ys1),
+    bwys321 is an abbreviation for bw(ys3, ys2) * bw(ys2, ys1) so on and so forth.
+
+    We could effectively compute the upper triangular matrix y_mat with:
+    cumprod([1, bwys21, bwys32, bwys43]) then masking out the values as needed.
+    Thus, only [1, bwys21, bwys32, bwys43] are required to compute the y_mat.
+
+
+        References: https://justintchiu.com/blog/pscan_diff/
+
+        NOTE: [associative_scan autograd implementation]
+
+        The forward of associative_scan can be computed with the following steps:
+
+        1.) Compute the forward output of the associative_scan
+            ys = associative_scan(combine_fn, xs, additional_inputs)
+
+        The backward of associative_scan can be computed with the following steps:
+
+        2.) Prepare the backward graph
+            We prepare the backward graph to be used in the backward function.
+            We utilize ``create_bw_fn`` to generate the joint function:
+            combine_fn_bw = create_bw_fn(combine_fn, operands)
+            where operands = [ys{t-1}, xst, additional_inputs]
+
+        3.) Materialize the ``combine_fn_bw``
+            This is required because torch.compile and torch.autograd.grad
+            cannot trace through the joint backward function dynamically.
+
+        4.) Compute the single step bw (instantaneous gradients) at every step t
+            bwys{t-1}, bwxst = combine_fn_bw(ys{t-1}, xst, 1.)
+            Here we pass 1 as the upstream gradient to obtain the local partial derivatives.
+
+            This gives:
+                bwys = [bw(ys1, ys0), bw(ys2, ys1), ..., bw(ysT, ys{T-1})]
+                bwxs = [bw(ys1, xs0), bw(ys2, xs1), ..., bw(ys{T-1}, xsT)]
+
+        5.) Compute the gradient transition matrix y_mat
+
+            As shown in the example above, each input xst affects all later outputs ysi for i ≥ t.
+            According to the chain rule, each such path contributes a product of local gradients g_ysk.
+
+            For example:
+                ∂ysT/∂xst = ∂ysT/∂ys{T-1} * ∂ys{T-1}/∂ys{T-2} * ... * ∂ys{t+1}/∂yst * ∂yst/∂xst
+                        = bw(ysT, ys{T-1}) * bw(ys{T-1}, ys{T-2}) * ... * bw(ys{t+1}, yst) * bw(ys{t-1}, xst)
+
+            This motivates the use of a cumulative product over bwys to compute all such paths efficiently.
+
+            We now construct the matrix of gradient transition paths:
+
+            5.1 Repeat g_y values to form the base matrix
+                y_mat = [[1, bwys21, bwys32, bwys43],
+                         [1, bwys21, bwys32, bwys43],
+                         [1, bwys21, bwys32, bwys43],
+                         [1, bwys21, bwys32, bwys43]]
+
+            5.2 Mask the lower triangle (inclusive) with 1s
+                y_mat = [[1, bwys21, bwys32, bwys43],
+                         [1, 1     , bwys32, bwys43],
+                         [1, 1     , 1     , bwys43],
+                         [1, 1     , 1     , 1    ]]
+
+            5.3 Apply cumulative product row-wise
+                y_mat = cumprod(y_mat, dim=1)
+                Resulting in:
+                y_mat = [[1, bwys21, bwys32 * bwys21, bwys43 * bwys32 * bwys21],
+                         [1, 1      , bwys32         , bwys43 * bwys32         ],
+                         [1, 1      , 1              , bwys43                  ],
+                         [1, 1      , 1              , 1                       ]]
+
+            5.4 Zero out the lower triangle (exclusive)
+                Final y_mat:
+                y_mat = [[1, bwys21, bwys32 * bwys21, bwys43 * bwys32 * bwys21],
+                         [0, 1      , bwys32         , bwys43 * bwys32         ],
+                         [0, 0      , 1              , bwys43                  ],
+                         [0, 0      , 0              , 1                       ]]
+
+        6.) Scale the y_mat with the upstream gradients gl_ys
+            scaled_y_mat = y_mat * gl_ys
+            Each entry now holds the full contribution of ∂L/∂ysj to ∂L/∂xsi via the path through ysj.
+
+        7.) Reduce the scaled_y_mat with a row-wise sum
+            summed_y_mat = scaled_y_mat.sum(dim=1)
+            This accumulates all downstream contributions for each xst.
+
+        8.) Scale with the instantaneous input gradients bwxs
+            g_xs = summed_y_mat * bwxs
+
+            This gives the final input gradients:
+                g_xs = [∂L/∂xs0, ∂L/∂xs1, ..., ∂L/∂xsT]
+
+        NOTE: [scan partial grad handling]
+            If any element of xs or of the outputs does not require gradients
+            (i.e., requires_grad=False), then the corresponding gradients will be returned
+            as tensors of zeros with the same shape as the element.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        combine_fn,
+        num_xs,
+        num_additional_inputs,
+        *operands,
+    ):
+        ctx._num_xs = num_xs
+        ctx._num_additional_inputs = num_additional_inputs
+        ctx._combine_fn = combine_fn
+        xs, additional_inputs = split_into_chunks(
+            operands, [num_xs, num_additional_inputs]
+        )
+
+        scan_length = xs[0].shape[0]
+        ctx._scan_length = scan_length
+
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+
+        with torch._C._AutoDispatchBelowAutograd():
+            # 1.) Compute the forward output of the associative_scan
+            ys = associative_scan_op(combine_fn, xs, additional_inputs)
+            save_tensors_and_symints_for_backward(ctx, list(operands) + list(ys))
+
+        return (*ys,)
+
+    @staticmethod
+    def backward(ctx, *gl_ys):
+        r"""
+        This function computes the gradients of the scan operation.
+        For a detailed description see the document above.
+
+        Args:
+            flat_grads (torch.Tensor): The tensor of upstream gradients, or a nested pytree of tensors.
+                                       E.g.: Gradient of the loss with respect to the forward output ys
+        """
+
+        # The backward of associative_scan is always performed on the first dimension
+        dim = 0
+        scan_length = ctx._scan_length
+        num_xs = ctx._num_xs
+        num_additional_inputs = ctx._num_additional_inputs
+
+        # Extract the inputs to the forward path and outputs from the forward path
+        flat_args = saved_tensors_and_symints(ctx)
+        xs, additional_inputs, outs = split_into_chunks(
+            flat_args, [num_xs, num_additional_inputs, num_xs]
+        )
+        ndim = outs[0].ndim
+
+        # First_slice_copy does not keep the original requires_grad flag,
+        # but we need it here in order to compute the correcte gradients
+        xs_slices = first_slice_copy_with_grad(itertools.chain(xs, xs))
+
+        # Construct the operands from the forward, fw_operands
+        # and the operands for a single event t of the forward, fw_operands_slice
+        fw_operands = (*xs, *additional_inputs)
+        fw_operands_slice = (*xs_slices, *additional_inputs)
+
+        # 2.) Prepare the backward graph
+        combine_fn_bw = create_bw_fn(ctx._combine_fn, fw_operands_slice)
+
+        # 3.) Materialize the ``combine_fn_bw``
+        # TODO: we need to materialize the bw graphs because dynamo is unable to
+        # trace through the joint function when torch.compile torch.autograd.grad.
+        combine_fn_bw_gm = materialize_as_graph(
+            combine_fn_bw,
+            (
+                *fw_operands_slice,
+                *[first_slice_copy(o) for o in outs],
+            ),
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+
+        # vmap joint graph over scan dimension to compute the individual
+        # gradients for each time slice ``t`` in parallel.
+        # This computation can be parallelized, as these are just the instantaneous gradients and not the full chain-rule
+        mapped_combine_fn_bw_gm = torch.vmap(combine_fn_bw_gm, 0, 0)
+
+        # 4.) Compute the single step bw (instantaneous gradients) at every step ``t``
+        # Use a ones_like tensor in order not to scale the bwyst and bwxst,
+        # with the upstream gradients yet.
+        # Note: All bwyst and bwxst are computed in parallel, thus the tensors bwys and bwxs are the result.
+        dummy_upstream_grad = (torch.ones_like(x) for x in gl_ys)
+        grads = mapped_combine_fn_bw_gm(
+            *(o.roll(1, dim) for o in outs), *fw_operands, *dummy_upstream_grad
+        )
+        bwys, bwxs = split_into_chunks(grads, [num_xs, num_xs])
+
+        def compute_y_mat(bwys: torch.Tensor) -> torch.Tensor:
+            # Prepare a ones and a zeros helper mask in order to easily compute the y_mat
+            def compute_helper_tril_mask(diagonal):
+                def expand_masks(mask):
+                    for _ in range(ndim - 1):
+                        mask = mask.unsqueeze(-1)
+                    return mask
+
+                tril_mask = torch.tril(
+                    torch.ones(
+                        scan_length, scan_length, device=bwys.device, dtype=torch.bool
+                    ),
+                    diagonal=diagonal,
+                )
+                tril_mask = expand_masks(tril_mask)
+                tril_mask = tril_mask.expand(-1, -1, *bwys.shape[1:])
+                return tril_mask
+
+            # The ones mask is used to fill the main diagonal and all elements below it with 1s
+            ones_mask = compute_helper_tril_mask(0)
+
+            # The zero mask is used to set all elements below the main diagonal to 0
+            zeros_mask = compute_helper_tril_mask(-1)
+
+            # 5.1) Repeat the elements of bwys to form the square matrix
+            y_mat = bwys.unsqueeze(dim).repeat_interleave(scan_length, dim)
+
+            # 5.2) Fill the lower triangular part, including the diagonal,
+            # of the h_mat with 1s. I.e., use the ones_mask to fill with 1s.
+            y_mat.masked_fill_(ones_mask, 1.0)
+
+            # 5.3) Compute the cumulative products across dim + 1
+            y_mat = y_mat.cumprod(dim=dim + 1)
+
+            # 5.4) Replace the elements we filled with 1s before with 0s
+            y_mat.masked_fill_(zeros_mask, 0.0)
+
+            return y_mat
+
+        def compute_grad(bwxs, bwys, gl_ys):
+            # Set the first gradient component of bwxs to 1.0, per definition.
+            torch.select(bwxs, dim, 0).fill_(1.0)
+
+            # 5.) Compute the gradient transition matrix
+            y_mat = compute_y_mat(bwys)
+
+            # 6.) scale the y_mat with the upstream gradients gl_ys
+            scaled_y_mat = y_mat * gl_ys
+
+            # 7.) Reduce the y_mat with sum along the columns to get the total contributions for xs_t
+            summed_y_mat = scaled_y_mat.sum(dim + 1)
+
+            # 8.) Scale with the bwxs to obtain the final gradients g_xs
+            g_xs = summed_y_mat * bwxs
+
+            return g_xs
+
+        # Stack all leaves of the gradients along the first dimension.
+        # This is useful as later the gradients of those leaves can be computed in parallel.
+        bwxs_stacked_leaves = torch.stack(bwxs)
+        bwys_stacked_leaves = torch.stack(bwys)
+        gl_ys_stacked_leaves = torch.stack(gl_ys)
+
+        # The compute_grad function is parallelized across all individual leaves of xs
+        # as these gradients can be computed independently from each other
+        # TODO: torch.vmap may create composability issues
+        compute_grad_mapped = torch.vmap(compute_grad, 0, 0)
+
+        g_xs = compute_grad_mapped(
+            bwxs_stacked_leaves, bwys_stacked_leaves, gl_ys_stacked_leaves
+        )
+
+        # TODO: Currently the gradients for the additional_inputs are not computed properly
+        return *[None] * 3, *g_xs, *[None] * num_additional_inputs
+
+
+@associative_scan_op.py_autograd_impl
+def associative_scan_autograd(combine_fn, xs, additional_inputs):
+    num_xs = len(xs)
+    num_additional_inputs = len(additional_inputs)
+
+    if num_additional_inputs > 0:
+        raise RuntimeError(
+            "Associative_scan does currently not support gradients for lifted parameters!"
+        )
+
+    flat_out = AssociativeScanAutogradOp.apply(
+        combine_fn,
+        num_xs,
+        num_additional_inputs,
+        *(tuple(xs) + tuple(additional_inputs)),
+    )
+    return (*flat_out,)
 
 
 @associative_scan_op.py_impl(ProxyTorchDispatchMode)
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index 633b46540767..e622a0ebee03 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -92,7 +92,7 @@ def __call__(
         kernel_options: dict[str, Any],
         score_mod_other_buffers: tuple = (),
         mask_mod_other_buffers: tuple = (),
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         validate_subgraph_args_types(score_mod_other_buffers + mask_mod_other_buffers)
         return super().__call__(
             query,
@@ -209,7 +209,7 @@ def math_attention(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Eager implementation
 
     This implementation uses vmap to vectorize the score_mod function over the batch, head, m, and n dimensions.
@@ -252,9 +252,19 @@ def math_attention(
     masked_rows = torch.all(post_mod_scores == -float("inf"), dim=-1)
     logsumexp = torch.where(masked_rows, -float("inf"), logsumexp)
 
+    # working precision will be used so no need to cast to fp32
+    max_scores = torch.max(post_mod_scores, dim=-1)[0]
+
     post_mod_scores = torch._safe_softmax(post_mod_scores, dim=-1)
 
-    return post_mod_scores.to(query.dtype) @ value, logsumexp / math.log(2)
+    # NB: kernel computes in ln2 space, we always convert back at the top level op, so
+    # for math impl we divide by log(2) because we will multiply by log(2)
+
+    return (
+        post_mod_scores.to(query.dtype) @ value,
+        logsumexp / math.log(2),
+        max_scores / math.log(2),
+    )
 
 
 @flex_attention.py_impl(DispatchKey.CompositeExplicitAutograd)
@@ -268,8 +278,8 @@ def sdpa_dense(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
-    out, lse = math_attention(
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    out, lse, max_scores = math_attention(
         query,
         key,
         value,
@@ -281,7 +291,7 @@ def sdpa_dense(
         mask_mod_other_buffers,
     )
     out = _permute_strides(out, query.stride())
-    return out, lse
+    return out, lse, max_scores
 
 
 def trace_flex_attention(
@@ -295,7 +305,7 @@ def trace_flex_attention(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Traces the flex_attention operator with the given score_mod function and other_buffers.
 
     Trace SDPA will call make_fx with "fake" example vals and then trace the score_mod function
@@ -365,7 +375,7 @@ def flex_attention_proxy_torch_dispatch_mode(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert mode is not None, "Mode should always be enabled for python fallback key"
     return trace_flex_attention(
         mode,
@@ -393,7 +403,7 @@ def flex_attention_functionalize(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Defines the functionalization rules for the flex_attention operator.
 
     Write now we are unwrapping each tensor and then redispatching to the next, however we want to
@@ -478,7 +488,7 @@ def flex_attention_fake_impl(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     if has_user_subclass(
         (
             query,
@@ -499,15 +509,17 @@ def flex_attention_fake_impl(
     if query.is_nested:
         out = torch.empty_like(query, memory_format=torch.contiguous_format)
         logsumexp = query.sum(dim=-1)
-        return out, logsumexp
+        max_scores = query.max(dim=-1)[0]
+        return out, logsumexp, max_scores
 
     v_head_dim = value.size(-1)
     batch_size, num_heads, seq_len_q, _q_head_dim = query.shape
     logsumexp = query.new_empty(batch_size, num_heads, seq_len_q, dtype=torch.float32)
+    max_scores = query.new_empty(batch_size, num_heads, seq_len_q, dtype=torch.float32)
     out_shape = (batch_size, num_heads, seq_len_q, v_head_dim)
     out = query.new_empty(out_shape)
     out = _permute_strides(out, query.stride())
-    return out, logsumexp
+    return out, logsumexp, max_scores
 
 
 # Registers dispatches for SAC
@@ -628,7 +640,7 @@ def forward(
         kernel_options: dict[str, Any],
         mask_mod_other_buffers: tuple[Any, ...],
         *score_mod_other_buffers: tuple[Any, ...],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         any_buffer_requires_grad = any(
             buffer.requires_grad
             for buffer in mask_mod_other_buffers
@@ -644,7 +656,7 @@ def forward(
         ctx.kernel_options = kernel_options
         ctx._score_mod_other_buffers_len = len(score_mod_other_buffers)
         with torch._C._AutoDispatchBelowAutograd():
-            out, logsumexp = flex_attention(
+            out, logsumexp, max_scores = flex_attention(
                 query,
                 key,
                 value,
@@ -655,7 +667,8 @@ def forward(
                 score_mod_other_buffers,
                 mask_mod_other_buffers,
             )
-
+        # no grads for you sir
+        ctx.mark_non_differentiable(max_scores)
         save_tensors_and_symints_for_backward(
             ctx,
             (
@@ -664,18 +677,20 @@ def forward(
                 value,
                 out,
                 logsumexp,
+                max_scores,
                 *block_mask[:-1],
                 *score_mod_other_buffers,
                 *mask_mod_other_buffers,
             ),
         )
-        return out, logsumexp
+        return out, logsumexp, max_scores
 
     @staticmethod
     def backward(  # type: ignore[override]
         ctx: Any,
         grad_out: Tensor,
         grad_logsumexp: Tensor,
+        grad_max_scores: Tensor,
     ) -> tuple[Optional[Tensor], ...]:
         fw_args = saved_tensors_and_symints(ctx)
         (
@@ -684,6 +699,7 @@ def backward(  # type: ignore[override]
             value,
             out,
             logsumexp,
+            max_scores,
             query_lengths,
             kv_lengths,
             kv_num_blocks,
@@ -762,7 +778,7 @@ def flex_attention_autograd(
     kernel_options: dict[str, Any],
     score_mod_other_buffers: tuple[Tensor, ...] = (),
     mask_mod_other_buffers: tuple[Tensor, ...] = (),
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
 
     with TransformGetItemToIndex():
@@ -788,7 +804,7 @@ def flex_attention_autograd(
             )
         else:
             fw_graph, bw_graph = score_mod, None
-        out, logsumexp = FlexAttentionAutogradOp.apply(
+        out, logsumexp, max_scores = FlexAttentionAutogradOp.apply(
             query,
             key,
             value,
@@ -800,7 +816,7 @@ def flex_attention_autograd(
             mask_mod_other_buffers,
             *score_mod_other_buffers,
         )
-    return out, logsumexp
+    return out, logsumexp, max_scores
 
 
 # ---------------------------- Backward HOP Implementation ----------------------------
@@ -1250,7 +1266,7 @@ def flex_attention_backward_fake_tensor_mode(
         [
             (
                 torch.empty_like(buffer, memory_format=torch.contiguous_format)
-                if isinstance(buffer, torch.Tensor) and buffer.requires_grad
+                if isinstance(buffer, torch.Tensor)
                 else None
             )
             for buffer in score_mod_other_buffers
diff --git a/torch/_higher_order_ops/invoke_subgraph.py b/torch/_higher_order_ops/invoke_subgraph.py
index 85a99d93f041..11b663ea4f61 100644
--- a/torch/_higher_order_ops/invoke_subgraph.py
+++ b/torch/_higher_order_ops/invoke_subgraph.py
@@ -45,7 +45,7 @@
 @dataclass
 class OutputMetadata:
     num_fw_outs: Optional[int] = None
-    indexes_with_none: set[int] = field(default_factory=set)
+    indexes_with_symint: set[int] = field(default_factory=set)
     indexes_with_no_grad: set[int] = field(default_factory=set)
 
 
@@ -258,8 +258,8 @@ def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
 
             output_metadata.num_fw_outs = num_fw_outs
             for idx, fw_out in enumerate(fw_outs):
-                if fw_out is None:
-                    output_metadata.indexes_with_none.add(idx)
+                if isinstance(fw_out, torch.SymInt):
+                    output_metadata.indexes_with_symint.add(idx)
                 elif not fw_out.requires_grad:
                     output_metadata.indexes_with_no_grad.add(idx)
 
@@ -331,8 +331,8 @@ def get_output_metadata(subgraph, *operands):
 
             output_metadata.num_fw_outs = num_fw_outs
             for idx, fw_out in enumerate(fw_outs):
-                if fw_out is None:
-                    output_metadata.indexes_with_none.add(idx)
+                if isinstance(fw_out, torch.SymInt):
+                    output_metadata.indexes_with_symint.add(idx)
                 elif not fw_out.requires_grad:
                     output_metadata.indexes_with_no_grad.add(idx)
             return output_metadata
@@ -428,10 +428,10 @@ def forward(
                 *operands,
             )
 
-        # Check that None is at expected indexes.
+        # Check that int (coming from symint) is at expected indexes.
         for idx, o in enumerate(out):
-            if o is None:
-                assert idx in output_metadata.indexes_with_none
+            if isinstance(o, int):
+                assert idx in output_metadata.indexes_with_symint
 
         return out
 
@@ -452,7 +452,7 @@ def backward(
         filtered_grad_outs = []
         for idx, o in enumerate(grad_outs):
             if o is None:
-                assert idx in output_metadata.indexes_with_none
+                assert idx in output_metadata.indexes_with_symint
             elif idx in output_metadata.indexes_with_no_grad:
                 # Deliberately skip over the grad_outs which we know should be
                 # None because the corresponding fwd_out does not require_grad.
diff --git a/torch/_higher_order_ops/scan.py b/torch/_higher_order_ops/scan.py
index e390c1f179bb..e4aa0161ad3c 100644
--- a/torch/_higher_order_ops/scan.py
+++ b/torch/_higher_order_ops/scan.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import functools
 import itertools
-from typing import Any, Callable, Optional
+from typing import Any, Callable
 
 import torch
 import torch._prims_common as utils
@@ -13,6 +13,9 @@
     check_meta_consistency,
     create_bw_fn,
     first_slice_copy,
+    first_slice_copy_with_grad,
+    get_tensor_mask,
+    mask_list,
     materialize_as_graph,
     reenter_make_fx,
     save_tensors_and_symints_for_backward,
@@ -60,42 +63,6 @@ def stack_y(y: torch.Tensor, scan_length: int) -> torch.Tensor:
     )
 
 
-# NOTE: These functions can be reused in associative_scan and eventually moved to
-# torch._higher_order_ops.utils
-def get_tensor_mask(tensor_list: list[Any]) -> list[bool]:
-    # Returns a mask whether a list element is a tensor or not
-    return [True if isinstance(v, torch.Tensor) else False for v in tensor_list]
-
-
-def mask_list(
-    mask: list[bool], inp: list[Any], other: Optional[list[Any]] = None
-) -> list[Any]:
-    # Masks elements on an `inp` list.
-    # If other is None, then the elements of the `inp` list where the mask is False are removed
-    # If other is not None, then the elements of the `inp` list where the mask is False are
-    # replaced with the elements of the `other` list
-    assert len(mask) == len(inp), (
-        "The length of the mask needs to be identical to the length of the input"
-    )
-    if other is not None:
-        assert len(inp) == len(other), (
-            "If an input and an other list is provided, they need to have the same length"
-        )
-        return [i if m else o for m, i, o in zip(mask, inp, other)]
-    else:
-        return [i for m, i in zip(mask, inp) if m]
-
-
-def first_slice_copy_with_grad(li: list[Any]) -> list[Any]:
-    # First_slice_copy does not keep the original requires_grad flag,
-    # but we need it for materialize_as_graph
-    # in order to compute the correct gradients
-    # The reason why first_slice_copy doesn't keep requires_grad flag is
-    # because it's called in torch.autograd.Function.backward/forward.
-    slc = [first_slice_copy(x).requires_grad_(x.requires_grad) for x in li]
-    return slc
-
-
 def call_operator(operator, *args):
     return pytree.tree_leaves(operator(*args))
 
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
index 4dd2bd145a90..fa8ab598eb89 100644
--- a/torch/_higher_order_ops/triton_kernel_wrap.py
+++ b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -18,6 +18,7 @@
 import torch.utils._pytree as pytree
 from torch import SymInt, Tensor
 from torch._C import DispatchKey
+from torch._higher_order_ops.utils import redirect_to_mode
 from torch._ops import HigherOrderOperator
 from torch._prims_common import clone_preserve_strides
 from torch._subclasses.fake_tensor import FakeTensorMode
@@ -28,6 +29,7 @@
 )
 from torch.fx.experimental.symbolic_shapes import guard_scalar
 from torch.types import IntLikeType
+from torch.utils.checkpoint import _CachedTorchDispatchMode, _CachingTorchDispatchMode
 
 
 if TYPE_CHECKING:
@@ -1342,6 +1344,9 @@ def triton_kernel_wrapper_functional_functionalize(
 triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCUDA)
 triton_kernel_wrapper_functional.fallthrough(DispatchKey.AutogradCPU)
 
+# Adds SAC support for triton ops
+redirect_to_mode(triton_kernel_wrapper_mutation, _CachingTorchDispatchMode)
+redirect_to_mode(triton_kernel_wrapper_mutation, _CachedTorchDispatchMode)
 
 ###############################################################################
 # The "TritonHOPifier": a class that transforms a call to a triton kernel into
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index ff50095b8e63..7e5b235264fc 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -1,7 +1,7 @@
 # mypy: allow-untyped-defs
 import contextlib
 import functools
-from collections.abc import Sequence
+from collections.abc import Iterable, Sequence
 from contextlib import AbstractContextManager, contextmanager, ExitStack, nullcontext
 from dataclasses import dataclass
 from typing import Any, Callable, Optional, overload, TypeVar, Union
@@ -114,9 +114,7 @@ def _maybe_compile_and_run_fn(fn, *args):
 
 
 def reenter_make_fx(fn):
-    from torch._guards import detect_fake_mode
     from torch.fx.experimental.proxy_tensor import _CURRENT_MAKE_FX_TRACER
-    from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
 
     @functools.wraps(fn)
     def wrapped(*args):
@@ -126,9 +124,6 @@ def wrapped(*args):
         gm = _CURRENT_MAKE_FX_TRACER.trace_subgraph(
             _maybe_run_with_interpreter(fn), *args
         )
-        if (fake_mode := detect_fake_mode()) and fake_mode.shape_env is not None:
-            insert_deferred_runtime_asserts(gm, fake_mode.shape_env, "reenter_make_fx")
-            gm.recompile()
         return gm
 
     return wrapped
@@ -506,7 +501,9 @@ def fw_with_masks(*args):
         # require_gradness reasoning much easier.
         if pytree.tree_any_only(torch.Tensor, lambda t: t.requires_grad, args):
             fw_out = pytree.tree_map_only(
-                torch.Tensor, lambda x: x.requires_grad_(True), fw_out
+                torch.Tensor,
+                lambda x: x.requires_grad_(True) if x.dtype.is_floating_point else x,
+                fw_out,
             )
         return fw_out, pytree.tree_map_only(
             torch.Tensor, lambda x: x.requires_grad, fw_out
@@ -807,6 +804,40 @@ def first_slice_copy(t: torch.Tensor, dim: int = 0) -> torch.Tensor:
     return torch.select_copy(t, dim, 0)
 
 
+# Returns a mask whether a list element is a tensor or not
+def get_tensor_mask(tensor_list: Iterable[Any]) -> list[bool]:
+    return [True if isinstance(v, torch.Tensor) else False for v in tensor_list]
+
+
+def mask_list(
+    mask: list[bool], inp: list[Any], other: Optional[list[Any]] = None
+) -> list[Any]:
+    # Masks elements on an `inp` list.
+    # If other is None, then the elements of the `inp` list where the mask is False are removed
+    # If other is not None, then the elements of the `inp` list where the mask is False are
+    # replaced with the elements of the `other` list
+    assert len(mask) == len(inp), (
+        "The length of the mask needs to be identical to the length of the input"
+    )
+    if other is not None:
+        assert len(inp) == len(other), (
+            "If an input and an other list is provided, they need to have the same length"
+        )
+        return [i if m else o for m, i, o in zip(mask, inp, other)]
+    else:
+        return [i for m, i in zip(mask, inp) if m]
+
+
+def first_slice_copy_with_grad(li: Iterable[Any]) -> list[Any]:
+    # First_slice_copy does not keep the original requires_grad flag,
+    # but we need it for materialize_as_graph
+    # in order to compute the correct gradients
+    # The reason why first_slice_copy doesn't keep requires_grad flag is
+    # because it's called in torch.autograd.Function.backward/forward.
+    slc = [first_slice_copy(x).requires_grad_(x.requires_grad) for x in li]
+    return slc
+
+
 # Reports the difference between meta of two tensors in a string
 def diff_tensor_meta(
     meta1: TensorMetadata, meta2: TensorMetadata, check_grad=True
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
index 7038dfe01d65..02aa6ac0215e 100644
--- a/torch/_higher_order_ops/while_loop.py
+++ b/torch/_higher_order_ops/while_loop.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import contextlib
+import functools
 from typing import Any, Callable, Union
 
 import torch
@@ -11,6 +12,9 @@
     autograd_not_implemented,
     check_input_alias_and_mutation_return_outputs,
     check_meta_consistency,
+    fill_none_with_masks,
+    filter_with_masks,
+    materialize_as_graph,
     reenter_make_fx,
     validate_subgraph_args_types,
 )
@@ -260,7 +264,9 @@ def _while_loop_op_wrapper(*args, **kwargs):
 
 
 @while_loop_op.py_impl(DispatchKey.CompositeExplicitAutograd)
-def while_loop_dense(cond_fn, body_fn, carried_inputs, additional_inputs):
+def while_loop_dense(
+    cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False
+):
     carried_vals = carried_inputs
 
     def _validate_cond_output(pred):
@@ -280,9 +286,30 @@ def _validate_cond_output(pred):
             f"carried_inputs must be a tuple or list but got {type(carried_inputs)}"
         )
 
-    while pred := cond_fn(*carried_vals, *additional_inputs):
-        _validate_cond_output(pred)
+    # Check condition and set up flag
+    should_loop = cond_fn(*carried_vals, *additional_inputs)
+    _validate_cond_output(should_loop)
+
+    if not should_loop:
+        if stack_output:
+            return tuple(
+                val.unsqueeze(0).clone() if isinstance(val, torch.Tensor) else val
+                for val in carried_vals
+            )
+        else:
+            return tuple(
+                val.clone() if isinstance(val, torch.Tensor) else val
+                for val in carried_vals
+            )
+
+    outputs: list[list[torch.Tensor]] = [[] for _ in carried_vals]
+
+    while should_loop:
         out = body_fn(*carried_vals, *additional_inputs)
+        if stack_output:
+            for i, o in enumerate(out):
+                outputs[i].append(o)
+
         assert isinstance(out, tuple), (
             f"body_fn should return a tuple but got {type(out)}"
         )
@@ -290,12 +317,28 @@ def _validate_cond_output(pred):
             "body_fn should return the same number of elements as carried_inputs"
         )
         carried_vals = out
+
+        should_loop = cond_fn(*carried_vals, *additional_inputs)
+
+    if stack_output:
+        outs: list[torch.Tensor] = []
+        for i, out in enumerate(outputs):
+            outs.append(torch.stack(out, dim=0))
+        return tuple(outs)
+
     return carried_vals
 
 
-while_loop_op.py_autograd_impl(
-    autograd_not_implemented(while_loop_op, deferred_error=True)
-)
+@while_loop_op.py_autograd_impl
+def while_loop_autograd(cond_fn, body_fn, operands, additional_inputs):
+    return WhileLoopAutogradOp.apply(
+        cond_fn,
+        body_fn,
+        len(operands),
+        len(additional_inputs),
+        *operands,
+        *additional_inputs,
+    )
 
 
 def _find_or_create_fake_mode() -> FakeTensorMode:
@@ -324,9 +367,18 @@ def _create_unbacked_symint(
 
 
 @while_loop_op.py_impl(ProxyTorchDispatchMode)
-def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs):
+def while_loop_tracing(
+    mode,
+    cond_fn,
+    body_fn,
+    carried_inputs,
+    additional_inputs,
+    stack_output=False,
+):
+    op = while_loop_stack_output_op if stack_output else while_loop_op
+
     def _trace_while_loop(
-        proxy_mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+        proxy_mode, op, cond_fn, body_fn, carried_inputs, additional_inputs
     ):
         # NOTE [unspecialize int carry with unbacked symints]
         # When we support int carry, we'll also need to support int output of body_fn because.
@@ -425,10 +477,10 @@ def produce_graph(fn):
         proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)
 
         out_proxy = proxy_mode.tracer.create_proxy(
-            "call_function", while_loop_op, proxy_args, {}, name="while_loop"
+            "call_function", op, proxy_args, {}, name=op._name
         )
 
-        out = while_loop_op(
+        out = op(
             cond_graph, body_graph, unspecialized_carried_inputs, additional_inputs
         )
         return track_tensor_tree(
@@ -436,13 +488,18 @@ def produce_graph(fn):
         )
 
     return _trace_while_loop(
-        mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
+        mode,
+        op,
+        cond_fn,
+        body_fn,
+        carried_inputs,
+        additional_inputs,
     )
 
 
 @while_loop_op.py_impl(FakeTensorMode)
 def while_loop_fake_tensor_mode(
-    mode, cond_fn, body_fn, carried_inputs, additional_inputs
+    mode, cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False
 ):
     with mode:
         # NOTE: [Handling unback symints in subgraph of while_loop]
@@ -487,6 +544,26 @@ def while_loop_fake_tensor_mode(
                 "body_output",
                 include_contiguity=False,
             )
+
+        if stack_output:
+            n_iter = _create_unbacked_symint(mode, ignore_fresh_unbacked_symbols=False)
+            assert all(isinstance(x, torch.Tensor) for x in carried_inputs)
+            fake_outputs = tuple(
+                out.clone()
+                .unsqueeze(0)
+                .repeat((n_iter,) + tuple(1 for _ in range(out.dim())))
+                for out in body_outs
+            )
+            return pytree.tree_map_only(
+                (int, torch.SymInt),
+                # For while_loop's unbacked symint output, we want them to be bound
+                # to the proxy of while_loop's output.
+                lambda _: _create_unbacked_symint(
+                    mode, ignore_fresh_unbacked_symbols=False
+                ),
+                fake_outputs,
+            )
+
         # See NOTE [unspecialize int carry with unbacked symints]
         return pytree.tree_map_only(
             (int, torch.SymInt),
@@ -500,9 +577,13 @@ def while_loop_fake_tensor_mode(
 
 
 @while_loop_op.py_functionalize_impl
-def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
+def while_loop_func(
+    ctx, cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False
+):
     from torch._higher_order_ops.utils import _check_alias_and_mutation
 
+    op = while_loop_stack_output_op if stack_output else while_loop_op
+
     unwrapped_carried_inputs = ctx.unwrap_tensors(carried_inputs)
     unwrapped_additional_inputs = ctx.unwrap_tensors(additional_inputs)
     unwrapped_inputs = unwrapped_carried_inputs + unwrapped_additional_inputs
@@ -515,10 +596,334 @@ def while_loop_func(ctx, cond_fn, body_fn, carried_inputs, additional_inputs):
             (body_fn, "body_fn"),
         ]:
             _check_alias_and_mutation(fn, unwrapped_inputs, fn_name, pre_dispatch)
-        ret = while_loop_op(
+        ret = op(
             functional_cond_fn,
             functional_body_fn,
             unwrapped_carried_inputs,
             unwrapped_additional_inputs,
         )
         return ctx.wrap_tensors(ret)
+
+
+class WhileLoopStackOutputOp(HigherOrderOperator):
+    """
+    while_loop_stack_output is a variant of while_loop that returns a stack of outputs.
+    Its semantic can be illurated using python code as:
+    def while_loop_stack_output(cond_fn, body_fn, carried_inputs, additional_inputs):
+        outs = []
+        while cond_fn(*carried_inputs, *additional_inputs):
+            out = body_fn(*carried_inputs, *additional_inputs)
+            outs.append(out)
+        return torch.stack(outs)
+
+    It's useful for supporting autograd of while_loop.
+    """
+
+    def __init__(self) -> None:
+        super().__init__("while_loop_stack_output")
+
+    def __call__(
+        self,
+        cond_fn: Callable,
+        body_fn: Callable,
+        carried_inputs: tuple[Union[torch.Tensor, int, float, bool]],
+        additional_inputs: tuple[Union[torch.Tensor, torch.SymInt, int], ...],
+        /,
+    ):
+        if not isinstance(carried_inputs, (tuple, list)):
+            raise RuntimeError(
+                f"carried_inputs must be a tuple or list, got {type(carried_inputs)}"
+            )
+        if not isinstance(additional_inputs, (tuple, list)):
+            raise RuntimeError(
+                f"additional_inputs must be a tuple or list, got {type(additional_inputs)}"
+            )
+
+        validate_subgraph_args_types(carried_inputs)
+        validate_subgraph_args_types(additional_inputs)
+        return super().__call__(cond_fn, body_fn, carried_inputs, additional_inputs)
+
+
+# Note [while_loop autograd]
+# Consider wthe following while_loop that can be visualized as:
+#           additional_inputs
+#       ┌─────┬─────┼─────┬─────┐
+#       |     |     |     |     |
+#       ↓     ↓     ↓     ↓     ↓
+# x ──→ y0 ─→ y1 ─→ y2 ─→ y3 ─→ y4
+#
+# The bacwkard can be visualized as follows:
+#
+#             g_additional_inputs
+#         ┌──────┬──────┼──────┬──────┐
+#         |      |      |      |      |
+#         |      |      |      |      |
+# gx <── gy0 <─ gy1 <─ gy2 <─ gy3 <─ gy4
+#
+# We can compute gx using chain rule:
+#
+#     gx = gy0 * bw(y0, x),
+#
+# where gy0 denotes the graident of loss with respect to y0, and bw(y0, x) denotes the graident of y0 with
+# respect to x. Note that bw can be computed from forward body_fn easily using torch.autograd.grad.
+# We could substitute the unknowns gy0, gy1, ..., with chain rule until gy4:
+#
+#     gx = gy1 * bw(y1, y0) * bw(y0, x)
+#        = gy2 * bw(y2, y1) * bw(y1, y0) * bw(y0, x)
+#        = ...
+#        = gy4 * bw(y4, y3) * bw(y3, y2) * bw(y2, y1) * bw(y1, y0) * bw(y0, x)
+#
+# since gy4 is the graient of the final output, which is given as the backward input, we've got a formula
+# to compute gx. A abbr for the formula is: gy4 * bw43210x
+#
+# In a similar way, we can compute g_additional_inputs using chain rule:
+#
+# g_additional_inputs = gy0 * bw(y0, addi) + gy1 * bw(y1, addi) + gy2 * bw(y2, addi) + ... + gy4 * bw(y4, addi)
+#
+# Notice that gy0 = gy4 * bw43210, gy1 = gy4 * bw4321 etc, we now also get a formula for g_additional_inputs.
+#
+# Implementation:
+# The idea of implementation is to construct a while_loop to calculate both gx and g_additional_inputs.
+# Specifically, we can implement the backward of while_loop with as follows:
+#
+# def cond_fn(idx, grad_carries, grad_additional_inputs, fw_additional_inputs, fw_inps):
+#     return idx < fw_inps.size(0)
+#
+# def body_fn(idx, grad_carries, grad_additional_inputs, fw_additional_inputs, fw_inps):
+#     reversed_idx = fw_inps.size(0) - 1 - idx
+#     next_grad_carry, next_grad_additional_inputs  = bw(fw_inps[reversed_idx], fw_additional_inputs, grad_carries)
+#     return idx + 1, next_grad_carry, next_grad_additional_inputs + grad_additional_inputs
+#
+# idx = 0
+# init_grad_carries = grads
+# init_grad_additional_inputs = torch.zeros_like(g_additioanl_inputs)
+# fw_inps = torch.cat([ctx.fw_carried_inputs, fw_outputs[:-1]])
+# while_loop(cond_fn, body_fn, (idx, init_grad_carries, init_grad_additional_inputs,), (fw_additional_inputs, fw_inps))
+
+
+class WhileLoopAutogradOp(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        cond_fn,
+        body_fn,
+        num_carried_inputs,
+        num_additional_inputs,
+        *carries_and_inputs,
+    ):
+        from torch._higher_order_ops.scan import split_into_chunks
+
+        carries, additional_inputs = split_into_chunks(
+            carries_and_inputs, [num_carried_inputs, num_additional_inputs]
+        )
+        with torch._C._AutoDispatchBelowAutograd():
+            fw_outputs = while_loop_stack_output_op(
+                cond_fn, body_fn, carries, additional_inputs
+            )
+
+        assert not hasattr(ctx, "fw_cond_fn")
+        assert not hasattr(ctx, "fw_body_fn")
+        assert not hasattr(ctx, "carries")
+        assert not hasattr(ctx, "additional_inputs")
+        assert not hasattr(ctx, "fw_outputs")
+        ctx.fw_cond_fn = cond_fn
+        ctx.fw_body_fn = body_fn
+        ctx.carries = carries
+        ctx.additional_inputs = additional_inputs
+        ctx.fw_outputs = fw_outputs
+        loop_count = None
+        for out in fw_outputs:
+            if isinstance(out, torch.Tensor):
+                if loop_count is not None:
+                    assert out.size(0) == loop_count
+                else:
+                    loop_count = out.size(0)
+        assert loop_count is not None
+
+        # Remove the loop_count from pending_fresh_unbacked_symbols
+        # because it's not part of forward output and it's impossible
+        # to bind it to a proxy in forward graph anyways.
+        if (
+            isinstance(loop_count, torch.SymInt)
+            and (shape_env := loop_count.node.shape_env)
+            and loop_count in shape_env.pending_fresh_unbacked_symbols
+        ):
+            shape_env.pending_fresh_unbacked_symbols.remove(loop_count)
+
+        # Even when body function is not executed, we clone and unsqueeze the input
+        # to avoid the aliasing, therefore loop_count is always >= 1
+        torch._check(loop_count >= 1)
+        # We snapshot the dispatch keys in forward for materializing the
+        # the bw_graph in backward.
+        ctx._fw_include_key_set = torch._C._dispatch_tls_local_include_set()
+        ctx._fw_exclude_key_set = torch._C._dispatch_tls_local_exclude_set()
+        assert len(fw_outputs) > 0, "fw_outputs shouldn't be empty"
+        # Only the last of the output fw_outputs need to be returned
+        return tuple(ckp[-1] for ckp in fw_outputs)
+
+    @staticmethod
+    def backward(ctx, *grads):
+        from torch._higher_order_ops.cond import create_bw_fn
+        from torch._higher_order_ops.scan import split_into_chunks
+
+        # set up single step bw fn
+        bw_body_fn = create_bw_fn(ctx.fw_body_fn, ctx.carries + ctx.additional_inputs)
+        # Note [Handle inputs that're not differentiable]
+        # When a forward input is non-differentiable e.g. a symint or an integer tensor, their gradients
+        # will be None. However, we don't want to return None in the subgraph because this complicates the
+        # inductor codegen, where we need to do a non-unform treatment for None and tensors.
+        # So we set up masks and filter the None gradients so that only tensors are returned from each step.
+        carries_tensor_masks = [
+            True if isinstance(t, torch.Tensor) and t.dtype.is_floating_point else False
+            for t in ctx.carries
+        ]
+        additional_inputs_tensor_masks = [
+            True if isinstance(t, torch.Tensor) and t.dtype.is_floating_point else False
+            for t in ctx.additional_inputs
+        ]
+
+        init_idx = torch.zeros((), dtype=torch.int64)
+        init_grad_carries = filter_with_masks(grads, carries_tensor_masks)  # type: ignore[arg-type]
+        init_grad_additional_inputs = tuple(
+            torch.zeros_like(t)
+            for need_keep, t in zip(
+                additional_inputs_tensor_masks, ctx.additional_inputs
+            )
+            if need_keep
+        )
+        # We need to the forward inputs to each iteration to compute the backward
+        # which is the concatenation of first iteraiton input i.e. ctx.carries and all iterations's
+        # output except the last iteration.
+        fw_carries = [
+            torch.cat([carry.unsqueeze(0), carries[:-1]])
+            for carry, carries in zip(ctx.carries, ctx.fw_outputs)
+        ]
+        for fw_carry, carry in zip(fw_carries, ctx.carries):
+            fw_carry.requires_grad_(carry.requires_grad)
+
+        _, spec = pytree.tree_flatten(
+            (
+                init_idx,
+                init_grad_carries,
+                init_grad_additional_inputs,
+                ctx.fw_outputs,
+                ctx.additional_inputs,
+            )
+        )
+
+        def cond_fn(*flat_args):
+            (
+                idx,
+                grad_carries,
+                grad_additional_inputs,
+                fw_carries,
+                additional_inputs,
+            ) = pytree.tree_unflatten(flat_args, spec)
+            assert isinstance(fw_carries[0], torch.Tensor), fw_carries[0]
+            # excluding the last iteration's output
+            return idx < fw_carries[0].size(0)
+
+        def body_fn(*flat_args):
+            (
+                idx,
+                grad_carries,
+                grad_additional_inputs,
+                fw_carries,
+                additional_inputs,
+            ) = pytree.tree_unflatten(flat_args, spec)
+            reversed_idx = fw_carries[0].size(0) - idx - 1
+            selected_fw_carries = [
+                ckp.select(0, reversed_idx.item()) for ckp in fw_carries
+            ]
+            cur_grad_carries, cur_grad_additional_inputs = split_into_chunks(
+                bw_body_fn(*selected_fw_carries, *additional_inputs, *grad_carries),
+                [len(ctx.carries), len(ctx.additional_inputs)],
+            )
+            assert all(isinstance(t, torch.Tensor) for t in cur_grad_carries)
+            cur_grad_carries_tensors = filter_with_masks(
+                cur_grad_carries, carries_tensor_masks
+            )
+            cur_grad_additional_inputs_tensors = filter_with_masks(
+                cur_grad_additional_inputs, additional_inputs_tensor_masks
+            )
+            return (
+                idx + 1,
+                *cur_grad_carries_tensors,
+                *(
+                    cur_grad + grad
+                    for cur_grad, grad in zip(
+                        cur_grad_additional_inputs_tensors, grad_additional_inputs
+                    )
+                ),
+            )
+
+        args_single_step_bw = (
+            init_idx,
+            *init_grad_carries,
+            *init_grad_additional_inputs,
+            *fw_carries,
+            *ctx.additional_inputs,
+        )
+
+        cond_gm = materialize_as_graph(
+            cond_fn,
+            args_single_step_bw,
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+
+        body_gm = materialize_as_graph(
+            body_fn,
+            args_single_step_bw,
+            ctx._fw_include_key_set,
+            ctx._fw_exclude_key_set,
+            force_enable_grad=True,
+        )
+
+        _, final_grad_carries, final_grad_additional_inputs = split_into_chunks(
+            while_loop_op(
+                cond_gm,
+                body_gm,
+                (
+                    init_idx,
+                    *init_grad_carries,
+                    *init_grad_additional_inputs,
+                ),
+                (*fw_carries, *ctx.additional_inputs),
+            ),
+            [1, len(init_grad_carries), len(init_grad_additional_inputs)],
+        )
+        return (
+            None,
+            None,
+            None,
+            None,
+            *fill_none_with_masks(final_grad_carries, carries_tensor_masks),
+            *fill_none_with_masks(
+                final_grad_additional_inputs, additional_inputs_tensor_masks
+            ),
+        )
+
+
+while_loop_stack_output_op = WhileLoopStackOutputOp()
+
+while_loop_stack_output_op.py_impl(DispatchKey.CompositeExplicitAutograd)(
+    functools.partial(while_loop_dense, stack_output=True)
+)
+
+while_loop_stack_output_op.py_impl(ProxyTorchDispatchMode)(
+    functools.partial(while_loop_tracing, stack_output=True)
+)
+
+while_loop_stack_output_op.py_impl(FakeTensorMode)(
+    functools.partial(while_loop_fake_tensor_mode, stack_output=True)
+)
+
+while_loop_stack_output_op.py_functionalize_impl(
+    functools.partial(while_loop_func, stack_output=True)
+)
+
+while_loop_stack_output_op.py_autograd_impl(
+    autograd_not_implemented(while_loop_stack_output_op, deferred_error=True)
+)
diff --git a/torch/_higher_order_ops/wrap.py b/torch/_higher_order_ops/wrap.py
index eec23ee20a47..8e9ca0503402 100644
--- a/torch/_higher_order_ops/wrap.py
+++ b/torch/_higher_order_ops/wrap.py
@@ -2,10 +2,14 @@
 import inspect
 import itertools
 import logging
-from typing import Optional
+from typing import Any, Optional
 
+import torch
+import torch.utils._pytree as pytree
 from torch._logging import warning_once
 from torch._ops import HigherOrderOperator
+from torch.fx import GraphModule
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 from torch.types import _dtype
 
 
@@ -227,7 +231,8 @@ def divide_kwargs(kwargs):
         }
         return checkpoint_kwargs, gmod_kwargs
 
-    def tag_nodes(self, gmod, is_sac):
+    @staticmethod
+    def tag_nodes(gmod, is_sac):
         from torch.utils.checkpoint import CheckpointPolicy
 
         unique_graph_id = next(uid)
@@ -243,44 +248,85 @@ def tag_nodes(self, gmod, is_sac):
         return gmod
 
     def __call__(self, gmod, *args, **kwargs):
-        import torch.fx.traceback as fx_traceback
-        from torch.fx import Interpreter
+        dispatch_key_set = torch._ops._compute_keyset(
+            args, kwargs, self.non_fallthrough_keys
+        )
+        dispatch_key = dispatch_key_set.highestPriorityTypeId()
+        if dispatch_key == torch._C.DispatchKey.PreDispatch:
+            return super().__call__(gmod, *args, **kwargs)
 
-        if "_checkpoint_context_fn" in gmod.meta:
-            warning_once(
-                log,
-                """
+        return tag_activation_checkpoint_impl(gmod, *args, **kwargs)
+
+
+tag_activation_checkpoint = TagActivationCheckpoint()
+
+
+def tag_activation_checkpoint_impl(gmod, *args, **kwargs):
+    import torch.fx.traceback as fx_traceback
+    from torch.fx import Interpreter
+
+    if "_checkpoint_context_fn" in gmod.meta:
+        warning_once(
+            log,
+            """
 Detected that context_fn is passed to torch.utils.checkpoint under torch.compile.
 Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_).
 """,
-            )
-            # use_reentrant is set to False because this op is going to be traced.
-            # And we ensure that AOT Autograd traces through the non reentrant
-            # version of checkpointing.
-            kwargs["use_reentrant"] = False
-            # preserve_rng_state is set to False because we want to prevent AOTAutograd from tracing through
-            # `torch.random.fork_rng` op (which is not supported yet under CUDA).
-            # This doesn't mean that we don't preserve RNG state. Instead, we will always preserve RNG state
-            # regardless of this flag (by doing RNG functionalization via `replace_random_passes` in Inductor
-            # instead of in AOTAutograd).
-            kwargs["preserve_rng_state"] = False
-            kwargs["context_fn"] = gmod.meta["_checkpoint_context_fn"]
-            # We first tag all nodes as "recompute" in this graph, and then we undo the "recompute" tag
-            # for specific nodes in _CachingTorchDispatchMode in torch/utils/checkpoint.py.
-            gmod = self.tag_nodes(gmod, is_sac=True)
-            # Using interpreter allows preservation of metadata through torch.compile stack.
-            with fx_traceback.preserve_node_meta():
-                from torch.utils.checkpoint import checkpoint
-
-                return checkpoint(Interpreter(gmod).run, *args, **kwargs)
-        else:
-            gmod = self.tag_nodes(gmod, is_sac=False)
-            # Using interpreter allows preservation of metadata through torch.compile stack.
-            # TODO: We want to use the same `checkpoint(Interpreter(gmod).run, *args, **kwargs)` here
-            # as the `context_fn != None` case, but that depends on in-place op support in TorchDispatchMode + torch.compile.
-            # (for details on in-place op issue, run `test_compile_selective_checkpoint_inplace_op` unit test)
-            with fx_traceback.preserve_node_meta():
-                return Interpreter(gmod).run(*args)
-
+        )
+        # use_reentrant is set to False because this op is going to be traced.
+        # And we ensure that AOT Autograd traces through the non reentrant
+        # version of checkpointing.
+        kwargs["use_reentrant"] = False
+        # preserve_rng_state is set to False because we want to prevent AOTAutograd from tracing through
+        # `torch.random.fork_rng` op (which is not supported yet under CUDA).
+        # This doesn't mean that we don't preserve RNG state. Instead, we will always preserve RNG state
+        # regardless of this flag (by doing RNG functionalization via `replace_random_passes` in Inductor
+        # instead of in AOTAutograd).
+        kwargs["preserve_rng_state"] = False
+        kwargs["context_fn"] = gmod.meta["_checkpoint_context_fn"]
+        # We first tag all nodes as "recompute" in this graph, and then we undo the "recompute" tag
+        # for specific nodes in _CachingTorchDispatchMode in torch/utils/checkpoint.py.
+        gmod = TagActivationCheckpoint.tag_nodes(gmod, is_sac=True)
+        # Using interpreter allows preservation of metadata through torch.compile stack.
+        with fx_traceback.preserve_node_meta():
+            from torch.utils.checkpoint import checkpoint
 
-tag_activation_checkpoint = TagActivationCheckpoint()
+            return checkpoint(Interpreter(gmod).run, *args, **kwargs)
+    else:
+        gmod = TagActivationCheckpoint.tag_nodes(gmod, is_sac=False)
+        # Using interpreter allows preservation of metadata through torch.compile stack.
+        # TODO: We want to use the same `checkpoint(Interpreter(gmod).run, *args, **kwargs)` here
+        # as the `context_fn != None` case, but that depends on in-place op support in TorchDispatchMode + torch.compile.
+        # (for details on in-place op issue, run `test_compile_selective_checkpoint_inplace_op` unit test)
+        with fx_traceback.preserve_node_meta():
+            return Interpreter(gmod).run(*args)
+
+
+@tag_activation_checkpoint.py_impl(ProxyTorchDispatchMode)
+def proxy_mode_key(
+    proxy_mode: ProxyTorchDispatchMode,
+    gmod: GraphModule,
+    *args: Any,
+    **kwargs: Any,
+) -> tuple[torch.Tensor]:
+    assert proxy_mode.pre_dispatch, (
+        "post-dispatch mode should have inlined in the Autograd key"
+    )
+    example_out = tag_activation_checkpoint(gmod, *args, **kwargs)
+    proxy_args = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, args)  # type: ignore[union-attr]
+    proxy_kwargs = pytree.tree_map(proxy_mode.tracer.unwrap_proxy, kwargs)  # type: ignore[union-attr]
+    qualname = proxy_mode.tracer.get_fresh_qualname("wrap_body")  # type: ignore[union-attr]
+    proxy_mode.tracer.root.register_module(qualname, gmod)  # type: ignore[union-attr]
+    proxy_gmod = proxy_mode.tracer.unwrap_proxy(gmod)  # type: ignore[union-attr, call-overload]
+    for node in proxy_gmod.graph.nodes:
+        if "example_value" in node.meta:
+            node.meta["val"] = node.meta["example_value"]
+    out_proxy = proxy_mode.tracer.create_proxy(
+        "call_function",
+        tag_activation_checkpoint,
+        (proxy_gmod, *proxy_args),
+        proxy_kwargs,
+    )
+    return track_tensor_tree(
+        example_out, out_proxy, constant=None, tracer=proxy_mode.tracer
+    )
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index ca1dda7efb95..d287337afaa6 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -292,6 +292,15 @@ def aot_compile(
     """
     from .compile_fx import _aoti_flatten_inputs, compile_fx_aot
 
+    if hasattr(gm, "_guards_fn"):
+        # Do not compile the guards function, since it may contain checks
+        # that are not currently supported by AOTI. In particular, non-Tensor
+        # arguments are converted to None and will fail specialization checks.
+        node = next(iter(gm.graph.find_nodes(op="call_module", target="_guards_fn")))
+        gm.graph.erase_node(node)
+        delattr(gm, "_guards_fn")
+        gm.recompile()
+
     flat_example_inputs, options = _aoti_flatten_inputs(
         gm, args, kwargs, options=options
     )
diff --git a/torch/_inductor/async_compile.py b/torch/_inductor/async_compile.py
index 09bf4b1c9e28..9f941c04e7b3 100644
--- a/torch/_inductor/async_compile.py
+++ b/torch/_inductor/async_compile.py
@@ -38,7 +38,11 @@
     StaticAutotunerFuture,
     torch_key,
 )
-from torch._inductor.compile_worker.subproc_pool import AnyPool, SubprocPool
+from torch._inductor.compile_worker.subproc_pool import (
+    AnyPool,
+    SubprocException,
+    SubprocPool,
+)
 from torch._inductor.compile_worker.tracked_process_pool import (
     TrackedProcessPoolExecutor,
 )
@@ -144,6 +148,7 @@ def shutdown_compile_workers() -> None:
     """Shut down all outstanding compile-worker pools."""
     for pool in _pool_set:
         pool.shutdown()
+    AsyncCompile._ready_future = None
     after_fork()
 
 
@@ -304,8 +309,8 @@ def warm_pool(cls) -> None:
 
     @classmethod
     def wait_pool_ready(cls, timeout=120) -> None:
-        if cls.use_process_pool():
-            assert cls._ready_future is not None
+        cls.use_process_pool()
+        if cls._ready_future is not None:
             cls._ready_future.result(timeout=timeout)
 
     @classmethod
@@ -450,12 +455,18 @@ def reload_kernel_in_parent():
             )
 
             def get_result() -> CachingAutotuner:
-                kernel, elapsed_us = task.result()
+                try:
+                    kernel, elapsed_us = task.result()
+                except SubprocException as e:
+                    raise e.with_name(kernel_name) from e
+
                 # Now that we've compiled, we should clear the future
                 # so it can't be used again
                 kernel.set_compile_info(compile_id, is_backward)
                 CompiledTritonKernels.remove_future(source_code)
 
+                kernel.restore_after_unpickle(old_values=None)
+
                 kernel.precompile(
                     warm_cache_only=False,
                     reload_kernel=reload_kernel_in_parent,
diff --git a/torch/_inductor/choices.py b/torch/_inductor/choices.py
index aacb62c7a123..2189e44f9e24 100644
--- a/torch/_inductor/choices.py
+++ b/torch/_inductor/choices.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import typing
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import sympy
 
@@ -13,7 +13,8 @@
 from .metrics import get_metric_table, is_metric_table_enabled
 from .runtime.hints import DeviceProperties, ReductionHint
 from .scheduler import BaseSchedulerNode, Scheduler, WhyNoFuse
-from .template_heuristics import (
+from .template_heuristics import get_template_heuristic
+from .template_heuristics.triton import (
     BaseConfigHeuristic,
     CPUConfigHeuristic,
     CUDAConfigHeuristic,
@@ -21,7 +22,6 @@
     ROCmConfigHeuristic,
     XPUConfigHeuristic,
 )
-from .template_registry import get_template_heuristic
 from .virtualized import V
 
 
@@ -33,8 +33,11 @@
 
     from torch.utils._ordered_set import OrderedSet
 
+    from .codegen.common import KernelTemplate
     from .codegen.simd_kernel_features import SIMDKernelFeatures
     from .codegen.triton import TritonKernel
+    from .ir import ChoiceCaller
+    from .select_algorithm import ExternKernelChoice
 
 
 class Sortable(typing.Protocol):
@@ -104,32 +107,65 @@ def get_mm_configs(
         self,
         kernel_inputs: KernelInputs,
         layout: Any,
-        template_name: str,
+        templates: list[Union[KernelTemplate, ExternKernelChoice]],
         op_name: str,
-    ) -> Generator[dict[str, Any], None, None]:
+        kwarg_overrides: Optional[dict[str, dict[str, Any]]] = None,
+    ) -> Generator[ChoiceCaller, None, None]:
         """
-        Get generator of template parameters for MM templates using template-specific heuristics.
+        Get generator of ChoiceCallers for MM templates using template-specific heuristics.
 
         Args:
             kernel_inputs: MMKernelInputs containing input tensor nodes and matrix indices
             layout: Output layout
-            template_name: Template name (e.g., "bmm", "mm", "mm_persistent_tma")
+            templates: List of template objects (KernelTemplate or ExternKernelChoice)
             op_name: Operation name (e.g., "bmm", "baddbmm", "addmm", "mm_plus_mm")
-
+            kwarg_overrides: Optional dict of kwargs to override for each template heuristic,
+                             indexed by template.uid. These only override the per config kwargs, not the extra kwargs
         Yields:
-            Template parameter dictionaries ready for maybe_append_choice
+            ChoiceCaller objects from the templates
         """
+        if kwarg_overrides is None:
+            kwarg_overrides = {}
         input_tensors = kernel_inputs.nodes()
         if len(input_tensors) < 2:
             raise ValueError(f"Need at least 2 input tensors, got {len(input_tensors)}")
 
         # Extract device_type from kernel_inputs
         device_type = kernel_inputs.device_type
+
         assert device_type is not None, "get_mm_configs requires a valid device type"
-        # Get the appropriate template-specific heuristic
-        heuristic = get_template_heuristic(template_name, device_type, op_name)
 
-        yield from heuristic.get_template_configs(kernel_inputs, layout, op_name)
+        for template in templates:
+            # Extract template_name from the template object
+            template_name = template.uid
+
+            # Get the appropriate template-specific heuristic
+            heuristic = get_template_heuristic(template_name, device_type, op_name)
+
+            cs = heuristic.get_template_configs(
+                kernel_inputs,
+                layout,
+                op_name,
+            )
+            extra_kwargs = heuristic.get_extra_kwargs(kernel_inputs, layout, op_name)
+
+            # Extract layout and input_nodes from extra_kwargs to pass them explicitly
+            layout_val = layout
+            # adjust the kernel inputs to the template-specific heuristic, if needed
+            # default here is to just return the kernel_inputs as is
+            input_nodes_val = heuristic.adjust_kernel_inputs(
+                kernel_inputs, op_name
+            ).nodes()
+
+            # Get overrides for this specific template
+            overrides = kwarg_overrides.get(template.uid, {})
+
+            extra_kwargs["layout"] = layout_val
+            extra_kwargs["input_nodes"] = input_nodes_val
+            for c in cs:
+                choice = template.choice_or_none(**{**c, **overrides}, **extra_kwargs)
+                if choice is not None:
+                    yield choice
 
     def triton_kernel_kwargs(
         self,
@@ -196,6 +232,18 @@ def should_use_persistent_reduction(
             features.reduction_numel, threshold
         )  # type: ignore[arg-types]
 
+    @staticmethod
+    def want_no_x_dim(features: SIMDKernelFeatures) -> bool:
+        """
+        Heuristic to decide if we should drop the X dimension from a persistent reduction kernel.
+        So the [XBLOCK, RBLOCK] block becomes a [RBLOCK] block and XBLOCK is forced to be always 1.
+        Strangely this is faster than a [1, RBLOCK] block in some cases.
+
+        ROCm branch change: Remove want_no_x_dim for persistent reduction.
+        Inductor benchmarks show no perf advantage and simplifies autotune flow.
+        """
+        return False
+
     @staticmethod
     def reduction_split_factor(
         device: torch.device,
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 40c7a1d66c3c..7b24208a2c51 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -83,6 +83,8 @@
     CustomGraphModulePass,
     CustomGraphPass,
     CustomGraphPassType,
+    CustomPartitionerFn,
+    CustomPartitionerFnType,
 )
 from torch._inductor.freezing_utils import has_frozen_params, is_frozen_param
 from torch._inductor.runtime.compile_tasks import _reload_python_module
@@ -895,6 +897,11 @@ def __init__(
             if custom_config is not None
         }
 
+        # Register the custom partitioner function
+        self._custom_partitioner_fn = self._get_custom_partitioner_fn_detail(
+            config.custom_partitioner_fn
+        )
+
     # This is mainly added to handle these two inductor configs, which are (unfortunately)
     # sometimes cache safe:
     # - _pre_fusion_custom_pass
@@ -927,6 +934,14 @@ def _get_custom_pass_detail(
         assert isinstance(custom_pass, (CustomGraphPass, CustomGraphModulePass))
         return custom_pass.uuid()
 
+    def _get_custom_partitioner_fn_detail(
+        self, custom_partitioner_fn: CustomPartitionerFnType
+    ) -> Optional[Any]:
+        if not custom_partitioner_fn:
+            return None
+        assert isinstance(custom_partitioner_fn, CustomPartitionerFn)
+        return custom_partitioner_fn.uuid()
+
 
 def compiled_fx_graph_hash(
     gm: torch.fx.GraphModule,
@@ -1233,6 +1248,22 @@ def cache_hit_post_compile(
             lambda: {"filename": artifact_path},
             payload_fn=lambda: code,
         )
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_provenance_tracking_node_mappings",
+                "encoding": "json",
+            },
+            payload_fn=lambda: graph.inductor_provenance_mapping_str,
+        )
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "inductor_provenance_tracking_kernel_stack_traces",
+                "encoding": "json",
+            },
+            payload_fn=lambda: graph.inductor_provenance_stack_traces_str,
+        )
         return graph, cache_info
 
     @staticmethod
@@ -1571,8 +1602,14 @@ def clear() -> None:
 
 @functools.cache
 def split_aot_inductor_output_path(path: str) -> tuple[str, str]:
+    def get_module_ext_type() -> str:
+        if _IS_WINDOWS:
+            return ".pyd"
+        else:
+            return ".so"
+
     """Returns the path where the AOT Inductor compiled kernels are stored."""
-    if path.endswith(".so"):
+    if path.endswith(get_module_ext_type()):
         return os.path.split(path)
     elif path.endswith(".pt2"):
         return os.path.split(path)
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index f3fd99992532..9802358b02ee 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -308,6 +308,7 @@ class DeviceCodegen:
     scheduling: SchedulingConstructor
     wrapper_codegen: WrapperConstructor
     cpp_wrapper_codegen: Optional[WrapperConstructor] = None
+    fx_wrapper_codegen: Optional[WrapperConstructor] = None
 
 
 KernelArgType = Union[WorkspaceArg, TensorArg, SizeArg, TMADescriptorArg, ConstexprArg]
@@ -402,11 +403,15 @@ def register_backend_for_device(
     device_scheduling: SchedulingConstructor,
     device_wrapper_codegen: WrapperConstructor,
     device_cpp_wrapper_codegen: Optional[WrapperConstructor] = None,
+    device_fx_wrapper_codegen: Optional[WrapperConstructor] = None,
     device_custom_pass: Optional[CustomGraphModulePass] = None,
     device_custom_config: Optional[ConfigModule] = None,
 ) -> None:
     device_codegens[device] = DeviceCodegen(
-        device_scheduling, device_wrapper_codegen, device_cpp_wrapper_codegen
+        device_scheduling,
+        device_wrapper_codegen,
+        device_cpp_wrapper_codegen,
+        device_fx_wrapper_codegen,
     )
     custom_backend_passes[device] = device_custom_pass
     if device_custom_config:
@@ -468,9 +473,7 @@ def get_wrapper_codegen_for_device(
     if device in device_codegens:
         wrapper_codegen_obj: DeviceCodegen = device_codegens[device]
         if fx_wrapper:
-            from .wrapper_fxir import WrapperFxCodegen
-
-            return WrapperFxCodegen
+            return wrapper_codegen_obj.fx_wrapper_codegen
         elif cpp_wrapper:
             return wrapper_codegen_obj.cpp_wrapper_codegen
         else:
@@ -507,6 +510,7 @@ def init_backend_registration() -> None:
     from .python_wrapper_mtia import PythonWrapperMtia
     from .triton import TritonScheduling
     from .wrapper import PythonWrapperCodegen
+    from .wrapper_fxir import WrapperFxCodegen
 
     if get_scheduling_for_device("cpu") is None:
         cpu_backends = {
@@ -521,6 +525,7 @@ def init_backend_registration() -> None:
             CppWrapperCpuArrayRef
             if config.aot_inductor.allow_stack_allocation
             else CppWrapperCpu,
+            WrapperFxCodegen,
         )
 
     if get_scheduling_for_device("cuda") is None:
@@ -534,6 +539,7 @@ def init_backend_registration() -> None:
             lambda scheduling: cuda_backends[config.cuda_backend](scheduling),
             PythonWrapperCodegen,
             CppWrapperGpu,
+            WrapperFxCodegen,
         )
 
     if get_scheduling_for_device("xpu") is None:
@@ -542,6 +548,7 @@ def init_backend_registration() -> None:
             TritonScheduling,
             PythonWrapperCodegen,
             CppWrapperGpu,
+            WrapperFxCodegen,
         )
 
     if get_scheduling_for_device("mps") is None:
@@ -550,6 +557,7 @@ def init_backend_registration() -> None:
             MetalScheduling,
             PythonWrapperCodegen,
             CppWrapperMps,
+            WrapperFxCodegen,
         )
 
     if get_scheduling_for_device("mtia") is None:
@@ -558,6 +566,7 @@ def init_backend_registration() -> None:
             TritonScheduling,
             PythonWrapperMtia,
             CppWrapperGpu,
+            WrapperFxCodegen,
         )
 
     private_backend = torch._C._get_privateuse1_backend_name()
@@ -571,12 +580,14 @@ def init_backend_registration() -> None:
             device_scheduling = _get_custom_mod_func("Scheduling")
             wrapper_codegen = _get_custom_mod_func("PythonWrapperCodegen")
             cpp_wrapper_codegen = _get_custom_mod_func("CppWrapperCodegen")
+            fx_wrapper_codegen = _get_custom_mod_func("WrapperFxCodegen")
             if device_scheduling and wrapper_codegen and cpp_wrapper_codegen:
                 register_backend_for_device(
                     private_backend,
                     device_scheduling,
                     wrapper_codegen,
                     cpp_wrapper_codegen,
+                    fx_wrapper_codegen,
                 )
         except RuntimeError:
             pass
@@ -809,6 +820,14 @@ def doprint(
             expr = V.graph.sizevars.simplify(expr)
         return super().doprint(expr)
 
+    def parenthesize(self, item: sympy.Expr, level: int, strict: bool = False) -> str:
+        if isinstance(item, sympy.Mod):
+            # use parenthesis to enforce precedence.
+            # in sympy 1.13.3, -2*Mod(x,y) becomes -2*x%y, which is wrong.
+            return f"({self._print(item)})"
+        else:
+            return super().parenthesize(item, level, strict)
+
 
 class OpDecompositions:
     """
@@ -2391,6 +2410,29 @@ def get_dtype(name: str) -> torch.dtype:
     def __init__(self, name: str) -> None:
         self.name = name
 
+    @property
+    def uid(self) -> str:
+        """
+        entry point to override for templates to ensure a uid e.g. through a prefix
+
+        the purpose of this is that every KernelTemplate/ExternKernelChoice is unique
+        in the system, but reproducible e.g. restarting pytorch should yield the same id
+        """
+        # TODO(coconutruben): add some central registration to assert on global uniqueness
+        return self.name
+
+    def choice_or_none(self, **kwargs: Any) -> Optional[ChoiceCaller]:
+        """
+        Maybe generates a new ChoiceCaller and returns it, or None if generation fails.
+
+        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
+        """
+        temp_choices: list[Any] = []
+        result = self.maybe_append_choice(temp_choices, **kwargs)
+        if result is None and len(temp_choices) == 1:
+            return temp_choices[0]
+        return None
+
     def maybe_append_choice(
         self, choices: list[Any], **kwargs: Any
     ) -> Optional[NotImplementedError]:
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index 1b2eb02279c3..9d36e24d5f9e 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -5396,10 +5396,6 @@ def define_kernel(self, src_code, nodes, kernel_args=None):
             else ""
         )
         kernel_name = "_".join(["cpp", fused_name, wrapper.next_kernel_suffix()])
-        # below add provenance tracing info for cpu CppKernel types
-        if config.trace.provenance_tracking_level != 0:
-            set_kernel_post_grad_provenance_tracing(nodes, kernel_name)
-
         kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel"
         src_code = src_code.replace(str(Placeholder.KERNEL_NAME), kernel_decl_name)
         src_code = src_code.replace(str(Placeholder.DESCRIPTIVE_NAME), kernel_name)
@@ -5438,7 +5434,15 @@ def flush(self):
             kernel_name = self.define_kernel(
                 src_code, self.kernel_group.scheduled_nodes
             )
-            self.kernel_group.call_kernel(V.graph.wrapper_code, kernel_name)
+            # below add provenance tracing info for cpu CppKernel types
+            debug_handle: Optional[int] = None
+            if config.trace.provenance_tracking_level != 0:
+                debug_handle = set_kernel_post_grad_provenance_tracing(
+                    self.kernel_group.scheduled_nodes, kernel_name
+                )
+            self.kernel_group.call_kernel(
+                V.graph.wrapper_code, kernel_name, debug_handle=debug_handle
+            )
         self.reset_kernel_group()
         self._set_flush_status(False)
 
@@ -5513,10 +5517,14 @@ def codegen_group(self, name=None) -> str:
             code.splice(self.loops_code)
         return code.getvalue()
 
-    def call_kernel(self, wrapper, kernel_name):
+    def call_kernel(self, wrapper, kernel_name, debug_handle: Optional[int] = None):
         _, call_args, arg_types = self.args.cpp_argdefs()
         wrapper.generate_kernel_call(
-            kernel_name, call_args, triton=False, arg_types=arg_types
+            kernel_name,
+            call_args,
+            triton=False,
+            arg_types=arg_types,
+            debug_handle=debug_handle,
         )
 
 
diff --git a/torch/_inductor/codegen/cpp_flex_attention_template.py b/torch/_inductor/codegen/cpp_flex_attention_template.py
index 80fd3014a643..a1ceecf7f7c9 100644
--- a/torch/_inductor/codegen/cpp_flex_attention_template.py
+++ b/torch/_inductor/codegen/cpp_flex_attention_template.py
@@ -792,7 +792,7 @@ def get_arg_name(name):
             return ""
 
         if start_offset == -1:
-            start_offset = getattr(self, len_attr)
+            start_offset = self.len_score_other
 
         length = getattr(self, len_attr)
         for i in range(length):
@@ -995,9 +995,9 @@ def render(  # type: ignore[override,return]
             value=value,
             kv_num_blocks=self.input_nodes[3],
             kv_indices=self.input_nodes[4],
-            full_kv_num_blocks=self.input_nodes[5]
-            if not self.no_full_kv_block
-            else None,
+            full_kv_num_blocks=(
+                self.input_nodes[5] if not self.no_full_kv_block else None
+            ),
             full_kv_indices=self.input_nodes[6] if not self.no_full_kv_block else None,
             score_mod_other_buffers=self.score_mod_other_buffers,
             mask_mod_other_buffers=self.mask_mod_other_buffers,
diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py
index 113913d50ee2..d6b8806bdd91 100644
--- a/torch/_inductor/codegen/cpp_micro_gemm.py
+++ b/torch/_inductor/codegen/cpp_micro_gemm.py
@@ -963,6 +963,15 @@ def check_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
     return k % vnni_size == 0 and alpha == 1
 
 
+def check_int8_bf16_amx_extra(config, m, n, k, alpha, num_threads, **kwargs):
+    # We need avx512_bf16 to dequant int8 to bf16
+    vec_isa = kwargs.get("vec_isa", None)
+    assert vec_isa is not None
+    return vec_isa.is_avx512_bf16_supported() and check_amx_extra(
+        config, m, n, k, alpha, num_threads, **kwargs
+    )
+
+
 # amx_fp16 need to be checked separately since it is not always supported when amx is supported
 def check_amx_fp16_extra(config, m, n, k, alpha, num_threads, **kwargs):
     assert config.input_dtype == torch.float16 and config.output_dtype == torch.float
@@ -984,16 +993,16 @@ def check_amx_fp16_extra(config, m, n, k, alpha, num_threads, **kwargs):
     ),
     *generate_gemm_config(
         VecAMX,
-        [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+        [(32, 32, 32), (48, 16, 32)],
         input_dtype=torch.bfloat16,
         input2_dtype=torch.int8,
         output_dtype=torch.float,
         compute_dtype=torch.float,
-        extra_check=check_amx_extra,
+        extra_check=check_int8_bf16_amx_extra,
     ),
     *generate_gemm_config(
         VecAMX,
-        [(32, 32, 32), (48, 16, 32), (16, 48, 32)],
+        [(32, 16, 32), (32, 32, 32), (48, 16, 32), (16, 48, 32)],
         input_dtype=torch.bfloat16,
         output_dtype=torch.float,
         extra_check=check_amx_extra,
@@ -1041,12 +1050,38 @@ class CppMicroGemmAMX(CppMicroGemm):
         for (int idx_dq = 0, idx_q = 0; idx_dq < buf_size; idx_q += ldb, idx_dq += {{block_n}}) {
         {%- for vec_idx in range(0, block_n, 32) %}
             {%- if (block_n - vec_idx) >= 32 %}
-            auto b_int8_idx_{{vec_idx}} = at::vec::Vectorized<int8_t>::loadu(
-                base_addr + idx_q + {{vec_idx}} ,
-                static_cast<int64_t>(32)
-            );
-            auto b_bf16_idx_{{vec_idx}} = at::vec::convert<{{input_t}}>(b_int8_idx_{{vec_idx}});
-            b_bf16_idx_{{vec_idx}}.store(dequantized_B_buf + idx_dq + {{vec_idx}});
+            // 1) Load 32 x int8
+            __m256i v8  = _mm256_loadu_si256((const __m256i*)(base_addr + idx_q + {{vec_idx}}));
+            // 2) Widen: 32 x i8 -> 32 x i16
+            __m512i v16 = _mm512_cvtepi8_epi16(v8);  // sign-extend. Use _mm512_cvtepu8_epi16 for unsigned
+            // Split the 32 x i16 into two 16-lane halves
+            __m256i v16_lo = _mm512_castsi512_si256(v16);
+            __m256i v16_hi = _mm512_extracti64x4_epi64(v16, 1);
+            // 3) Widen each half to i32
+            __m512i v32_lo = _mm512_cvtepi16_epi32(v16_lo);
+            __m512i v32_hi = _mm512_cvtepi16_epi32(v16_hi);
+            // 4) Convert to f32
+            __m512 f_lo = _mm512_cvtepi32_ps(v32_lo);
+            __m512 f_hi = _mm512_cvtepi32_ps(v32_hi);
+            // 5) f32 -> bf16 (round-to-nearest-even) and pack 32 lanes to 512b
+            // Packs the second operand (f_lo) into the lower 16 bf16 lanes and the first (f_hi) into the upper 16.
+            __m512i bf = (__m512i)_mm512_cvtne2ps_pbh(f_hi, f_lo);
+            // 6) Store 32 x bf16 (512 bits)
+            _mm512_storeu_si512((__m512i*)(dequantized_B_buf + idx_dq + {{vec_idx}}), bf);
+            {%- elif (block_n - vec_idx) >= 16 %}
+            // 1) Load 16 x int8 (128 bits)
+            __m128i v8 = _mm_loadu_si128((const __m128i*)(base_addr + idx_q + {{vec_idx}}));
+            // 2) Widen: 16 x i8 -> 16 x i16
+            __m256i v16 = _mm256_cvtepi8_epi16(v8);   // for signed
+            // use _mm256_cvtepu8_epi16 for unsigned
+            // 3) Widen further: 16 x i16 -> 16 x i32
+            __m512i v32 = _mm512_cvtepi16_epi32(v16);
+            // 4) Convert to f32
+            __m512 f32 = _mm512_cvtepi32_ps(v32);
+            // 5) Convert f32 -> bf16 (round-to-nearest-even)
+            __m256i bf16 = (__m256i)_mm512_cvtneps_pbh(f32);
+            // 6) Store 16 x bf16 (256 bits)
+            _mm256_storeu_si256((__m256i*)(dequantized_B_buf + idx_dq + {{vec_idx}}), bf16);
             {%- else %}
             auto b_int8_tail = at::vec::Vectorized<int8_t>::loadu(
                 base_addr + idx_q + {{block_n - (block_n % 32)}},
@@ -1915,7 +1950,7 @@ def create_from_config(cls, config: CppMicroGemmConfig):
             alpha,
         )
 
-    def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
+    def skip_amx_kernel_for_woq(dynamic_M):
         # For WoQ GEMM, AMX micro-kernel may not perform well if m is small.
         # Exception: for dynamic shapes, we consider using the AMX micro-kernel.
         if (
@@ -1924,11 +1959,7 @@ def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
             or input2_dtype not in [torch.int8, torch.uint8]
         ):
             return False
-        # For WOQ INT8, use AMX for m >= block_m
-        # For WOQ INT4, use AMX for m >= 5
-        block_m, *_ = config.register_blocking
-        is_woq_int4 = micro_gemm_cls == CppMicroGemmWoQInt4Amx
-        m_threshold = 5 if is_woq_int4 else block_m
+        m_threshold = 5
         return m < m_threshold
 
     assert isinstance(n, int) or n.is_number, n
@@ -1974,9 +2005,7 @@ def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
                 ):
                     continue
                 block_m, block_n, block_k = config.register_blocking
-                if config.vec_isa_cls == VecAMX and skip_amx_kernel_for_woq(
-                    config, dynamic_M, cls
-                ):
+                if config.vec_isa_cls == VecAMX and skip_amx_kernel_for_woq(dynamic_M):
                     continue
                 # Criteria on the ranking of configurations
                 # 1. ISA: AMX > VEC
@@ -2005,9 +2034,14 @@ def skip_amx_kernel_for_woq(config, dynamic_M, micro_gemm_cls):
                     + (block_m * block_k + block_k * block_n)
                     * config.input_dtype.itemsize
                 )
+                size_score = register_bytes
+                # if number of mxn blocks can not occupy all the threads,
+                # we favor smaller register blocks.
+                if occupancy_score == 0:
+                    size_score = 0 - register_bytes
                 matched_configs.append(
                     (
-                        (isa_score, dividable_score, occupancy_score, register_bytes),
+                        (isa_score, dividable_score, occupancy_score, size_score),
                         cls,
                         config,
                     )
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
index 2ac35c44e13d..929c22703946 100644
--- a/torch/_inductor/codegen/cpp_utils.py
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -201,6 +201,14 @@ def doprint(self, expr, *, simplify: bool = True, p=True):
             expr = V.graph.sizevars.simplify(expr)
         return super().doprint(expr)
 
+    def parenthesize(self, item: sympy.Expr, level: int, strict: bool = False) -> str:
+        if isinstance(item, sympy.Mod):
+            # use parenthesis to enforce precedence.
+            # in sympy 1.13.3, -2*Mod(x,y) becomes -2*x%y, which is wrong.
+            return f"({self._print(item)})"
+        else:
+            return super().parenthesize(item, level, strict)
+
 
 # A function to print, useful for printing sympy symbols.
 cexpr = CppPrinter().doprint
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 27698a7fdd1e..83d1d0614674 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -22,6 +22,7 @@
 from torch.utils._sympy.symbol import symbol_is_type, SymT
 
 from .. import config, cpp_builder, ir
+from ..debug import set_kernel_post_grad_provenance_tracing
 from ..utils import _align, DeferredLineBase, LineContext, normalize_name
 from ..virtualized import V
 from .aoti_hipify_utils import maybe_hipify_code_wrapper
@@ -721,6 +722,28 @@ def codegen_model_kernels(self):
                     )
             self.prefix.writeline("}")
 
+    # MSVC string was longer than the limit of 16380 single-byte characters.
+    # https://learn.microsoft.com/en-us/cpp/error-messages/compiler-errors-1/compiler-error-c2026
+    MSVC_C2026_MAX_STRING_LENGTH = 16000
+
+    def codegen_write_arg_with_large_length_string(
+        self,
+        arg_name: str,
+        arg_str_val: str,
+        max_truncate_length: int = MSVC_C2026_MAX_STRING_LENGTH,
+    ):
+        def truncate_string(s: str, length: int) -> list[str]:
+            return [s[i : i + length] for i in range(0, len(s), length)]
+
+        if len(arg_str_val) > max_truncate_length:
+            truncated_strs = truncate_string(arg_str_val, max_truncate_length)
+            self.prefix.writeline(f"{arg_name} =")
+            for truncate_str in truncated_strs:
+                self.prefix.writeline(f'R"({truncate_str})"')
+            self.prefix.writeline(";")
+        else:
+            self.prefix.writeline(f'{arg_name} = R"({arg_str_val})";')
+
     def codegen_model_constructor(self):
         """
         // Generated code example
@@ -868,11 +891,16 @@ def escape_string(x):
                     .replace("\t", "\\t")
                 )
 
-            self.prefix.writeline(
-                f'in_spec_ = R"({config.aot_inductor.serialized_in_spec})";'
+            # Origin code: self.prefix.writeline(f'in_spec_ = R"({config.aot_inductor.serialized_in_spec})";')
+            # Fix msvc C2026 error via codegen_write_arg_with_large_length_string
+            self.codegen_write_arg_with_large_length_string(
+                arg_name="in_spec_", arg_str_val=config.aot_inductor.serialized_in_spec
             )
-            self.prefix.writeline(
-                f'out_spec_ = R"({config.aot_inductor.serialized_out_spec})";'
+            # Origin code: self.prefix.writeline(f'out_spec_ = R"({config.aot_inductor.serialized_out_spec})";')
+            # Fix msvc C2026 error via codegen_write_arg_with_large_length_string
+            self.codegen_write_arg_with_large_length_string(
+                arg_name="out_spec_",
+                arg_str_val=config.aot_inductor.serialized_out_spec,
             )
 
             for idx, output in enumerate(V.graph.graph_outputs):
@@ -1219,6 +1247,7 @@ def generate_c_shim_extern_kernel_call(
         device: str,
         *,
         debug_args: Optional[list[str]] = None,
+        debug_handle: Optional[int] = None,
     ) -> None:
         """debug_args kwarg allows CppWrapperCpuArrayRef to pass in wrapped arguments in
         place of args while preserving debug printer output."""
@@ -1235,14 +1264,16 @@ def generate_c_shim_extern_kernel_call(
         ]
         with debug_printer_manager:
             shim_fn = self.get_c_shim_func_name(kernel, device)
+            self.write_provenance_debug_handle(shim_fn, debug_handle)
             shim_fn_codes = (
                 f"AOTI_TORCH_ERROR_CODE_CHECK({shim_fn}({', '.join(args)}));"
             )
             if enable_kernel_profile:
+                debug_handle_str = "" if debug_handle is None else f":{debug_handle}"
                 shim_fn_codes = textwrap.dedent(
                     f"""
                     {{
-                      RAIIAtenRecordFunctionHandle record_{shim_fn}_("{shim_fn}", nullptr);
+                      RAIIAtenRecordFunctionHandle record_{shim_fn}_("{shim_fn}{debug_handle_str}", nullptr);
                       {shim_fn_codes}
                     }}
                     """
@@ -1265,8 +1296,15 @@ def generate_c_shim_extern_kernel_alloc(
             args = [*args, f"&{output_handle_name}"]
 
         device = d.type if (d := extern_kernel.get_device()) else self.device
+
+        debug_handle = None
+        if config.trace.provenance_tracking_level != 0:
+            debug_handle = set_kernel_post_grad_provenance_tracing(
+                extern_kernel, extern_kernel.get_kernel_name(), is_extern=True
+            )
+
         self.generate_c_shim_extern_kernel_call(
-            extern_kernel.get_kernel_name(), args, device
+            extern_kernel.get_kernel_name(), args, device, debug_handle=debug_handle
         )
 
         if extern_kernel.python_kernel_name in (
@@ -1323,10 +1361,19 @@ def generate_c_shim_fallback_kernel(
                 raise NotImplementedError(f"unsupported type of {output=}")
         args = args + output_args
         device = d.type if (d := fallback_kernel.get_device()) else self.device
+
+        debug_handle = None
+        if config.trace.provenance_tracking_level != 0:
+            debug_handle = set_kernel_post_grad_provenance_tracing(
+                fallback_kernel,
+                fallback_kernel.cpp_kernel_name,  # type: ignore[arg-type]
+                is_extern=True,
+            )
         self.generate_c_shim_extern_kernel_call(
             fallback_kernel.cpp_kernel_name,  # type: ignore[arg-type]
             args,
             device,
+            debug_handle=debug_handle,
         )
         for raii_handle in output_raii_handles:
             self.writeline(raii_handle)
@@ -1338,6 +1385,7 @@ def _generate_extern_kernel_out_helper(
         out_view: Optional[str],
         args: list[str],
         device: str,
+        debug_handle: Optional[int] = None,
     ) -> None:
         if out_view:
             out_name = f"{out}_as_strided"
@@ -1346,7 +1394,9 @@ def _generate_extern_kernel_out_helper(
         else:
             args.insert(0, out)
 
-        self.generate_c_shim_extern_kernel_call(kernel, args, device)
+        self.generate_c_shim_extern_kernel_call(
+            kernel, args, device, debug_handle=debug_handle
+        )
 
     def generate_scatter_fallback(
         self,
@@ -1645,6 +1695,9 @@ def make_allocation(
             self.wrapper_call.writeline(
                 f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_as_strided({', '.join(args)}));"
             )
+            self.wrapper_call.writeline(
+                f"wrap_with_raii_handle_if_needed({old_handle_name});"
+            )
 
         return f"RAIIAtenTensorHandle {name}({handle_name});"
 
@@ -1894,7 +1947,9 @@ def codegen_subgraph(self, subgraph, outer_inputs, outer_outputs):
         finally:
             self.pop_codegened_graph()
 
-    def codegen_while_loop(self, while_loop):
+    def codegen_while_loop(self, while_loop, stack_output=False):
+        if stack_output:
+            raise NotImplementedError("NYI cpp wrapper for while_loop_stack_output")
         is_bool_pred = isinstance(
             while_loop.cond_subgraph.graph.graph_outputs[0], ir.ShapeAsConstantBuffer
         )
diff --git a/torch/_inductor/codegen/cpp_wrapper_gpu.py b/torch/_inductor/codegen/cpp_wrapper_gpu.py
index 6bbbab859900..24b87fa8fa49 100644
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@@ -234,6 +234,8 @@ def generate_launch_kernel(self, prefix, wrapper, kernel_var_name, params):
             "kernel_args_",
             "stream_",
         ]
+        if wrapper.device == "xpu":
+            launch_kernel_args.append(str(params["threads_per_warp"]))
         prefix.writeline(f"launchKernel({', '.join(launch_kernel_args)});")
 
 
diff --git a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
index 6ca7a086c0ea..605b93dff592 100644
--- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/evt_extensions.py
@@ -3,8 +3,11 @@
 from sympy import Expr
 
 import torch._inductor.config as config
-from torch._inductor.ir import ComputedBuffer, InputBuffer
-from torch._prims_common import check_contiguous_sizes_strides
+from torch._inductor.ir import (
+    ComputedBuffer,
+    InputBuffer,
+    is_contiguous_strides_for_shape,
+)
 from torch.utils._ordered_set import OrderedSet
 
 from ..cutlass_utils import torch_dtype_to_cutlass_type, try_import_cutlass
@@ -72,8 +75,8 @@ def cutlass_tensor_from_buffer(
             shape = tuple(size_hint_fn(x) for x in shape)
             stride = tuple(size_hint_fn(x) for x in stride)
 
-            is_row_major = check_contiguous_sizes_strides(shape, stride)
-            is_column_major = check_contiguous_sizes_strides(shape[::-1], stride[::-1])
+            is_row_major = is_contiguous_strides_for_shape(stride, shape)
+            is_column_major = is_contiguous_strides_for_shape(stride[::-1], shape[::-1])
 
             if not is_row_major and not is_column_major:
                 raise RuntimeError(
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py b/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py
index ca6af6690e62..c30f8bc05d6f 100644
--- a/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_kernel.py
@@ -2,19 +2,31 @@
 import contextlib
 import dataclasses
 import logging
+import textwrap
 from typing import Any, Callable, Optional
 
+import sympy
+
 import torch
-from torch._inductor.codegen.common import IndentedBuffer, Kernel
-from torch._inductor.ir import Buffer
-from torch._inductor.select_algorithm import PartialRender
+from torch._inductor.codegen.common import (
+    CSE,
+    CSEVariable,
+    IndentedBuffer,
+    Kernel,
+    ValueRanges,
+)
+from torch._inductor.ir import Buffer, ComputedBuffer, InputBuffer
+from torch._inductor.ops_handler import StoreMode
 from torch._inductor.utils import OrderedSet
 from torch._inductor.virtualized import V
 
+from .cutedsl_op_overrides import CuteDSLOpOverrides
+
 
 # TODO setting the 'main' kernel w/ this suffix. We have 3 should probably just auto generate this
 MAIN_SUFFIX = "main"
 
+
 log = logging.getLogger(__name__)
 kernel_code_log = torch._logging.getArtifactLogger(__name__, "kernel_code")
 
@@ -70,14 +82,14 @@ def __init__(
         kernel_name: str,
         input_nodes: list[Buffer],
         output_node: Buffer,
+        subgraphs: Optional[list[Buffer]] = None,
     ) -> None:
         # Call parent Kernel constructor
         super().__init__()
         self.kernel_name = kernel_name
         self.input_nodes = input_nodes
         self.output_node = output_node
-
-        # TODO Subgraph management for template processing
+        self.subgraphs = subgraphs
         self.subgraph_bodies: dict[str, CuteDSLSubgraphInfo] = {}
 
         # Template attributes
@@ -97,6 +109,8 @@ def __init__(
             node_name = getattr(input_node, "name", f"input_{i}")
             self.named_input_nodes[node_name] = input_node
 
+        self.cse = CSE(name_prefix="tmp")
+
     def gen_imports(self) -> str:
         """Generate common imports for CuteDSL templates."""
         imports = IndentedBuffer()
@@ -107,6 +121,8 @@ def gen_imports(self) -> str:
             import cutlass.cute as cute
             from cutlass.cute.runtime import from_dlpack
             import cuda.bindings.driver as cuda
+            from cutlass._mlir.dialects import math as mlir_math
+            import operator
             """
         )
         return imports.getvalue()
@@ -119,11 +135,15 @@ def gen_defines(self, **kwargs) -> str:
         return params.getvalue()
 
     def render(self, template, **kwargs):
+        from torch._inductor.select_algorithm import PartialRender
+
         """Render the kernel using the template, returning PartialRender object with hooks."""
         # Available {{}} hooks for jinja rendering
         template_env = {
             "def_kernel": self.def_kernel,
             "gen_defines": lambda: self.gen_defines(**kwargs),
+            "get_output": self.get_output,
+            "modification": self.modification,
         }
 
         # Render the template with the environment and provided kwargs
@@ -194,29 +214,203 @@ def create_subgraph_body(self, body_name: str):
 
     def def_kernel(self, *argnames):
         """Define kernel function signature for CuteDSL templates."""
-        # Populate all the kernel args
+        renames = IndentedBuffer(initial_indent=1)
+
         for i, input_node in enumerate(self.input_nodes):
-            self.args.input(input_node.get_name())
+            buf_name = input_node.get_name()
+            self.args.input(buf_name)
+
+            # Template aliasing: converts template variables (e.g., "input_a") to function args (e.g., "arg_input_a")
+            # and generates rename statements so template code can use the original names
+            if i < len(argnames):
+                template_name = argnames[i]
+                arg_name = f"arg_{template_name}"
+                self.args.input_buffers[buf_name] = arg_name
+                renames.writeline(f"{template_name} = {arg_name}")
 
         if self.output_node:
             self.args.output(self.output_node.get_name())
 
         def hook():
+            # Deferred execution: arg definitions must be collected after template processing adds all args
+            arg_defs, *_ = self.args.python_argdefs()
             code = IndentedBuffer()
             code.writeline(f"# Kernel function signature: {self.kernel_name}")
-            params = list(argnames) + ["stream"]
+            params = [x.full_name() for x in arg_defs] + ["stream"]
             code.writeline(
                 f"def {self.kernel_name}_{MAIN_SUFFIX}({', '.join(params)}):"
             )
+            with code.indent():
+                code.splice(renames.getvalue())
             return code.getvalue()
 
         assert "<DEF_KERNEL>" not in self.render_hooks
+        # Placeholder-based rendering: hook will be called when template encounters "<DEF_KERNEL>"
         self.render_hooks["<DEF_KERNEL>"] = hook
         return "<DEF_KERNEL>"
 
+    def get_output(self):
+        """Get the actual argument name for the output buffer."""
+        assert self.output_node, "Output node must exist to get output buffer name"
+        buf_name = self.output_node.get_name()
+        output = self.args.output_buffers.get(buf_name, None)
+        if output is None:
+            raise ValueError(f"Output buffer '{buf_name}' not found in args")
+        return output
+
     def call_kernel(self, name: str, node=None):
         """Call the kernel function. Simplified version of TritonTemplateKernel.call_kernel."""
         wrapper = V.graph.wrapper_code
         _, call_args, _, arg_types = self.args.python_argdefs()
         # TODO triton should really be swapped w/ `python`
         wrapper.generate_kernel_call(name, call_args, triton=True, arg_types=arg_types)
+
+    def _get_subgraph(self, subgraph_number: int):
+        """Get subgraph by number for modification processing."""
+        assert isinstance(subgraph_number, int)
+        assert isinstance(self.subgraphs, list)
+        assert subgraph_number < len(self.subgraphs), (
+            f"Invalid subgraph number provided to create_modification, {subgraph_number} must be < {len(self.subgraphs)}"
+        )
+        assert self.body.getvalue() == "", (
+            "Body should be clear before adding a modification"
+        )
+        return self.subgraphs[subgraph_number]
+
+    def modification(
+        self,
+        subgraph_number: int,
+        output_name: Optional[str],
+        mask: Optional[str] = None,
+        **fixed_inputs,
+    ) -> str:
+        """Generate CuteDSL code for a subgraph modification."""
+        # Find unique name to avoid collisions between multiple modifications of same subgraph
+        num = 0
+        while f"mod_{subgraph_number}_{num}" in self.subgraph_bodies:
+            num += 1
+
+        with self.create_subgraph_body(f"mod_{subgraph_number}_{num}"):
+            subgraph = self._get_subgraph(subgraph_number)
+            modification_handler = ModificationWrapperCuteDSL(
+                self, subgraph_number, fixed_inputs, mask
+            )
+            with V.set_kernel_handler(self), V.set_ops_handler(modification_handler):
+                assert isinstance(subgraph, (ComputedBuffer, list)), (
+                    f"Expected ComputedBuffer or List[ComputedBuffer], got {type(subgraph)}"
+                )
+
+                if isinstance(subgraph, list):
+                    raise NotImplementedError(
+                        "Scatter graphs are not supported for CuteDSL"
+                    )
+
+                if isinstance(subgraph.data, InputBuffer):
+                    # grad_score_mod can be InputBuffers
+                    out = subgraph.data.make_loader()(())
+                else:
+                    # Inline a pointwise lowering into the template
+                    out = subgraph.data.inner_fn(())
+
+            if output_name is not None:
+                assert out is not None, (
+                    f"Expected computation result for named output {output_name}"
+                )
+                self.body.writeline(f"{output_name} = {out.value}")
+            else:
+                # Side-effect only: no output assignment (currently only for scatter operations)
+                raise NotImplementedError(
+                    "Side-effect only modifications not yet supported for CuteDSL"
+                )
+
+            return self.body.getvalue()
+
+
+class ModificationWrapperCuteDSL(V.WrapperHandler):  # type: ignore[name-defined]
+    """
+    Wrapper handler that enables CuteDSL code generation during subgraph modifications.
+
+    This class sits between the PyTorch IR and CuteDSL code generation, providing:
+    1. Operation substitution: converts PyTorch ops to CuteDSL equivalents via CuteDSLOpOverrides
+    2. Placeholder handling: resolves fixed_inputs during template processing
+    3. Limited operation support: currently restricted to pointwise operations
+
+    """
+
+    def __init__(
+        self,
+        kernel,
+        subgraph_number: int,
+        fixed_inputs: dict[str, Any],
+        mask: Optional[str],
+    ):
+        cutedsl_ops = CuteDSLOpOverrides()
+        super().__init__(cutedsl_ops)
+        self.name = f"CuteDSLPlaceholderSubstitution_{subgraph_number}"
+        self.kernel = kernel
+        self.fixed_inputs = fixed_inputs
+        self.mask = mask
+
+    def _get_input_dtype(self, name: str) -> torch.dtype:
+        """Get the dtype for an input from the kernel's named_input_nodes."""
+        if name in self.kernel.named_input_nodes:
+            return self.kernel.named_input_nodes[name].dtype
+        # TODO: Fallback for common dimension names - should be replaced with proper dtype tracking
+        return torch.float32 if name not in ("b", "h", "m", "n") else torch.int32
+
+    def load(self, name: str, index: sympy.Expr):
+        """Handle loading from tensor or fixed(template args) input for CuteDSL."""
+        if name not in self.fixed_inputs:
+            raise NotImplementedError(
+                "Tensor loading not yet supported for CuteDSL - only fixed input substitution"
+            )
+        value = self.fixed_inputs[name]
+        dtype = self._get_input_dtype(name)
+
+        # ensure CSE wrapping
+        return self.kernel.cse.generate(
+            self.kernel.body, value, bounds=ValueRanges.unknown(), dtype=dtype
+        )
+
+    def indirect_indexing(self, index_var: str, size, check, wrap_neg=True):
+        """Convert index variable to symbolic form."""
+        raise NotImplementedError("Indirect indexing not supported")
+
+    def store(
+        self, name: str, index: sympy.Expr, value: CSEVariable, mode: StoreMode = None
+    ) -> str:
+        raise NotImplementedError(
+            "Store operations not supported - CuteDSL limited to read-only operations"
+        )
+
+    def _add_kernel_input(self, name: str):
+        """Add name as input to kernel and return input ref."""
+        return self.kernel.args.input(name)
+
+    def _process_indexing(self, index):
+        """Process and rename indexing, adding symbols as kernel inputs."""
+        # Convert sympy expression to string representation for CuteDSL
+        return str(index)  # Simplified for now
+
+    def _default(self, name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Any:
+        try:
+            return getattr(self._inner, name)(*args, **kwargs)
+        except NotImplementedError as e:
+            bar = "=" * 80
+            msg = textwrap.dedent(f"""
+                {bar}
+                UNSUPPORTED CUTEDSL OPERATION: '{name}'
+                {bar}
+                This operation is not yet implemented in Inductor.
+
+                Please open an issue at: https://github.com/pytorch/pytorch/issues
+                with the following information:
+
+                Operation: {name}
+                Args: {args!r}
+                Kwargs: {kwargs!r}
+
+                Title your issue: [CuteDSL] Missing operation: {name}
+                {bar}
+            """).strip()
+            raise NotImplementedError(msg) from e
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py b/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py
new file mode 100644
index 000000000000..5dd79db7bdb7
--- /dev/null
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_op_overrides.py
@@ -0,0 +1,358 @@
+# mypy: allow-untyped-defs
+"""
+CuteDSL-specific operation overrides for pointwise operations.
+
+This module provides CuteDSL implementations of common operations used in
+template kernels, particularly for flex attention modifications.
+"""
+
+import math
+from typing import Optional, Union
+
+import sympy
+
+import torch
+from torch._inductor.codegen.common import CSEVariable, OpOverrides
+from torch._inductor.virtualized import OpsValue, V
+from torch.utils._sympy.value_ranges import ValueRanges
+
+
+CuteDSLArg = Union[CSEVariable, str]
+
+
+def upcast_compute_type(dtype: torch.dtype) -> torch.dtype:
+    """Maybe upcast [b]float16 to float32"""
+    if dtype in (torch.float16, torch.bfloat16):
+        return torch.float32
+    return dtype
+
+
+class CuteDSLOpOverrides(OpOverrides):
+    """
+    CuteDSL-specific operation overrides that generate code using CuteDSL syntax.
+
+    CuteDSL TensorSSA objects have built-in operator overloads (__add__, __mul__, etc.)
+    and math functions (cute.math.exp, cute.math.sqrt, etc.)
+    """
+
+    TORCH_TO_CUTE_DTYPE = {
+        torch.float16: "cutlass.Float16",
+        torch.bfloat16: "cutlass.BFloat16",
+        torch.float32: "cutlass.Float32",
+        torch.float64: "cutlass.Float64",
+        torch.int8: "cutlass.Int8",
+        torch.int16: "cutlass.Int16",
+        torch.int32: "cutlass.Int32",
+        torch.int64: "cutlass.Int64",
+        torch.bool: "cutlass.Boolean",
+        torch.float8_e4m3fn: "cutlass.Float8E4M3FN",
+        torch.float8_e5m2: "cutlass.Float8E5M2",
+    }
+
+    # Math constants
+    LOG2_E = 1.4426950408889634  # 1/ln(2) for converting natural exp to base-2 exp
+
+    @staticmethod
+    def _ensure_tensor_ssa(arg: CuteDSLArg, template_tensor: CuteDSLArg) -> str:
+        """
+        Convert scalar arguments to TensorSSA using cute.full_like if needed.
+
+        Args:
+            arg: The argument to check (CSEVariable for tensors, str for scalars, or OpsValue wrapper)
+            template_tensor: A tensor argument to use as template for full_like
+
+        Returns:
+            String representation suitable for CuteDSL operations
+        """
+        if isinstance(arg, CSEVariable):
+            return str(arg)
+
+        if isinstance(arg, OpsValue) and isinstance(arg.value, CSEVariable):
+            return str(arg.value)
+
+        if isinstance(template_tensor, CSEVariable):
+            return f"cute.full_like({template_tensor}, {arg})"
+
+        return str(arg)
+
+    @staticmethod
+    def _extract_dtype_and_bounds(
+        *args: CuteDSLArg,
+    ) -> tuple[Optional[torch.dtype], ValueRanges[sympy.Expr]]:
+        """Extract dtype and bounds from CSEVariable arguments."""
+        for arg in args:
+            if isinstance(arg, CSEVariable):
+                return arg.dtype, arg.bounds
+        return None, ValueRanges.unknown()
+
+    @staticmethod
+    def _apply_binary_op(a: CuteDSLArg, b: CuteDSLArg, op_format: str) -> CuteDSLArg:
+        """
+        Apply a binary operation with automatic scalar-to-tensor conversion.
+
+        CuteDSL requires both operands to be TensorSSA objects for tensor operations.
+        This helper automatically converts scalar arguments to TensorSSA using
+        cute.full_like when at least one argument is a tensor (CSEVariable).
+
+        Args:
+            a: First operand (CSEVariable for tensors, str for scalars)
+            b: Second operand (CSEVariable for tensors, str for scalars)
+            op_format: Format string with {a} and {b} placeholders for the operation
+
+        Returns:
+            CSEVariable if at least one operand is a CSEVariable, otherwise string
+        """
+        tensor_arg = (
+            a
+            if isinstance(a, CSEVariable)
+            else b
+            if isinstance(b, CSEVariable)
+            else None
+        )
+        if tensor_arg is not None:
+            a_ssa = CuteDSLOpOverrides._ensure_tensor_ssa(a, tensor_arg)
+            b_ssa = CuteDSLOpOverrides._ensure_tensor_ssa(b, tensor_arg)
+            result_expr = op_format.format(a=a_ssa, b=b_ssa)
+
+            dtype, bounds = CuteDSLOpOverrides._extract_dtype_and_bounds(a, b)
+
+            # Create and return CSEVariable using CSE generation for caching
+            return V.kernel.cse.generate(
+                V.kernel.body, result_expr, bounds=bounds, dtype=dtype
+            )
+
+        return op_format.format(a=a, b=b)
+
+    @staticmethod
+    def _apply_unary_op(x: CuteDSLArg, op_format: str) -> CuteDSLArg:
+        """
+        Apply a unary operation, returning CSEVariable if input is CSEVariable.
+
+        Args:
+            x: Input operand (CSEVariable for tensors, str for scalars)
+            op_format: Format string with {x} placeholder for the operation
+
+        Returns:
+            CSEVariable if input is a CSEVariable, otherwise string
+        """
+        if isinstance(x, CSEVariable):
+            result_expr = op_format.format(x=str(x))
+            return V.kernel.cse.generate(
+                V.kernel.body, result_expr, bounds=x.bounds, dtype=x.dtype
+            )
+
+        return op_format.format(x=x)
+
+    @staticmethod
+    def constant(value: Union[bool, float, int], dtype: torch.dtype) -> str:
+        """Generate CuteDSL constant representation."""
+        if value == float("-inf"):
+            return "float('-inf')"
+        elif value == float("inf"):
+            return "float('inf')"
+        elif math.isnan(value):
+            return "float('nan')"
+        return repr(value)
+
+    @staticmethod
+    def add(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} + {b})")
+
+    @staticmethod
+    def mul(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} * {b})")
+
+    @staticmethod
+    def sub(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} - {b})")
+
+    @staticmethod
+    def truediv(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} / {b})")
+
+    @staticmethod
+    def mod(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} % {b})")
+
+    @staticmethod
+    def remainder(a, b):
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} % {b})")
+
+    @staticmethod
+    def exp(x: CuteDSLArg) -> CuteDSLArg:
+        """Exponential using CuteDSL cute.math.exp function."""
+        return CuteDSLOpOverrides._apply_unary_op(
+            x, f"cute.math.exp2({{x}} * {CuteDSLOpOverrides.LOG2_E})"
+        )
+
+    @staticmethod
+    def sqrt(x: CuteDSLArg) -> CuteDSLArg:
+        """Square root using CuteDSL cute.math.sqrt function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.sqrt({x})")
+
+    @staticmethod
+    def log(x: CuteDSLArg) -> CuteDSLArg:
+        """Natural logarithm using CuteDSL cute.math.log function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.log({x})")
+
+    @staticmethod
+    def cos(x: CuteDSLArg) -> CuteDSLArg:
+        """Cosine using CuteDSL cute.math.cos function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.cos({x})")
+
+    @staticmethod
+    def sin(x: CuteDSLArg) -> CuteDSLArg:
+        """Sine using CuteDSL cute.math.sin function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.sin({x})")
+
+    @staticmethod
+    def erf(x: CuteDSLArg) -> CuteDSLArg:
+        """Error function using CuteDSL cute.math.erf function."""
+        return CuteDSLOpOverrides._apply_unary_op(x, "cute.math.erf({x})")
+
+    @staticmethod
+    def maximum(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        raise NotImplementedError("TODO: maximum is not supported yet for TensorSSA")
+
+    @staticmethod
+    def minimum(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        raise NotImplementedError("TODO: minimum is not supported yet for TensorSSA")
+
+    @staticmethod
+    def where(
+        condition: CuteDSLArg,
+        a: CuteDSLArg,
+        b: CuteDSLArg,
+    ) -> CuteDSLArg:
+        """Conditional selection - handles both CSEVariable and string inputs."""
+        # Find a tensor argument to use as template for full_like
+        # Priority: use 'a' if it's a tensor, else use 'b', else condition
+        tensor_arg = (
+            a
+            if isinstance(a, CSEVariable)
+            else (
+                b
+                if isinstance(b, CSEVariable)
+                else condition
+                if isinstance(condition, CSEVariable)
+                else None
+            )
+        )
+
+        if tensor_arg is not None:
+            a_ssa = CuteDSLOpOverrides._ensure_tensor_ssa(a, tensor_arg)
+            b_ssa = CuteDSLOpOverrides._ensure_tensor_ssa(b, tensor_arg)
+            result_expr = f"cute.where({condition}, {a_ssa}, {b_ssa})"
+
+            dtype, bounds = CuteDSLOpOverrides._extract_dtype_and_bounds(
+                a, b, condition
+            )
+
+            return V.kernel.cse.generate(
+                V.kernel.body, result_expr, bounds=bounds, dtype=dtype
+            )
+
+        return f"cute.where({condition}, {a}, {b})"
+
+    @staticmethod
+    def pow(a: CuteDSLArg, b: CuteDSLArg):
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "({a} ** {b})")
+
+    @staticmethod
+    def abs(x: CuteDSLArg) -> CuteDSLArg:
+        """Absolute value using CuteDSL cute.math.abs function."""
+        if isinstance(x, CSEVariable):
+            x_dtype = x.dtype
+        elif isinstance(x, OpsValue) and isinstance(x.value, CSEVariable):
+            x_dtype = x.value.dtype
+        else:
+            x_dtype = torch.float32
+
+        abs_op = (
+            "mlir_math.absf"
+            if x_dtype in (torch.float16, torch.bfloat16, torch.float32)
+            else "mlir_math.absi"
+        )
+        return CuteDSLOpOverrides._apply_unary_op(
+            x, f"cute.TensorSSA({abs_op}({{x}}), {{x}}.shape, {{x}}.dtype)"
+        )
+
+    @staticmethod
+    def neg(x: CuteDSLArg) -> CuteDSLArg:
+        """Negation using CuteDSL TensorSSA __neg__ operator."""
+        # TODO: See https://github.com/NVIDIA/cutlass/issues/2584
+        return CuteDSLOpOverrides._apply_unary_op(
+            x, "cute.TensorSSA(-{x}, {x}.shape, {x}.dtype)"
+        )
+
+    @staticmethod
+    def to_dtype(
+        x: CuteDSLArg, dtype: torch.dtype, src_dtype=None, use_compute_types=True
+    ) -> CuteDSLArg:
+        """Type conversion using CuteDSL TensorSSA.to(Type[Numeric]).
+
+        Maps torch dtypes to cutlass.cute.typing numeric types and emits
+        `{x}.to(cute.typing.<Type>)`.
+
+        Raises NotImplementedError for unsigned integer and unsupported dtypes.
+        """
+        # Always convert up from bf16 and fp16 TODO on configuring
+        dtype = upcast_compute_type(dtype)
+
+        cute_type = CuteDSLOpOverrides.TORCH_TO_CUTE_DTYPE.get(dtype)
+        if cute_type is None:
+            raise NotImplementedError(
+                f"CuteDSL dtype cast not implemented for torch dtype: {dtype}"
+            )
+
+        if isinstance(x, CSEVariable):
+            result_expr = f"{str(x)}.to({cute_type})"
+            return V.kernel.cse.generate(
+                V.kernel.body, result_expr, bounds=x.bounds, dtype=dtype
+            )
+
+        return f"{x}.to({cute_type})"
+
+    @staticmethod
+    def tanh(x0: CuteDSLArg) -> CuteDSLArg:
+        """Hyperbolic tangent using CuteDSL cute.math.tanh function."""
+        return CuteDSLOpOverrides._apply_unary_op(x0, "cute.math.tanh({x})")
+
+    # Logical operations
+    @staticmethod
+    def logical_and(x0: CuteDSLArg, x1: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(x0, x1, "({a} and {b})")
+
+    @staticmethod
+    def logical_or(x0: CuteDSLArg, x1: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(x0, x1, "({a} or {b})")
+
+    @staticmethod
+    def logical_not(a):
+        """Logical NOT."""
+        return CuteDSLOpOverrides._apply_unary_op(a, "({x} == 0)")
+
+    # Comparison operations
+    @staticmethod
+    def eq(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.eq({a}, {b})")
+
+    @staticmethod
+    def ne(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.ne({a}, {b})")
+
+    @staticmethod
+    def lt(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.lt({a}, {b})")
+
+    @staticmethod
+    def le(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.le({a}, {b})")
+
+    @staticmethod
+    def gt(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.gt({a}, {b})")
+
+    @staticmethod
+    def ge(a: CuteDSLArg, b: CuteDSLArg) -> CuteDSLArg:
+        return CuteDSLOpOverrides._apply_binary_op(a, b, "operator.ge({a}, {b})")
diff --git a/torch/_inductor/codegen/cutedsl/cutedsl_template.py b/torch/_inductor/codegen/cutedsl/cutedsl_template.py
index 1ce0528348cf..b43dbd9cfd71 100644
--- a/torch/_inductor/codegen/cutedsl/cutedsl_template.py
+++ b/torch/_inductor/codegen/cutedsl/cutedsl_template.py
@@ -1,14 +1,17 @@
 # mypy: allow-untyped-defs
 import functools
 import itertools
+from collections.abc import Iterable
 from typing import Any, Optional, Union
+from unittest.mock import patch
 
 from torch._inductor.ir import ShapeAsConstantBuffer
 from torch._inductor.utils import Placeholder
+from torch._inductor.virtualized import V
 from torch._logging import getArtifactLogger
 
 from ...autotune_process import CuteDSLBenchmarkRequest, TensorMeta
-from ...ir import Buffer, ChoiceCaller, CuteDSLTemplateBuffer, Layout, TensorBox
+from ...ir import Buffer, ChoiceCaller, CuteDSLTemplateBuffer, IRNode, Layout, TensorBox
 from ..common import KernelTemplate
 from .cutedsl_kernel import CuteDSLTemplateKernel
 
@@ -64,6 +67,8 @@ def generate(self, **kwargs: Any) -> ChoiceCaller:
         """Generate the CuteDSL kernel caller."""
         input_nodes = kwargs.pop("input_nodes")
         layout = kwargs.pop("layout")
+        mutated_inputs = kwargs.pop("mutated_inputs", None)
+        subgraphs = kwargs.pop("subgraphs", None)
 
         kernel_name = f"cutedsl_{self.name}_{next(self.index_counter)}"
 
@@ -71,45 +76,57 @@ def generate(self, **kwargs: Any) -> ChoiceCaller:
             raise RuntimeError("Template compilation failed (Jinja2 required)")
 
         self.output_node: Buffer = Buffer(name="buf_out", layout=layout)
+        # Patch V.graph.get_dtype to handle the fake buf_out buffer
+        with patch.object(
+            V.graph, "get_dtype", KernelTemplate._fake_get_dtype(self.output_node)
+        ):
+            kernel = self.kernel_type(
+                kernel_name=kernel_name,
+                input_nodes=input_nodes,
+                output_node=self.output_node,
+                subgraphs=subgraphs,
+            )
+            code = kernel.render(self.template, **kwargs)
 
-        kernel = self.kernel_type(
-            kernel_name=kernel_name,
-            input_nodes=input_nodes,
-            output_node=self.output_node,
-        )
-
-        code = kernel.render(self.template, **kwargs)
+            log.debug("Generated CuteDSL Code:\n%s", code)
 
-        log.debug("Generated CuteDSL Code:\n%s", code)
+            bmreq = CuteDSLBenchmarkRequest(
+                kernel_name=kernel_name,
+                input_tensor_meta=TensorMeta.from_irnodes(input_nodes),
+                output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
+                extra_args=tuple(),
+                source_code=code,
+            )
 
-        bmreq = CuteDSLBenchmarkRequest(
-            kernel_name=kernel_name,
-            input_tensor_meta=TensorMeta.from_irnodes(input_nodes),
-            output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
-            extra_args=tuple(),
-            source_code=code,
-        )
+            def make_kernel_render(out_node, hint_override: Optional[int] = None):
+                """
+                Factory function that creates a kernel renderer for the final output.
 
-        def make_kernel_render(out_node, hint_override: Optional[int] = None):
-            render_kernel = self.kernel_type(
-                kernel_name=str(Placeholder.KERNEL_NAME),
-                input_nodes=input_nodes,
-                output_node=out_node,
-            )
+                This closure captures the current template and parameters, but allows
+                the output node to be specified later. This is used during the final
+                kernel selection phase when the actual output buffer is available.
+                """
+                render_kernel = self.kernel_type(
+                    kernel_name=str(Placeholder.KERNEL_NAME),
+                    input_nodes=input_nodes,
+                    output_node=out_node,
+                    subgraphs=subgraphs,
+                )
 
-            def render():
-                return render_kernel.render(self.template, **kwargs)
+                def render():
+                    return render_kernel.render(self.template, **kwargs)
 
-            return render_kernel, render
+                return render_kernel, render
 
-        return CuteDSLTemplateCaller(
-            name=kernel_name,
-            input_nodes=input_nodes,
-            layout=layout,
-            make_kernel_render=make_kernel_render,
-            bmreq=bmreq,
-            template=self,
-        )
+            return CuteDSLTemplateCaller(
+                name=kernel_name,
+                input_nodes=input_nodes,
+                layout=layout,
+                make_kernel_render=make_kernel_render,
+                bmreq=bmreq,
+                template=self,
+                mutated_inputs=mutated_inputs,
+            )
 
 
 class CuteDSLTemplateCaller(ChoiceCaller):
@@ -123,6 +140,7 @@ def __init__(
         make_kernel_render: Any,
         bmreq: CuteDSLBenchmarkRequest,
         template: "CuteDSLTemplate",
+        mutated_inputs: Optional[Iterable[IRNode]] = None,
     ):
         super().__init__(
             name=name,
@@ -133,6 +151,7 @@ def __init__(
         self.make_kernel_render = make_kernel_render
         self.bmreq = bmreq
         self.template = template
+        self.mutated_inputs = mutated_inputs
 
     def __str__(self) -> str:
         return f"CuteDSLTemplateCaller({self.name})"
@@ -149,6 +168,7 @@ def output_node(self) -> Union[TensorBox, ShapeAsConstantBuffer]:
                 inputs=self.input_nodes,
                 make_kernel_render=self.make_kernel_render,
                 template=self.template,
+                mutated_inputs=self.mutated_inputs,
             )
         )
 
diff --git a/torch/_inductor/codegen/mps.py b/torch/_inductor/codegen/mps.py
index 8b59db126f05..32e45bfde48d 100644
--- a/torch/_inductor/codegen/mps.py
+++ b/torch/_inductor/codegen/mps.py
@@ -366,7 +366,7 @@ def randint64(
 
     @staticmethod
     def round(x: CSEVariable) -> str:
-        return f"metal::round({x})"
+        return f"metal::rint({x})"
 
     @staticmethod
     def pow(a: CSEVariable, b: CSEVariable) -> str:
@@ -421,6 +421,8 @@ def _initialize_special_ops(cls) -> None:
         # Binary special ops
         for name in [
             "polygamma",
+            "igamma",
+            "igammac",
             "zeta",
         ]:
             setattr(cls, name, functools.partialmethod(cls._special_binary, name=name))
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index da077e725f7d..d73db7ed2a22 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -408,6 +408,7 @@ def __init__(
             else self.should_use_cooperative_reduction()
         )
         self.tiling_scores: Optional[dict[str, sympy.Expr]] = tiling_scores
+        self.tiling: dict[str, sympy.Expr] = tiling
         self.persistent_reduction: bool = (
             override_persistent_reduction
             if override_persistent_reduction is not None
@@ -1422,6 +1423,15 @@ def can_use_32bit_indexing(
             if buf.has_tensor_output()
         ]
 
+        for buf in buffers:
+            if not buf.has_tensor_output() and isinstance(buf, ir.MutationOutput):
+                mutated_bufs = buf.get_mutation_buffers()
+                buf_sizes += [
+                    buf.get_layout().storage_size()
+                    for buf in mutated_bufs
+                    if buf.has_tensor_output()
+                ]
+
         if not all(expr_fits_within_32bit(size) for size in buf_sizes):
             return False
 
@@ -1449,15 +1459,17 @@ def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
         for kernel in kernels:
             self.codegen_node_schedule_with_kernel(node_schedule, kernel)
         MultiKernel.merge_workspaces_inplace(kernels)
+        debug_handles: list[tuple[str, Optional[int]]] = []
         for kernel in kernels:
             with V.set_kernel_handler(kernel):
                 src_code = kernel.codegen_kernel()
             kernel_name = self.define_kernel(src_code, node_schedule, kernel)
             if config.trace.provenance_tracking_level != 0:
-                set_kernel_post_grad_provenance_tracing(
+                debug_handle = set_kernel_post_grad_provenance_tracing(
                     node_schedule,  # type: ignore[arg-type]
                     kernel_name,
                 )
+                debug_handles.append((kernel_name, debug_handle))
             log.debug("Generating kernel code with kernel_name: %s", kernel_name)
             kernel.kernel_name = kernel_name
             kernel.code_hash = code_hash(src_code)
@@ -1474,6 +1486,10 @@ def codegen_node_schedule(self, kernel_features: SIMDKernelFeatures):
                 node.mark_run()
 
         self.codegen_comment(node_schedule)
+        for kernel_name, debug_handle in debug_handles:
+            V.graph.wrapper_code.write_provenance_debug_handle(
+                kernel_name, debug_handle
+            )
         final_kernel.call_kernel(final_kernel.kernel_name)
 
         if config.nan_asserts:
diff --git a/torch/_inductor/codegen/subgraph.py b/torch/_inductor/codegen/subgraph.py
index 8e34c43cebad..374186c2e242 100644
--- a/torch/_inductor/codegen/subgraph.py
+++ b/torch/_inductor/codegen/subgraph.py
@@ -168,7 +168,6 @@ class SubgraphTemplate(KernelTemplate):
     def __init__(
         self,
         name: str,
-        make_fx_graph: Callable[..., Any],
     ):
         """
         Initialize a subgraph template.
@@ -177,13 +176,15 @@ def __init__(
             name: The name of this template
             graph: The FX graph
         """
-        self.name = f"{name}_{next(SubgraphTemplate.index_counter)}"
-        self.make_fx_graph = make_fx_graph
+        super().__init__(name=name)
 
     def generate(  # type: ignore[override]
         self,
+        name: str,
         input_nodes: list[Buffer],
         layout: Layout,
+        make_fx_graph: Callable[..., Any],
+        description: str = "",
         **kwargs: Any,
     ) -> SubgraphChoiceCaller:
         """
@@ -200,9 +201,9 @@ def generate(  # type: ignore[override]
         """
 
         return SubgraphChoiceCaller(
-            name=self.name,
+            name=f"{name}_{next(SubgraphTemplate.index_counter)}",
             input_nodes=input_nodes,
             layout=layout,
-            description="",
-            make_fx_graph=self.make_fx_graph,
+            description=description,
+            make_fx_graph=make_fx_graph,
         )
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index 47817cfaed11..3848fc3355e4 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1062,9 +1062,9 @@ def exp(x):
         more details.
         """
         if config.use_fast_math:
-            return f"libdevice.exp2({x} * {TritonOverrides._LOG_2_E})"
-        else:
             return f"tl_math.exp({x})"
+        else:
+            return f"libdevice.exp({x})"
 
     @staticmethod
     @maybe_upcast_float32()
@@ -1296,21 +1296,7 @@ def rsqrt(x):
     @staticmethod
     @maybe_upcast_float32()
     def log1p(x):
-        bug = config.triton.inject_log1p_bug_TESTING_ONLY
-        if bug == "compile_error":
-            return "compile error!"
-        elif bug == "runtime_error":
-            # NB: this only triggers runtime error as long as input
-            # is not all zero
-            return f'triton_helpers.device_assert_then({x} == 0, "injected assert fail", {x})'
-        elif bug == "accuracy":
-            return f"{x} + 1"
-        elif bug is None:
-            return f"libdevice.log1p({x})"
-        else:
-            raise AssertionError(
-                f"unrecognized config triton.inject_log1p_bug_TESTING_ONLY = {bug!r}"
-            )
+        return f"libdevice.log1p({x})"
 
     @staticmethod
     @maybe_upcast_float32()
@@ -1320,7 +1306,7 @@ def tan(x):
     @staticmethod
     @maybe_upcast_float32()
     def tanh(x):
-        return f"libdevice.tanh({x})"
+        return f"libdevice.fast_tanhf({x})"
 
     @staticmethod
     @maybe_upcast_float32()
@@ -2044,12 +2030,11 @@ def should_use_persistent_reduction(self) -> bool:
         )
 
     def want_no_x_dim(self):
-        return (
-            self.persistent_reduction
-            and len(self.numels) == self.num_reduction_dims + 1
-            and self.fixed_config
-            and self.fixed_config["XBLOCK"] == 1
-        )
+        """
+        ROCm branch change: Remove want_no_x_dim for persistent reduction.
+        Inductor benchmarks show no perf advantage and simplifies autotune flow.
+        """
+        return False
 
     @property
     def assert_function(self) -> str:
@@ -2312,27 +2297,31 @@ def match_block_expr() -> Optional[BlockDescriptorOptions]:
 
                 # Form the block pointer or TMA descriptor.
                 self.filter_masks(mask_vars)
-                options_class: type[BlockDescriptorOptions]
-                if config.triton.use_block_ptr:
-                    options_class = BlockPtrOptions
-                else:
+
+                options_class = (
+                    BlockPtrOptions
+                    if config.triton.use_block_ptr
+                    else TensorDescriptorOptions
+                )
+                options = options_class.create(
+                    params=block_params,
+                    constant_offset=offset,
+                    range_trees=range_trees,
+                    mask_vars=mask_vars,
+                    get_max_block=self.max_block,
+                )
+
+                if options_class == TensorDescriptorOptions:
                     nonlocal tma_compatibility_checker
                     tma_compatibility_checker = cast(
                         TMACompatibilityChecker, tma_compatibility_checker
                     )
                     if not tma_compatibility_checker.are_block_parameters_compatible(
-                        block_params
+                        options.params
                     ):
                         return None
-                    options_class = TensorDescriptorOptions
 
-                return options_class.create(
-                    params=block_params,
-                    constant_offset=offset,
-                    range_trees=range_trees,
-                    mask_vars=mask_vars,
-                    get_max_block=self.max_block,
-                )
+                return options
 
             # Return a block pointer, if indexing matches the pattern.
             options = match_block_expr()
@@ -4036,7 +4025,12 @@ def inductor_meta_common():
             )
         return inductor_meta
 
-    def codegen_kernel(self, name=None):
+    def codegen_kernel(self, name=None) -> str:
+        """
+        Convert the TritonKernel from Inductor SIMD IR to triton code, including inductor triton heuristics, imports,
+        metadata, and benchmarking infra.
+        """
+
         code = IndentedBuffer()
 
         size_hints = {}
@@ -4177,6 +4171,53 @@ def add_constexpr_arg(arg_name):
             "num_reduction": self.num_reduction,
             **self.inductor_meta_common(),
         }
+
+        # Bail on 3d tiling, which has more complicated coalesce patterns
+        looped_red = V.kernel.features.is_reduction() and not self.persistent_reduction
+        tiling_scores = self.tiling_scores
+        two_d_red = (
+            len(self.tiling) == 2 and tiling_scores is not None and "x" in tiling_scores
+        )
+        if looped_red and two_d_red:
+            assert tiling_scores is not None
+            memory_stats = self.features.memory_stats(self.tiling)
+            dim_stats = memory_stats.persistent.memory.dim[0]
+            mem_ops_per_thread = dim_stats.count_per_thread
+
+            # check if majority of reads are coalesced by the rblock
+            r_coalesce_ratio = tiling_scores["r0_"] / max(tiling_scores["x"], 1)
+
+            looped_mem = memory_stats.looped.memory.bytes
+            persistent_mem = memory_stats.persistent.memory.bytes
+            # check that we save significant memory by doing persistent
+            saved_bytes_ratio = V.graph.sizevars.size_hint(
+                looped_mem, fallback=config.unbacked_symint_fallback
+            ) / max(
+                V.graph.sizevars.size_hint(
+                    persistent_mem, fallback=config.unbacked_symint_fallback
+                ),
+                1,
+            )
+
+            # TODO - rnumel should be reasonably close to power of 2
+            if (
+                # significant memory bandwidth savings
+                saved_bytes_ratio >= 1.3
+                # large rblock inhibits xblock size, dont attempt if there is a decent amount of
+                # reads coalesced by xblock
+                and r_coalesce_ratio >= 8.0
+                # TODO - need more detailed register analysis
+                and V.graph.sizevars.statically_known_leq(
+                    self.features.reduction_numel, 32768
+                )
+                # We will already generate a persistent config in this case
+                and V.graph.sizevars.statically_known_gt(
+                    self.features.reduction_numel, 2048
+                )
+                and mem_ops_per_thread <= 10
+            ):
+                inductor_meta["add_persistent_rblock"] = True
+
         if self.tiling_scores:
             inductor_meta["tiling_scores"] = self.tiling_scores
 
diff --git a/torch/_inductor/codegen/triton_combo_kernel.py b/torch/_inductor/codegen/triton_combo_kernel.py
index dc2392119cc5..94a905e4211c 100644
--- a/torch/_inductor/codegen/triton_combo_kernel.py
+++ b/torch/_inductor/codegen/triton_combo_kernel.py
@@ -614,7 +614,7 @@ def jit_line(
         if heuristics == "foreach":
             heuristics_line = f"""
                 @triton_heuristics.foreach(
-                    num_warps={self.num_warps},
+                    filename=__file__,
                     triton_meta={triton_meta!r},
                     inductor_meta={inductor_meta!r},
                 )
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index 27d8a28cb969..4aa7037618b9 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -228,11 +228,18 @@ def writeline(line: str, example_grid: Optional[str] = None):
                 key=lambda x: len(x[1].kwargs),
                 reverse=True,
             ):
+                guardslist = []
                 if c.kwargs:
-                    guards = [
-                        f"meta['{name}'] == {val}" for name, val in c.kwargs.items()
-                    ]
-                    guards = " and ".join(guards)
+                    # Remove AMD specific kwargs.
+                    for kwarg in c.kwargs:
+                        if kwarg not in [
+                            "matrix_instr_nonkdim",
+                            "waves_per_eu",
+                            "kpack",
+                        ]:
+                            guardslist.append(f"meta['{kwarg}'] == {c.kwargs[kwarg]}")
+                if guardslist:
+                    guards = " and ".join(guardslist)
                 else:
                     guards = "True"  # for configs with empty kwargs
                 grid, example_grid = determine_grid(grid, example_grid)
@@ -254,6 +261,7 @@ def user_defined_triton_kernel_transitive_closure_source_code(kernel) -> str:
     compile_wrapper.splice(kernel.src, strip=True)
 
     # Also include any possible kernel being called indirectly
+    import triton
     from triton import JITFunction  # type: ignore[name-defined, attr-defined]
     from triton.language import constexpr  # type: ignore[name-defined]
 
@@ -282,6 +290,14 @@ def traverse(cur_kernel):
                     compile_wrapper.splice(symbol.src, strip=True)
                     symbols_included.add(symbol_name)
                     traverse(symbol)
+                elif hasattr(triton, "constexpr_function") and isinstance(
+                    symbol, triton.runtime.jit.ConstexprFunction
+                ):
+                    compile_wrapper.newline()
+                    compile_wrapper.writeline("@triton.constexpr_function")
+                    compile_wrapper.splice(symbol.src, strip=True)
+                    symbols_included.add(symbol_name)
+                    traverse(symbol)
                 elif isinstance(symbol, (int, str, bool, constexpr)):
                     compile_wrapper.newline()
                     if isinstance(symbol, constexpr):
@@ -480,15 +496,19 @@ def codegen(self, code: IndentedBuffer) -> None:
         else:
             kernel_name = node.get_kernel_name()
         device = d.type if (d := node.get_device()) else V.graph.device_type
+        provenance_debug_handle: Optional[int] = None
         # set provenance tracing kernel mapping for ExternKernel types
         if config.trace.provenance_tracking_level != 0:
-            set_kernel_post_grad_provenance_tracing(node, kernel_name, is_extern=True)
+            provenance_debug_handle = set_kernel_post_grad_provenance_tracing(
+                node, kernel_name, is_extern=True
+            )
         self.wrapper._generate_extern_kernel_out_helper(
             kernel_name,
             node.codegen_reference(),
             node.output_view.codegen_reference() if node.output_view else None,
             args,
             device,
+            provenance_debug_handle,
         )
 
     def codegen_fx(self, converter: FxConverter) -> FxConversionFunc:
@@ -1132,14 +1152,15 @@ def write_triton_header_once(self) -> None:
             )
 
     def write_get_raw_stream_header(self) -> None:
+        import_get_raw_stream_str = V.graph.device_ops.import_get_raw_stream_as(
+            "get_raw_stream"
+        )
         if config.triton.autotune_at_compile_time:
-            self.kernel_autotune_calls.writeline(
-                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
-            )
+            if not self.kernel_autotune_calls.contains(import_get_raw_stream_str):
+                self.kernel_autotune_calls.writeline(import_get_raw_stream_str)
         if not V.graph.cpp_wrapper:
-            self.imports.writeline(
-                V.graph.device_ops.import_get_raw_stream_as("get_raw_stream")
-            )
+            if not self.imports.contains(import_get_raw_stream_str):
+                self.imports.writeline(import_get_raw_stream_str)
 
     @cache_on_self
     def write_get_raw_stream_header_once(self) -> None:
@@ -1286,7 +1307,7 @@ def codegen_input_size_and_nan_asserts(self) -> None:
     # that stream caching happens per graph instance. this
     # is important for nested subgraph codegening.
     def write_get_raw_stream(self, device_idx: int, graph_name: str) -> str:
-        self.write_get_raw_stream_header_once()
+        self.write_get_raw_stream_header()
         name = f"stream{device_idx}"
         if config.triton.autotune_at_compile_time:
             self.kernel_autotune_calls.writeline(
@@ -1432,11 +1453,13 @@ def _generate_extern_kernel_out_helper(
         out_view: Optional[str],
         args: list[str],
         device: str,
+        debug_handle: Optional[int] = None,
     ) -> None:
         # add debug printer code for triton kernel calls at (jit) inductor level
         debug_printer_manager = V.graph.wrapper_code.debug_printer
         debug_printer_manager.set_printer_args(args, kernel, None, None, "extern")
         args.append(f"out={out_view if out_view else out}")
+        self.write_provenance_debug_handle(kernel, debug_handle)
         with debug_printer_manager:
             self.writeline(f"{kernel}({', '.join(args)})")
 
@@ -1714,7 +1737,8 @@ def run_wrapper_ir_passes(self, is_inference: bool):
         if is_inference and config.memory_planning:
             self.memory_plan()
         else:
-            self.estimate_peak = EfficientPeakEstimate()
+            if config.allow_buffer_reuse:
+                self.estimate_peak = EfficientPeakEstimate()
             self.memory_plan_reuse()
 
     def codegen_input_symbol_assignment(
@@ -2277,6 +2301,7 @@ def rename_sizes_for_launcher(expr: Union[int, sympy.Expr]) -> sympy.Expr:
                         "config": config_to_dict(cfg),
                         "python": [*map(pexpr, grid)],
                         "cpp": [*map(cexpr, grid)],
+                        "python_slow": [*map(pexpr, grid)],
                     }
                 )
             inductor_meta = {
@@ -2584,6 +2609,7 @@ def generate_kernel_call(
         raw_args=None,
         triton_meta=None,
         original_fxnode_name=None,
+        debug_handle: Optional[int] = None,
     ):
         """
         Generates kernel call code.
@@ -2603,6 +2629,7 @@ def generate_kernel_call(
         )
 
         device = device or V.graph.get_current_device_or_throw()
+        self.write_provenance_debug_handle(kernel_name, debug_handle)
         self.writeline(
             KernelCallLine(
                 self,
@@ -2929,6 +2956,16 @@ def make_free_by_names(self, names_to_del: list[str]):
     def codegen_exact_buffer_reuse(self, old_name: str, new_name: str, del_line: str):
         return f"{self.declare_maybe_reference}{new_name} = {old_name}{del_line}{self.ending}  {self.comment} reuse"
 
+    def write_provenance_debug_handle(
+        self,
+        kernel_name,
+        debug_handle: Optional[int] = None,
+    ):
+        if debug_handle is not None:
+            self.writeline(
+                f"{self.comment} [Provenance debug handles] {kernel_name}:{debug_handle}"
+            )
+
     def make_buffer_reuse(self, old: BufferLike, new: BufferLike, delete_old: bool):
         assert old.get_dtype() == new.get_dtype()
         old_name = old.get_name()
@@ -3334,7 +3371,18 @@ def codegen_conditional(self, conditional):
             self.codegen_subgraph(conditional.false_subgraph, outer_inputs, name)
         self.writeline(ExitSubgraphLine(self))
 
-    def codegen_while_loop(self, while_loop):
+    def codegen_while_loop(self, while_loop, stack_output):
+        """while_loop is codegened as a host side while_loop"""
+
+        def codegen_subgraph(subgraph, outer_inputs, outer_outputs):
+            """Helper method to deduplicate subgraph codegen logic"""
+            if V.graph.aot_mode:
+                self.codegen_subgraph_by_inlining(subgraph, outer_inputs, outer_outputs)
+            else:
+                self.codegen_subgraph_with_flattened_outputs(
+                    subgraph, outer_inputs, outer_outputs
+                )
+
         name = while_loop.get_name()
         outer_carried_inputs = [
             buf.codegen_reference() for buf in while_loop.carried_inputs
@@ -3343,7 +3391,13 @@ def codegen_while_loop(self, while_loop):
             buf.codegen_reference() for buf in while_loop.additional_inputs
         ]
 
+        ckp_offset = len(outer_carried_inputs)
         self.writeline(f"{name} = [None] * {len(outer_carried_inputs)}")
+        if stack_output:
+            self.writeline(
+                f"{name}.extend([[] for _ in range({len(outer_carried_inputs)})])"
+            )
+
         for i, inp in enumerate(outer_carried_inputs):
             # set the initial state before the loop
             self.writeline(f"{name}[{i}] = {inp}")
@@ -3360,33 +3414,62 @@ def codegen_while_loop(self, while_loop):
         # the carried_inputs part of the inputs, the additional ones
         # are passed in as they're before.
         body_outer_outputs = body_outer_inputs[: len(outer_carried_inputs)]
-
-        self.writeline("while True:")
-        self.writeline(EnterSubgraphLine(self, while_loop.cond_subgraph.graph))
-
-        if V.graph.aot_mode:
-            self.codegen_subgraph_by_inlining(
-                while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
-            )
+        # Check condition at the beginning and set up flag
+        codegen_subgraph(
+            while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+        )
+        self.writeline(f"should_loop = {cond_outer_outputs[0]}")
+        self.writeline("if not should_loop:")
+        if stack_output:
+            # Handle the case when loop never executes
+            for i, (carried_input, carried_buf) in enumerate(
+                zip(outer_carried_inputs, while_loop.carried_inputs)
+            ):
+                self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+                self.writeline(f"{name}[{i}] = {carried_input}.unsqueeze(0).clone()")
+                self.writeline(ExitSubgraphLine(self))
         else:
-            self.codegen_subgraph_with_flattened_outputs(
-                while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
-            )
-        self.writeline(
-            f"if not {cond_outer_outputs[0]}: break"
-        )  # condition doesn't hold
-        self.writeline(ExitSubgraphLine(self))
+            for i, (carried_input, carried_buf) in enumerate(
+                zip(outer_carried_inputs, while_loop.carried_inputs)
+            ):
+                self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+                self.writeline(f"{name}[{i}] = {carried_input}.clone()")
+                self.writeline(ExitSubgraphLine(self))
+
+        self.writeline("while should_loop:")
+        # Body execution
         self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
-        if V.graph.aot_mode:
-            self.codegen_subgraph_by_inlining(
-                while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
-            )
-        else:
-            self.codegen_subgraph_with_flattened_outputs(
-                while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
-            )
+        codegen_subgraph(
+            while_loop.body_subgraph, body_outer_inputs, body_outer_outputs
+        )
         self.writeline(ExitSubgraphLine(self))
 
+        # Collect outputs if enabled
+        if stack_output:
+            self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+            for i in range(len(outer_carried_inputs)):
+                self.writeline(f"{name}[{i + ckp_offset}].append({name}[{i}])")
+            self.writeline(ExitSubgraphLine(self))
+
+        # Condition check at end of loop
+        self.writeline(EnterSubgraphLine(self, while_loop.cond_subgraph.graph))
+        codegen_subgraph(
+            while_loop.cond_subgraph, cond_outer_inputs, cond_outer_outputs
+        )
+        self.writeline(ExitSubgraphLine(self))
+        self.writeline(f"    should_loop = {cond_outer_outputs[0]}")
+
+        # Stack outputs after loop completion
+        if stack_output:
+            self.writeline("# Stack outputs after loop completion")
+            for i in range(len(outer_carried_inputs)):
+                self.writeline(f"if len({name}[{i + ckp_offset}]) > 0:")
+                self.writeline(EnterSubgraphLine(self, while_loop.body_subgraph.graph))
+                self.writeline(
+                    f"{name}[{i}] = torch.stack({name}[{i + ckp_offset}], dim=0)"
+                )
+                self.writeline(ExitSubgraphLine(self))
+
     @staticmethod
     def statically_known_int_or_none(x):
         try:
diff --git a/torch/_inductor/codegen/wrapper_fxir.py b/torch/_inductor/codegen/wrapper_fxir.py
index cf266c7618a8..29905b11f3b9 100644
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@@ -28,11 +28,12 @@
 from torch._library.triton import wrap_triton
 from torch.fx import GraphModule
 from torch.utils import _pytree as pytree
-from torch.utils._sympy.functions import CeilDiv
+from torch.utils._sympy.functions import FloorDiv
 from torch.utils._sympy.interp import _run_sympy_handler, sympy_interp
 from torch.utils._sympy.reference import OptimizedPythonReferenceAnalysis
 
 from .. import config, ir
+from ..runtime.triton_compat import Config
 from ..utils import LineContext
 from .common import (
     CodegenSymbol,
@@ -100,6 +101,38 @@ class TritonKernel:
     wrapped: TraceableTritonKernelWrapper
 
 
+def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
+    """
+    Replace sympy.floor with FloorDiv.
+    """
+    expr = sympy.together(expr)
+
+    # Find division operations in the sympy.floor expression
+    # Div is either represented as Mul with:
+    # Rational denominator or Pow with negative exponent
+    if not isinstance(expr, sympy.core.mul.Mul):
+        return sympy.floor(expr)
+
+    if isinstance(expr.args[0], sympy.Rational):
+        frac = expr.args[0]
+        numerator = sympy_product(expr.args[1:]) * frac.numerator
+        denominator = frac.denominator
+
+        return FloorDiv(numerator, denominator)
+    elif isinstance(expr.args[0], sympy.Pow):
+        base = expr.args[0].base
+        exp = expr.args[0].exp
+        numerator = sympy_product(expr.args[1:])
+        if exp < 0:
+            denominator = base ** (-exp)
+        else:
+            numerator = numerator * (base**exp)
+            denominator = 1
+        return FloorDiv(numerator, denominator)
+    else:
+        return sympy.floor(expr)
+
+
 class WrapperFxCodegen(PythonWrapperCodegen):
     """
     Backend to generate wrapper code as an FX IR graph.
@@ -466,30 +499,6 @@ def _generate_sym_node(
             )
             return self.expr_to_proxy[s].node
         elif isinstance(s, sympy.Expr):
-
-            def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
-                """
-                Converts floor(x / c) to x // c.
-                """
-                if isinstance(expr, sympy.core.mul.Mul) and isinstance(
-                    expr.args[0], sympy.Rational
-                ):
-                    # Only the first argument of a Mul can be a Rational.
-                    frac = expr.args[0]
-                    numerator = sympy_product(expr.args[1:]) * frac.numerator
-                    denominator = frac.denominator
-
-                    # Sanity check the results.
-                    new_expr = numerator / denominator
-                    assert V.graph.sizevars.statically_known_equals(new_expr, expr), (
-                        f"Unsound replacement: '{new_expr}' != '{expr}'"
-                    )
-                    # Undo the python division trick and replace with explicit CeilDiv
-                    return -CeilDiv(-numerator, denominator)
-                else:
-                    return sympy.floor(expr)
-
-            s = s.replace(sympy.floor, replace_floor_div)
             return self._sympy_interp(s).node
 
         elif isinstance(s, torch.fx.Node):
@@ -663,6 +672,10 @@ def _generate_triton_call(self, line: WrapperLine) -> None:
         call_args = self._lookup_args(line.call_args)
         kernel = self.kernels[line.kernel_name]
         tuner = kernel.tuner
+        # Use python_slow mode instead of python mode to avoid
+        # the round to neginf behaviour, which is not the convention
+        # in other languages.
+        tuner.grid_mode = "python_slow"
 
         # Optionally autotune the kernels.
         # The FX backend currently only supports compile-time tuning.
@@ -700,11 +713,50 @@ def node_to_tuning_arg(arg: Any) -> Any:
                 kernel_name,
             )
 
+        triton_meta = tuner.triton_meta
+        signature = triton_meta["signature"]
+
+        def add_constants_to_call_args(
+            call_args: Sequence[Any], cfg: Config
+        ) -> tuple[Any, ...]:
+            """
+            Add constant kwargs to the arg list.
+            """
+            # Add args from the proper Triton signature.
+            new_call_args = []
+            call_arg_idx = 0
+            constants = triton_meta["constants"]
+            for arg_name in signature:
+                # Config kwargs are tracked separately.
+                if arg_name in cfg.kwargs:
+                    continue
+
+                try:
+                    new_arg = constants[arg_name]
+                except KeyError:
+                    new_arg = call_args[call_arg_idx]
+                    call_arg_idx += 1
+                new_call_args.append(new_arg)
+
+            # Add Inductor's extra call args to the end.
+            new_call_args.extend(call_args[call_arg_idx:])
+
+            return tuple(new_call_args)
+
         kernel_config = tuner.compile_results[0].config
+        call_args = add_constants_to_call_args(call_args, kernel_config)
         call_args, grid = tuner._interpret_args_grid(call_args, kernel_config)
-        call_kwargs = dict(zip(tuner.triton_meta["signature"], call_args))
+        call_kwargs = dict(zip(signature, call_args))
         call_kwargs.update(kernel_config.kwargs)
 
+        # Replace all sympy.floor with FloorDiv
+        # _generate_sym_node does not support sympy.floor
+        grid = [
+            x.replace(sympy.floor, replace_floor_div)
+            if isinstance(x, sympy.Expr)
+            else x
+            for x in grid
+        ]
         wrapper_grid = [tuple(self._generate_sym_nodes(grid))]
         call_kwargs = {
             name: self._generate_sym_node(val) for name, val in call_kwargs.items()
diff --git a/torch/_inductor/comm_analysis.py b/torch/_inductor/comm_analysis.py
index 2a69a0531347..c24cf336e66a 100644
--- a/torch/_inductor/comm_analysis.py
+++ b/torch/_inductor/comm_analysis.py
@@ -1,20 +1,26 @@
 import functools
+import logging
 import math
 from enum import IntEnum
+from typing import Optional
 
 import sympy
 
 import torch
 
 from . import ir
-from .utils import get_dtype_size, sympy_product
+from .utils import get_dtype_size, snode_args_kwargs, sympy_product
 from .virtualized import V
 
 
+log = logging.getLogger(__name__)
+
+
 class NCCL_COLL(IntEnum):
     ALL_REDUCE = 0
     ALL_GATHER = 1
     REDUCE_SCATTER = 2
+    ALL_TO_ALL = 3
 
 
 class NVIDIA_GPU_TYPE(IntEnum):
@@ -49,6 +55,8 @@ def get_collective_type(node: ir.IRNode) -> NCCL_COLL:
         return NCCL_COLL.ALL_GATHER
     elif "reduce_scatter" in kernel_name:
         return NCCL_COLL.REDUCE_SCATTER
+    elif "torch.ops._dtensor.shard_dim_alltoall.default" in kernel_name:
+        return NCCL_COLL.ALL_TO_ALL
     else:
         raise ValueError(f"Unsupported collective kernel: {kernel_name}")
 
@@ -158,9 +166,53 @@ class NCCL_PROTO(IntEnum):
 ]
 
 
+def estimate_nccl_collective_runtime_nccl_estimator(snode) -> Optional[float]:  # type: ignore[no-untyped-def]
+    kernel = snode.node
+    assert kernel is not None
+    py_kernel_name = getattr(kernel, "python_kernel_name", "")
+    if not ("all_gather" in py_kernel_name or "reduce_scatter" in py_kernel_name):
+        # NCCL of version 2.27 sometimes unrecoverably fail for all_to_all, all_reduce
+        return None
+
+    from torch.distributed.distributed_c10d import _resolve_process_group
+
+    pg_name = kernel.constant_args[-1]  # type: ignore[attr-defined]
+    pg = _resolve_process_group(pg_name)
+    rank: int = torch.distributed.get_rank(pg)
+    # TODO(ivankobzarev): Figure out how we can use time estimations,
+    # without cuda allocations.
+    device = torch.device(f"cuda:{rank}")
+
+    fn = eval(py_kernel_name)
+    args, kwargs = snode_args_kwargs(snode)
+
+    # TODO(ivankobzarev): fix out variants snode_args_kwargs
+    if "all_gather_into_tensor_out" in py_kernel_name:
+        args = args[1:] + args[0]
+
+    try:
+        with torch.distributed._time_estimator(
+            group=pg, device=device
+        ) as time_estimator:
+            w = fn(*args, **kwargs)
+            torch.ops._c10d_functional.wait_tensor.default(w)
+    except Exception as e:
+        # NCCL estimator can fail
+        log.info(e)
+        return None
+
+    est_time_us = time_estimator.estimated_time
+    # -1000 constant is NCCL return in case of error during estimations.
+    # Observed it for all_to_all estimations.
+    if est_time_us < 0:
+        return None
+    est_time_ms = est_time_us / 1e3
+    return est_time_ms
+
+
 def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
     """
-    Returns estimated NCCL collective runtime in nanoseconds (ns).
+    Returns estimated NCCL collective runtime in milliseconds (ms).
 
     The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
     We aim to estimate the runtime as accurately as possible.
@@ -220,6 +272,8 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
 
     if coll == NCCL_COLL.ALL_REDUCE:
         nsteps = 2 * (nRanks - 1)
+    elif coll == NCCL_COLL.ALL_TO_ALL:
+        nsteps = 2 * (nRanks - 1)
     elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
         nsteps = nRanks - 1
 
@@ -237,7 +291,7 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
             nInterSteps = 2 * nNodes
         else:
             nInterSteps = 0
-    elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER):
+    elif coll in (NCCL_COLL.REDUCE_SCATTER, NCCL_COLL.ALL_GATHER, NCCL_COLL.ALL_TO_ALL):
         nInterSteps = nNodes - 1
 
     # First compute latency in us; then at the end, convert it to ns
@@ -256,7 +310,9 @@ def estimate_nccl_collective_runtime(node: ir.IRNode) -> float:
 
     # =============== final result ===============
     transport_ns = tensor_storage_size_GB / bandwidth_GB_per_ns
-    return transport_ns + latency_ns
+    ns = transport_ns + latency_ns
+    ms = ns / 1e6
+    return ms
 
 
 ################################################################################################################
diff --git a/torch/_inductor/comms.py b/torch/_inductor/comms.py
index af4651a42a8e..fa8bb30f238c 100644
--- a/torch/_inductor/comms.py
+++ b/torch/_inductor/comms.py
@@ -52,6 +52,28 @@
     from torch._inductor.scheduler import BaseSchedulerNode
 
 
+def align_runtime_estimations_across_all_distributed_ranks(
+    snodes: list[BaseSchedulerNode],
+):
+    runtime_estimations = {}
+    for snode in snodes:
+        runtime_estimations[snode] = snode.get_estimated_runtime()
+    import torch.distributed as dist
+    from torch.distributed.distributed_c10d import _get_default_group
+
+    world_size = dist.get_world_size()
+    pg = _get_default_group()
+    gathered_runtime_estimations: list[list[float]] = [[] for _ in range(world_size)]
+    dist.all_gather_object(
+        gathered_runtime_estimations, list(runtime_estimations.values()), pg
+    )
+    median_runtime_estimations = torch.median(
+        torch.tensor(gathered_runtime_estimations), dim=0
+    ).values.tolist()
+    for i in range(len(snodes)):
+        snodes[i].override_estimated_runtime = median_runtime_estimations[i]
+
+
 def sink_waits(snodes: list[BaseSchedulerNode]) -> list[BaseSchedulerNode]:
     """
     Greedily schedules waits as late as possible.
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index 2ff92c48fdf2..9e4661330045 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -65,6 +65,7 @@
     log_cudagraph_skip_and_bump_counter,
     PlaceholderInfo,
 )
+from torch._inductor.custom_graph_pass import CustomPartitionerFn
 from torch._inductor.debug import (
     create_mapping_pre_post_grad_nodes,
     save_args_for_compile_fx_inner,
@@ -154,6 +155,8 @@ def log_optimus_to_scuba(*args: object, **kwargs: object) -> None:
     from torch._inductor.fb.utils import log_optimus_to_scuba, time_and_log
 
 if TYPE_CHECKING:
+    import types
+
     from torch._functorch._aot_autograd.schemas import (
         FQN,
         GraphInputName,
@@ -1063,29 +1066,6 @@ def _compile_fx_inner(
 
     log.debug("FX codegen and compilation took %.3fs", time.time() - start)
 
-    # Dump provenance artifacts for debugging trace
-    if config.trace.provenance_tracking_level != 0:
-        trace_structured(
-            "artifact",
-            metadata_fn=lambda: {
-                "name": "inductor_provenance_tracking_node_mappings",
-                "encoding": "json",
-            },
-            payload_fn=lambda: json.dumps(
-                torch._inductor.debug.dump_inductor_provenance_info()
-            ),
-        )
-        trace_structured(
-            "artifact",
-            metadata_fn=lambda: {
-                "name": "inductor_provenance_tracking_kernel_stack_traces",
-                "encoding": "json",
-            },
-            payload_fn=lambda: json.dumps(
-                torch._inductor.debug._inductor_kernel_stack_trace
-            ),
-        )
-
     # This message is for printing overview information of inductor mm counts, shapes,etc after lowering
     if log.isEnabledFor(logging.INFO):
         mm_table_data = []
@@ -1528,6 +1508,33 @@ def codegen_and_compile(
                                 compiled_module, "runner", None
                             )
 
+                    # Dump provenance artifacts for debugging trace
+                    inductor_provenance_tracking_node_mappings = None
+                    inductor_kernel_stack_trace_str = None
+                    if config.trace.provenance_tracking_level != 0:
+                        inductor_provenance_tracking_node_mappings = json.dumps(
+                            torch._inductor.debug.dump_inductor_provenance_info()
+                        )
+                        inductor_kernel_stack_trace_str = json.dumps(
+                            torch._inductor.debug._inductor_kernel_stack_trace
+                        )
+                        trace_structured(
+                            "artifact",
+                            metadata_fn=lambda: {
+                                "name": "inductor_provenance_tracking_node_mappings",
+                                "encoding": "json",
+                            },
+                            payload_fn=lambda: inductor_provenance_tracking_node_mappings,
+                        )
+                        trace_structured(
+                            "artifact",
+                            metadata_fn=lambda: {
+                                "name": "inductor_provenance_tracking_kernel_stack_traces",
+                                "encoding": "json",
+                            },
+                            payload_fn=lambda: inductor_kernel_stack_trace_str,
+                        )
+
                     node_runtimes = None
                     if inductor_metrics_log.isEnabledFor(logging.INFO):
                         num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes()
@@ -1606,6 +1613,19 @@ def codegen_and_compile(
 
                     self._compile_stats[type(self)].codegen_and_compile += 1
 
+                    if (
+                        torch._inductor.debug.RECORD_GRAPH_EXECUTION
+                        and torch._inductor.debug.GRAPH_COMPILE_IDS is not None
+                    ):
+                        compile_id = str(
+                            torch._guards.CompileContext.current_compile_id()
+                        )
+                        graph_id = graph_kwargs.get("graph_id")
+                        if graph_id is not None:
+                            torch._inductor.debug.GRAPH_COMPILE_IDS[graph_id] = (
+                                compile_id
+                            )
+
                     return CompiledFxGraph(
                         compiled_fn,
                         graph,
@@ -1622,6 +1642,8 @@ def codegen_and_compile(
                         runnable_graph_str,
                         inductor_post_grad_graph_str,
                         compiled_fn_runner,
+                        inductor_provenance_tracking_node_mappings,
+                        inductor_kernel_stack_trace_str,
                     )
 
 
@@ -2089,17 +2111,273 @@ def partition_fn(
         "static_lifetime_input_indices", None
     )
 
-    with dynamo_utils.dynamo_timed(
-        "min_cut_rematerialization_partition", log_pt2_compile_event=True
-    ):
-        return min_cut_rematerialization_partition(
-            gm,
-            joint_inputs,
-            compiler="inductor",
-            static_lifetime_input_indices=static_lifetime_input_indices,
-            **kwargs,
+    if config.custom_partitioner_fn is None:
+        with dynamo_utils.dynamo_timed(
+            "min_cut_rematerialization_partition", log_pt2_compile_event=True
+        ):
+            return min_cut_rematerialization_partition(
+                gm,
+                joint_inputs,
+                compiler="inductor",
+                static_lifetime_input_indices=static_lifetime_input_indices,
+                **kwargs,
+            )
+    else:
+        assert isinstance(config.custom_partitioner_fn, CustomPartitionerFn)
+        with dynamo_utils.dynamo_timed(
+            config.custom_partitioner_fn.__class__.__name__,
+            log_pt2_compile_event=True,
+        ):
+            return config.custom_partitioner_fn(
+                gm,
+                joint_inputs,
+                compiler="inductor",
+                static_lifetime_input_indices=static_lifetime_input_indices,
+                **kwargs,
+            )
+
+
+def get_num_model_outputs(model: GraphModule) -> int:
+    model_outputs_node = output_node(model)
+    model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+    return len(model_outputs)
+
+
+@dataclass(frozen=True)
+class CompilerConfigExtra:
+    cudagraphs: BoxedBool
+    graph_id: int
+    forward_device: BoxedDeviceIndex
+
+
+def create_compiler_config_extra(config: types.ModuleType) -> CompilerConfigExtra:
+    # Although cudagraphs may have been enabled via config, various
+    # conditions (which are tested within the bowels of Inductor) may
+    # force cudagraphs to be disabled.  This mutable box lets us retrieve
+    # the final determination if cudagraphs actually can be used or not.
+    cudagraphs = BoxedBool(config.triton.cudagraphs)
+
+    # TODO: The modern style is to use CompileId from TracingContext to
+    # identify Inductor compilation.  However, this CompileId cannot
+    # uniquely identify multiple Inductor compilations that arise from
+    # DDPOptimizer
+    graph_id = next(_graph_counter)
+
+    # See [Backward Generation Handling]
+    forward_device = BoxedDeviceIndex(None)
+
+    return CompilerConfigExtra(
+        cudagraphs=cudagraphs,
+        graph_id=graph_id,
+        forward_device=forward_device,
+    )
+
+
+def compile_fx_forward(
+    gm: GraphModule,
+    example_inputs: Sequence[InputType],
+    num_orig_model_outputs: int,
+    num_example_inputs: int,
+    compiler_config_extra: CompilerConfigExtra,
+    inner_compile: Callable[..., OutputCode] = compile_fx_inner,
+    is_inference: bool = False,
+) -> OutputCode:
+    """
+    Compile the forward graph of the given graph module.
+
+    Args:
+        gm: The graph module to compile.
+        example_inputs: The example inputs to use for compilation.
+        num_orig_model_outputs: The number of model outputs from the original dynamo graph.
+        num_example_inputs: The number of example inputs from the original dynamo graph.
+        compiler_config_extra: Extra configuration for the compiler.
+        inner_compile: The inner compile function to use.
+        is_inference: Whether this is an inference graph.
+    """
+
+    if is_inference:
+        # partition_fn won't be called
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "before_joint_graph",
+                "encoding": "string",
+            },
+            payload_fn=lambda: gm.print_readable(
+                print_output=False, include_stride=True, include_device=True
+            ),
+        )
+
+        _recursive_joint_graph_passes(gm)
+
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "after_joint_graph",
+                "encoding": "string",
+            },
+            payload_fn=lambda: gm.print_readable(
+                print_output=False, include_stride=True, include_device=True
+            ),
         )
 
+    fixed = torch._inductor.utils.num_fw_fixed_arguments(
+        num_example_inputs, len(example_inputs)
+    )
+
+    model_outputs_node = output_node(gm)
+    if config.keep_output_stride:
+        model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+        num_model_outputs = len(model_outputs)
+
+        context = torch._guards.TracingContext.try_get()
+        # See Note [User Outputs in the inductor graph]
+        if context is not None and context.fw_metadata and not is_inference:
+            original_output_start_index = (
+                context.fw_metadata.num_mutated_inp_runtime_indices
+            )
+        else:
+            original_output_start_index = 0
+
+        assert num_orig_model_outputs <= num_model_outputs
+
+        # Note [User Outputs in the inductor graph]
+        # We makes the following assumption
+        # For inference
+        #   len(orig_model_outputs) == len(model_outputs)
+        # For training
+        #   len(orig_model_outputs) <= len(model_outputs)
+        # During training, most of the time the model_outputs starts with
+        # original module's outputs followed by saved activations.
+        # But this can be not true if the model have inplace updated tensors.
+        # AOTAutograd will make those tensors being returned before the original
+        # module's output.
+        # To make things safe, we'll use original_output_start_index field
+        # set by AOTAutograd to decide where the original module outputs start.
+        orig_output_end_idx = original_output_start_index + num_orig_model_outputs
+        # Sanity check: we are about to splice out the "user" outputs from the full set
+        # of "graph" outputs. Make sure we're within bounds.
+        assert orig_output_end_idx <= num_model_outputs
+
+        model_outputs_node.meta["user_visible_output_idxs"] = [
+            idx
+            for idx in range(original_output_start_index, orig_output_end_idx)
+            if isinstance(model_outputs[idx], torch.fx.Node)
+        ]
+    else:
+        model_outputs_node.meta["user_visible_output_idxs"] = []
+
+    # We also mark the invoke_subgraph outputs as user_visible to
+    # force the outputs of invoke_subgraph subgraph to follow the
+    # original strides
+    _recursive_record_user_visible_output_idxs(gm)
+
+    return inner_compile(
+        gm,
+        example_inputs,
+        static_input_idxs=get_static_input_idxs(fixed),
+        cudagraphs=compiler_config_extra.cudagraphs,
+        graph_id=compiler_config_extra.graph_id,
+        is_inference=is_inference,
+        boxed_forward_device_index=compiler_config_extra.forward_device,
+    )
+
+
+def compile_fx_backward(
+    gm: GraphModule,
+    example_inputs: Sequence[InputType],
+    compiler_config_extra: CompilerConfigExtra,
+    inner_compile: Callable[..., OutputCode] = compile_fx_inner,
+) -> OutputCode:
+    """
+    Compile the backward graph of the given graph module.
+
+    Args:
+        gm: The graph module to compile.
+        example_inputs: The example inputs to use for compilation.
+        compiler_config_extra: Extra configuration for the compiler.
+        inner_compile: The inner compile function to use.
+    """
+    from torch._dynamo.convert_frame import compile_lock
+
+    with compile_lock:
+        model_outputs_node = output_node(gm)
+        if config.bw_outputs_user_visible:
+            model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
+            model_outputs_node.meta["user_visible_output_idxs"] = [
+                idx
+                for idx, n in enumerate(model_outputs)
+                if isinstance(n, torch.fx.Node)
+            ]
+        else:
+            model_outputs_node.meta["user_visible_output_idxs"] = []
+
+        fixed = count_tangents(gm)
+        with (
+            config.patch(get_cpp_wrapper_config())
+            if config.cpp_wrapper
+            else contextlib.nullcontext()
+        ):
+            return inner_compile(
+                gm,
+                example_inputs,
+                static_input_idxs=list(range(fixed)),
+                cudagraphs=compiler_config_extra.cudagraphs,
+                is_backward=True,
+                graph_id=compiler_config_extra.graph_id,
+                boxed_forward_device_index=compiler_config_extra.forward_device,
+            )
+
+
+def run_pre_grad_passes(
+    model_: GraphModule, example_inputs_: Sequence[InputType]
+) -> GraphModule:
+    # "before_pre_grad_graph" is used in inductor provenance
+    # tracking highlighter front-end.
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "before_pre_grad_graph",
+            "encoding": "string",
+        },
+        payload_fn=lambda: model_.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        )
+        + f"\n\n # graph id: {id(model_.graph)}",
+    )
+    pre_grad_graphs_log.debug(
+        "%s",
+        lazy_format_graph_code(
+            "BEFORE PRE GRAD",
+            model_,
+            include_stride=True,
+            include_device=True,
+            colored=True,
+        ),
+    )
+    torch._inductor.debug._pre_grad_graph_id = id(model_.graph)
+
+    if config.trace.provenance_tracking_level == 1:
+        for node in model_.graph.nodes:
+            if node.stack_trace:
+                torch._inductor.debug._inductor_pre_grad_node_stack_trace[node.name] = (
+                    node.stack_trace
+                )
+
+    model_ = _recursive_pre_grad_passes(model_, example_inputs_)
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "after_pre_grad_graph",
+            "encoding": "string",
+        },
+        payload_fn=lambda: model_.print_readable(
+            print_output=False, include_stride=True, include_device=True
+        )
+        + f"\n\n # graph id: {id(model_.graph)}",
+    )
+    return model_
+
 
 def compile_fx(
     model_: GraphModule,
@@ -2245,50 +2523,7 @@ def compile_fx(
         # having AOTAutograd trace it.
         # TODO: Get rid of this?
         if isinstance(model_, GraphModule):
-            # "before_pre_grad_graph" is used in inductor provenance
-            # tracking highlighter front-end.
-            trace_structured(
-                "artifact",
-                metadata_fn=lambda: {
-                    "name": "before_pre_grad_graph",
-                    "encoding": "string",
-                },
-                payload_fn=lambda: model_.print_readable(
-                    print_output=False, include_stride=True, include_device=True
-                )
-                + f"\n\n # graph id: {id(model_.graph)}",
-            )
-            pre_grad_graphs_log.debug(
-                "%s",
-                lazy_format_graph_code(
-                    "BEFORE PRE GRAD",
-                    model_,
-                    include_stride=True,
-                    include_device=True,
-                    colored=True,
-                ),
-            )
-            torch._inductor.debug._pre_grad_graph_id = id(model_.graph)
-
-            if config.trace.provenance_tracking_level == 1:
-                for node in model_.graph.nodes:
-                    if node.stack_trace:
-                        torch._inductor.debug._inductor_pre_grad_node_stack_trace[
-                            node.name
-                        ] = node.stack_trace
-
-            model_ = _recursive_pre_grad_passes(model_, example_inputs_)
-            trace_structured(
-                "artifact",
-                metadata_fn=lambda: {
-                    "name": "after_pre_grad_graph",
-                    "encoding": "string",
-                },
-                payload_fn=lambda: model_.print_readable(
-                    print_output=False, include_stride=True, include_device=True
-                )
-                + f"\n\n # graph id: {id(model_.graph)}",
-            )
+            model_ = run_pre_grad_passes(model_, example_inputs_)
 
         # TODO: Move this before recursive pre-grad passes
         # NB: This short circuit never occurs for Dynamo produced graphs
@@ -2304,20 +2539,7 @@ def compile_fx(
 
         num_example_inputs = len(example_inputs_)
 
-        # Although cudagraphs may have been enabled via config, various
-        # conditions (which are tested within the bowels of Inductor) may
-        # force cudagraphs to be disabled.  This mutable box lets us retrieve
-        # the final determination if cudagraphs actually can be used or not.
-        cudagraphs = BoxedBool(config.triton.cudagraphs)
-
-        # See [Backward Generation Handling]
-        forward_device = BoxedDeviceIndex(None)
-
-        # TODO: The modern style is to use CompileId from TracingContext to
-        # identify Inductor compilation.  However, this CompileId cannot
-        # uniquely identify multiple Inductor compilations that arise from
-        # DDPOptimizer
-        graph_id = next(_graph_counter)
+        compiler_config_extra = create_compiler_config_extra(config)
 
         decompositions = (
             decompositions if decompositions is not None else select_decomp_table()
@@ -2329,105 +2551,18 @@ def fw_compiler_base(
             is_inference: bool,
         ) -> OutputCode:
             with dynamo_utils.dynamo_timed("compile_fx.<locals>.fw_compiler_base"):
-                if is_inference:
-                    # partition_fn won't be called
-                    trace_structured(
-                        "artifact",
-                        metadata_fn=lambda: {
-                            "name": "before_joint_graph",
-                            "encoding": "string",
-                        },
-                        payload_fn=lambda: gm.print_readable(
-                            print_output=False, include_stride=True, include_device=True
-                        ),
-                    )
-
-                    _recursive_joint_graph_passes(gm)
-
-                    trace_structured(
-                        "artifact",
-                        metadata_fn=lambda: {
-                            "name": "after_joint_graph",
-                            "encoding": "string",
-                        },
-                        payload_fn=lambda: gm.print_readable(
-                            print_output=False, include_stride=True, include_device=True
-                        ),
-                    )
-
-                fixed = torch._inductor.utils.num_fw_fixed_arguments(
-                    num_example_inputs, len(example_inputs)
-                )
-
-                model_outputs_node = output_node(gm)
-                if config.keep_output_stride:
-                    model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
-                    num_model_outputs = len(model_outputs)
-
-                    context = torch._guards.TracingContext.try_get()
-                    # See Note [User Outputs in the inductor graph]
-                    if context is not None and context.fw_metadata and not is_inference:
-                        original_output_start_index = (
-                            context.fw_metadata.num_mutated_inp_runtime_indices
-                        )
-                    else:
-                        original_output_start_index = 0
-
-                    if isinstance(model_, GraphModule):
-                        *_, orig_model_outputs_node = model_.graph.nodes
-                        assert orig_model_outputs_node.op == "output"
-                        orig_model_outputs, _ = pytree.tree_flatten(
-                            orig_model_outputs_node.args
-                        )
-                        num_orig_model_outputs = len(orig_model_outputs)
-                    else:
-                        num_orig_model_outputs = num_model_outputs
-
-                    assert num_orig_model_outputs <= num_model_outputs
-
-                    # Note [User Outputs in the inductor graph]
-                    # We makes the following assumption
-                    # For inference
-                    #   len(orig_model_outputs) == len(model_outputs)
-                    # For training
-                    #   len(orig_model_outputs) <= len(model_outputs)
-                    # During training, most of the time the model_outputs starts with
-                    # original module's outputs followed by saved activations.
-                    # But this can be not true if the model have inplace updated tensors.
-                    # AOTAutograd will make those tensors being returned before the original
-                    # module's output.
-                    # To make things safe, we'll use original_output_start_index field
-                    # set by AOTAutograd to decide where the original module outputs start.
-                    orig_output_end_idx = (
-                        original_output_start_index + num_orig_model_outputs
-                    )
-                    # Sanity check: we are about to splice out the "user" outputs from the full set
-                    # of "graph" outputs. Make sure we're within bounds.
-                    assert orig_output_end_idx <= num_model_outputs
-
-                    model_outputs_node.meta["user_visible_output_idxs"] = [
-                        idx
-                        for idx in range(
-                            original_output_start_index, orig_output_end_idx
-                        )
-                        if isinstance(model_outputs[idx], torch.fx.Node)
-                    ]
+                if isinstance(model_, GraphModule):
+                    num_orig_model_outputs = get_num_model_outputs(model_)
                 else:
-                    model_outputs_node.meta["user_visible_output_idxs"] = []
-
-                # We also mark the invoke_subgraph outputs as user_visible to
-                # force the outputs of invoke_subgraph subgraph to follow the
-                # original strides
-                _recursive_record_user_visible_output_idxs(gm)
-
-                return inner_compile(
+                    num_orig_model_outputs = get_num_model_outputs(gm)
+                return compile_fx_forward(
                     gm,
                     example_inputs,
-                    static_input_idxs=get_static_input_idxs(fixed),
-                    cudagraphs=cudagraphs,
-                    graph_id=graph_id,
+                    num_orig_model_outputs=num_orig_model_outputs,
+                    num_example_inputs=num_example_inputs,
+                    compiler_config_extra=compiler_config_extra,
+                    inner_compile=inner_compile,
                     is_inference=is_inference,
-                    boxed_forward_device_index=forward_device,
                 )
 
         fw_compiler: Callable[[GraphModule, Sequence[InputType]], OutputCode] = (
@@ -2441,9 +2576,9 @@ def fw_compiler_base(
                 dynamo_model=model_,
                 num_example_inputs=num_example_inputs,
                 inner_compile=inner_compile,
-                cudagraphs=cudagraphs,
-                graph_id=graph_id,
-                forward_device=forward_device,
+                cudagraphs=compiler_config_extra.cudagraphs,
+                graph_id=compiler_config_extra.graph_id,
+                forward_device=compiler_config_extra.forward_device,
             )
         else:
             inference_compiler = functools.partial(fw_compiler_base, is_inference=True)
@@ -2455,38 +2590,15 @@ def fw_compiler_base(
         def bw_compiler(
             gm: GraphModule, example_inputs: Sequence[InputType]
         ) -> OutputCode:
-            from torch._dynamo.convert_frame import compile_lock
-
             with (
                 dynamo_utils.dynamo_timed("compile_fx.<locals>.bw_compiler"),
-                compile_lock,
             ):
-                model_outputs_node = output_node(gm)
-                if config.bw_outputs_user_visible:
-                    model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args)
-                    model_outputs_node.meta["user_visible_output_idxs"] = [
-                        idx
-                        for idx, n in enumerate(model_outputs)
-                        if isinstance(n, torch.fx.Node)
-                    ]
-                else:
-                    model_outputs_node.meta["user_visible_output_idxs"] = []
-
-                fixed = count_tangents(gm)
-                with (
-                    config.patch(get_cpp_wrapper_config())
-                    if config.cpp_wrapper
-                    else contextlib.nullcontext()
-                ):
-                    return inner_compile(
-                        gm,
-                        example_inputs,
-                        static_input_idxs=list(range(fixed)),
-                        cudagraphs=cudagraphs,
-                        is_backward=True,
-                        graph_id=graph_id,
-                        boxed_forward_device_index=forward_device,
-                    )
+                return compile_fx_backward(
+                    gm,
+                    example_inputs,
+                    compiler_config_extra=compiler_config_extra,
+                    inner_compile=inner_compile,
+                )
 
         bw_compiler = SerializableAOTDispatchCompiler(OutputCode, bw_compiler)
 
@@ -2573,8 +2685,8 @@ def bw_compiler(
                     decompositions=decompositions,
                     partition_fn=partition_fn,
                     keep_inference_input_mutations=True,
-                    cudagraphs=cudagraphs,
-                    boxed_forward_device_index=forward_device,
+                    cudagraphs=compiler_config_extra.cudagraphs,
+                    boxed_forward_device_index=compiler_config_extra.forward_device,
                     ignore_shape_env=ignore_shape_env,
                 )(model_, example_inputs_)
             except ShortenTraceback as e:
diff --git a/torch/_inductor/compile_worker/subproc_pool.py b/torch/_inductor/compile_worker/subproc_pool.py
index 50858c3ae66e..6342fc7e0fcd 100644
--- a/torch/_inductor/compile_worker/subproc_pool.py
+++ b/torch/_inductor/compile_worker/subproc_pool.py
@@ -28,6 +28,7 @@
 )
 from torch._inductor.compile_worker.utils import _async_compile_initializer
 from torch._inductor.utils import get_ld_library_path, python_subprocess_env
+from torch._utils_internal import find_compile_subproc_binary
 
 
 log = logging.getLogger(__name__)
@@ -90,8 +91,14 @@ class SubprocException(Exception):
     Thrown when a job in a subprocess raises an Exception.
     """
 
-    def __init__(self, details: str) -> None:
-        super().__init__(f"An exception occurred in a subprocess:\n\n{details}")
+    def __init__(self, details: str, name: str = "<unknown>") -> None:
+        self.details = details
+        super().__init__(
+            f"An exception occurred in a subprocess:\n\nName={name}\n{details}"
+        )
+
+    def with_name(self, name: str) -> "SubprocException":
+        return SubprocException(self.details, name)
 
 
 class SubprocPickler:
@@ -137,6 +144,11 @@ def __init__(
         cmd = [
             sys.executable,
             entry,
+        ]
+        if (binary := find_compile_subproc_binary()) is not None:
+            cmd = [binary]
+
+        args = [
             f"--pickler={self.pickler.__class__.__module__}.{self.pickler.__class__.__name__}",
             f"--kind={self.kind.value}",
             f"--workers={nprocs}",
@@ -145,6 +157,7 @@ def __init__(
             f"--write-fd={str(subproc_write_fd)}",
             f"--torch-key={torch_key_str}",
         ]
+        cmd.extend(args)
         log_path = None
         self.log_file = None
 
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 7630ebd2acab..f6921a057ba0 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -266,6 +266,9 @@ def prologue_fusion_enabled() -> bool:
 post_grad_custom_pre_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
 post_grad_custom_post_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
 
+# Allow users to pass in custom partition function
+custom_partitioner_fn: torch._inductor.custom_graph_pass.CustomPartitionerFnType = None
+
 # Registers a custom joint graph pass.
 joint_custom_pre_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
 joint_custom_post_pass: torch._inductor.custom_graph_pass.CustomGraphPassType = None
@@ -413,6 +416,8 @@ def prologue_fusion_enabled() -> bool:
 # for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
 estimate_op_runtime = "default"
 
+runtime_estimations_mms_benchmark: bool = False
+
 # unit: GB/s, uni-directional P2P bandwidth per card
 # default value is NVLink
 intra_node_bw = 300
@@ -448,6 +453,12 @@ def prologue_fusion_enabled() -> bool:
     os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_REPORT_CHOICES_STATS", "1") == "1"
 )
 
+# Prune configs that require more shared memory than the hardware limit
+max_autotune_prune_choices_based_on_shared_mem = (
+    os.environ.get("TORCHINDUCTOR_MAX_AUTOTUNE_PRUNE_CHOICES_BASED_ON_SHARED_MEM", "1")
+    == "1"
+)
+
 # enable inductor graph partition to allow multiple inductor graphs for the same dynamo graph
 graph_partition: bool = (
     os.environ.get("TORCHINDUCTOR_GRAPH_PARTITION", "1" if not is_fbcode() else "0")
@@ -762,6 +773,10 @@ def decide_worker_start_method() -> str:
 
 worker_start_method: str = decide_worker_start_method()
 
+# Threshold to decide if a kernel has small memory access in bytes
+# Default value is 16 MB which is arbitrarily selected.
+small_memory_access_threshold: int = 16777216
+
 # Whether to log from subprocess workers that are launched.
 worker_suppress_logging: bool = Config(
     justknob="pytorch/compiler:worker_suppress_logging",
@@ -914,9 +929,15 @@ def decide_compile_threads() -> int:
 )
 pad_channels_last = False
 
+# Control if we will do padding on dynamic shapes
+pad_dynamic_shapes = False
+
 # Disable comprehensive padding on the CPU
 disable_padding_cpu = True
 
+# Control if we will expand the dimension of pointwise nodes to fuse
+expand_dimension_for_pointwise_nodes = False
+
 # The width of comprehensive padding, in bytes.
 # CUDA max memory transaction size is 128 bytes for a warp.
 padding_alignment_bytes = 128
@@ -1233,6 +1254,15 @@ class triton:
     # instead of recording and executing cudagraphs
     force_cudagraphs_warmup = False
 
+    # If False (default), torch.compile skips cudagraph for a graph if it
+    # contains cudagraph-unsafe ops. If True, we require that all cuda ops
+    # be captured into cudagraph. If this is not possible, this will raise
+    # an error.
+    cudagraph_or_error: bool = Config(
+        env_name_force="TORCHINDUCTOR_CUDAGRAPH_OR_ERROR",
+        default=False,
+    )
+
     # assertions on the fast path
     fast_path_cudagraph_asserts = False
 
@@ -1383,7 +1413,6 @@ class triton:
     # extraction and minification functionality.
     # Valid values: "compile_error", "runtime_error", "accuracy"
     inject_relu_bug_TESTING_ONLY: Optional[str] = None
-    inject_log1p_bug_TESTING_ONLY: Optional[str] = None
 
     # Whether to upcast float16 / bfloat16 to float32 in triton codegen (Experimental)
     codegen_upcast_to_fp32 = True
@@ -1756,6 +1785,9 @@ class rocm:
     # The threshold at which we trigger a splitK config - K // max(M,N) has to be greater than this
     split_k_threshold: int = 16
 
+    # The threshold at which we trigger a contiguous subgraph transformation
+    contiguous_threshold: int = 16
+
 
 # Backend to use for CPU codegen either "cpp" or "triton" (experimental) or "halide" (experimental)
 cpu_backend: Literal["cpp", "triton", "halide"] = "cpp"
diff --git a/torch/_inductor/config_comms.py b/torch/_inductor/config_comms.py
new file mode 100644
index 000000000000..b5dbf424f35b
--- /dev/null
+++ b/torch/_inductor/config_comms.py
@@ -0,0 +1,15 @@
+import sys
+
+from torch.utils._config_module import install_config_module
+
+
+# Whether to use c10d._time_estimator for collectives runtime estimations.
+runtime_estimations_use_nccl_lib_estimations: bool = False
+
+# Config to enable sync of runtime estimations across distributed ranks,
+# To prevent passes using this runtime estimations to make different
+# decisions on different distributed ranks.
+runtime_estimations_align_across_all_distributed_ranks: bool = False
+
+# adds patch, save_config, etc
+install_config_module(sys.modules[__name__])
diff --git a/torch/_inductor/cpu_vec_isa.py b/torch/_inductor/cpu_vec_isa.py
index efa25f6efe94..f2fd105e6a96 100644
--- a/torch/_inductor/cpu_vec_isa.py
+++ b/torch/_inductor/cpu_vec_isa.py
@@ -200,12 +200,51 @@ class VecAVX512(VecISA):
         else "/arch:AVX512"
     )  # TODO: use cflags
     _dtype_nelements = {torch.float: 16, torch.bfloat16: 32, torch.float16: 32}
+    _is_avx512_bf16_supported = False
 
     def __str__(self) -> str:
         return "avx512"
 
     __hash__: Callable[[VecISA], Any] = VecISA.__hash__  # type: ignore[assignment]
 
+    _avx512_bf16_code = """
+#include <cstdint>
+#include <immintrin.h>
+
+extern "C" __m512bh __avx512_bf16_chk_kernel(__m512 a, __m512 b) {
+    return _mm512_cvtne2ps_pbh(a, b);
+}
+"""
+
+    @functools.cache  # noqa: B019
+    def __bool__(self) -> bool:
+        if super().__bool__():
+            if config.is_fbcode():
+                return False
+            # check avx512_bf16
+            if torch.cpu._is_avx512_bf16_supported() and not _IS_WINDOWS:
+                # save _arch_flags
+                base_flags = self._arch_flags
+                # temporarily change _arch_flags for avx512_bf16 check_build
+                self._arch_flags += " -mavx512bf16"
+                if self.check_build(VecAMX._avx512_bf16_code):
+                    self._is_avx512_bf16_supported = True
+                # restore _arch_flags
+                self._arch_flags = base_flags
+
+            return True
+        return False
+
+    @functools.lru_cache(None)  # noqa: B019
+    def is_avx512_bf16_supported(self) -> bool:
+        return self._is_avx512_bf16_supported
+
+    def build_arch_flags(self) -> str:
+        if self._is_avx512_bf16_supported:
+            return self._arch_flags + " -mavx512bf16"
+        else:
+            return self._arch_flags
+
 
 @dataclasses.dataclass
 class VecAMX(VecAVX512):
@@ -267,10 +306,14 @@ def is_amx_fp16_supported(self) -> bool:
         return self._is_amx_fp16_supported
 
     def build_arch_flags(self) -> str:
+        extra_flags = ""
+        if self._is_avx512_bf16_supported:
+            # avx512_bf16 is not among the base flags, so we need to check and add it here
+            # And we need this flag in the WOQ case for dequantization
+            extra_flags += " -mavx512bf16"
         if self._is_amx_fp16_supported:
-            return self._arch_flags + " -mamx-fp16"
-        else:
-            return self._arch_flags
+            extra_flags += " -mamx-fp16"
+        return self._arch_flags + extra_flags
 
 
 @dataclasses.dataclass
diff --git a/torch/_inductor/cudagraph_utils.py b/torch/_inductor/cudagraph_utils.py
index e6281ad30e41..effed470548c 100644
--- a/torch/_inductor/cudagraph_utils.py
+++ b/torch/_inductor/cudagraph_utils.py
@@ -204,6 +204,10 @@ def check_lowering_disable_cudagraph(
 def log_cudagraph_skip_and_bump_counter(msg: str) -> None:
     perf_hint_log.warning(msg)
     counters["inductor"]["cudagraph_skips"] += 1
+
+    if torch._inductor.config.triton.cudagraph_or_error:
+        raise RuntimeError(msg)
+
     metrics_context = get_metrics_context()
     if metrics_context.in_progress():
         metrics_context.set("cudagraph_skip_reason", msg, overwrite=True)
diff --git a/torch/_inductor/custom_graph_pass.py b/torch/_inductor/custom_graph_pass.py
index c9a8e33a1145..413a224724fd 100644
--- a/torch/_inductor/custom_graph_pass.py
+++ b/torch/_inductor/custom_graph_pass.py
@@ -1,5 +1,6 @@
 import hashlib
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from functools import lru_cache
 from typing import Any, Callable, Optional, Union
 from typing_extensions import TypeAlias
@@ -102,3 +103,58 @@ def get_hash_for_files(paths: tuple[str], extra: str = "") -> bytes:
             hasher.update(path.encode("utf-8"))
             hasher.update(f.read())
     return hasher.digest()
+
+
+class CustomPartitionerFn(ABC):
+    """
+    Implement this interface for custom partitioner:
+
+    1) The __call__() method contains the implementation of the custom partitioner.
+
+    2) The uuid() method enables inductor to cache compiled graphs when your custom
+    partitioner are applied. This method can return any identifier as long as it uniquely
+    identifies your implementation (and can be pickled). The caching logic includes this
+    identifier in its key calculation, i.e., any new value will effectively invalidate
+    existing entries. We expect custom partitioner would typically depend purely on the
+    textual representation of the implementation. In that case, we recommend using the
+    'get_hash_for_files' helper below to compute a unique hash from the contents of a
+    static list of source files, i.e., the source(s) containing the custom partitioner
+    implementation. That approach ensures that any change to the implementation will
+    mean a new uuid.
+
+    EXAMPLE:
+
+    from torch._inductor.custom_graph_pass import get_hash_for_files
+
+    class MyCustomPartitionerFn(CustomPartitionerFn):
+        def __call__(
+            self,
+            gm: torch.fx.GraphModule,
+            joint_inputs: Sequence[object],
+            **kwargs: Any
+        ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+            # my custom partitioner implementation
+            #     ...
+
+        def uuid(self) -> Optional[Any]:
+            return get_hash_for_files((__file__,))
+
+    """
+
+    @abstractmethod
+    def __call__(
+        self, gm: torch.fx.GraphModule, joint_inputs: Sequence[object], **kwargs: Any
+    ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+        """
+        Implementation of the custom partitioner.
+        """
+
+    @abstractmethod
+    def uuid(self) -> Optional[Any]:
+        """
+        Return an ID to uniquely identify your custom partitioner implementation.
+        Return None to skip inductor code caching entirely.
+        """
+
+
+CustomPartitionerFnType: TypeAlias = Optional[CustomPartitionerFn]
diff --git a/torch/_inductor/debug.py b/torch/_inductor/debug.py
index 9b5213cf3e38..e9df7119bb75 100644
--- a/torch/_inductor/debug.py
+++ b/torch/_inductor/debug.py
@@ -34,7 +34,7 @@
 from torch.utils._pytree import tree_map
 
 from . import config, ir  # noqa: F811, this is needed
-from .ir import ExternKernelOut
+from .ir import ExternKernel
 from .scheduler import (
     BaseSchedulerNode,
     FusedSchedulerNode,
@@ -47,6 +47,11 @@
 
 log = logging.getLogger(__name__)
 
+# Graph execution tracking for debugging
+GRAPH_EXECUTION_ORDER: Optional[list[dict[str, object]]] = None
+RECORD_GRAPH_EXECUTION: bool = False
+GRAPH_COMPILE_IDS: Optional[dict[int, Optional[str]]] = None
+
 ir_pre_fusion_log = getArtifactLogger(__name__, "ir_pre_fusion")
 ir_post_fusion_log = getArtifactLogger(__name__, "ir_post_fusion")
 SchedulerNodeList = list[Any]
@@ -321,6 +326,12 @@ def enable_aot_logging() -> Iterator[None]:
 _pre_grad_graph_id: Optional[int] = None
 _inductor_pre_grad_node_stack_trace: dict[str, str] = {}
 _inductor_kernel_stack_trace: dict[str, list[str]] = {}
+_inductor_kernel_provenance_debug_handle: int = 0
+
+
+def reset_inductor_kernel_provenance_debug_handle() -> None:
+    global _inductor_kernel_provenance_debug_handle
+    _inductor_kernel_provenance_debug_handle = 0
 
 
 @contextlib.contextmanager
@@ -802,6 +813,39 @@ def dtype_to_str(dtype: Any) -> Optional[str]:
         log.debug("Failed to log inductor_runtime_and_tensor_meta", exc_info=True)
 
 
+def log_graph_execution() -> None:
+    """Emit a structured artifact with the graph execution order."""
+    if not GRAPH_EXECUTION_ORDER:
+        return
+    try:
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "graph_execution",
+                "encoding": "json",
+            },
+            payload_fn=lambda: {"graph_execution_order": GRAPH_EXECUTION_ORDER},
+        )
+    except Exception:
+        log.debug("Failed to log graph_execution", exc_info=True)
+
+
+@contextlib.contextmanager
+def record_and_log_graph_execution_order() -> Iterator[None]:
+    """Record graph execution order and log it once on exit."""
+    global RECORD_GRAPH_EXECUTION, GRAPH_EXECUTION_ORDER, GRAPH_COMPILE_IDS
+    GRAPH_EXECUTION_ORDER = []
+    GRAPH_COMPILE_IDS = {}
+    RECORD_GRAPH_EXECUTION = True
+    try:
+        yield
+    finally:
+        log_graph_execution()
+        RECORD_GRAPH_EXECUTION = False
+        GRAPH_EXECUTION_ORDER = None
+        GRAPH_COMPILE_IDS = None
+
+
 @dataclasses.dataclass
 class TensorMetadataHolder:
     tensor_metadata: TensorMetadata
@@ -968,18 +1012,12 @@ def convert_sets_to_lists(d: dict[str, Any]) -> None:
         return empty_return
 
 
-def dump_inductor_provenance_info(
-    filename: str = "inductor_generated_kernel_to_post_grad_nodes.json",
-) -> dict[str, Any]:
+def dump_inductor_provenance_info() -> dict[str, Any]:
     try:
         global _pre_grad_graph_id
         global _inductor_post_to_pre_grad_nodes
         global _inductor_triton_kernel_to_post_grad_node_info
-        if config.trace.enabled:
-            with V.debug.fopen(filename, "w") as fd:
-                log.info("Writing provenance tracing debugging info to %s", fd.name)
-                json.dump(_inductor_triton_kernel_to_post_grad_node_info, fd)
-        node_mapping = {}
+        node_mapping: dict[str, Any] = {}
         if _pre_grad_graph_id:
             node_mapping_kernel = create_node_mapping_kernel_to_post_grad(
                 _inductor_triton_kernel_to_post_grad_node_info
@@ -993,6 +1031,9 @@ def dump_inductor_provenance_info(
                     "inductor_provenance_tracking_node_mappings.json", "w"
                 ) as fd:
                     json.dump(node_mapping, fd)
+        # we need to update the node mapping version when node mapping format changes
+        # so the tlparse tool knows which node mapping version it is looking at
+        node_mapping["version"] = 2.0
         return node_mapping
     except Exception as e:
         # Since this is just debugging, it should never interfere with regular
@@ -1052,18 +1093,28 @@ def create_kernel_information_json() -> dict[str, dict[str, list[str]]]:
 
 
 def set_kernel_post_grad_provenance_tracing(
-    node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernelOut],
+    node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernel],
     kernel_name: str,
     is_extern: bool = False,
-) -> None:
+) -> Optional[int]:
+    """
+    Set the mapping between `kernel_name` and the post_grad nodes in `node_schedule`.
+
+    Returns a unique int debug handler for each call to this function.
+    """
+
     try:
         from .codegen.simd_kernel_features import DisableReduction, EnableReduction
 
         global _inductor_triton_kernel_to_post_grad_node_info
         global _inductor_kernel_stack_trace
+        global _inductor_kernel_provenance_debug_handle
+
+        _inductor_kernel_provenance_debug_handle += 1
         stack_traces: list[str] = []
+        kernel_name = f"{kernel_name}:{_inductor_kernel_provenance_debug_handle}"
         if is_extern:
-            assert isinstance(node_schedule, ExternKernelOut)
+            assert isinstance(node_schedule, ExternKernel)
             curr_node_info = _inductor_triton_kernel_to_post_grad_node_info.setdefault(
                 kernel_name, []
             )
@@ -1100,6 +1151,7 @@ def set_kernel_post_grad_provenance_tracing(
                         )
             stack_traces = list(stack_traces_set)
         _inductor_kernel_stack_trace.setdefault(kernel_name, []).extend(stack_traces)
+        return _inductor_kernel_provenance_debug_handle
     except Exception as e:
         # Since this is just debugging, it should never interfere with regular
         # program execution, so we use this try-except to guard against any error
@@ -1112,6 +1164,7 @@ def set_kernel_post_grad_provenance_tracing(
                 "stack_trace": traceback.format_exc(),
             },
         )
+        return None
 
 
 def save_args_for_compile_fx_inner(*args: Any, **kwargs: Any) -> None:
@@ -1197,7 +1250,7 @@ def aot_inductor_minifier_wrapper(
 
     use_minifier = config.aot_inductor.dump_aoti_minifier
 
-    gm = exported_program.module()
+    gm = exported_program.module(check_guards=False)
     assert isinstance(gm, torch.fx.GraphModule)
 
     args, kwargs = exported_program.example_inputs
@@ -1226,7 +1279,7 @@ def aot_inductor_minifier_wrapper(
             tuple_inputs = tuple(flat_example_inputs)
             flattened_ep = torch.export.export(gm_copy, tuple_inputs, strict=False)
             func(
-                flattened_ep.module(),
+                flattened_ep.module(check_guards=False),
                 tuple_inputs,
                 inductor_configs=config_copy,
                 package_path=package_path,
diff --git a/torch/_inductor/fuzzer.py b/torch/_inductor/fuzzer.py
index 82edd5d4d5b6..8149bc7e98e7 100644
--- a/torch/_inductor/fuzzer.py
+++ b/torch/_inductor/fuzzer.py
@@ -23,7 +23,8 @@
 )
 
 import torch
-from torch._inductor.custom_graph_pass import CustomGraphPass
+from functorch.compile import min_cut_rematerialization_partition
+from torch._inductor.custom_graph_pass import CustomGraphPass, CustomPartitionerFn
 from torch._inductor.scheduler import BaseSchedulerNode
 from torch.utils._config_module import _ConfigEntry, ConfigModule
 from torch.utils._ordered_set import OrderedSet
@@ -74,6 +75,20 @@ def uuid(self) -> Optional[Any]:
         return None
 
 
+class DummyPartitionerFn(CustomPartitionerFn):
+    """
+    A Dummy partitioner function to be used by ConfigFuzzer
+    """
+
+    def __call__(
+        self, gm: torch.fx.GraphModule, joint_inputs: Sequence[object], **kwargs: Any
+    ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+        return min_cut_rematerialization_partition(gm, joint_inputs, **kwargs)
+
+    def uuid(self) -> Optional[Any]:
+        return None
+
+
 T = TypeVar("T")
 
 
@@ -84,6 +99,7 @@ class TypeExemplars:
 
     TYPE_EXEMPLARS: dict[str, Any] = {
         CustomGraphPass.__name__: DummyPass(),
+        CustomPartitionerFn.__name__: DummyPartitionerFn(),
         torch.fx.graph.Graph.__name__: torch.fx.graph.Graph(),
         BaseSchedulerNode.__name__: BaseSchedulerNode(None),  # type: ignore[arg-type]
     }
@@ -499,6 +515,7 @@ def keys(self) -> KeysView[ComboType]:
         "joint_custom_post_pass": DEFAULT,  # Typing
         "joint_custom_pre_pass": DEFAULT,  # Typing
         "pre_grad_custom_pass": DEFAULT,  # Typing
+        "custom_partitioner_fn": DEFAULT,  # Typing
     },
     "torch._dynamo.config": {
         "traceable_tensor_subclasses": DEFAULT,  # Typing
diff --git a/torch/_inductor/fx_passes/bucketing.py b/torch/_inductor/fx_passes/bucketing.py
index 1c4c5f6c3f73..bf16454157b3 100644
--- a/torch/_inductor/fx_passes/bucketing.py
+++ b/torch/_inductor/fx_passes/bucketing.py
@@ -1,5 +1,6 @@
 import collections
 import logging
+from collections import defaultdict
 from typing import Any, Callable, Optional
 
 import torch
@@ -33,6 +34,7 @@ def bucket_cap_mb_by_bucket_idx_default(bucket_id: int) -> float:
 def bucket_all_gather(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
+    mode: Optional[str] = None,
 ) -> None:
     if bucket_cap_mb_by_bucket_idx is None:
         from torch._inductor.fx_passes.bucketing import (
@@ -43,13 +45,13 @@ def bucket_all_gather(
     ag_buckets = bucket_all_gather_by_mb(gm, bucket_cap_mb_by_bucket_idx)
     if len(ag_buckets) == 0:
         return
-
-    merge_all_gather(gm, ag_buckets)
+    merge_all_gather(gm, ag_buckets, mode)
 
 
 def bucket_reduce_scatter(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
+    mode: Optional[str] = None,
 ) -> None:
     if bucket_cap_mb_by_bucket_idx is None:
         from torch._inductor.fx_passes.bucketing import (
@@ -60,7 +62,7 @@ def bucket_reduce_scatter(
     rs_buckets = bucket_reduce_scatter_by_mb(gm, bucket_cap_mb_by_bucket_idx)
     if len(rs_buckets) == 0:
         return
-    merge_reduce_scatter(gm, rs_buckets)
+    merge_reduce_scatter(gm, rs_buckets, mode)
 
 
 def is_all_gather_into_tensor(node: torch.fx.Node) -> bool:  # type: ignore[arg-type]
@@ -131,28 +133,46 @@ def greedy_bucket_collective_by_mb(
     node_group_key: Callable[[torch.fx.Node], Any],
     filter_wait_node: Optional[Callable[[torch.fx.Node], bool]] = None,
 ) -> list[list[torch.fx.Node]]:
-    if not gm.graph.find_nodes(
-        op="call_function", target=torch.ops._c10d_functional.wait_tensor.default
-    ):
-        return []
-
+    """
+    Bucketing adjacent collectives with equal node_group_key.
+    We can not bucket non adjacent collectives,
+    as this will effectively change the order of collectives.
+    Reordering can lead to different order on different ranks.
+    """
     g = gm.graph
+    found_candidates = False
+    for node in g.nodes:
+        if filter_node(node):
+            found_candidates = True
+            break
+    if not found_candidates:
+        return []
 
     # TODO: pearce kelly algorithm for detecting cycles
     node_descendents = collect_node_descendants(gm.graph)
 
-    node_groups: dict[Any, list[torch.fx.Node]] = collections.defaultdict(list)
+    nodes_groups: list[list[torch.fx.Node]] = []
+    cur_group: list[torch.fx.Node] = []
+    cur_group_key = None
 
     for node in g.nodes:
         if is_wait_tensor(node) and filter_node(node.args[0]):
             if (filter_wait_node is None) or filter_wait_node(node):
                 coll_node = node.args[0]
                 group_key = node_group_key(coll_node)
-                node_groups[group_key].append(coll_node)
+                if group_key == cur_group_key:
+                    cur_group.append(coll_node)
+                else:
+                    if len(cur_group) > 1:
+                        nodes_groups.append(cur_group)
+                    cur_group = [coll_node]
+                    cur_group_key = group_key
 
-    buckets: list[list[torch.fx.Node]] = []
+    if len(cur_group) > 1:
+        nodes_groups.append(cur_group)
 
-    for nodes in node_groups.values():
+    buckets: list[list[torch.fx.Node]] = []
+    for nodes in nodes_groups:
         cur_bucket: list[torch.fx.Node] = []
         cur_bucket_descendents: OrderedSet[torch.fx.Node] = OrderedSet()
         cur_bucket_size_bytes: int = 0
@@ -261,6 +281,52 @@ def _rs_group_key(node: torch.fx.Node) -> tuple[str, str, torch.dtype]:
     )
 
 
+@torch.library.custom_op("bucketing::_pre_bucket_reduce_scatter", mutates_args={})
+def _pre_bucket_reduce_scatter(
+    rs_ins: list[torch.Tensor],
+    group_size: int,
+) -> torch.Tensor:
+    rs_ins_flattened = [x.view(group_size, -1) for x in rs_ins]
+    new_rs_in = torch.cat(rs_ins_flattened, dim=1).flatten()
+    return new_rs_in
+
+
+def _pre_bucket_reduce_scatter_fake(
+    rs_ins: list[torch.Tensor],
+    group_size: int,
+) -> torch.Tensor:
+    out_numel = sum(rs_in.numel() for rs_in in rs_ins)
+    return torch.empty((out_numel,), device=rs_ins[0].device, dtype=rs_ins[0].dtype)
+
+
+_pre_bucket_reduce_scatter.register_fake(_pre_bucket_reduce_scatter_fake)
+
+
+def reduce_scatter_merge_fn_to_trace_custom_ops(
+    rs_ins: list[torch.Tensor],
+    group_size: int,
+    group_name: str,
+    reduce_op: str,
+    reduce_dtype: torch.dtype,  # type: ignore[name-defined]
+    device: torch.device,  # type: ignore[name-defined]
+) -> list[torch.Tensor]:  # type: ignore[no-untyped-def]
+    new_out_sizes = [(x.shape[0] // group_size,) + x.shape[1:] for x in rs_ins]
+    new_out_numels = [x.numel() // group_size for x in rs_ins]
+
+    new_rs_in = torch.ops.bucketing._pre_bucket_reduce_scatter(rs_ins, group_size)
+
+    # TODO - either use torch.cat or make sure inductor foreach codegen
+    # fires more reliably
+    new_rs_out = torch.ops.c10d_functional.wait_tensor(
+        torch.ops._c10d_functional.reduce_scatter_tensor.default(
+            new_rs_in, reduce_op, group_size, group_name
+        )
+    )
+    new_out_flat = new_rs_out.split(new_out_numels, 0)
+    new_outs = [x.view(s) for x, s in zip(new_out_flat, new_out_sizes)]
+    return new_outs
+
+
 def reduce_scatter_merge_fn_to_trace(
     rs_ins: list[torch.Tensor],
     group_size: int,
@@ -276,8 +342,6 @@ def reduce_scatter_merge_fn_to_trace(
 
     new_rs_in = torch.cat(rs_ins_flattened, dim=1).flatten()
 
-    # TODO - either use torch.cat or make sure inductor foreach codegen
-    # fires more reliably
     new_rs_out = torch.ops.c10d_functional.wait_tensor(
         torch.ops._c10d_functional.reduce_scatter_tensor.default(
             new_rs_in, reduce_op, group_size, group_name
@@ -288,6 +352,74 @@ def reduce_scatter_merge_fn_to_trace(
     return new_outs
 
 
+@torch.library.custom_op("bucketing::_pre_bucket_all_gather", mutates_args={})
+def _pre_bucket_all_gather(
+    ag_ins: list[torch.Tensor],
+    group_size: int,
+    group_name: str,
+    dtype: torch.dtype,  # type: ignore[name-defined]
+    rank: int,
+) -> torch.Tensor:
+    ins_split_sizes = [ag_in.numel() for ag_in in ag_ins]
+    ag_input_numel = sum(ins_split_sizes)
+    device = ag_ins[0].device
+    new_ag_out = torch.empty(ag_input_numel * group_size, dtype=dtype, device=device)
+    new_ag_in = new_ag_out.narrow(0, ag_input_numel * rank, ag_input_numel)
+    foreach_copy_dsts = torch.split(new_ag_in, ins_split_sizes)
+    ag_ins_flattened = [ag_in.reshape(-1) for ag_in in ag_ins]
+    torch._foreach_copy_(foreach_copy_dsts, ag_ins_flattened)
+    return new_ag_out
+
+
+def _pre_bucket_all_gather_fake(
+    ag_ins: list[torch.Tensor],
+    group_size: int,
+    group_name: str,
+    dtype: torch.dtype,  # type: ignore[name-defined]
+    rank: int,
+) -> torch.Tensor:
+    ins_split_sizes = [ag_in.numel() for ag_in in ag_ins]
+    ag_input_numel = sum(ins_split_sizes)
+    device = ag_ins[0].device
+    new_ag_out = torch.empty(ag_input_numel * group_size, dtype=dtype, device=device)
+    return new_ag_out
+
+
+_pre_bucket_all_gather.register_fake(_pre_bucket_all_gather_fake)
+
+
+def all_gather_merge_fn_to_trace_custom_ops(
+    ag_ins: list[torch.Tensor],
+    group_size: int,
+    group_name: str,
+    dtype: torch.dtype,  # type: ignore[name-defined]
+    rank: int,
+) -> list[torch.Tensor]:
+    ins_sizes = [ag_in.shape for ag_in in ag_ins]
+    ins_split_sizes = [ag_in.numel() for ag_in in ag_ins]
+    ag_input_numel = sum(ins_split_sizes)
+    new_ag_out = torch.ops.bucketing._pre_bucket_all_gather(
+        ag_ins, group_size, group_name, dtype, rank
+    )
+    new_ag_in = new_ag_out.narrow(0, ag_input_numel * rank, ag_input_numel)
+    wait_tensor = torch.ops.c10d_functional.wait_tensor(
+        torch.ops._c10d_functional.all_gather_into_tensor_out.default(
+            new_ag_in, group_size, group_name, out=new_ag_out
+        )
+    )
+    new_ag_out_reshaped = wait_tensor.reshape(group_size, -1)
+    outs = torch.split_with_sizes(
+        new_ag_out_reshaped,
+        ins_split_sizes,
+        dim=1,
+    )
+    outs_reshaped = [
+        o.reshape((shape[0] * group_size,) + shape[1:])
+        for o, shape in zip(outs, ins_sizes)
+    ]
+    return outs_reshaped
+
+
 def all_gather_merge_fn_to_trace(
     ag_ins: list[torch.Tensor],
     group_size: int,
@@ -420,9 +552,17 @@ def _insert_fn_trace_before_node(  # type: ignore[no-untyped-def]
 
 
 def merge_reduce_scatter(
-    gm: torch.fx.GraphModule, rs_buckets: list[list[torch.fx.Node]]
+    gm: torch.fx.GraphModule,
+    rs_buckets: list[list[torch.fx.Node]],
+    mode: Optional[str] = None,
 ) -> None:
+    """
+    Merges specified buckets of reduce_scatter to joint reduce_scatter.
+    """
     with dynamo_timed("fx.bucketing.merge_reduce_scatter", log_pt2_compile_event=True):
+        rs_merge_fn = reduce_scatter_merge_fn_to_trace
+        if mode and "custom_ops" in mode:
+            rs_merge_fn = reduce_scatter_merge_fn_to_trace_custom_ops
         trace_structured(
             "artifact",
             metadata_fn=lambda: {
@@ -469,7 +609,7 @@ def merge_reduce_scatter(
 
             replacements = _insert_fn_trace_before_node(
                 g,
-                reduce_scatter_merge_fn_to_trace,
+                rs_merge_fn,
                 (
                     pytree.tree_map(lambda node: node.meta["val"], _rs_ins),
                     group_size,
@@ -501,7 +641,9 @@ def _replace(x: torch.fx.Node) -> torch.fx.Node:
 
 
 def merge_all_gather(
-    gm: torch.fx.GraphModule, ag_buckets: list[list[torch.fx.Node]]
+    gm: torch.fx.GraphModule,
+    ag_buckets: list[list[torch.fx.Node]],
+    mode: Optional[str] = None,
 ) -> None:  # type: ignore[union-attr]
     """
     Merges specified buckets of all_gather to joint all_gather.
@@ -509,6 +651,10 @@ def merge_all_gather(
     with dynamo_timed("fx.bucketing.merge_all_gather", log_pt2_compile_event=True):
         from torch.distributed.distributed_c10d import _resolve_process_group
 
+        ag_merge_fn = all_gather_merge_fn_to_trace
+        if mode and "custom_ops" in mode:
+            ag_merge_fn = all_gather_merge_fn_to_trace_custom_ops
+
         trace_structured(
             "artifact",
             metadata_fn=lambda: {
@@ -519,6 +665,8 @@ def merge_all_gather(
         )
         n_buckets = len(ag_buckets)
 
+        ag_node_to_pre_nodes = defaultdict(list)
+
         ag_ins: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
         ag_waits: list[list[torch.fx.Node]] = [[] for _ in range(n_buckets)]
         for bucket_idx, ag_bucket in enumerate(ag_buckets):
@@ -537,6 +685,14 @@ def merge_all_gather(
                     and ag_node.meta["val"].dtype == dtype
                 )
                 ag_node_in = ag_node.args[0]
+                if (
+                    ag_node_in.op == "call_function"  # type: ignore[union-attr]
+                    and ag_node_in.target  # type: ignore[union-attr]
+                    == torch.ops.prims.convert_element_type.default  # type: ignore[union-attr]
+                    and len(ag_node_in.users) == 1  # type: ignore[union-attr]
+                ):
+                    ag_node_to_pre_nodes[ag_node].append(ag_node_in)
+                    ag_node_in = ag_node_in.args[0]  # type: ignore[union-attr]
 
                 ag_ins[bucket_idx].append(ag_node_in)  # type: ignore[union-attr, arg-type]
                 ag_waits[bucket_idx].append(wait_node)
@@ -558,7 +714,7 @@ def merge_all_gather(
 
             replacements = _insert_fn_trace_before_node(
                 g,
-                all_gather_merge_fn_to_trace,
+                ag_merge_fn,
                 (
                     pytree.tree_map(lambda node: node.meta["val"], _ag_ins),
                     group_size,
@@ -582,3 +738,5 @@ def _replace(x: torch.fx.Node) -> torch.fx.Node:
             for ag_n, wait_n in zip(ag_buckets[bucket_idx], _ag_waits):
                 g.erase_node(wait_n)
                 g.erase_node(ag_n)
+                for n in reversed(ag_node_to_pre_nodes[ag_n]):
+                    g.erase_node(n)  # type: ignore[arg-type]
diff --git a/torch/_inductor/fx_passes/fsdp.py b/torch/_inductor/fx_passes/fsdp.py
index e24ebe4037e7..e7e574ae4934 100644
--- a/torch/_inductor/fx_passes/fsdp.py
+++ b/torch/_inductor/fx_passes/fsdp.py
@@ -56,6 +56,7 @@ def is_fsdp_reduce_scatter_wait(wait: torch.fx.Node) -> bool:
 def bucket_fsdp_all_gather(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
+    mode: Optional[str] = None,
 ) -> None:
     """
     Bucketing pass for SimpleFSDP all_gather ops.
@@ -79,12 +80,13 @@ def bucket_fsdp_all_gather(
     )
     if len(ag_buckets) == 0:
         return
-    merge_all_gather(gm, ag_buckets)
+    merge_all_gather(gm, ag_buckets, mode)
 
 
 def bucket_fsdp_reduce_scatter(
     gm: torch.fx.GraphModule,
     bucket_cap_mb_by_bucket_idx: Optional[Callable[[int], float]] = None,
+    mode: Optional[str] = None,
 ) -> None:
     """
     Bucketing pass for SimpleFSDP reduce_scatter ops.
@@ -109,4 +111,4 @@ def bucket_fsdp_reduce_scatter(
     )
     if len(rs_buckets) == 0:
         return
-    merge_reduce_scatter(gm, rs_buckets)
+    merge_reduce_scatter(gm, rs_buckets, mode)
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index db273b06c8e6..ba6953c09118 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -204,13 +204,14 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
 
         p = (
             bucket_fsdp_reduce_scatter
-            if config.bucket_reduce_scatters_fx == "fsdp"
+            if "fsdp" in config.bucket_reduce_scatters_fx
             else bucket_reduce_scatter
         )
         GraphTransformObserver(gm, "bucket_reduce_scatters").apply_graph_pass(
             lambda graph: p(
                 graph.owning_module,
                 config.bucket_reduce_scatters_fx_bucket_size_determinator,
+                config.bucket_reduce_scatters_fx,
             )
         )
         collectives_bucketing = True
@@ -223,13 +224,14 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
 
         p = (
             bucket_fsdp_all_gather  # type: ignore[assignment]
-            if config.bucket_all_gathers_fx == "fsdp"
+            if "fsdp" in config.bucket_all_gathers_fx
             else bucket_all_gather
         )
         GraphTransformObserver(gm, "bucket_all_gathers").apply_graph_pass(
             lambda graph: p(
                 graph.owning_module,
                 config.bucket_all_gathers_fx_bucket_size_determinator,
+                config.bucket_all_gathers_fx,
             )
         )
         collectives_bucketing = True
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
index 70dfe9ae43b3..01f62bdf608c 100644
--- a/torch/_inductor/fx_passes/quantization.py
+++ b/torch/_inductor/fx_passes/quantization.py
@@ -1530,7 +1530,7 @@ def _find_first_node_in_dequant_pattern(_node):
         counters["inductor"]["dequant_promotion_matcher_nodes"] += len(match.nodes)
 
 
-def _is_valid_dequant_conv_pattern(dtype):
+def _is_valid_dequant_conv_pattern(dtype, with_dtype_convert):
     def _inner(match):
         # Here we do some further check to ensure:
         # 1. It's a conv2d node with dim of 4, since we only support lowering of conv2d now.
@@ -1552,7 +1552,7 @@ def _inner(match):
 
         assert dtype in [torch.float32, torch.bfloat16]
 
-        if dtype == torch.float32:
+        if not with_dtype_convert:
             dequant_node = conv_node.args[0]
         else:
             convert_to_bf16 = conv_node.args[0]
@@ -1567,10 +1567,12 @@ def _inner(match):
     return _inner
 
 
-def _register_qconv_weight_prepack_pass(pattern, pass_number, dtype=torch.float32):
+def _register_qconv_weight_prepack_pass(
+    pattern, pass_number, dtype=torch.float32, with_dtype_convert=False
+):
     @register_freezing_graph_pattern(
         pattern,
-        extra_check=_is_valid_dequant_conv_pattern(dtype),
+        extra_check=_is_valid_dequant_conv_pattern(dtype, with_dtype_convert),
         pass_number=pass_number,
     )
     def qconv_weight_prepack(match: Match, *args, **kwargs):
@@ -1590,7 +1592,7 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
         assert dtype in [torch.float32, torch.bfloat16]
         conv_node = match.output_node()
         assert conv_node.target is aten.convolution.default
-        if dtype == torch.float32:
+        if not with_dtype_convert:
             dequant_node = conv_node.args[0]
         else:
             convert_to_bf16 = conv_node.args[0]
@@ -1695,7 +1697,7 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
             # Erase the original conv node
             graph.erase_node(conv_node)
             # Erase the dequant pattern
-            if dtype == torch.bfloat16:
+            if with_dtype_convert:
                 graph.erase_node(convert_to_bf16)  # type: ignore[possibly-undefined, arg-type]
             graph.erase_node(dequant_node)  # type: ignore[arg-type]
             # Erase the dequant per channel pattern
@@ -1711,7 +1713,7 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
 
 
 def _generate_dequant_convolution_node_pattern(
-    _dequant_per_channel_pattern, dtype=torch.float32
+    _dequant_per_channel_pattern, dtype=torch.float32, with_dtype_convert=False
 ):
     assert dtype in [torch.float32, torch.bfloat16]
     dequant_convolution_node_pattern = CallFunction(
@@ -1719,7 +1721,7 @@ def _generate_dequant_convolution_node_pattern(
         _may_generate_pattern_with_dtype_convert(
             get_dequantize_per_tensor_activation_pattern(),
             KeywordArg("autocast_act_dtype"),
-            dtype == torch.bfloat16,
+            with_dtype_convert,
         ),
         _dequant_per_channel_pattern,
         KeywordArg("b"),
@@ -1733,7 +1735,9 @@ def _generate_dequant_convolution_node_pattern(
     return dequant_convolution_node_pattern
 
 
-def _generate_qconv_weight_prepack_patterns(dtype=torch.float32):
+def _generate_qconv_weight_prepack_patterns(
+    dtype=torch.float32, with_dtype_convert=False
+):
     assert dtype in [torch.float32, torch.bfloat16]
     return (
         _generate_dequant_convolution_node_pattern(
@@ -1741,6 +1745,7 @@ def _generate_qconv_weight_prepack_patterns(dtype=torch.float32):
             if dtype == torch.float32
             else dequantize_per_channel_to_bf16_weight_pattern,
             dtype,
+            with_dtype_convert,
         ),
         # There is another pattern due to the pass of convert_conv_weights_to_channels_last
         # https://github.com/pytorch/pytorch/blob/07107919297db3f8ab37f11c12666b6d6d5f692e/torch/_inductor/freezing.py#L338-L362.
@@ -1751,6 +1756,7 @@ def _generate_qconv_weight_prepack_patterns(dtype=torch.float32):
             if dtype == torch.float32
             else dequantize_per_channel_to_bf16_clone_weight_pattern,
             dtype,
+            with_dtype_convert,
         ),
     )
 
@@ -1778,7 +1784,11 @@ def _get_linear_node(match, input_dim_exceeds_two, input_contiguous):
 
 
 def _get_linear_dq_node(
-    linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+    linear_node,
+    input_index,
+    input_dim_exceeds_two,
+    input_contiguous,
+    with_dtype_convert,
 ):
     act_reshape_node = None
     activation_to_bf16_node = None
@@ -1787,7 +1797,7 @@ def _get_linear_dq_node(
         if input_contiguous:
             act_reshape_node = linear_node.args[input_index]
             assert act_reshape_node.target is aten.reshape.default
-            if dtype == torch.float32:
+            if not with_dtype_convert:
                 # pattern: linear -> reshape -> dequant
                 dequant_node = act_reshape_node.args[0]
             else:
@@ -1798,13 +1808,13 @@ def _get_linear_dq_node(
             # bmm pattern decomposed from linear when input dim exceeds 2 and not contiguous
             act_expand_node = linear_node.args[input_index]
             assert act_expand_node.target is aten.expand.default
-            if dtype == torch.float32:
+            if not with_dtype_convert:
                 dequant_node = act_expand_node.args[0]
             else:
                 activation_to_bf16_node = act_expand_node.args[0]
                 dequant_node = activation_to_bf16_node.args[0]
     else:
-        if dtype == torch.float32:
+        if not with_dtype_convert:
             # pattern: linear -> dequant
             dequant_node = linear_node.args[input_index]
         else:
@@ -1814,7 +1824,9 @@ def _get_linear_dq_node(
     return dequant_node, act_reshape_node, activation_to_bf16_node, act_expand_node
 
 
-def _is_valid_dequant_linear_pattern(dtype, input_dim_exceeds_two, input_contiguous):
+def _is_valid_dequant_linear_pattern(
+    dtype, input_dim_exceeds_two, input_contiguous, with_dtype_convert
+):
     def _inner(match):
         # Check dequant pattern has only 1 user.
         (
@@ -1830,7 +1842,11 @@ def _inner(match):
             _,
             _,
         ) = _get_linear_dq_node(
-            linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+            linear_node,
+            input_index,
+            input_dim_exceeds_two,
+            input_contiguous,
+            with_dtype_convert,
         )
 
         assert dequant_node.target in [
@@ -1892,11 +1908,12 @@ def _register_qlinear_weight_prepack_pass(
     dtype=torch.float32,
     input_dim_exceeds_two=False,
     input_contiguous=True,
+    with_dtype_convert=False,
 ):
     @register_freezing_graph_pattern(
         pattern,
         extra_check=_is_valid_dequant_linear_pattern(
-            dtype, input_dim_exceeds_two, input_contiguous
+            dtype, input_dim_exceeds_two, input_contiguous, with_dtype_convert
         ),
         pass_number=pass_number,
     )
@@ -1928,7 +1945,11 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
             activation_to_bf16_node,
             act_expand_node,
         ) = _get_linear_dq_node(
-            linear_node, input_index, dtype, input_dim_exceeds_two, input_contiguous
+            linear_node,
+            input_index,
+            input_dim_exceeds_two,
+            input_contiguous,
+            with_dtype_convert,
         )
 
         if input_dim_exceeds_two and not input_contiguous:
@@ -2035,7 +2056,7 @@ def qlinear_weight_prepack(match: Match, *args, **kwargs):
                 else:
                     graph.erase_node(act_expand_node)
                     graph.erase_node(wgt_expand_node)  # type: ignore[possibly-undefined]
-            if dtype == torch.bfloat16:
+            if with_dtype_convert:
                 graph.erase_node(activation_to_bf16_node)
             # Erase the dequant pattern
             graph.erase_node(dequant_node)
@@ -2056,6 +2077,7 @@ def _generate_dequant_linear_node_pattern(
     dtype=torch.float32,
     input_dim_exceeds_two=False,
     is_tensor_overload=False,
+    with_dtype_convert=False,
 ):
     assert dtype in [torch.float32, torch.bfloat16]
     t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
@@ -2067,7 +2089,7 @@ def _generate_dequant_linear_node_pattern(
                 _may_generate_pattern_with_dtype_convert(
                     get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                     KeywordArg("autocast_act_dtype"),
-                    dtype == torch.bfloat16,
+                    with_dtype_convert,
                 ),
                 KeywordArg("act_reshape_size"),
                 input_dim_exceeds_two,
@@ -2084,7 +2106,7 @@ def _generate_dequant_linear_node_pattern(
                 _may_generate_pattern_with_dtype_convert(
                     get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                     KeywordArg("autocast_act_dtype"),
-                    dtype == torch.bfloat16,
+                    with_dtype_convert,
                 ),
                 KeywordArg("act_reshape_size"),
                 input_dim_exceeds_two,
@@ -2102,6 +2124,7 @@ def _generate_dequant_bmm_node_pattern(
     dtype=torch.float32,
     with_bias=False,
     is_tensor_overload=False,
+    with_dtype_convert=False,
 ):
     # When activation of linear dim exceed 2 and not contiguous
     t_pattern = _generate_linear_t_pattern(_dequant_per_channel_pattern, dtype)
@@ -2114,7 +2137,7 @@ def _generate_dequant_bmm_node_pattern(
             _may_generate_pattern_with_dtype_convert(
                 get_dequantize_per_tensor_activation_pattern(is_tensor_overload),
                 KeywordArg("autocast_act_dtype"),
-                dtype == torch.bfloat16,
+                with_dtype_convert,
             ),
             KeywordArg("act_expand_size"),
         ),
@@ -2144,6 +2167,7 @@ def _generate_qlinear_weight_prepack_patterns(
     input_contiguous=True,
     with_bias=False,
     is_tensor_overload=False,
+    with_dtype_convert=False,
 ):
     if input_dim_exceeds_two and not input_contiguous:
         return _generate_dequant_bmm_node_pattern(
@@ -2151,6 +2175,7 @@ def _generate_qlinear_weight_prepack_patterns(
             dtype,
             with_bias,
             is_tensor_overload,
+            with_dtype_convert,
         )
     else:
         return _generate_dequant_linear_node_pattern(
@@ -2158,6 +2183,7 @@ def _generate_qlinear_weight_prepack_patterns(
             dtype,
             input_dim_exceeds_two,
             is_tensor_overload,
+            with_dtype_convert,
         )
 
 
@@ -2273,12 +2299,21 @@ def _register_dequant_promotion():
 
 
 def _register_qconv_weight_prepack():
-    for dtype in [torch.float32, torch.bfloat16]:
-        weight_prepack_patterns = _generate_qconv_weight_prepack_patterns(dtype)
+    for dtype, with_dtype_convert in itertools.product(
+        [torch.float32, torch.bfloat16], [True, False]
+    ):
+        if dtype == torch.float32 and with_dtype_convert:
+            continue
+        weight_prepack_patterns = _generate_qconv_weight_prepack_patterns(
+            dtype, with_dtype_convert
+        )
         for weight_prepack_pattern in weight_prepack_patterns:
             # Register to pass_number 1, so we can do dequant promotion in pass_number 0.
             _register_qconv_weight_prepack_pass(
-                weight_prepack_pattern, pass_number=1, dtype=dtype
+                weight_prepack_pattern,
+                pass_number=1,
+                dtype=dtype,
+                with_dtype_convert=with_dtype_convert,
             )
 
 
@@ -2318,15 +2353,23 @@ def _register_qlinear_weight_prepack():
     #   |            OPT(add)               |
 
     linear_weight_prepack_cases = itertools.product(
-        [torch.float32, torch.bfloat16], [True, False], [True, False]
+        [torch.float32, torch.bfloat16], [True, False], [True, False], [True, False]
     )
 
     # Step 1: register patterns from mm and addmm
-    for dtype, input_dim_exceeds_two, is_tensor_overload in linear_weight_prepack_cases:
+    for (
+        dtype,
+        input_dim_exceeds_two,
+        is_tensor_overload,
+        with_dtype_convert,
+    ) in linear_weight_prepack_cases:
+        if dtype == torch.float32 and with_dtype_convert:
+            continue
         weight_prepack_patterns = _generate_qlinear_weight_prepack_patterns(
             dtype,
             input_dim_exceeds_two,
             is_tensor_overload=is_tensor_overload,
+            with_dtype_convert=with_dtype_convert,
         )
         for weight_prepack_pattern in weight_prepack_patterns:
             # Register to pass_number 1, so we can do dequant promotion in pass_number 0.
@@ -2335,6 +2378,7 @@ def _register_qlinear_weight_prepack():
                 pass_number=1,
                 dtype=dtype,
                 input_dim_exceeds_two=input_dim_exceeds_two,
+                with_dtype_convert=with_dtype_convert,
             )
 
     # Step 2: register patterns from bmm
@@ -2342,15 +2386,23 @@ def _register_qlinear_weight_prepack():
     # refer to:
     # https://github.com/pytorch/pytorch/blob/80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
     # in this case, we can convert it back to qlinear
-    for dtype, with_bias, is_tensor_overload in itertools.product(
-        [torch.float32, torch.bfloat16], [True, False], [True, False]
+    for (
+        dtype,
+        with_bias,
+        is_tensor_overload,
+        with_dtype_convert,
+    ) in itertools.product(
+        [torch.float32, torch.bfloat16], [True, False], [True, False], [True, False]
     ):
+        if dtype == torch.float32 and with_dtype_convert:
+            continue
         bmm_pattern = _generate_qlinear_weight_prepack_patterns(
             dtype=dtype,
             input_dim_exceeds_two=True,
             input_contiguous=False,
             with_bias=with_bias,
             is_tensor_overload=is_tensor_overload,
+            with_dtype_convert=with_dtype_convert,
         )
         _register_qlinear_weight_prepack_pass(
             bmm_pattern,
@@ -2360,6 +2412,7 @@ def _register_qlinear_weight_prepack():
             dtype=dtype,
             input_dim_exceeds_two=True,
             input_contiguous=False,
+            with_dtype_convert=with_dtype_convert,
         )
 
 
diff --git a/torch/_inductor/index_propagation.py b/torch/_inductor/index_propagation.py
index e73272c655f9..0dc0a00412a8 100644
--- a/torch/_inductor/index_propagation.py
+++ b/torch/_inductor/index_propagation.py
@@ -68,7 +68,15 @@ def __post_init__(self):
             expr = self.expr
             if isinstance(expr, sympy.Expr):
                 expr = expr.expand(identity=True)
-            self.expr = dtype_to_type(self.dtype)(expr)
+            expr = dtype_to_type(self.dtype)(expr)
+            if is_integer_dtype(self.dtype):
+                bits = torch.iinfo(self.dtype).bits
+                if self.dtype.is_signed:
+                    expr = expr + 2 ** (bits - 1)
+                expr = expr % 2**bits
+                if self.dtype.is_signed:
+                    expr = expr - 2 ** (bits - 1)
+            self.expr = expr
 
 
 class SymPyOps:
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index ac2619f64a30..a454e4f5f77b 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -318,7 +318,7 @@ def get_fill_order(
     """
     Convert strides to fill order (argsort)
     """
-    if shape_env is None:
+    if shape_env is None or all(isinstance(s, (int, sympy.Integer)) for s in seq):
         sorted_idx: Sequence[int] = argsort(seq)
     else:
         # argsort_sym handles unbacked symints (with the help of the shape_env)
@@ -2877,8 +2877,8 @@ def _normalize_size(x: IRNode, new_size: Sequence[_IntLike]) -> Sequence[_IntLik
             if new_size[i] == -1:
                 assert old_size[i] is not None
                 new_size[i] = old_size[i]
-            elif old_size[i] is None or V.graph.sizevars.shape_env.evaluate_expr(
-                sympy.Eq(old_size[i], 1), fallback_value=False
+            elif old_size[i] is None or V.graph.sizevars.is_size_one_or_false(
+                old_size[i]
             ):
                 pass
             else:
@@ -2904,9 +2904,7 @@ def create(cls, x: IRNode, new_size: Sequence[_IntLike]) -> BaseView:
             for stride, size in zip(old_layout.stride, old_layout.size):
                 new_stride.append(
                     stride
-                    if not V.graph.sizevars.shape_env.evaluate_expr(
-                        sympy.Eq(size, 1), fallback_value=False
-                    )
+                    if not V.graph.sizevars.is_size_one_or_false(size)
                     else sympy.S.Zero
                 )
             new_layout = FixedLayout(
@@ -3746,18 +3744,20 @@ def _pad_strides(
         ):
             return in_strides
 
-        # get_stride_order does not work with dynamic shape. Also we can not
-        # statically decide if a padding is needed or how much padding we should
-        # do for dynamic shape.
-        #
-        # Skip padding the strides for dynamic shape for now.
-        if not all(
-            isinstance(s, (int, sympy.Integer))
-            for s in itertools.chain(in_strides, size)
-        ):
+        shape_env = V.graph._shape_env if hasattr(V.graph, "_shape_env") else None
+
+        def contains_unbacked_symints(expr: sympy.Expr | int) -> bool:
+            if shape_env is None:
+                return False
+            if not isinstance(expr, sympy.Expr):
+                return False
+            return any(shape_env.is_unbacked_symint(s) for s in expr.free_symbols)
+
+        # Skip padding the strides when it contains unbacked symints for now.
+        if shape_env and any(contains_unbacked_symints(s) for s in in_strides):
             return in_strides
 
-        stride_order = get_stride_order(in_strides)
+        stride_order = get_stride_order(in_strides, shape_env)
         fill_order = stride_order2fill_order(stride_order)
 
         new_strides = [0 for _ in range(len(in_strides))]
@@ -3769,11 +3769,17 @@ def _pad_strides(
         for rank, idx in enumerate(fill_order[1:], start=1):
             prev_idx = fill_order[rank - 1]
             stride = new_strides[prev_idx] * size[prev_idx]
-
-            if stride > config.padding_stride_threshold and stride % align != 0:
-                stride = ceildiv(stride, align) * align
-                padded = True
+            # Static stride and meets padding conditions OR
+            # Dynamic stride and config.pad_dynamic_shape=True
+            require_padding = (
+                isinstance(stride, (int, sympy.Integer))
+                and stride > config.padding_stride_threshold
+                and stride % align != 0
+            ) or (isinstance(stride, sympy.Expr) and config.pad_dynamic_shapes)
             new_strides[idx] = stride
+            if require_padding:
+                new_strides[idx] = ceildiv(stride, align) * align
+                padded = True
 
         if not padded:
             # Consider a tensor with shape [256, 1, 5, 5]
@@ -4833,7 +4839,7 @@ def dummy(index: Sequence[Any], rindex: Sequence[Any]) -> Any:
                 return ops.load(inp.get_name(), indexer(index))
 
             deps.reads |= dependencies.extract_read_writes(
-                dummy, inp.get_size(), (), normalize=True
+                dummy, inp.get_size(), (), normalize=normalize
             ).reads
 
         return deps
@@ -6486,6 +6492,14 @@ def get_mutation_names(self) -> Sequence[str]:
     def should_allocate(self) -> bool:
         return False
 
+    def get_mutation_buffers(self) -> Sequence[IRNode]:
+        mutation_names = self.get_mutation_names()
+        return [
+            buf
+            for buf in (V.graph.try_get_buffer(name) for name in mutation_names)
+            if buf is not None
+        ]
+
 
 class TMADescriptor(ExternKernel):
     """
@@ -7556,7 +7570,25 @@ def has_side_effects(self) -> bool:
         return get_schema_info(self.op_overload).is_mutable()
 
     def get_inputs_that_alias_output(self) -> Sequence[str]:
-        return self.alias_names
+        assert isinstance(
+            self.op_overload, (torch._ops.OpOverload, torch._ops.HigherOrderOperator)
+        ), (
+            f"Fails to create FallbackKernel for {self.op_overload}: "
+            f"{type(self.op_overload)} not supported"
+        )
+
+        # See [Note: FallbackKernel supported operators]: for a mutating
+        # op that is auto-functionalizable, its outputs does NOT
+        # alias any of the inputs.
+        if (
+            not isinstance(self.op_overload, torch._ops.HigherOrderOperator)
+            and "_c10d_functional" not in self.op_overload.name()
+            and self.op_overload._schema.is_mutable
+            and can_auto_functionalize(self.op_overload)
+        ):
+            return []
+        else:
+            return self.alias_names
 
     def get_mutation_names(self) -> Sequence[str]:
         assert len(self.mutation_names) <= 1
@@ -8415,6 +8447,12 @@ def __init__(
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
 
+    @staticmethod
+    def _maybe_expr(s: Union[int, torch.SymInt]) -> Union[int, sympy.Expr]:
+        if isinstance(s, int):
+            return s
+        return s.node.expr
+
     @classmethod
     def create(
         cls,
@@ -8481,18 +8519,15 @@ def create(
             unbacked_bindings=unbacked_bindings,
         )
 
-        def _maybe_expr(s: Union[int, torch.SymInt]) -> Union[int, sympy.Expr]:
-            if isinstance(s, int):
-                return s
-            return s.node.expr
-
         outputs = [
             MultiOutput(
                 FixedLayout(
                     device=device,
                     dtype=output.get_dtype(),
-                    size=[_maybe_expr(sz) for sz in merged_output.size()],
-                    stride=[_maybe_expr(sz) for sz in merged_output.stride()],
+                    size=[Conditional._maybe_expr(sz) for sz in merged_output.size()],
+                    stride=[
+                        Conditional._maybe_expr(sz) for sz in merged_output.stride()
+                    ],
                     offset=output.get_layout().offset,
                     is_pinned=output.get_layout().is_pinned,
                 ),
@@ -8542,7 +8577,7 @@ def _split_by_sym_type(
 
 @ir_dataclass(frozen=False)
 class WhileLoop(ExternKernel):
-    """IR node for while_loop, which supports input mutations"""
+    """The IR node for while_loop and while_loop_stack_output. It supports input mutation."""
 
     carried_inputs: Optional[Sequence[IRNode]] = None
     additional_inputs: Optional[Sequence[IRNode]] = None
@@ -8557,6 +8592,8 @@ def __init__(
         cond_subgraph: Subgraph,
         body_subgraph: Subgraph,
         layout: MultiOutputLayout,
+        unbacked_bindings: Optional[dict[sympy.Symbol, pytree.KeyPath]],
+        stack_output: bool,
     ) -> None:
         self.carried_inputs = carried_inputs
         self.additional_inputs = additional_inputs
@@ -8572,6 +8609,9 @@ def __init__(
             inputs=tensor_args,
             constant_args=sym_args,
         )
+        if unbacked_bindings is not None:
+            self.unbacked_bindings = unbacked_bindings
+        self.stack_output = stack_output
 
         self.name = V.graph.register_buffer(self)
         V.graph.register_operation(self)
@@ -8615,7 +8655,11 @@ def create(
         body_fn: Subgraph,
         carried_inputs: Sequence[IRNode],
         additional_inputs: Sequence[IRNode],
+        stack_output: bool,
     ) -> Union[IRNode, Sequence[IRNode]]:
+        """create the while_loop IR node. stack_output controls whether it stack
+        each iterations' output, which is necessary for training.
+        """
         from torch._higher_order_ops.utils import check_input_alias_and_mutation
 
         def _require_exact_strides(
@@ -8722,9 +8766,14 @@ def _guard_list_equals(
             # as the MultiOutputLayout below requires single device
             assert op.get_device() == bo.get_device(), (i, op, bo, device)
             assert op.get_dtype() == bo.get_dtype(), (i, op, bo)
-            assert op.get_layout().offset == bo.get_layout().offset, (i, op, bo)
 
         assert device is not None
+
+        unbacked_bindings = resolve_unbacked_bindings(
+            V.graph.sizevars.shape_env,
+            V.graph.current_node.meta.get("unbacked_bindings", None),
+        )
+
         while_loop = WhileLoop(
             carried_inputs=carried_inputs_,
             additional_inputs=additional_inputs_,
@@ -8732,6 +8781,8 @@ def _guard_list_equals(
             body_subgraph=body_fn,
             # asserted above that there is at least one operand
             layout=MultiOutputLayout(device=device),
+            unbacked_bindings=unbacked_bindings,
+            stack_output=stack_output,
         )
 
         assert body_fn.graph is not None and isinstance(
@@ -8747,34 +8798,51 @@ def _guard_list_equals(
 
         # Create all outputs first
         mutated_inputs_iter = iter(mutated_inputs)
-        all_outputs = []
+        all_outputs: list[IRNode] = []
         while_loop.outputs = []
         while_loop.mutation_outputs = []
-
-        for idx, output in enumerate(body_outputs):
-            if idx in mutated_idx_set:
-                assert idx < len(carried_inputs), "only carries can be mutated."
-                # Create MutationOutput for mutated inputs
-                mutated_input = next(mutated_inputs_iter)
-                while_loop.mutation_outputs.append(
-                    MutationOutput(mutated_input.layout, mutated_input, while_loop)  # type: ignore[attr-defined, union-attr]
-                )
-                all_outputs.append(mutated_input)
-            else:
+        if stack_output:
+            assert len(mutated_idx_set) == 0, (
+                "NYI: while_loop_stack_output input mutations."
+            )
+            for idx, output in enumerate(V.graph.current_node.meta["val"]):
                 # Create MultiOutput for regular outputs
                 multi_out = MultiOutput(
                     FixedLayout(
-                        device=output.get_device(),  # type: ignore[arg-type]
-                        dtype=output.get_dtype(),
-                        size=output.get_size(),
-                        stride=output.get_stride(),
-                        offset=output.get_layout().offset,
+                        device=output.device,  # type: ignore[arg-type]
+                        dtype=output.dtype,
+                        size=[Conditional._maybe_expr(sz) for sz in output.size()],
+                        stride=[Conditional._maybe_expr(st) for st in output.stride()],
                     ),
                     while_loop,
                     [(list, idx)],
                 )
                 while_loop.outputs.append(multi_out)
                 all_outputs.append(multi_out)
+        else:
+            for idx, output in enumerate(body_outputs):
+                if idx in mutated_idx_set:
+                    assert idx < len(carried_inputs), "only carries can be mutated."
+                    # Create MutationOutput for mutated inputs
+                    mutated_input = next(mutated_inputs_iter)
+                    while_loop.mutation_outputs.append(
+                        MutationOutput(mutated_input.layout, mutated_input, while_loop)  # type: ignore[attr-defined, union-attr]
+                    )
+                    all_outputs.append(mutated_input)
+                else:
+                    multi_out = MultiOutput(
+                        FixedLayout(
+                            device=output.get_device(),  # type: ignore[arg-type]
+                            dtype=output.get_dtype(),
+                            size=output.get_size(),
+                            stride=output.get_stride(),
+                            offset=output.get_layout().offset,
+                        ),
+                        while_loop,
+                        [(list, idx)],
+                    )
+                    while_loop.outputs.append(multi_out)
+                    all_outputs.append(multi_out)
 
         for inp, out in zip(carried_inputs, all_outputs):
             if inp.get_name() in V.graph.graph_inputs:
@@ -8787,7 +8855,20 @@ def _guard_list_equals(
         return all_outputs
 
     def codegen(self, wrapper: PythonWrapperCodegen) -> None:
-        wrapper.codegen_while_loop(self)
+        wrapper.codegen_while_loop(self, self.stack_output)
+        wrapper.codegen_unbacked_symbol_defs_for_outputs(
+            self.get_name(), self.outputs, getattr(self, "unbacked_bindings", {})
+        )
+
+    def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
+        if unbacked_bindings := getattr(self, "unbacked_bindings", None):
+            resolved = resolve_unbacked_bindings(
+                V.graph.sizevars.shape_env, unbacked_bindings
+            )
+            assert resolved is not None
+            return OrderedSet(resolved.keys())
+        else:
+            return OrderedSet()
 
 
 class EffectfulKernel(FallbackKernel):
diff --git a/torch/_inductor/kernel/bmm.py b/torch/_inductor/kernel/bmm.py
index 92822ecc310b..a843c7369fb5 100644
--- a/torch/_inductor/kernel/bmm.py
+++ b/torch/_inductor/kernel/bmm.py
@@ -1,5 +1,6 @@
 # mypy: allow-untyped-defs
 import logging
+from typing import TYPE_CHECKING
 
 import torch
 from torch._dynamo.utils import counters
@@ -22,13 +23,11 @@
     use_triton_template,
 )
 from ..virtualized import V
-from .mm_common import (
-    _is_static_problem,
-    addmm_epilogue,
-    is_batch_stride_largest_or_zero,
-    mm_args,
-)
+from .mm_common import _is_static_problem, is_batch_stride_largest_or_zero, mm_args
+
 
+if TYPE_CHECKING:
+    from ..ir import ChoiceCaller
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
@@ -190,30 +189,32 @@ def may_require_contiguous(t, meta_t):
         layout,
     )
 
+    aten_handler: ExternKernelChoice = aten_bmm
+    aten_extra_kwargs = {}
     if out_dtype:
         assert mat1.get_device().type == "cuda", "out_dtype is only supported for CUDA"
-        aten_func = aten_bmm_dtype.bind(
-            kernel_inputs.nodes(), layout, out_dtype=out_dtype
+        aten_handler = aten_bmm_dtype
+        aten_extra_kwargs = {"out_dtype": out_dtype}
+
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                layout,
+                [aten_handler],
+                name,
+                {aten_handler.uid: aten_extra_kwargs},
+            )
         )
-    else:
-        aten_func = aten_bmm.bind(kernel_inputs.nodes(), layout)
 
-    # options to tune from
-    choices = [aten_func] if use_aten_gemm_kernels() else []
-
-    if use_triton_template(layout):
+    if use_triton_template(layout, check_max_autotune=False):
         # TODO: add out_dtype support for Triton Template
         assert out_dtype is None, "out_dtype is not supported for Triton"
 
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, bmm_template.name, name
-        ):
-            bmm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-            )
+        choices.extend(
+            V.choices.get_mm_configs(kernel_inputs, layout, [bmm_template], name)
+        )
     _, is_nonzero = _is_static_problem(layout)
     batch_stride_largest_or_zero = is_batch_stride_largest_or_zero(mat1, mat2, layout)
     if (
@@ -245,11 +246,16 @@ def may_require_contiguous(t, meta_t):
 
 @L.register_lowering(aten.baddbmm)
 def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    """
+    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
+    """
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2, inp = mm_args(mat1, mat2, inp, layout=layout)
 
     # Create MMKernelInputs for BadDBMM at the top
-    kernel_inputs = MMKernelInputs([inp, mat1, mat2])
+    kernel_inputs = MMKernelInputs(
+        [inp, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
+    )
 
     # below is for getting an overview logging info of inductor mms
     batch_size = mat1.get_size()[0]
@@ -267,24 +273,20 @@ def tuned_baddbmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
     )
     name = "baddbmm"
     # options to tune from
-    choices = (
-        [aten_baddbmm.bind(kernel_inputs.nodes(), layout, alpha=alpha, beta=beta)]
-        if use_aten_gemm_kernels()
-        else []
-    )
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(kernel_inputs, layout, [aten_baddbmm], name)
+        )
 
-    if use_triton_template(layout):
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, bmm_template.name, name
-        ):
-            bmm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-                prefix_args=1,
-                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
-                epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
+    if use_triton_template(layout, check_max_autotune=False):
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                layout,
+                [bmm_template],
+                name,
             )
+        )
 
     return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
diff --git a/torch/_inductor/kernel/flex/flex_attention.py b/torch/_inductor/kernel/flex/flex_attention.py
index cbb86b6090e2..52144b03cf4d 100644
--- a/torch/_inductor/kernel/flex/flex_attention.py
+++ b/torch/_inductor/kernel/flex/flex_attention.py
@@ -253,6 +253,12 @@ def flex_attention(
         dtype=torch.float32,  # The logsumexp is always stored in fp32 regardless of the input dtype
         device=query.get_device(),
     )
+    max_scores = empty_strided(
+        logsumexp_shape,  # Same shape as logsumexp
+        None,
+        dtype=torch.float32,  # The max scores are always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
     kernel_options.setdefault("SM_SCALE", scale)
 
     # Determine GQA broadcast factor.
@@ -274,7 +280,9 @@ def flex_attention(
 
     dtype = query.get_dtype()
     head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
-    configs = V.choices.get_flex_attention_fwd_configs(head_dim, dtype)
+    configs = V.choices.get_flex_attention_fwd_configs(
+        head_dim, dtype, query.get_device().type
+    )
 
     # Mark SPARSE_KV_BLOCK_SIZE & SPARSE_Q_BLOCK_SIZE as static shapes and add guards.
     SPARSE_KV_BLOCK_SIZE = V.graph.sizevars.guard_int(SPARSE_KV_BLOCK_SIZE)
@@ -344,6 +352,7 @@ def flex_attention(
                 key,
                 value,
                 logsumexp,
+                max_scores,
                 kv_num_blocks,
                 kv_indices,
                 full_kv_num_blocks,
@@ -356,6 +365,7 @@ def flex_attention(
             ],
             mutated_inputs=[
                 logsumexp,
+                max_scores,
             ],
             call_sizes=query.get_size(),
             **cur_kernel_options,
@@ -368,6 +378,7 @@ def flex_attention(
             key,
             value,
             logsumexp,
+            max_scores,
             kv_num_blocks,
             kv_indices,
             full_kv_num_blocks,
@@ -377,10 +388,10 @@ def flex_attention(
         + list(mask_mod_other_buffers)
     )
     input_gen_fns = {
-        4: create_num_blocks_fake_generator(kv_indices),
-        5: create_indices_fake,
-        6: create_num_blocks_fake_generator(full_kv_indices),
-        7: create_indices_fake,
+        5: create_num_blocks_fake_generator(kv_indices),
+        6: create_indices_fake,
+        7: create_num_blocks_fake_generator(full_kv_indices),
+        8: create_indices_fake,
     }
 
     out = autotune_select_algorithm(
@@ -401,7 +412,7 @@ def flex_attention(
         subgraph_buffer, mask_graph_buffer
     )
 
-    return (out, logsumexp)
+    return (out, logsumexp, max_scores)
 
 
 # ---------------------------- Backward HOP Implementation ----------------------------
@@ -712,7 +723,9 @@ def flex_attention_backward(*args, **kwargs):
 
     dtype = query.get_dtype()
     head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
-    configs = V.choices.get_flex_attention_bwd_configs(head_dim, dtype)
+    configs = V.choices.get_flex_attention_bwd_configs(
+        head_dim, dtype, query.get_device().type
+    )
 
     # Default config for warp specialization
     num_consumer_groups, num_buffers_warp_spec = 0, 0
diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
index e085710735fb..7cee22118904 100644
--- a/torch/_inductor/kernel/flex/flex_decoding.py
+++ b/torch/_inductor/kernel/flex/flex_decoding.py
@@ -71,6 +71,7 @@ def _use_flex_decoding(query, kv_indices, value, kernel_options, enable_gqa) ->
 
     return (
         not force_flex
+        and not kernel_options.get("OUTPUT_MAX", False)
         and short_query_length
         and static_batch
         and static_num_heads
@@ -102,7 +103,10 @@ def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, me
 
 
 def get_split_k(B: int, H: int, Mk: int) -> int:
-    num_SM = torch.cuda.get_device_properties("cuda").multi_processor_count
+    if torch.xpu.is_available():
+        num_SM = torch.xpu.get_device_properties("xpu").gpu_subslice_count
+    else:
+        num_SM = torch.cuda.get_device_properties("cuda").multi_processor_count
     bh = max(B * H, 1)  # NOTE: Handle B*h=0 case
     assert isinstance(bh, (int, sympy.Integer)), "B and H must be concrete integers"
     split_k = num_SM // bh * 2  # Each SM should at least get one block.
@@ -207,7 +211,9 @@ def create_flex_decoding_kernel(*args, **kwargs):
     choices: list[Any] = []
     dtype = key.get_dtype()
     head_dim = V.graph.sizevars.guard_int(key.get_size()[-1])
-    configs = V.choices.get_flex_decode_configs(head_dim, dtype)
+    configs = V.choices.get_flex_decode_configs(
+        head_dim, dtype, query.get_device().type
+    )
 
     # TODO: fix autotuning.
 
@@ -254,7 +260,7 @@ def create_flex_decoding_kernel(*args, **kwargs):
                     )
                     * gqa_shared_heads
                 ),
-                16,
+                1 if torch.xpu.is_available() else 16,
             )
         ),
     )
@@ -349,6 +355,13 @@ def create_flex_decoding_kernel(*args, **kwargs):
             **cur_kernel_options,
         )
 
+    filtered_score_mod_buffers = [
+        buf for buf in score_mod_other_buffers if not isinstance(buf, sympy.Symbol)
+    ]
+    filtered_mask_mod_buffers = [
+        buf for buf in mask_mod_other_buffers if not isinstance(buf, sympy.Symbol)
+    ]
+
     inputs_for_flex_decoding = (
         [
             query,
@@ -361,8 +374,8 @@ def create_flex_decoding_kernel(*args, **kwargs):
             full_kv_num_blocks,
             full_kv_indices,
         ]
-        + list(score_mod_other_buffers)
-        + list(mask_mod_other_buffers)
+        + filtered_score_mod_buffers
+        + filtered_mask_mod_buffers
     )
 
     input_gen_fns = {
diff --git a/torch/_inductor/kernel/flex/templates/common.py.jinja b/torch/_inductor/kernel/flex/templates/common.py.jinja
index 0e967570127d..f95beb146129 100644
--- a/torch/_inductor/kernel/flex/templates/common.py.jinja
+++ b/torch/_inductor/kernel/flex/templates/common.py.jinja
@@ -4,7 +4,7 @@
 @triton.jit
 def forward_block_mn(
     {{gen_argdefs()}},
-    q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
+    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
     # accumulated values
     acc, l_i, m_i,
     # Offsets
@@ -13,6 +13,8 @@ def forward_block_mn(
     kv_start,
     kv_offset,
     MATMUL_PRECISION, RCP_LN2,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
     IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
 
 ):
@@ -21,17 +23,21 @@ def forward_block_mn(
 
     # -- load k --
     # NB reversed order to since K is transposed
+    kv_base_offset = kv_start + kv_offset
     {%- if USE_TMA %}
     k = tl.load_tensor_descriptor(
         desc_k,
-        [kv_start + kv_offset, 0],
+        [kv_base_offset, 0],
     )
     {%- else %}
-    k = load_checked_block(K_block_ptr, SAFE_HEAD_DIM, IS_DIVISIBLE)
+
+    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
+    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
     {%- endif %}
 
-    if USE_TMA:
-        k = tl.trans(k)
+    k = tl.trans(k)
     # -- compute qk ---
     qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
     if not PRESCALE_QK:
@@ -98,10 +104,12 @@ def forward_block_mn(
     {%- if USE_TMA %}
     v = tl.load_tensor_descriptor(
         desc_v,
-        [kv_start + kv_offset, 0],
+        [kv_base_offset, 0],
     )
     {%- else %}
-    v = load_checked_block(V_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
+    # Calculate offsets for V loading - reuse kv_base_offset from K loading
+    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
+    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
     {%- endif %}
     acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
 
@@ -113,7 +121,7 @@ def forward_block_mn(
 @triton.jit
 def forward_inner(
     {{gen_argdefs()}},
-    q, K_block_ptr, V_block_ptr,
+    q, K, V,
     desc_k, desc_v, Q_LEN, KV_LEN,
     # accumulated values
     acc, l_i, m_i,
@@ -127,6 +135,8 @@ def forward_inner(
     # start kv and end kv block
     block_n_start, block_n_end,
     MATMUL_PRECISION,
+    # Strides for K and V
+    stride_kk, stride_kn, stride_vn, stride_vk,
     IS_FULL_BLOCKS,
 ):
     # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
@@ -146,7 +156,7 @@ def forward_inner(
         if IS_DIVISIBLE:
             acc, l_i, m_i = forward_block_mn(
                 {{gen_argdefs()}},
-                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
                 # accumulated values
                 acc, l_i, m_i,
                 # Offsets
@@ -155,6 +165,8 @@ def forward_inner(
                 kv_start,
                 kv_offset,
                 MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
                 IS_FULL_BLOCKS,
             )
         else:
@@ -164,7 +176,7 @@ def forward_inner(
             # to the last block because it's faster a lot.
             acc, l_i, m_i = forward_block_mn(
                 {{gen_argdefs()}},
-                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
+                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
                 # accumulated values
                 acc, l_i, m_i,
                 # Offsets
@@ -173,6 +185,8 @@ def forward_inner(
                 kv_start,
                 kv_offset,
                 MATMUL_PRECISION, RCP_LN2,
+                # Strides for K and V
+                stride_kk, stride_kn, stride_vn, stride_vk,
                 IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
             )
 
@@ -185,9 +199,6 @@ def forward_inner(
 
         offs_n = offs_n + offset
         kv_offset += offset
-        if not USE_TMA:
-            K_block_ptr = tl.advance(K_block_ptr, (0, offset))
-            V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
 
 
     return acc, l_i, m_i
diff --git a/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja b/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja
index 26f354192995..0a16a28c6cd4 100644
--- a/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja
+++ b/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja
@@ -1,4 +1,4 @@
-{{def_kernel("Q", "K", "V", "LSE", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
+{{def_kernel("Q", "K", "V", "LSE", "MAX", "KV_NUM_BLKS", "KV_IDX", "FULL_KV_NUM_BLKS", "FULL_KV_IDX")}}
     # Sub notation for this kernel:
     #
     # Q: Query, K: Key, V: Value
@@ -45,9 +45,9 @@
 
     MATMUL_PRECISION = Q.dtype.element_ty
 
-    q_start = tl.program_id(0)
-    off_zq = tl.program_id(1)
-    off_hq = tl.program_id(2)
+    q_start = tl.program_id(0).to(INDEX_DTYPE)
+    off_zq = tl.program_id(1).to(INDEX_DTYPE)
+    off_hq = tl.program_id(2).to(INDEX_DTYPE)
 
     # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
     # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
@@ -114,19 +114,6 @@
     sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
     sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
     sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
-    K_block_ptr = None
-    V_block_ptr = None
-    Q_block_ptr = None
-
-    if not USE_TMA:
-        Q_block_ptr = tl.make_block_ptr(
-            base=Q ,
-            shape=(Q_LEN, QK_HEAD_DIM),
-            strides=(stride_qm, stride_qk),
-            offsets=(q_start * BLOCK_M, 0),
-            block_shape=(BLOCK_M, QK_HEAD_DIM_ROUNDED),
-            order=(1, 0)
-        )
 
     {%- if USE_TMA %}
     q = tl.load_tensor_descriptor(
@@ -134,7 +121,9 @@
         [(q_start * BLOCK_M).to(tl.int32), 0],
     )
     {%- else %}
-        q = load_checked_block(Q_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
+    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
+    q = load_checked_2d(Q, offs_m, offs_k, stride_qm, stride_qk, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
     {%- endif %}
 
     # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -146,31 +135,14 @@
     block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
 
 
-    if not USE_TMA:
-        K_block_ptr = tl.make_block_ptr(
-            base=K,
-            shape=(QK_HEAD_DIM, KV_LEN),
-            strides=(stride_kk, stride_kn),
-            offsets=(0, kv_start),
-            block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
-            order=(0, 1)
-        )
-
-        V_block_ptr = tl.make_block_ptr(
-            base=V,
-            shape=(KV_LEN, V_HEAD_DIM),
-            strides=(stride_vn, stride_vk),
-            offsets=(kv_start, 0),
-            block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
-            order=(1, 0)
-        )
+    # K and V pointers will be passed directly to forward_inner
 
     offs_n = kv_start + tl.arange(0, BLOCK_N)
 
 
     acc, l_i, m_i = forward_inner(
         {{gen_argdefs()}},
-        q, K_block_ptr, V_block_ptr,
+        q, K, V,
         desc_k, desc_v, Q_LEN, KV_LEN,
         acc, l_i, m_i,
         off_zq, off_hq, offs_m[:, None], offs_n[None, :],
@@ -178,6 +150,7 @@
         kv_indices, kv_num_blocks,
         0, block_n_end,
         MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
         IS_FULL_BLOCKS=False,
     )
 
@@ -190,28 +163,12 @@
         kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
         kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
         block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
-        if not USE_TMA:
-            K_block_ptr = tl.make_block_ptr(
-                base=K,
-                shape=(QK_HEAD_DIM, KV_LEN),
-                strides=(stride_kk, stride_kn),
-                offsets=(0, kv_start),
-                block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
-                order=(0, 1)
-            )
-            V_block_ptr = tl.make_block_ptr(
-                base=V,
-                shape=(KV_LEN, V_HEAD_DIM),
-                strides=(stride_vn, stride_vk),
-                offsets=(kv_start, 0),
-                block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
-                order=(1, 0)
-            )
+        # K and V pointers will be passed directly to forward_inner
         offs_n = kv_start + tl.arange(0, BLOCK_N)
 
         acc, l_i, m_i = forward_inner(
             {{gen_argdefs()}},
-            q, K_block_ptr, V_block_ptr,
+            q, K, V,
             desc_k, desc_v, Q_LEN, KV_LEN,
             acc, l_i, m_i,
             off_zq, off_hq, offs_m[:, None], offs_n[None, :],
@@ -219,6 +176,7 @@
             kv_indices, kv_num_blocks,
             0, block_n_end,
             MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
             IS_FULL_BLOCKS=True,
         )
 
@@ -229,10 +187,10 @@
     l_i = tl.where(l_i == 0.0, 1, l_i)
 
     acc = acc / l_i[:, None]
-    idx_zq = tl.program_id(1)
-    idx_hq = tl.program_id(2)
-    idx_m = offs_m[:, None]
-    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :]
+    idx_zq = tl.program_id(1).to(INDEX_DTYPE)
+    idx_hq = tl.program_id(2).to(INDEX_DTYPE)
+    idx_m = offs_m[:, None].to(INDEX_DTYPE)
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :].to(INDEX_DTYPE)
 
     mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
 
@@ -247,3 +205,11 @@
             tl.store(l_ptrs, lse)
         else:
             tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)
+
+    if OUTPUT_MAX:
+        off_hz = off_zq * HQ + off_hq
+        max_ptrs = MAX + off_hz * Q_LEN + offs_m
+        if IS_DIVISIBLE:
+            tl.store(max_ptrs, m_i)
+        else:
+            tl.store(max_ptrs, m_i, mask=offs_m < Q_LEN)
diff --git a/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja b/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja
index 443c1f82cce3..236f68540e4f 100644
--- a/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja
+++ b/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja
@@ -51,12 +51,12 @@
 
     MATMUL_PRECISION = Q.dtype.element_ty
 
-    pid = tl.program_id(0)
+    pid = tl.program_id(0).to(INDEX_DTYPE)
     NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
     NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
 
-    off_zq = tl.program_id(1) # q batch idx
-    off_hkv = tl.program_id(2) # kv head idx
+    off_zq = tl.program_id(1).to(INDEX_DTYPE) # q batch idx
+    off_hkv = tl.program_id(2).to(INDEX_DTYPE) # kv head idx
     off_zkv = off_zq % ZKV # kv batch idx
 
     SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
@@ -312,61 +312,28 @@ def bwd_dq_inner(
     tl.static_assert(BLOCK_M2 % BLOCK_N2 == 0)
 
     hi = tl.minimum(sparse_kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N2), 1))
-    if not IS_DIVISIBLE:
-        if hi >= 1:
-            for start_n in range(0, hi - 1):
-                dq = bwd_dq_block_mn(
-                    {{gen_argdefs()}},
-                    dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-                    off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-                    stride_kn, stride_kd, stride_vn, stride_vd,
-                    kv_indices, sparse_kv_num_blocks,
-                    MATMUL_PRECISION, RCP_LN2,
-                    IS_FULL_BLOCKS,
-                )
-
-                # Increment pointers.
-                offset = get_offset_for_next_block(
-                    start_n, kv_indices, sparse_kv_num_blocks,
-                    SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
-                )
-
-                kT_ptrs += offset * stride_kn
-                vT_ptrs += offset * stride_vn
-
-                offs_n2 += offset
 
-            dq = bwd_dq_block_mn(
-                {{gen_argdefs()}},
-                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-                stride_kn, stride_kd, stride_vn, stride_vd,
-                kv_indices, sparse_kv_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
-            )
-    else:
-        for start_n in range(0, hi):
-            dq = bwd_dq_block_mn(
-                {{gen_argdefs()}},
-                dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
-                stride_kn, stride_kd, stride_vn, stride_vd,
-                kv_indices, sparse_kv_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS,
-            )
+    for start_n in range(0, hi):
+        dq = bwd_dq_block_mn(
+            {{gen_argdefs()}},
+            dq, q, kT_ptrs, vT_ptrs, do, Di, lse, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_m2, offs_n2, offs_k, offs_v,
+            stride_kn, stride_kd, stride_vn, stride_vd,
+            kv_indices, sparse_kv_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
 
-            # Increment pointers.
-            offset = get_offset_for_next_block(
-                start_n, kv_indices, sparse_kv_num_blocks,
-                SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
-            )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_n, kv_indices, sparse_kv_num_blocks,
+            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N2, BLOCKS_ARE_CONTIGUOUS
+        )
 
-            kT_ptrs += offset * stride_kn
-            vT_ptrs += offset * stride_vn
+        kT_ptrs += offset * stride_kn
+        vT_ptrs += offset * stride_vn
 
-            offs_n2 += offset
+        offs_n2 += offset
 
     return dq
 
@@ -379,7 +346,7 @@ def bwd_dq_block_mn(
     stride_kn, stride_kd, stride_vn, stride_vd,
     kv_indices, sparse_kv_num_blocks,
     MATMUL_PRECISION, RCP_LN2,
-    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+    IS_FULL_BLOCKS,
 ):
     {{gen_defines() | indent_except_first(1)}}
 
@@ -390,10 +357,10 @@ def bwd_dq_block_mn(
         qk *= SM_SCALE
     # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
     pre_mod_scores = qk
-    n = get_bounded_indices(offs_n2[None, :], KV_LEN if CHECK_BLOCK_BOUNDARY else None)
+    n = get_bounded_indices(offs_n2[None, :], KV_LEN if not IS_DIVISIBLE else None)
     # The boundary check is done for the outer loop, but here it's possible since we're iterating across N dim
-    # that the M reads out of bounds prior to the last loop
-    m = get_bounded_indices(offs_m2[:, None], Q_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
+    # that the M reads out of bounds for the PIDS spanning the Q_LEN boundary
+    m = get_bounded_indices(offs_m2[:, None], Q_LEN if not IS_DIVISIBLE else None)
 
     {{ modification(
         subgraph_number=0,
@@ -406,8 +373,13 @@ def bwd_dq_block_mn(
         out="qk"
     ) | indent_except_first(1) }}
 
-    if CHECK_BLOCK_BOUNDARY:
-        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
+
+    {# Note: Selective masking DQ
+    We load elements beyond KV_LEN w/ zero, some score mods may convert this elements to NaN
+    Example: lambda x, *_: 1 / score, this NaN would propagate regardless of other masking
+    We only need to do this on the m1 dim since these elements take part in the final reduction
+    for DQ #}
+    if not IS_DIVISIBLE:
         post_mod_scores = tl.where(offs_n2[None, :] < KV_LEN, post_mod_scores, float("-inf"))
 
     if not IS_FULL_BLOCKS:
@@ -421,8 +393,6 @@ def bwd_dq_block_mn(
             n="n",
         ) | indent_except_first(2) }}
 
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
         # apply mask for partial masked block
         post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -446,7 +416,8 @@ def bwd_dq_block_mn(
         n="n",
         grad_score_mod="ds"
     ) | indent_except_first(1) }}
-    if CHECK_BLOCK_BOUNDARY:
+    {# See Note Selective masking DQ #}
+    if not IS_DIVISIBLE:
         grad_scores = tl.where(offs_n2[None, :] < KV_LEN, grad_scores, 0.0)
 
     # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
@@ -467,8 +438,6 @@ def bwd_dq_block_mn(
     ds = grad_scores
 
     if not IS_FULL_BLOCKS:
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n2[None, :] < KV_LEN, mask_mod_output, False)
         # (grads) apply mask for partially unmasked block
         ds = tl.where(mask_mod_output, ds, 0.0)
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -503,61 +472,30 @@ def bwd_dkdv_inner(
     do_ptrs = DO + offs_m1[:, None] * stride_dom + offs_v[None, :] * stride_dod
     # BLOCK_N1 must be a multiple of BLOCK_M1, otherwise the code wouldn't work.
     tl.static_assert(BLOCK_N1 % BLOCK_M1 == 0)
-    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
-
-    if not IS_DIVISIBLE:
-        if hi >= 1:
-            for start_m in range(0, hi - 1):
-                dk, dv = bwd_dkdv_block_mn(
-                    {{gen_argdefs()}},
-                    dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-                    off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-                    stride_qm, stride_qd, stride_dom, stride_dod,
-                    q_indices, sparse_q_num_blocks,
-                    MATMUL_PRECISION, RCP_LN2,
-                    IS_FULL_BLOCKS,
-                )
-                # Increment pointers.
-                offset = get_offset_for_next_block(
-                    start_m, q_indices, sparse_q_num_blocks,
-                    SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
-                )
-
-                qT_ptrs += offset * stride_qm
-                do_ptrs += offset * stride_dom
 
-                offs_m1 += offset
-
-            dk, dv = bwd_dkdv_block_mn(
-                {{gen_argdefs()}},
-                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-                stride_qm, stride_qd, stride_dom, stride_dod,
-                q_indices, sparse_q_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
-            )
-    else:
-        for start_m in range(0, hi):
-            dk, dv = bwd_dkdv_block_mn(
-                {{gen_argdefs()}},
-                dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
-                off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
-                stride_qm, stride_qd, stride_dom, stride_dod,
-                q_indices, sparse_q_num_blocks,
-                MATMUL_PRECISION, RCP_LN2,
-                IS_FULL_BLOCKS,
-            )
-            # Increment pointers.
-            offset = get_offset_for_next_block(
-                start_m, q_indices, sparse_q_num_blocks,
-                SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
-            )
+    # The minimum is needed to handle the case where we run with a super large
+    # SPARSE_BLOCK_SIZE (i.e. no block-mask!)
+    hi = tl.minimum(sparse_q_num_blocks * SPARSE_Q_MULTIPLE, tl.maximum(tl.cdiv(Q_LEN, BLOCK_M1), 1))
 
-            qT_ptrs += offset * stride_qm
-            do_ptrs += offset * stride_dom
+    for start_m in range(0, hi):
+        dk, dv = bwd_dkdv_block_mn(
+            {{gen_argdefs()}},
+            dk, dv, qT_ptrs, k, v, do_ptrs, DELTA, LSE, Q_LEN, KV_LEN,
+            off_z, off_hq, offs_n1, offs_m1, offs_k, offs_v,
+            stride_qm, stride_qd, stride_dom, stride_dod,
+            q_indices, sparse_q_num_blocks,
+            MATMUL_PRECISION, RCP_LN2,
+            IS_FULL_BLOCKS,
+        )
+        # Increment pointers.
+        offset = get_offset_for_next_block(
+            start_m, q_indices, sparse_q_num_blocks,
+            SPARSE_Q_BLOCK_SIZE, SPARSE_Q_MULTIPLE, BLOCK_M1, BLOCKS_ARE_CONTIGUOUS
+        )
 
-            offs_m1 += offset
+        qT_ptrs += offset * stride_qm
+        do_ptrs += offset * stride_dom
+        offs_m1 += offset
 
     return dk, dv
 
@@ -570,7 +508,7 @@ def bwd_dkdv_block_mn(
     stride_qm, stride_qd, stride_dom, stride_dod,
     q_indices, sparse_q_num_blocks,
     MATMUL_PRECISION, RCP_LN2,
-    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
+    IS_FULL_BLOCKS,
 ):
     {{gen_defines() | indent_except_first(1) }}
 
@@ -586,10 +524,10 @@ def bwd_dkdv_block_mn(
     if not PRESCALE_QK:
         qkT *= SM_SCALE
     # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
-    m = get_bounded_indices(offs_m1[None, :], Q_LEN if CHECK_BLOCK_BOUNDARY else None)
+    m = get_bounded_indices(offs_m1[None, :], Q_LEN if not IS_DIVISIBLE else None)
     # The boundary check is done for the outer loop, but here it's possible since we're iterating across M dim
-    # that the n reads out of bounds prior to the last loop
-    n = get_bounded_indices(offs_n1[:, None], KV_LEN if (not IS_DIVISIBLE or CHECK_BLOCK_BOUNDARY) else None)
+    # that the n reads out of bounds for the PIDS spanning the KV_LEN boundary
+    n = get_bounded_indices(offs_n1[:, None], KV_LEN if not IS_DIVISIBLE else None)
 
     pre_mod_scores = qkT
     {{ modification(
@@ -603,22 +541,23 @@ def bwd_dkdv_block_mn(
         out="qkT"
     ) | indent_except_first(1) }}
 
-    if CHECK_BLOCK_BOUNDARY:
-        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
-        post_mod_scores = tl.where(offs_n1[:, None] < KV_LEN, post_mod_scores, float("-inf"))
+    {# Note: Selective masking DK/DV
+    We load elements beyond Q_LEN w/ zero, some score mods may convert this elements to NaN
+    Example: lambda x, *_: 1 / score, this NaN would propagate regardless of other masking
+    We only need to do this on the m1 dim since these elements take part in the final reduction
+    for DK/DV #}
+    if not IS_DIVISIBLE:
+        post_mod_scores = tl.where(offs_m1[None, :] < Q_LEN, post_mod_scores, float("-inf"))
 
     if not IS_FULL_BLOCKS:
         {{ modification(
             subgraph_number=2,
             output_name="mask_mod_output",
-            score="qkT",
             b="off_z",
             h="off_hq",
             m="m",
             n="n",
         ) | indent_except_first(2) }}
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
         # (grads) apply mask for fully masked block
         post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -648,6 +587,10 @@ def bwd_dkdv_block_mn(
         grad_score_mod="dsT"
     ) | indent_except_first(1) }}
 
+    {# See Note: Selective masking DK/DV#}
+    if not IS_DIVISIBLE:
+        grad_scores = tl.where(offs_m1[None, :] < Q_LEN, grad_scores, 0.0)
+
     # ~~~~~~~~~~~~~~~~~~~ Apply other buffer grad writes ~~~~~~~~~~~~~
     if not WRITE_DQ:
         idx_b = off_z
@@ -667,14 +610,8 @@ def bwd_dkdv_block_mn(
             grad_score_mod="dsT"
         ) | indent_except_first(2) }}
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-    if CHECK_BLOCK_BOUNDARY:
-        grad_scores = tl.where(offs_n1[:, None] < KV_LEN, grad_scores, 0.0)
-
     dsT = grad_scores
     if not IS_FULL_BLOCKS:
-        if CHECK_BLOCK_BOUNDARY:
-            mask_mod_output = tl.where(offs_n1[:, None] < KV_LEN, mask_mod_output, False)
         # (grads) apply mask for partially unmasked block
         dsT = tl.where(mask_mod_output, dsT, 0.0)
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
index f4596070c833..57adc1cd69d6 100644
--- a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
+++ b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
@@ -54,15 +54,18 @@
     TILE_KV = tl.cdiv(TILE_KV_OG, BLOCK_N) * BLOCK_N
     TILE_KV_MULTIPLE: tl.constexpr = (TILE_KV // BLOCK_N)
 
-    off_z = tl.program_id(0) // HKV
+    off_z = tl.program_id(0).to(INDEX_DTYPE) // HKV
     off_zkv = off_z % ZKV
-    off_hkv = tl.program_id(0) % HKV
-    off_t = tl.program_id(1)
+    off_hkv = tl.program_id(0).to(INDEX_DTYPE) % HKV
+    off_t = tl.program_id(1).to(INDEX_DTYPE)
 
     q_offset = off_z * stride_qz + off_hkv * stride_qh
     k_offset = off_zkv * stride_kz + off_hkv * stride_kh
     v_offset = off_zkv * stride_vz + off_hkv * stride_vh
 
+    K = K + k_offset
+    V = V + v_offset
+
     SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
     SPARSE_HQ = {{size("KV_NUM_BLKS", 1)}}
 
@@ -113,13 +116,12 @@
 
 
     # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    # Apply both score_mod and mask_mod
-
     # find first kv block we are loading and the number of blocks we are loading
     # Offset the kv_indices tensor by the correct batch and head
     kv_indices = KV_IDX + sparse_idx_hz_offset
     kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_block_hz_offset)
-    indices_idx = block_n_start // SPARSE_KV_MULTIPLE
+    MAX_KV_IDX = {{size("KV_IDX", -1)}}
+    indices_idx = (block_n_start // SPARSE_KV_MULTIPLE) % (MAX_KV_IDX)
     off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
     off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
     # first kv block we're loading
@@ -127,36 +129,39 @@
     # last valid block according to sparse mask
     block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
 
-    K_block_ptr = tl.make_block_ptr(
-        base=K + k_offset,
-        shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
-        strides=(stride_kk, stride_kn),
-        offsets=(0, off_n),
-        block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
-        order=(0, 1)
+    offs_n = tl.arange(0, BLOCK_N) + off_n
+
+    desc_k = None
+    desc_v = None
+    {%- if USE_TMA %}
+    desc_k = tl.make_tensor_descriptor(
+        base=K,
+        shape=[KV_LEN, QK_HEAD_DIM],
+        strides=[stride_kn, 1],
+        block_shape=[BLOCK_N, QK_HEAD_DIM_ROUNDED],
     )
-    V_block_ptr = tl.make_block_ptr(
-        base=V + v_offset,
-        shape=(KV_LEN, V_HEAD_DIM),
-        strides=(stride_vn, stride_vk),
-        offsets=(off_n, 0),
-        block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
-        order=(1, 0)
+
+    desc_v = tl.make_tensor_descriptor(
+        base=V,
+        shape=[KV_LEN, V_HEAD_DIM],
+        strides=[stride_vn, 1],
+        block_shape=[BLOCK_N, V_HEAD_DIM_ROUNDED],
     )
-    offs_n = tl.arange(0, BLOCK_N) + off_n
+    {%- endif %}
 
     acc, l_i, m_i = forward_inner(
         {{gen_argdefs()}},
-        q, K_block_ptr, V_block_ptr, None, None, Q_LEN, KV_LEN,
+        q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
         # accumulatd values
         acc, l_i, m_i,
         #offsets
         off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
-        None,
+        off_n,
         #block sparse data
         kv_indices, kv_num_blocks,
         block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
         MATMUL_PRECISION,
+        stride_kk, stride_kn, stride_vn, stride_vk,
         IS_FULL_BLOCKS=False,
     )
 
@@ -170,43 +175,28 @@
         # Assign full block in a reverse order for off_t. Prioritize the last CTA.
         block_n_start = (SPLIT_KV - off_t - 1) * TILE_KV_MULTIPLE
         block_n_end = block_n_start + TILE_KV_MULTIPLE
-        indices_idx = block_n_start // SPARSE_KV_MULTIPLE
+        indices_idx = (block_n_start // SPARSE_KV_MULTIPLE) % (MAX_KV_IDX)
         off_n_block_in_sparse = block_n_start % SPARSE_KV_MULTIPLE
         off_n = tl.load(kv_indices + indices_idx) * SPARSE_KV_BLOCK_SIZE + off_n_block_in_sparse * BLOCK_N
 
         # last valid block according to sparse mask
         block_n_last_valid = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
 
-        K_block_ptr = tl.make_block_ptr(
-            base=K + k_offset,
-            shape=(QK_HEAD_DIM, KV_LEN),                # (d, N)
-            strides=(stride_kk, stride_kn),
-            offsets=(0, off_n),
-            block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
-            order=(0, 1)
-        )
-        V_block_ptr = tl.make_block_ptr(
-            base=V + v_offset,
-            shape=(KV_LEN, V_HEAD_DIM),
-            strides=(stride_vn, stride_vk),
-            offsets=(off_n, 0),
-            block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
-            order=(1, 0)
-        )
         offs_n = tl.arange(0, BLOCK_N) + off_n
 
         acc, l_i, m_i = forward_inner(
             {{gen_argdefs()}},
-            q, K_block_ptr, V_block_ptr, None, None, Q_LEN, KV_LEN,
+            q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
             # accumulatd values
             acc, l_i, m_i,
             #offsets
             off_z, offs_hq[:, None], offs_m[:, None], offs_n[None, :],
-            None,
+            off_n,
             #block sparse data
             kv_indices, kv_num_blocks,
             block_n_start, block_n_end if block_n_end <= block_n_last_valid else block_n_last_valid,
             MATMUL_PRECISION,
+            stride_kk, stride_kn, stride_vn, stride_vk,
             IS_FULL_BLOCKS=True,
         )
 
@@ -249,4 +239,4 @@
 
     mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
     acc = acc.reshape(G, BLOCK_M_PER_HQ, V_HEAD_DIM)
-    {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
\ No newline at end of file
+    {{store_output(("idx_z", "idx_t", "idx_hq", "idx_m", "idx_d"), "acc", "mask")}}
diff --git a/torch/_inductor/kernel/flex/templates/utilities.py.jinja b/torch/_inductor/kernel/flex/templates/utilities.py.jinja
index 7e2367e4f269..0c40b43277f8 100644
--- a/torch/_inductor/kernel/flex/templates/utilities.py.jinja
+++ b/torch/_inductor/kernel/flex/templates/utilities.py.jinja
@@ -42,7 +42,7 @@ def load_checked_2d(
     IS_DIVISIBLE_M: tl.constexpr,
     IS_DIVISIBLE_N: tl.constexpr,
     M_LEN: tl.constexpr,
-    N_DIM: tl.constexpr,
+    N_LEN: tl.constexpr,
 ):
     # Calculate final pointer if strides are provided
     if stride_m is not None and stride_n is not None:
@@ -50,9 +50,9 @@ def load_checked_2d(
 
     # Handle all masking cases
     if not IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
-        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_DIM), other=0.0)
+        return tl.load(ptr, mask=(offs_m[:, None] < M_LEN) & (offs_n[None, :] < N_LEN), other=0.0)
     elif IS_DIVISIBLE_M and not IS_DIVISIBLE_N:
-        return tl.load(ptr, mask=(offs_n[None, :] < N_DIM), other=0.0)
+        return tl.load(ptr, mask=(offs_n[None, :] < N_LEN), other=0.0)
     elif not IS_DIVISIBLE_M and IS_DIVISIBLE_N:
         return tl.load(ptr, mask=(offs_m[:, None] < M_LEN), other=0.0)
     else:  # Both divisible
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index 58fb29f14474..a597107510e7 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -3,8 +3,6 @@
 import logging
 from typing import Any, Optional
 
-import sympy
-
 import torch
 from torch._dynamo.utils import counters
 from torch._inductor.autoheuristic.autoheuristic import AutoHeuristicSelectAlgorithm
@@ -24,15 +22,10 @@
 from ..codegen.cuda.gemm_template import CUTLASS2xGemmTemplate, CUTLASS3xGemmTemplate
 from ..codegen.rocm.ck_tile_universal_gemm_template import CKTileGemmTemplate
 from ..codegen.rocm.ck_universal_gemm_template import CKGemmTemplate
-from ..codegen.subgraph import SubgraphTemplate
-from ..ir import FlexibleLayout, is_triton
+from ..codegen.subgraph import SubgraphChoiceCaller, SubgraphTemplate
+from ..ir import Buffer, ChoiceCaller, FlexibleLayout, is_triton, Layout
 from ..kernel_inputs import MMKernelInputs
-from ..lowering import (
-    add_layout_constraint,
-    constrain_to_fx_strides,
-    lowerings as L,
-    register_lowering,
-)
+from ..lowering import add_layout_constraint, constrain_to_fx_strides, register_lowering
 from ..select_algorithm import (
     autotune_select_algorithm,
     ExternKernelChoice,
@@ -41,8 +34,6 @@
 )
 from ..utils import (
     _use_cutlass_for_op,
-    get_k_splits,
-    get_tma_workspace_arg,
     use_aten_gemm_kernels,
     use_ck_gemm_template,
     use_ck_tile_gemm_template,
@@ -52,14 +43,7 @@
     use_triton_template,
     use_triton_tma_template,
 )
-from .mm_common import (
-    _is_static_problem,
-    addmm_epilogue,
-    mm_args,
-    mm_grid,
-    persistent_mm_grid,
-    scale_mm_epilogue,
-)
+from .mm_common import _is_static_problem, mm_args, mm_grid, persistent_mm_grid
 
 
 try:
@@ -594,23 +578,13 @@ def _is_int8_mat(mat):
     return mat.get_dtype() in (torch.int8, torch.uint8)
 
 
-@functools.lru_cache
-def using_b200() -> bool:
-    """Returns true if the device is a NVIDIA B200, otherwise returns false."""
-    if not torch.cuda.is_available():
-        return False
-    # compute capability 10.0 or 10.0a is NVIDIA B200
-    device_properties = torch.cuda.get_device_properties(torch.cuda.current_device())
-    return device_properties.major == 10
-
-
 def bias_addmm(inp, mat1, mat2, *, out=None, alpha=1, beta=1):
     """
     Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
     kernel under the hood.  There are a few shapes where this is slower,
     but they are rare.
     """
-    if inp.stride(0) == 0 or inp.size(0) == 1:
+    if (inp.stride(0) == 0 and inp.size(0) != 0) or inp.size(0) == 1:
         return torch.addmm(inp[0], mat1, mat2, out=out, alpha=alpha, beta=beta)
     return torch.addmm(inp, mat1, mat2, out=out, alpha=alpha, beta=beta)
 
@@ -658,6 +632,94 @@ def decomposeK(a, b, k_splits):
     return reduced_buf.to(a.dtype)
 
 
+class DecomposeKSugraphTemplate(SubgraphTemplate):
+    def __init__(self):
+        super().__init__(
+            name="decompose_k",
+        )
+
+    def generate(  # type: ignore[override]
+        self,
+        input_nodes: list[Buffer],
+        layout: Layout,
+        k_split: int,
+    ) -> SubgraphChoiceCaller:
+        from torch._dispatch.python import enable_python_dispatcher
+
+        from ..decomposition import select_decomp_table
+
+        name = f"decompose_k_mm_{k_split}_split"
+        description = f"{k_split=}"
+
+        with enable_python_dispatcher():
+            decompositions = select_decomp_table()
+            fn = make_fx(
+                functools.partial(decomposeK, k_splits=k_split),
+                decompositions,
+            )
+
+            return super().generate(
+                name=name,
+                input_nodes=input_nodes,
+                layout=layout,
+                make_fx_graph=fn,
+                description=description,
+            )
+
+
+decompose_k_subgraph_template = DecomposeKSugraphTemplate()
+
+
+class ContiguousTemplate(SubgraphTemplate):
+    def __init__(self, name: str, description: str, fn: Any):
+        self.name = name
+        self.description = description
+        self.fn = fn
+        super().__init__(
+            name=name,
+        )
+
+    def generate(  # type: ignore[override]
+        self,
+        input_nodes: list[Buffer],
+        layout: Layout,
+    ) -> SubgraphChoiceCaller:
+        from torch._dispatch.python import enable_python_dispatcher
+
+        from ..decomposition import select_decomp_table
+
+        with enable_python_dispatcher():
+            decompositions = select_decomp_table()
+            fn = make_fx(
+                self.fn,
+                decompositions,
+            )
+
+            return super().generate(
+                name=self.name,
+                input_nodes=input_nodes,
+                layout=layout,
+                make_fx_graph=fn,
+                description=self.description,
+            )
+
+
+def contiguous_mm(a, b):
+    return torch.mm(a, b.contiguous())
+
+
+def contiguous_addmm(inp, a, b):
+    return torch.addmm(inp, a, b.contiguous())
+
+
+mm_contiguous_subgraph_template = ContiguousTemplate(
+    "contiguous_mm", "contiguous mm", contiguous_mm
+)
+addmm_contiguous_subgraph_template = ContiguousTemplate(
+    "contiguous_addmm", "contiguous addmm", contiguous_addmm
+)
+
+
 @register_lowering(aten.mm, type_promotion_kind=None)
 def tuned_mm(mat1, mat2, *, layout=None):
     """
@@ -688,84 +750,37 @@ def tuned_mm(mat1, mat2, *, layout=None):
         aten_layout = FlexibleLayout(
             device=layout.device, dtype=layout.dtype, size=layout.size
         )
-
-    # options to tune from
-    choices = (
-        [aten_mm.bind(kernel_inputs.nodes(), aten_layout)]
-        if use_aten_gemm_kernels()
-        else []
-    )
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(kernel_inputs, aten_layout, [aten_mm], "mm")
+        )
     static_shape, is_nonzero = _is_static_problem(layout)
 
-    if is_nonzero and use_triton_template(layout):
-        # Get template params using the new unified function
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, mm_template.name, "mm"
-        ):
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-            )
-
+    if is_nonzero and use_triton_template(layout, check_max_autotune=False):
+        # Get template choices using the new unified function
+        choices.extend(
+            V.choices.get_mm_configs(kernel_inputs, layout, [mm_template], "mm")
+        )
         if use_triton_tma_template(mat1, mat2):
-            # Get TMA template params using the new unified function
-            for kwargs in V.choices.get_mm_configs(
-                kernel_inputs, layout, persistent_tma_mm_template.name, "mm"
-            ):
-                persistent_tma_mm_template.maybe_append_choice(
-                    choices,
-                    input_nodes=kernel_inputs.nodes(),
-                    layout=layout,
-                    workspace_arg=get_tma_workspace_arg(
-                        num_tma_descriptors=2,
-                        device=mat1.get_device(),
-                    ),
-                    **kwargs,
+            # Get TMA template choices using the new unified function
+            choices.extend(
+                V.choices.get_mm_configs(
+                    kernel_inputs, layout, [persistent_tma_mm_template], "mm"
                 )
-
-        from torch._inductor.ir import get_free_symbols
-
-        # Only do split-k optimization if K is much larger than m, n and m, n are small
-        # and if there aren't any unbacked symbols
-        unbacked_symbols = any(
-            len(get_free_symbols(itr, unbacked_only=True)) > 0
-            for itr in (
-                mat1.get_size(),
-                mat1.get_stride(),
-                mat2.get_size(),
-                mat2.get_stride(),
             )
-        )
-        if use_decompose_k_choice(m, n, k) and not unbacked_symbols:
-            from torch._dispatch.python import enable_python_dispatcher
-
-            from ..decomposition import select_decomp_table
-
-            k_splits = get_k_splits(m, n, k)
-            for k_split in k_splits:
-                if not V.graph.sizevars.statically_known_true(
-                    sympy.Eq(sympy.Mod(k, k_split), 0)
-                ):
-                    continue
-
-                with enable_python_dispatcher():
-                    decompositions = select_decomp_table()
-
-                    decompose_k_subgraph_template = SubgraphTemplate(
-                        name=f"decompose_k_mm_{k_split}_split",
-                        make_fx_graph=make_fx(
-                            functools.partial(decomposeK, k_splits=k_split),
-                            decompositions,
-                        ),
-                    )
 
-                decompose_k_subgraph_template.maybe_append_choice(
-                    choices,
-                    input_nodes=(mat1, mat2),
-                    layout=layout,
+        if use_decompose_k_choice(m, n, k):
+            choices.extend(
+                V.choices.get_mm_configs(
+                    kernel_inputs, layout, [decompose_k_subgraph_template], "mm"
                 )
+            )
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs, layout, [mm_contiguous_subgraph_template], "mm"
+            )
+        )
 
     if (
         is_nonzero
@@ -799,21 +814,17 @@ def tuned_mm(mat1, mat2, *, layout=None):
         if use_aten_gemm_kernels():
             always_included.append("extern_mm")
         num_choices_before_extra_configs = len(choices)
-        for kwargs in V.choices.get_mm_configs(
-            # TODO(coconutruben): remove once we deprecate ah
-            # mm-extra is a hack to keep the ah functionality alive
-            # while we transition to the unified kwargs retrieval
-            kernel_inputs,
-            layout,
-            "mm-ah",
-            "mm",
-        ):
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
+        choices.extend(
+            V.choices.get_mm_configs(
+                # TODO(coconutruben): remove once we deprecate ah
+                # mm-extra is a hack to keep the ah functionality alive
+                # while we transition to the unified kwargs retrieval
+                kernel_inputs,
+                layout,
+                [mm_template],
+                "mm-ah",
             )
+        )
 
         # using AutoHeuristic for ranking
         ah_choices = mm_autoheuristic(
@@ -867,7 +878,7 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
     m, n, k, layout, mat1, mat2 = mm_args(
         mat1, mat2, layout=layout, out_dtype=torch.int32
     )
-
+    name = "int_mm"
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten._int_mm_{m}_{n}_{k}"] += 1
     log.info(
@@ -882,41 +893,49 @@ def tuned_int_mm(mat1, mat2, *, layout=None):
 
     static_shape, is_nonzero = _is_static_problem(layout)
     use_cutlass = static_shape and is_nonzero and use_cutlass_template(layout, m, n, k)
-
-    choices = (
-        [aten__int_mm.bind((mat1, mat2), layout)] if use_aten_gemm_kernels() else []
-    )
+    choices: list[ChoiceCaller] = []
 
     # Create MMKernelInputs for Int MM
     kernel_inputs = MMKernelInputs([mat1, mat2])
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                layout,
+                [aten__int_mm],
+                name,
+            )
+        )
 
-    if use_cutlass and _use_cutlass_for_op("int_mm"):
+    if use_cutlass and _use_cutlass_for_op(name):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices, layout, kernel_inputs.nodes(), fuseable=True, non_fuseable=True
         )
 
-    if is_nonzero and use_triton_template(layout, enable_int32=True):
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, mm_template.name, "int_mm"
-        ):
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-            )
+    if is_nonzero and use_triton_template(
+        layout, enable_int32=True, check_max_autotune=False
+    ):
+        choices.extend(
+            V.choices.get_mm_configs(kernel_inputs, layout, [mm_template], name)
+        )
 
-    return autotune_select_algorithm("int_mm", choices, kernel_inputs.nodes(), layout)
+    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
 
 @register_lowering(aten.addmm, type_promotion_kind=None)
 def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
+    """
+    Lowering for autotuning aten.addmm with different backends (Aten, Triton, CUTLASS, etc.)
+    """
     # TODO(coconutruben): integrate into MMKernelInputs when all callsites use that
     m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout)
     static_shape, is_nonzero = _is_static_problem(layout)
-
+    name = "addmm"
     # Create MMKernelInputs for AddMM at the top
-    kernel_inputs = MMKernelInputs([inp_expanded, mat1, mat2])
+    kernel_inputs = MMKernelInputs(
+        [inp_expanded, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
+    )
+    choices: list[ChoiceCaller] = []
 
     # below is for getting an overview logging info of inductor mms
     counters["aten_mm_info"][f"aten.addmm_{m}_{n}_{k}"] += 1
@@ -929,7 +948,7 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         mat2.get_dtype(),
         layout,
     )
-
+    aten_layout = layout
     if (not is_nonzero) or (
         not (inductor_config.max_autotune or inductor_config.max_autotune_gemm)
     ):
@@ -938,99 +957,79 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
         from torch._inductor.ir import FixedLayout, FlexibleLayout
 
         if isinstance(layout, FixedLayout):
-            layout = FlexibleLayout(
+            aten_layout = FlexibleLayout(
                 device=layout.device, dtype=layout.dtype, size=layout.size
             )
-        choices = (
-            [
-                aten_addmm.bind(
-                    # TODO(coconutruben): replace with kernel_inputs.nodes()
-                    # once that supports the unexpanded nodes as well
-                    [inp, mat1, mat2],
-                    layout,
-                    alpha=alpha,
-                    beta=beta,
-                )
-            ]
-            if use_aten_gemm_kernels()
-            else []
+        # TODO(coconutruben): combine this with the main flow of addmm through
+        # a subgraph or something as inp vs inp_expanded causes some slight numeric
+        # differences
+        kernel_inputs = MMKernelInputs(
+            [inp, mat1, mat2], scalars=dict(alpha=alpha, beta=beta)
         )
-        return autotune_select_algorithm(
-            # TODO(coconutruben): replace with kernel_inputs.nodes()
-            # once that supports the unexpanded nodes as well
-            "addmm",
-            choices,
-            [inp, mat1, mat2],
-            layout,
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                aten_layout,
+                [aten_addmm],
+                name,
+            )
         )
+        return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
-    choices = (
-        [
-            aten_addmm.bind(
-                kernel_inputs.nodes(),
-                layout,
-                alpha=alpha,
-                beta=beta,
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                aten_layout,
+                [aten_bias_addmm],
+                name,
+            )
+        )
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                aten_layout,
+                [aten_addmm],
+                name,
             )
-        ]
-        if use_aten_gemm_kernels()
-        else []
-    )
-
-    if (
-        use_aten_gemm_kernels()
-        and inp_expanded.get_stride()[0] == 0
-        and inp_expanded.get_device().type == "cuda"
-        and inductor_config.triton.autotune_cublasLt
-    ):
-        # unexpand inp to make sure fused addmm from cublasLt is used
-        choices.insert(
-            0,
-            aten_bias_addmm.bind(
-                kernel_inputs.nodes(),
-                layout,
-                alpha=alpha,
-                beta=beta,
-            ),
         )
 
-    if is_nonzero and use_triton_template(layout):
-        # Get template params using the new unified function
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, mm_template.name, "addmm"
-        ):
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-                prefix_args=1,
-                epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
-                epilogue_fn_hash=str(["addmm_epilogue", layout.dtype, alpha, beta]),
+    if is_nonzero and use_triton_template(layout, check_max_autotune=False):
+        # all the triton templates use the extra_kwargs
+        # Get template choices using the new unified function
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                layout,
+                [mm_template],
+                name,
             )
+        )
 
         if use_triton_tma_template(mat1, mat2):
-            # Get TMA template params using the new unified function
-            for kwargs in V.choices.get_mm_configs(
-                kernel_inputs, layout, persistent_tma_mm_template.name, "addmm"
-            ):
-                persistent_tma_mm_template.maybe_append_choice(
-                    choices,
-                    input_nodes=kernel_inputs.nodes(),
-                    layout=layout,
-                    workspace_arg=get_tma_workspace_arg(
-                        num_tma_descriptors=2,
-                        device=mat1.get_device(),
-                    ),
-                    **kwargs,
-                    prefix_args=1,
-                    epilogue_fn=addmm_epilogue(layout.dtype, alpha, beta),
+            # Get TMA template choices using the new unified function
+            choices.extend(
+                V.choices.get_mm_configs(
+                    kernel_inputs,
+                    layout,
+                    [persistent_tma_mm_template],
+                    name,
                 )
+            )
+
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                layout,
+                [addmm_contiguous_subgraph_template],
+                "addmm",
+            )
+        )
 
     if (
         is_nonzero
         and use_cutlass_template(layout, m, n, k)
-        and _use_cutlass_for_op("addmm")
+        and _use_cutlass_for_op(name)
     ):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices,
@@ -1064,7 +1063,7 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
             has_bias=True,
         )
 
-    return autotune_select_algorithm("addmm", choices, kernel_inputs.nodes(), layout)
+    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
 
 @register_lowering(aten._sparse_semi_structured_mm, type_promotion_kind=None)
@@ -1073,13 +1072,13 @@ def tuned_sparse_semi_structured_mm(
 ):
     from torch._inductor.select_algorithm import realize_inputs
 
+    # TODO(coconturuben): support V.choices.get_mm_configs for sparse_semi_structured_mm
     mat1, mat1_meta, mat2 = realize_inputs(mat1, mat1_meta, mat2)
     m1, k1 = mat1.get_size()
     m2, _ = mat1_meta.get_size()
     k2, n = mat2.get_size()
     m = V.graph.sizevars.check_equals_and_simplify(m1, m2)
     k = V.graph.sizevars.check_equals_and_simplify(2 * k1, k2)
-
     if layout is None:
         from torch._inductor.ir import FixedLayout
 
@@ -1161,115 +1160,77 @@ def tuned_scaled_mm(
         mat_b.get_dtype(),
         layout,
     )
-
+    name = "scaled_mm"
     check_supported_striding(mat_a, mat_b)
 
     scale_a_real, scale_b_real = realize_inputs(scale_a, scale_b)
 
-    input_nodes: tuple[Any, ...]
+    input_nodes: list[Any]
 
     if not bias:
-        input_nodes = (mat_a, mat_b, scale_a_real, scale_b_real)
+        input_nodes = [mat_a, mat_b, scale_a_real, scale_b_real]
     else:
         bias_real = realize_inputs(bias)
-        input_nodes = (mat_a, mat_b, scale_a_real, scale_b_real, bias_real)
+        input_nodes = [mat_a, mat_b, scale_a_real, scale_b_real, bias_real]
 
-    aten_choice = aten__fp8_mm.bind(
-        input_nodes, layout, out_dtype=out_dtype, use_fast_accum=use_fast_accum
-    )
+    # Create MMKernelInputs for Scaled MM (matrices are at indices 0, 1)
+    kernel_inputs = MMKernelInputs(input_nodes, mat1_idx=0, mat2_idx=1)
 
-    choices = []
+    choices: list[ChoiceCaller] = []
     if use_aten_gemm_kernels():
-        choices.append(aten_choice)
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                layout,
+                [aten__fp8_mm],
+                name,
+                kwarg_overrides={
+                    aten__fp8_mm.uid: dict(
+                        out_dtype=out_dtype, use_fast_accum=use_fast_accum
+                    )
+                },
+            )
+        )
 
     # We dont have triton lowerings for the MX variants yet
     if scale_a.dtype != torch.float32:
-        return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
+        return autotune_select_algorithm(name, choices, input_nodes, layout)
 
     _, is_nonzero = _is_static_problem(layout)
 
-    # Prepare triton input nodes and create kernel_inputs at the top
-    triton_input_nodes: list[Any]
-    if bias and len(mat_b.get_size()) == len(bias.get_size()) + 1:
-        # Need to unsqueeze bias from [N] -> [1, N]
-        triton_bias = L[aten.unsqueeze](bias, 0)
-    else:
-        triton_bias = bias
-
-    if len(scale_a.get_size()) == 0 or len(scale_b.get_size()) == 0:
-        assert len(scale_a.get_size()) == len(scale_b.get_size())
-        # Need to unsqueeze scale from [] -> [1, 1]
-        triton_scale_a = L[aten.unsqueeze](L[aten.unsqueeze](scale_a, 0), 1)
-        triton_scale_b = L[aten.unsqueeze](L[aten.unsqueeze](scale_b, 0), 1)
-    else:
-        triton_scale_a = scale_a
-        triton_scale_b = scale_b
-
-    if bias:
-        triton_input_nodes = [
-            mat_a,
-            mat_b,
-            triton_scale_a,
-            triton_scale_b,
-            triton_bias,
-        ]
-        suffix_args = 3
-    else:
-        triton_input_nodes = [mat_a, mat_b, triton_scale_a, triton_scale_b]
-        suffix_args = 2
-
-    # Create MMKernelInputs for Scaled MM (matrices are at indices 0, 1)
-    kernel_inputs = MMKernelInputs(triton_input_nodes, mat1_idx=0, mat2_idx=1)
-
-    if is_nonzero and use_triton_template(layout, enable_float8=True):
+    if is_nonzero and use_triton_template(
+        layout, enable_float8=True, check_max_autotune=False
+    ):
+        overriders = dict(USE_FAST_ACCUM=use_fast_accum)
         # TODO (paulzhan): There is no template that exists for bias and TMA
         # Don't run tma template currently if bias exists
         if use_triton_tma_template(mat_a, mat_b) and not bias:
-            # Get TMA template params using the new unified function
-            for kwargs in V.choices.get_mm_configs(
-                kernel_inputs, layout, scaled_mm_device_tma_template.name, "scaled_mm"
-            ):
-                kwargs["USE_FAST_ACCUM"] = use_fast_accum
-                scaled_mm_device_tma_template.maybe_append_choice(
-                    choices,
-                    input_nodes=kernel_inputs.nodes(),
-                    layout=layout,
-                    workspace_arg=get_tma_workspace_arg(
-                        num_tma_descriptors=2,
-                        device=mat_a.get_device(),
-                    ),
-                    **kwargs,
+            # Get TMA template choices using the new unified function
+            choices.extend(
+                V.choices.get_mm_configs(
+                    kernel_inputs,
+                    layout,
+                    [scaled_mm_device_tma_template],
+                    name,
+                    kwarg_overrides={scaled_mm_device_tma_template.uid: overriders},
                 )
+            )
 
-        # Get template params using the new unified function
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout, mm_template.name, "scaled_mm"
-        ):
-            kwargs["USE_FAST_ACCUM"] = use_fast_accum
-            if V.graph.sizevars.guard_or_false(sympy.Le(k, 16)):
-                # Triton crashes however uncommon for real workloads
-                continue
-
-            # On NVIDIA B200 GPUs, K dim must be >= 32 for tcgen05.mma.kind::f8f6f4.* PTX instruction to be valid
-            # source: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-matrix-shape
-            if using_b200() and V.graph.sizevars.guard_or_false(sympy.Lt(k, 32)):
-                continue
-
-            # possibly appends a TritonTemplateCaller to choices
-            mm_template.maybe_append_choice(
-                choices,
-                input_nodes=kernel_inputs.nodes(),
-                layout=layout,
-                **kwargs,
-                suffix_args=suffix_args,
-                epilogue_fn=scale_mm_epilogue(),
-                epilogue_fn_hash="scale_mm_epilogue",
+        # Get template choices using the new unified function
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs,
+                layout,
+                [mm_template],
+                name,
+                kwarg_overrides={mm_template.uid: overriders},
             )
+        )
 
     if (
         is_nonzero
         and use_cutlass_template(layout, m, n, k)
-        and _use_cutlass_for_op("scaled_mm")
+        and _use_cutlass_for_op(name)
     ):
         CUTLASS3xGemmTemplate.add_cutlass_gemm_choices(
             choices,
@@ -1281,7 +1242,7 @@ def tuned_scaled_mm(
     if is_nonzero and use_ck_gemm_template(layout, m, n, k):
         CKGemmTemplate.add_ck_gemm_choices(choices, layout, kernel_inputs.nodes())
 
-    return autotune_select_algorithm("scaled_mm", choices, input_nodes, layout)
+    return autotune_select_algorithm(name, choices, kernel_inputs.nodes(), layout)
 
 
 @functools.cache
diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py
index df3e8fcf1e65..60e1b01a5b03 100644
--- a/torch/_inductor/kernel/mm_plus_mm.py
+++ b/torch/_inductor/kernel/mm_plus_mm.py
@@ -1,6 +1,7 @@
 # mypy: allow-untyped-defs
 
 import logging
+from typing import TYPE_CHECKING
 
 import torch
 
@@ -16,6 +17,9 @@
 from .mm_common import mm_args, mm_grid
 
 
+if TYPE_CHECKING:
+    from torch._inductor.ir import ChoiceCaller
+
 log = logging.getLogger(__name__)
 
 aten = torch.ops.aten
@@ -150,27 +154,21 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
 
     assert layout1 == layout2
     # options to tune from
-    choices = (
-        [aten_mm_plus_mm.bind(kernel_inputs.nodes(), layout1)]
-        if use_aten_gemm_kernels()
-        else []
-    )
+    choices: list[ChoiceCaller] = []
+    if use_aten_gemm_kernels():
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs, layout1, [aten_mm_plus_mm], "mm_plus_mm"
+            )
+        )
 
-    if use_triton_template(layout1):
-        # Get template params using the new unified function
-        for kwargs in V.choices.get_mm_configs(
-            kernel_inputs, layout1, mm_plus_mm_template.name, "mm_plus_mm"
-        ):
-            # Apply BLOCK_K constraint specific to mm_plus_mm
-            # see https://github.com/triton-lang/triton/issues/1298
-            # BLOCK_K = K causes llvm error
-            if V.graph.sizevars.statically_known_lt(kwargs.get("BLOCK_K", k1), k1):
-                mm_plus_mm_template.maybe_append_choice(
-                    choices,
-                    input_nodes=kernel_inputs.nodes(),
-                    layout=layout1,
-                    **kwargs,
-                )
+    if use_triton_template(layout1, check_max_autotune=False):
+        # Get template choices using the new unified function
+        choices.extend(
+            V.choices.get_mm_configs(
+                kernel_inputs, layout1, [mm_plus_mm_template], "mm_plus_mm"
+            )
+        )
 
     return autotune_select_algorithm(
         "mm_plus_mm", choices, kernel_inputs.nodes(), layout1
diff --git a/torch/_inductor/kernel_inputs.py b/torch/_inductor/kernel_inputs.py
index 6c66c1161900..83ef996831a2 100644
--- a/torch/_inductor/kernel_inputs.py
+++ b/torch/_inductor/kernel_inputs.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, Optional, TYPE_CHECKING
+from typing import Any, Optional, TYPE_CHECKING, Union
 
 import torch
 import torch._inductor.config
@@ -21,7 +21,11 @@ class KernelInputs:
     information about these nodes, such as their device type and device.
     """
 
-    def __init__(self, input_nodes: list[Any]):
+    def __init__(
+        self,
+        input_nodes: list[Any],
+        scalars: Optional[dict[str, Union[float, int]]] = None,
+    ):
         """
         Initialize with a tuple of input nodes.
 
@@ -30,6 +34,7 @@ def __init__(self, input_nodes: list[Any]):
         """
         self._input_nodes = input_nodes
         self._device_name: Optional[str] = None
+        self._scalars = scalars if scalars is not None else {}
         assert len(input_nodes) > 0, "Expected at least one input node"
 
     def nodes(self, reorder: Optional[Sequence[int]] = None) -> list[Any]:
@@ -50,6 +55,16 @@ def nodes(self, reorder: Optional[Sequence[int]] = None) -> list[Any]:
         )
         return [self._input_nodes[i] for i in reorder]
 
+    @property
+    def count(self) -> int:
+        """
+        Get the number of input nodes.
+
+        Returns:
+            The number of input nodes
+        """
+        return len(self._input_nodes)
+
     @property
     def device_type(self) -> Optional[str]:
         """
@@ -153,6 +168,19 @@ def dtype(self, idx: int = 0) -> torch.dtype:
         """
         return self._input_nodes[idx].get_dtype()
 
+    def get_scalar(self, name: str) -> Union[float, int]:
+        """
+        Get the scalar value for a given name.
+
+        Args:
+            name: Name of the scalar to get
+
+        Returns:
+            The scalar value
+        """
+        assert name in self._scalars, f"Scalar {name} not found, but required"
+        return self._scalars[name]
+
 
 class MMKernelInputs(KernelInputs):
     """
@@ -160,14 +188,20 @@ class MMKernelInputs(KernelInputs):
     Provides additional methods to access M, N, K dimensions.
     """
 
-    def __init__(self, input_nodes: list[Any], mat1_idx: int = -2, mat2_idx: int = -1):
+    def __init__(
+        self,
+        input_nodes: list[Any],
+        scalars: Optional[dict[str, Union[float, int]]] = None,
+        mat1_idx: int = -2,
+        mat2_idx: int = -1,
+    ):
         """
         Initialize with a tuple of input nodes.
 
         By default, we assume the last 2 input nodes are mat1 and mat2, but
         the caller can adjust when necessary
         """
-        super().__init__(input_nodes)
+        super().__init__(input_nodes, scalars)
         # for mm, we need at least 2 nodes, and we need to know which nodes
         # are the main matrixes e.g. addmm is (bias, mat1, mat2) whereas others
         # might be (mat1, mat2, scale), etc.
@@ -212,6 +246,16 @@ def mnk_symbolic(
         V.graph.sizevars.check_equals(k, k0)
         return (m, n, k)
 
+    def mat1mat2(self) -> tuple[Any, Any]:
+        """
+        Get the mat1 and mat2 nodes.
+
+        Returns:
+            A tuple of (mat1, mat2) nodes
+        """
+        nodes = self.nodes()
+        return nodes[self._mat1_idx], nodes[self._mat2_idx]
+
     def mnk_hinted(self) -> tuple[int, int, int]:
         """
         Get the hinted M, N, K dimensions for matrix multiplication.
diff --git a/torch/_inductor/loop_body.py b/torch/_inductor/loop_body.py
index ffcf431c0cb3..5ae38810fa13 100644
--- a/torch/_inductor/loop_body.py
+++ b/torch/_inductor/loop_body.py
@@ -223,6 +223,53 @@ def merge_loops(self) -> LoopBody:
         )
         return new_body2
 
+    def expand_dimension_for_pointwise_node(
+        self, dimension: int, new_range: int
+    ) -> LoopBody:
+        """
+        Expand node on `dimension` to `new_range` and rely on index modular to avoid
+        out-of-boundary access.
+        """
+
+        old_body = self
+        old_sizes = self.sizes
+
+        iter_size, reduce_size = old_sizes
+        original_range = iter_size[dimension]
+        new_iter_size = list(iter_size)
+        new_iter_size[dimension] = new_range
+        new_sizes = (new_iter_size, reduce_size)
+
+        (iter_vars, reduce_vars), var_ranges = dependencies.index_vars_no_squeeze(
+            *new_sizes,
+            prefix="t",  # type: ignore[arg-type]
+        )
+
+        def new_body(*indices: Sequence[sympy.Expr]) -> Any:
+            index = [*itertools.chain.from_iterable(indices)]
+            assert len(index) == len(iter_size) + len(reduce_size)
+            iter_idx = index[: len(iter_size)]
+            reduce_idx = index[len(iter_size) :]
+
+            new_iter_idx = list(iter_idx)
+            new_iter_idx[dimension] = iter_idx[dimension] % original_range
+
+            return old_body(new_iter_idx, reduce_idx)
+
+        loop_body = LoopBody(
+            new_body, (iter_vars, reduce_vars), var_ranges, iter_vars, reduce_vars
+        )
+
+        # use the original symbol prefix so we can do multiple round of reordering
+        (iter_vars2, reduce_vars2), var_ranges2 = dependencies.index_vars_no_squeeze(
+            *new_sizes,
+            prefix="p",  # type: ignore[arg-type]
+        )
+        new_body = LoopBody(
+            loop_body, (iter_vars2, reduce_vars2), var_ranges2, iter_vars2, reduce_vars2
+        )
+        return new_body
+
     def reorder_iter_loops(self, new_order) -> LoopBody:
         """
         Reorder iteration loops and return a new LoopBody.
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index d235ae800beb..d05bdd135469 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -314,6 +314,26 @@ def in_namespace(op, namespace):
     return False
 
 
+def maybe_copy_cpu_scalar(x: TensorBox, device: torch.device) -> TensorBox:
+    """
+    Copy cpu scalar if doesn't not match with given `device`
+    """
+    if not isinstance(x.data, ir.ReinterpretView) or has_free_unbacked_symbols(
+        x.get_size()
+    ):
+        return x
+    size = [V.graph.sizevars.size_hint_or_throw(s) for s in x.get_size()]
+    cur_device = x.get_device()
+    if (
+        cur_device is not None
+        and cur_device.type == "cpu"
+        and cur_device != device
+        and (len(size) == 0 or (len(size) == 1 and size[0] == 1))
+    ):
+        return TensorBox(ir.StorageBox(ir.DeviceCopy.create(x, cur_device, False)))
+    return x
+
+
 def transform_args(
     args: list[Any],
     kwargs: dict[str, Any],
@@ -321,6 +341,10 @@ def transform_args(
     type_promotion_kind: Optional[ELEMENTWISE_TYPE_PROMOTION_KIND],
     convert_input_to_bool: bool,
 ) -> tuple[list[Any], dict[str, Any]]:
+    """
+    Transforms arguments for broadcasting and type promotion
+    """
+
     args_indices = [i for i, x in enumerate(args) if isinstance(x, TensorBox)]
     kwargs_indices = [k for k, v in kwargs.items() if isinstance(v, TensorBox)]
     # check that there's something to transform
@@ -348,6 +372,12 @@ def transform_args(
             args[args_indices[0]] if args_indices else kwargs[kwargs_indices[0]]
         ).get_device()
 
+        for i in args_indices:
+            args[i] = maybe_copy_cpu_scalar(args[i], device)
+
+        for k in kwargs_indices:
+            kwargs[k] = maybe_copy_cpu_scalar(kwargs[k], device)
+
         # sometimes args are an immutable list so we can't mutate them
         def promote(arg):
             if isinstance(arg, TensorBox):
@@ -497,13 +527,9 @@ def broadcast_symbolic_shapes(a, b):
     """
     output = []
     for x, y in itertools.zip_longest(reversed(a), reversed(b), fillvalue=sympy.S.One):
-        if V.graph.sizevars.shape_env.evaluate_expr(
-            sympy.Eq(y, 1), fallback_value=False
-        ):
+        if V.graph.sizevars.is_size_one_or_false(y):
             output.append(x)
-        elif V.graph.sizevars.shape_env.evaluate_expr(
-            sympy.Eq(x, 1), fallback_value=False
-        ):
+        elif V.graph.sizevars.is_size_one_or_false(x):
             output.append(y)
         else:
             V.graph.sizevars.check_equals(x, y)
@@ -949,13 +975,10 @@ def broadcast_tensors(*inputs):
     for x in inputs:
         sizes = x.get_size()
 
-        def is_length_one(size: sympy.Expr):
-            return V.graph.sizevars.shape_env.evaluate_expr(
-                sympy.Eq(size, 1), fallback_value=False
-            )
-
         if len(sizes) != len(target) or any(
-            is_length_one(a) != is_length_one(b) for a, b in zip(sizes, target)
+            V.graph.sizevars.is_size_one_or_false(a)
+            != V.graph.sizevars.is_size_one_or_false(b)
+            for a, b in zip(sizes, target)
         ):
             x = expand(x, target)
         outputs.append(x)
@@ -1334,13 +1357,8 @@ def _assert_async(cond, msg):
     cond = to_dtype(cond, torch.bool)
 
     def inner_fn(index):
-        if hasattr(cond.data, "data") and hasattr(cond.data.data, "force_realize"):
-            with cond.data.data.force_realize():
-                cond_loader = cond.make_loader()
-                return ops.device_assert_async(cond_loader(index), msg)
-        else:
-            cond_loader = cond.make_loader()
-            return ops.device_assert_async(cond_loader(index), msg)
+        with ir.ComputedBuffer.force_realize():
+            return ops.device_assert_async(cond.make_loader()(index), msg)
 
     assertion_op = Pointwise.create(
         device=cond.get_device(),
@@ -3019,7 +3037,7 @@ def inner_fn(idx):
 
 @register_lowering(aten.slice_scatter, type_promotion_kind=None)
 def slice_scatter(x, src, dim=0, start=None, end=None, step=1):
-    assert x.get_dtype() == src.get_dtype()
+    src = to_dtype(src, x.get_dtype())
     x_loader = x.make_loader()
     dim = _validate_dim(x, dim, 0)
     dim_size = x.get_size()[dim]
@@ -7047,7 +7065,7 @@ def cond(pred, true_fn, false_fn, operands):
 
 
 @register_lowering(torch.ops.higher_order.while_loop, type_promotion_kind=None)
-def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs):
+def while_loop(cond_fn, body_fn, carried_inputs, additional_inputs, stack_output=False):
     if any(
         isinstance(x, IRNode) and is_triton(x)
         for x in carried_inputs + additional_inputs
@@ -7067,11 +7085,18 @@ def _map_output(out: Any):
         else:
             raise RuntimeError(f"NYI unsupported output type: {type(out)}")
 
-    result = ir.WhileLoop.create(cond_fn, body_fn, carried_inputs, additional_inputs)
+    result = ir.WhileLoop.create(
+        cond_fn, body_fn, carried_inputs, additional_inputs, stack_output
+    )
     assert isinstance(result, Sequence)
     return list(map(_map_output, result))
 
 
+register_lowering(
+    torch.ops.higher_order.while_loop_stack_output, type_promotion_kind=None
+)(functools.partial(while_loop, stack_output=True))
+
+
 @register_lowering(torch.ops.higher_order.invoke_subgraph, type_promotion_kind=None)
 def invoke_subgraph(subgraph_fn: ir.Subgraph, identifier: str, *operands):
     result = ir.InvokeSubgraph.create(subgraph_fn, *operands)
diff --git a/torch/_inductor/mkldnn_ir.py b/torch/_inductor/mkldnn_ir.py
index db63d880d971..866c22abd069 100644
--- a/torch/_inductor/mkldnn_ir.py
+++ b/torch/_inductor/mkldnn_ir.py
@@ -513,17 +513,20 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+        self.device_type = get_device_type(inputs[0])
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._convolution_transpose_pointwise.default,
-            cpp_kernel_name="aoti_torch_cpu_mkldnn__convolution_transpose_pointwise",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}_mkldnn__convolution_transpose_pointwise",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
 
     @classmethod
@@ -590,6 +593,7 @@ def __init__(
             - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_scale, o_zp,
               fp32_output, unary_attr, unary_scalars, unary_algorithm]
         """
+        self.device_type = get_device_type(inputs[0])
         self.has_bias = len(inputs) == 5
         super().__init__(
             layout,
@@ -597,11 +601,13 @@ def __init__(
             constant_args,
             None,
             op_overload=torch.ops.onednn.qconv_pointwise.default,
-            cpp_kernel_name="aoti_torch_cpu__qconv_pointwise_tensor",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}__qconv_pointwise_tensor",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
@@ -694,6 +700,7 @@ def __init__(
             - const_args [b, stride, padding, dilation, groups, o_scale, o_zp,
              output_dtype, accum_scale, accum_zp, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
         """
+        self.device_type = get_device_type(inputs[0])
         self.has_bias = len(inputs) == 8
         self.idx_for_inplace_sum = 6
         super().__init__(
@@ -702,11 +709,15 @@ def __init__(
             constant_args,
             None,
             op_overload=torch.ops.onednn.qconv2d_pointwise.binary,
-            cpp_kernel_name=("aoti_torch_cpu__qconv2d_pointwise_binary_tensor"),
+            cpp_kernel_name=(
+                f"aoti_torch_{self.device_type}__qconv2d_pointwise_binary_tensor"
+            ),
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
@@ -850,17 +861,20 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+        self.device_type = get_device_type(inputs[0])
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._linear_pointwise.default,
-            cpp_kernel_name="aoti_torch_cpu__linear_pointwise",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}__linear_pointwise",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
 
     @classmethod
@@ -906,17 +920,20 @@ def __init__(
         inputs,
         constant_args=(),
     ) -> None:
+        self.device_type = get_device_type(inputs[0])
         super().__init__(
             layout,
             inputs,
             constant_args,
             None,
             op_overload=torch.ops.mkldnn._linear_pointwise.binary,
-            cpp_kernel_name="aoti_torch_cpu__linear_pointwise_binary",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}__linear_pointwise_binary",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
 
     @classmethod
@@ -971,6 +988,7 @@ def __init__(
             - const_args is: [bias, x_scale, x_zp, o_scale, o_zp,
               fp32_output, unary_attr, unary_scalars, unary_algorithm]
         """
+        self.device_type = get_device_type(inputs[0])
         self.has_bias = has_bias
         super().__init__(
             layout,
@@ -978,11 +996,15 @@ def __init__(
             constant_args,
             None,
             op_overload=(torch.ops.onednn.qlinear_pointwise.tensor),
-            cpp_kernel_name=("aoti_torch_cpu__qlinear_pointwise_tensor"),
+            cpp_kernel_name=(
+                f"aoti_torch_{self.device_type}__qlinear_pointwise_tensor"
+            ),
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
 
         if isinstance(self.layout, Layout):
@@ -1054,6 +1076,7 @@ def __init__(
             - const_args is: [bias, o_scale, o_zp,
               fp32_output, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm]
         """
+        self.device_type = get_device_type(inputs[0])
         self.has_bias = has_bias
         self.idx_for_inplace_sum = 6
         super().__init__(
@@ -1062,11 +1085,13 @@ def __init__(
             constant_args,
             None,
             op_overload=(torch.ops.onednn.qlinear_pointwise.binary_tensor),
-            cpp_kernel_name="aoti_torch_cpu__qlinear_pointwise_binary_tensor",
+            cpp_kernel_name=f"aoti_torch_{self.device_type}__qlinear_pointwise_binary_tensor",
         )
 
     def codegen(self, wrapper):
-        wrapper.include_extra_header("torch/csrc/inductor/aoti_torch/c/shim_cpu.h")
+        wrapper.include_extra_header(
+            f"torch/csrc/inductor/aoti_torch/c/shim_{self.device_type}.h"
+        )
         super().codegen(wrapper)
         if isinstance(self.layout, Layout):
             self.codegen_size_asserts(wrapper)
diff --git a/torch/_inductor/output_code.py b/torch/_inductor/output_code.py
index ae637345ac0d..955c00c51d0b 100644
--- a/torch/_inductor/output_code.py
+++ b/torch/_inductor/output_code.py
@@ -41,8 +41,10 @@
 )
 from torch._inductor.freezing_utils import has_frozen_params, is_frozen_param
 from torch._inductor.utils import (
+    _unstable_customized_partition_wrapper,
     align_inputs_from_check_idxs,
     BoxedBool,
+    CUDAGraphWrapperMetadata,
     GraphPartitionMap,
     InputType,
     output_node,
@@ -422,6 +424,8 @@ class CompiledFxGraph(OutputCode):
     # fx graph. The expression must be generated by:
     # ShapeEnv.produce_guards_expression()
     guards_expr: Optional[str]
+    inductor_provenance_mapping_str: Optional[str]
+    inductor_provenance_stack_traces_str: Optional[str]
 
     cudagraph_info: Optional[CudagraphCachedInfo]
     partition_maps: Optional[list[GraphPartitionMap]]
@@ -448,6 +452,8 @@ def __init__(
         runnable_graph_str: str,
         inductor_post_grad_graph_str: str,
         compiled_fn_runner: Optional[Any] = None,
+        inductor_provenance_mapping_str: Optional[str] = None,
+        inductor_provenance_stack_traces_str: Optional[str] = None,
     ) -> None:
         self.current_callable = current_callable
         self.compiled_fn_runner = compiled_fn_runner
@@ -462,6 +468,8 @@ def __init__(
                 self.source_code = f.read()
         self.runnable_graph_str = runnable_graph_str
         self.inductor_post_grad_graph_str = inductor_post_grad_graph_str
+        self.inductor_provenance_mapping_str = inductor_provenance_mapping_str
+        self.inductor_provenance_stack_traces_str = inductor_provenance_stack_traces_str
         self.cache_linemap = graph.cache_linemap
         # TODO - ordered set
         self.device_types = OrderedSet(graph.device_types)
@@ -581,6 +589,23 @@ def __del__(self) -> None:
 
     def __call__(self, inputs: Sequence[Any]) -> Any:
         assert self.current_callable is not None
+
+        if (
+            torch._inductor.debug.RECORD_GRAPH_EXECUTION
+            and torch._inductor.debug.GRAPH_EXECUTION_ORDER is not None
+        ):
+            graph_id = self.fx_kwargs.get("graph_id")
+            compile_id = (
+                torch._inductor.debug.GRAPH_COMPILE_IDS.get(graph_id)
+                if graph_id is not None
+                and torch._inductor.debug.GRAPH_COMPILE_IDS is not None
+                else None
+            )
+            torch._inductor.debug.GRAPH_EXECUTION_ORDER.append(
+                {
+                    "compile_id": compile_id,
+                }
+            )
         try:
             with record_function(
                 f"## Call CompiledFxGraph {self._fx_graph_cache_key} ##"
@@ -605,6 +630,23 @@ def post_compile(
         This runs whether or not we have a cache hit, and always runs directly after we get a CompiledFxGraph.
         The results of this function are *not* saved in the cache itself.
         """
+        if config.graph_partition and _unstable_customized_partition_wrapper.wrapper:
+            # Mechanically apply user-specified cudagraph wrappers without modification
+            assert self.recursively_apply_fns is not None
+            assert self.compiled_fn_runner is not None
+            num_partitions = len(self.compiled_fn_runner.partitions)
+            wrapper_metadatas = [
+                CUDAGraphWrapperMetadata(num_partitions, i)
+                for i in range(num_partitions)
+            ]
+            customized_wrapper = _unstable_customized_partition_wrapper.wrapper
+            customized_wrappers_with_metadata = [
+                lambda f, m=metadata: customized_wrapper(f, m)
+                for metadata in wrapper_metadatas
+            ]
+            self.recursively_apply_fns(customized_wrappers_with_metadata)
+            return
+
         set_tracing_context_output_strides(example_inputs, self)
         assert graph_kwargs["cudagraphs"] is not None
         assert graph_kwargs["is_backward"] is not None
diff --git a/torch/_inductor/pattern_matcher.py b/torch/_inductor/pattern_matcher.py
index 93f4956ab1e6..e8210f1e80f8 100644
--- a/torch/_inductor/pattern_matcher.py
+++ b/torch/_inductor/pattern_matcher.py
@@ -63,7 +63,7 @@
 from torch._prims_common import is_integer_dtype
 from torch._subclasses.fake_tensor import unset_fake_temporarily
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.symbolic_shapes import statically_known_true
+from torch.fx.experimental.symbolic_shapes import guard_or_false, statically_known_true
 from torch.fx.graph_module import _get_attr
 from torch.fx.immutable_collections import immutable_dict, immutable_list
 from torch.fx.passes.graph_transform_observer import GraphTransformObserver
@@ -1978,7 +1978,8 @@ def apply(self, gm: Union[torch.fx.GraphModule, torch.fx.Graph]) -> int:
                         continue
                     if os.environ.get("TORCHINDUCTOR_PATTERN_MATCH_DEBUG") == node.name:
                         log.warning("%s%s %s %s", node, node.args, m, entry.pattern)
-                    if is_match(m) and entry.extra_check(m):
+
+                    if is_match(m) and guard_or_false(entry.extra_check(m)):
                         count += 1
                         entry.apply(m, graph, node)
                         counters[backend]["pattern_matcher_count"] += 1
diff --git a/torch/_inductor/runtime/compile_tasks.py b/torch/_inductor/runtime/compile_tasks.py
index 850c7660d5d9..1851e447e195 100644
--- a/torch/_inductor/runtime/compile_tasks.py
+++ b/torch/_inductor/runtime/compile_tasks.py
@@ -40,7 +40,7 @@ def _reload_python_module(
 def _set_triton_ptxas_path() -> None:
     if os.environ.get("TRITON_PTXAS_PATH") is not None:
         return
-    ptxas = Path(__file__).absolute().parents[1] / "bin" / "ptxas"
+    ptxas = Path(__file__).absolute().parents[2] / "bin" / "ptxas"
     if not ptxas.exists():
         return
     if ptxas.is_file() and os.access(ptxas, os.X_OK):
diff --git a/torch/_inductor/runtime/coordinate_descent_tuner.py b/torch/_inductor/runtime/coordinate_descent_tuner.py
index 413dfaf09d06..26b3bcf5cc5c 100644
--- a/torch/_inductor/runtime/coordinate_descent_tuner.py
+++ b/torch/_inductor/runtime/coordinate_descent_tuner.py
@@ -3,6 +3,7 @@
 import itertools
 import logging
 from typing import Callable, Optional, TYPE_CHECKING
+from functools import lru_cache
 
 from .hints import TRITON_MAX_BLOCK
 from .runtime_utils import red_text, triton_config_to_hashable
@@ -60,10 +61,16 @@ def get_config_max(self, prefix: str) -> int:
         size_hint = self.size_hints.get(prefix) if self.size_hints is not None else None
         return min(max_block, size_hint) if size_hint is not None else max_block
 
+    @lru_cache(maxsize=1)
     def get_warpsmax(self):
-        # Currently, CUDA has a maximum of 1024 threads, so 32 is the max
-        # number of warps.
-        return 1024 // 32
+        # CUDA/ROCm has a maximum of 1024 threads per block
+        from torch.cuda import current_device, get_device_properties, is_available
+        
+        warp_size = (
+            get_device_properties(current_device()).warp_size if is_available() else 32
+        )
+
+        return 1024 // warp_size
 
     def cache_benchmark_result(self, config, timing):
         self.cached_benchmark_results[triton_config_to_hashable(config)] = timing
@@ -241,7 +248,10 @@ def autotune(
 
         log.debug("= Do coordinate descent tuning for %s =", self.name)
         log.debug(
-            "Baseline Config %s, baseline timing %f", baseline_config, baseline_timing
+            "%s: Baseline Config %s, baseline timing %f",
+            self.name,
+            baseline_config,
+            baseline_timing,
         )
         improved = True
         best_config = baseline_config
@@ -283,15 +293,17 @@ def autotune(
 
                 if improved:
                     msg = red_text(
-                        "Coordinate descend tuning found improvement of %.3fx by looking in all directions."
+                        "%s: Coordinate descend tuning found improvement of %.3fx by looking in all directions."
                     )
                     log.debug(
                         msg,
+                        self.name,
                         old_best_timing / best_timing,
                     )
 
         log.debug(
-            "Improve from %s %f -> %s %f, %.3fx",
+            "%s: Improve from %s %f -> %s %f, %.3fx",
+            self.name,
             baseline_config,
             baseline_timing,
             best_config,
diff --git a/torch/_inductor/runtime/static_cuda_launcher.py b/torch/_inductor/runtime/static_cuda_launcher.py
index 3290e25eeae4..bfea6fc119d9 100644
--- a/torch/_inductor/runtime/static_cuda_launcher.py
+++ b/torch/_inductor/runtime/static_cuda_launcher.py
@@ -54,7 +54,19 @@ def __init__(self, kernel: CompiledKernel) -> None:
             launch_enter = triton_knobs.runtime.launch_enter_hook
             launch_exit = triton_knobs.runtime.launch_exit_hook
 
-        if launch_enter is not None or launch_exit is not None:
+        def hook_is_empty(hook: Any) -> bool:
+            if hook is None:
+                return True
+            if (
+                triton_knobs
+                and (HookChain := getattr(triton_knobs, "HookChain", None)) is not None
+                and isinstance(hook, HookChain)
+            ):
+                # Support hooks after https://github.com/triton-lang/triton/pull/7866
+                return len(hook.calls) == 0
+            return False
+
+        if not hook_is_empty(launch_enter) or not hook_is_empty(launch_exit):
             raise NotImplementedError(
                 "We don't support launch enter or launch exit hooks"
             )
diff --git a/torch/_inductor/runtime/triton_helpers.py b/torch/_inductor/runtime/triton_helpers.py
index 9acbe3f7c0a8..e003615b218f 100644
--- a/torch/_inductor/runtime/triton_helpers.py
+++ b/torch/_inductor/runtime/triton_helpers.py
@@ -168,15 +168,15 @@ def max_with_index(value, index, dim):
 @triton.jit
 def exp(x, use_fast_math: tl.constexpr):
     if use_fast_math:
-        return libdevice.exp2(x * _LOG_2_E)
-    else:
         return math.exp(x)
+    else:
+        return libdevice.exp(x)
 
 
 @triton.jit
 def online_softmax_reduce(lhs_max, lhs_sum, dim, use_fast_math: tl.constexpr):
     out_max = max2(lhs_max, dim)
-    out_max_keepdim = out_max[:, None]
+    out_max_keepdim = tl.expand_dims(out_max, dim)
     delta = tl.where(out_max_keepdim == float("-inf"), 0, lhs_max - out_max_keepdim)
     out_sum = tl.sum(lhs_sum * exp(delta, use_fast_math), dim)
     return out_max, out_sum
@@ -314,8 +314,8 @@ def bucketize_binary_search(
     while full_range > 1:
         mid = (high + low) // 2
         mask = (
-            mid * BOUNDARIES_STRIDE + boundary_indices
-        ) < BOUNDARIES_UNDERLYING_NUMEL and mid < BOUNDARIES_SIZE
+            (mid * BOUNDARIES_STRIDE + boundary_indices) < BOUNDARIES_UNDERLYING_NUMEL
+        ).logical_and(mid < BOUNDARIES_SIZE)
         mid_indices = (
             mid
             if sorter_ptr is None or SORTER_STRIDE is None
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 11d7520cc5fb..46832167622b 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -31,6 +31,7 @@
 
 import torch
 from torch._dynamo.utils import set_feature_use
+from torch._environment import is_fbcode
 from torch._prims_common import compute_required_storage_length
 from torch.utils._ordered_set import OrderedSet
 
@@ -81,6 +82,14 @@
 )
 
 
+class InductorConfig(Config):
+    """Inductor-specific Triton config with additional control flags"""
+
+    def __init__(self, *args, dynamic_scale_rblock=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dynamic_scale_rblock = dynamic_scale_rblock
+
+
 class NoTritonConfigsError(RuntimeError):
     pass
 
@@ -365,6 +374,9 @@ def __init__(
         self.compile_id: Optional[CompileId] = None
         self.is_backward = False
 
+        # Mode for launch grid calculation
+        self.grid_mode: Literal["python", "python_slow", "cpp"] = "python"
+
     def is_statically_launchable(self):
         """
         Checks if every compiled kernel is statically launchable, which
@@ -607,7 +619,7 @@ def _make_launchers(self):
             raise RuntimeError(f"No valid triton configs. {type(exc).__name__}: {exc}")
         self.launchers = launchers
 
-    def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any]:
+    def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any, Any]:
         """Drop stuff from triton.JITFunction that does not pickle.
         This must be called after precompile so that these things are no longer needed.
         Returns a tuple of old values
@@ -618,14 +630,33 @@ def prepare_for_pickle(self) -> tuple[Any, Any, Any, Any, Any]:
             self.fn.used_global_vals,
             self.fn.repr,
             self.launchers,
+            getattr(self.fn, "_hash_lock", None),
         )
         self.fn.fn = None
         self.fn.__globals__ = None
         self.fn.used_global_vals = None
         self.fn.repr = _ConstRepr(self.fn.repr(self.fn))
         self.launchers = []
+        self.fn._hash_lock = None
         return old_values
 
+    def restore_after_unpickle(
+        self, old_values: Optional[tuple[Any, Any, Any, Any, Any, Any]]
+    ) -> None:
+        if old_values:
+            (
+                self.fn.fn,
+                self.fn.__globals__,
+                self.fn.used_global_vals,
+                self.fn.repr,
+                self.launchers,
+                self.fn._hash_lock,
+            ) = old_values
+        else:
+            # even if we don't need/have specific values, we do need the
+            # _hash_lock to be a valid RLock
+            self.fn._hash_lock = threading.RLock()
+
     def prepare_for_caching(self) -> None:
         """
         Statically Launched CUDA Kernels have a raw cubin on them
@@ -761,7 +792,10 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
             and getattr(knobs.runtime, "jit_post_compile_hook", None)
         ):
             try:
-                knobs.runtime.jit_post_compile_hook(
+                hook = knobs.runtime.jit_post_compile_hook
+
+                # base args everyone should get
+                call_kwargs = dict(
                     key=getattr(self.fn, "cache_key", self.kernel_hash or str(self.fn)),
                     repr=getattr(self.fn, "src", None),
                     fn=self.fn,
@@ -769,6 +803,14 @@ def _precompile_config(self, cfg: Config) -> CompileResult[_KernelType]:
                     is_manual_warmup=False,
                     already_compiled=True,
                 )
+
+                # only add inductor_args if the hook takes it
+                sig = inspect.signature(hook)
+                params = sig.parameters
+                if "inductor_args" in params:
+                    call_kwargs["inductor_args"] = self.inductor_meta["config_args"]
+
+                hook(**call_kwargs)
             except Exception:
                 log.exception("jit_post_compile_hook failed")
 
@@ -1058,6 +1100,15 @@ def save_gpu_kernel(self, stream, launcher):
             "global_scratch": launcher.global_scratch,
             "profile_scratch": launcher.profile_scratch,
         }
+        if self.device_props.type == "xpu":
+            # On the XPU backend, threads_per_warp is not always 32.
+            # For Intel GEMM Triton kernels, it can be 16.
+            # This information must be preserved so that the Cpp wrapper
+            # can launch the kernel with the correct configuration.
+            params["threads_per_warp"] = getattr(
+                launcher.bin.metadata, "threads_per_warp", 32
+            )
+
         from torch._inductor.codecache import CudaKernelParamCache
 
         bin_type = {"hip": "hsaco", "xpu": "spv"}.get(self.device_props.type, "cubin")
@@ -1279,7 +1330,9 @@ def filtered_signature() -> list[str]:
             def filtered_signature() -> list[str]:
                 return list(self.triton_meta["signature"].keys())
 
-        grid = GridExpr.from_meta(self.inductor_meta, cfg).eval_slow(
+        grid = GridExpr.from_meta(
+            self.inductor_meta, cfg, mode=self.grid_mode
+        ).eval_slow(
             dict(
                 zip(
                     [
@@ -2246,6 +2299,7 @@ def triton_config_reduction(
     num_stages=1,
     num_warps=None,
     register_intensive=False,
+    dynamic_scale_rblock=True,
 ) -> Config:
     """
     Construct a reduction triton config with some adjustment heuristics
@@ -2289,7 +2343,12 @@ def total_numel() -> int:
     cfg = _get_config({"x": x, **rnumels})
     check_max_block(cfg)
     check_config(cfg, xnumel=size_hints["x"])
-    return Config(cfg, num_warps=num_warps, num_stages=num_stages)
+    return InductorConfig(
+        cfg,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        dynamic_scale_rblock=dynamic_scale_rblock,
+    )
 
 
 def _get_config(numels: dict[str, int]) -> dict[str, int]:
@@ -2478,7 +2537,7 @@ def pointwise(
 
 
 def _reduction_configs(
-    *, size_hints: dict[str, int], inductor_meta: dict[str, Any]
+    *, size_hints: dict[str, int], inductor_meta: dict[str, Any], num_dynamic=0
 ) -> list[Config]:
     reduction_hint = inductor_meta.get("reduction_hint", None)
 
@@ -2487,11 +2546,10 @@ def _reduction_configs(
 
     register_intensive = False
     MAX_R0_BLOCK = 2048
-    if (
-        size_hints["x"] >= 1024
-        and inductor_meta.get("num_load", 0) + inductor_meta.get("num_reduction", 0)
-        >= 10
-    ):
+    loads_and_red = inductor_meta.get("num_load", 0) + inductor_meta.get(
+        "num_reduction", 0
+    )
+    if size_hints["x"] >= 1024 and loads_and_red >= 10:
         # A heuristics to reduce R0_BLOCK if a kernel potentially need many registers.
         # Consider load and reduction since load need move data into registers and
         # reduction needs an accumulator.
@@ -2507,7 +2565,14 @@ def _reduction_configs(
         MAX_R0_BLOCK = 1024
         register_intensive = True
 
-    def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
+    def make_config(
+        x,
+        r,
+        num_warps=None,
+        num_stages=1,
+        register_intensive=False,
+        dynamic_scale_rblock=True,
+    ):
         # For 3D case with tiling scores, create an adapted version
         if "y" in size_hints:
             assert "tiling_scores" in inductor_meta
@@ -2529,19 +2594,86 @@ def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
                 num_warps=num_warps,
                 num_stages=num_stages,
                 register_intensive=register_intensive,
+                dynamic_scale_rblock=dynamic_scale_rblock,
             )
 
+    def outer_config_opt():
+        # Default to 64 for vectorized loads
+        max_x_block, x_block = 256, 64
+        load_factor = inductor_meta.get("num_load", 0)
+        x = size_hints["x"]
+        num_warps = None
+
+        # Try to use all SMs with small x
+        if x <= 1024:
+            x_block = max(min(x // 128, 8), 2)
+            outer_r_block = min(rnumel, 64)
+        # Lower bound x = 1024, 1024 // 16 = 128 around # of SMs
+        elif x // 4096 <= 8:
+            x_block = 16
+            outer_r_block = 512 // x_block
+        elif num_dynamic > 1:
+            # Lots of compute with multiple dynamic shape per loop iteration
+            # Larger RBLOCK minimizes loop iteration
+            outer_r_block = max(min((rnumel // 64), 64), 8)
+        elif num_dynamic == 1:
+            # Dynamic shapes introduce a lot register pressure for indexing
+            outer_r_block = (
+                1
+                if load_factor >= 3
+                else min(next_power_of_2(max(rnumel, 128) // 128), 8)
+            )
+        else:
+            x_block = max(min(max_x_block, next_power_of_2(x // 4096)), x_block)
+            if load_factor < 4 or rnumel <= 128:
+                outer_r_block = 512 // x_block
+            else:
+                # Heavier reductions contain a lot more overhead per loop iteration
+                # We minimize the overhead by enlarging r block
+                if rnumel >= 2048:
+                    outer_r_block = 64
+                else:
+                    outer_r_block = 32
+                x_block = min(x_block, 32)
+                num_warps = 4
+
+        # Set register intensive to true by default as we try to maximize tiles with heuristic
+        return make_config(
+            x_block,
+            outer_r_block,
+            num_warps=num_warps,
+            register_intensive=register_intensive,
+        )
+
     contiguous_config = make_config(
         1,
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
-    outer_config = make_config(64, 8, register_intensive=register_intensive)
     tiny_config = make_config(
         2 * (256 // rnumel) if rnumel <= 256 else 1,
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
+
+    outer_config = make_config(64, 8, register_intensive=register_intensive)
+    # TODO (paulzhan): Test heuristic on AMD and internal testing
+    # for correctness
+    if not torch.version.hip and not is_fbcode():
+        outer_config = outer_config_opt()
+
+    configs = []
+
+    if inductor_meta.get("add_persistent_rblock") and loads_and_red <= 8:
+        xnumel = max(4096 // rnumel, 1)
+        c = make_config(
+            xnumel,
+            rnumel,
+            register_intensive=register_intensive,
+            dynamic_scale_rblock=False,
+        )
+        configs.append(c)
+
     # For 3d tiling, default to more autotuning initially
     if "y" in size_hints:
         pass
@@ -2550,14 +2682,15 @@ def make_config(x, r, num_warps=None, num_stages=1, register_intensive=False):
     ):
         pass  # skip all these cases
     elif reduction_hint == ReductionHint.INNER:
-        return [contiguous_config]
+        return configs + [contiguous_config]
     elif reduction_hint == ReductionHint.OUTER:
-        return [outer_config]
+        return configs + [outer_config]
     elif reduction_hint == ReductionHint.OUTER_TINY:
-        return [tiny_config]
+        return configs + [tiny_config]
     if disable_pointwise_autotuning(inductor_meta):
-        return [make_config(32, 128)]
-    return [
+        return configs + [make_config(32, 128)]
+
+    return configs + [
         contiguous_config,
         outer_config,
         tiny_config,
@@ -2661,7 +2794,15 @@ def reduction(
 
     assert triton_meta is not None
 
-    configs = _reduction_configs(size_hints=size_hints, inductor_meta=inductor_meta)
+    num_dynamic = 0
+    for k in triton_meta["signature"].keys():
+        if "ks" in k:
+            num_dynamic += 1
+
+    configs = _reduction_configs(
+        size_hints=size_hints, inductor_meta=inductor_meta, num_dynamic=num_dynamic
+    )
+
     configs = _maybe_filter_configs_for_tma_restrictions(inductor_meta, configs)
     return cached_autotune(
         size_hints,
@@ -2729,6 +2870,10 @@ def _persistent_reduction_configs(
     rnumel = get_total_reduction_numel(size_hints)
 
     MAX_PERSISTENT_BLOCK_NUMEL = 4096
+    max_autotune_enabled = not disable_pointwise_autotuning(inductor_meta) or (
+        inductor_meta.get("max_autotune")
+        or inductor_meta.get("max_autotune_pointwise")
+    )
 
     if "y" not in size_hints:
         configs = [
@@ -2758,18 +2903,27 @@ def _persistent_reduction_configs(
     if "y" in size_hints:
         pass
     # TODO(jansel): we should be able to improve these heuristics
-    elif reduction_hint == ReductionHint.INNER and rnumel >= 256:
-        configs = configs[:1]
-    elif reduction_hint == ReductionHint.OUTER:
-        configs = configs[-1:]
+    if not max_autotune_enabled: # Don't filter if tuning enabled
+        if reduction_hint == ReductionHint.INNER and rnumel >= 256:
+            configs = configs[:1]
+        elif reduction_hint == ReductionHint.OUTER:
+            configs = configs[-1:]
+
+    tiny_configs = [
+        triton_config_reduction(
+            size_hints,
+            2 * (256 // rnumel) if rnumel <= 256 else 1,
+            rnumel,
+        )
+    ]
+
+    if max_autotune_enabled:
+        for conf in tiny_configs:
+            if conf not in configs:
+                configs.append(conf)
     elif reduction_hint == ReductionHint.OUTER_TINY:
-        configs = [
-            triton_config_reduction(
-                size_hints,
-                2 * (256 // rnumel) if rnumel <= 256 else 1,
-                rnumel,
-            )
-        ]
+        configs = tiny_configs
+
     for c in configs:
         # we don't need Rn_BLOCK for persistent reduction
         for prefix in size_hints:
@@ -2961,33 +3115,42 @@ def user_autotune(
     )
 
 
-def foreach(triton_meta, num_warps, filename=None, inductor_meta=None):
+def foreach(triton_meta, filename=None, inductor_meta=None):
     """
     Compile a triton foreach kernel
     """
+    configs = []
+    if disable_pointwise_autotuning(inductor_meta) and not (
+        inductor_meta.get("max_autotune") or
+        inductor_meta.get("max_autotune_pointwise")
+    ):
+        configs.append(triton.Config({}, num_stages=1, num_warps=8))
+    else:
+        for warps in [1, 2, 4, 8]:
+            configs.append(triton.Config({}, num_stages=1, num_warps=warps))
+
     return cached_autotune(
         None,
-        [triton.Config({}, num_stages=1, num_warps=num_warps)],
+        configs,
         triton_meta=triton_meta,
         inductor_meta=inductor_meta,
         heuristic_type=HeuristicType.TEMPLATE,
         filename=filename,
     )
 
-
 @dataclasses.dataclass
 class GridExpr:
     """Generate code for grid size expressions in launcher"""
 
     inductor_meta: dict[str, Any]
-    mode: Literal["python", "cpp"] = "python"
+    mode: Literal["python", "cpp", "python_slow"] = "python"
     prefix: list[str] = dataclasses.field(default_factory=list)
     x_grid: Union[str, int] = 1
     y_grid: Union[str, int] = 1
     z_grid: Union[str, int] = 1
 
     def __post_init__(self) -> None:
-        assert self.mode in ("python", "cpp")
+        assert self.mode in ("python", "cpp", "python_slow")
 
     def generate(self, meta: dict[str, int]) -> None:
         raise NotImplementedError
@@ -2999,9 +3162,15 @@ def ceildiv(
             return numel
         if isinstance(numel, int) and isinstance(block, int):
             return ceildiv(numel, block)  # constant fold
+        # This trick only works in python, where
+        # negative integer division is floored
         if self.mode == "python":
             return f"-(({numel}) // -({block}))"
-        # trick above doesn't work in C++ due to rounding differences
+        # This is more generic than above, and works in languages where
+        # positive integer division is floored/truncated
+        elif self.mode == "python_slow":
+            return f"(({numel} + {block} - 1) // ({block}))"
+        # For cpp code gen
         return f"(({numel} + ({block} - 1)) / ({block}))"
 
     def maximum(self, seq: list[Union[int, str]]) -> Union[int, str]:
@@ -3009,7 +3178,7 @@ def maximum(self, seq: list[Union[int, str]]) -> Union[int, str]:
         items = self._constant_fold(max, seq)
         if len(items) <= 1:
             return items[0]
-        if self.mode == "python":
+        if self.mode in ("python", "python_slow"):
             return f"max({', '.join(map(str, items))})"
         return functools.reduce(lambda x, y: f"std::max({x}, {y})", items)
 
@@ -3032,7 +3201,7 @@ def _constant_fold(
 
     def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
         # Grid functions are one per kernel, so name collisions are fine
-        if self.mode == "python":
+        if self.mode in ("python", "python_slow"):
             return f"{name} = {expr}"
         if self.mode == "cpp":
             return f"uint32_t {name} = {expr};"
@@ -3042,7 +3211,7 @@ def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
     def from_meta(
         inductor_meta: dict[str, Any],
         cfg: Union[Config, dict[str, int]],
-        mode: Literal["python", "cpp"] = "python",
+        mode: Literal["python", "cpp", "python_slow"] = "python",
     ) -> GridExpr:
         grid_cls = globals()[inductor_meta["grid_type"]]
         assert issubclass(grid_cls, GridExpr)
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 5cbbbf6260c9..41dbd9e14ad9 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import collections
+import contextlib
 import dataclasses
 import functools
 import inspect
@@ -19,13 +20,16 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Iterator, Sequence
     from types import ModuleType
 
+import weakref
+
 import sympy
 
 import torch
 import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
+import torch.utils._pytree as pytree
 from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor.codecache import LambdaFuture, PyCodeCache
 from torch._inductor.ir import TritonTemplateCallerBase
@@ -35,10 +39,13 @@
 from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
 from torch.utils._triton import has_triton
 
-from . import comms, config, dependencies, ir, metrics
+from . import comms, config, config_comms, dependencies, ir, metrics
 from .analyze_preserves_zero_mask import can_codegen_without_upcasts
 from .codegen.common import BackendFeature, get_scheduling_for_device, Kernel
-from .comm_analysis import estimate_nccl_collective_runtime
+from .comm_analysis import (
+    estimate_nccl_collective_runtime,
+    estimate_nccl_collective_runtime_nccl_estimator,
+)
 from .dependencies import Dep, MemoryDep, StarDep, WeakDep
 from .exc import GPUTooOldForTriton, TritonMissing
 from .fx_utils import count_flops_fx
@@ -54,6 +61,7 @@
 from .runtime.runtime_utils import green_text, red_text
 from .sizevars import SimplifyIndexing
 from .utils import (
+    _unstable_customized_partition_wrapper,
     cache_on_self,
     cmp,
     device_need_guard,
@@ -86,6 +94,28 @@
 _P = ParamSpec("_P")
 
 
+_custom_should_partition_fns: weakref.WeakKeyDictionary[
+    torch._ops.OpOverload, Callable[..., bool]
+] = weakref.WeakKeyDictionary()
+
+
+def register_should_partition_rule(
+    op: torch._ops.OpOverload,
+    func: Callable[..., bool],
+) -> None:
+    """Register a function that says if Inductor should partition the graph on this op.
+
+    The function should be have the same signature as the operator.
+    Inductor will invoke the function with FakeTensors when it needs to decide
+    if the graph should be partitioned.
+
+    `register_should_partition_rule` is currently private and experimental.
+    Use at your own risk.
+    """
+    assert isinstance(op, torch._ops.OpOverload)
+    _custom_should_partition_fns[op] = func
+
+
 @dataclasses.dataclass
 class SchedulerBuffer:
     scheduler: Scheduler
@@ -211,6 +241,7 @@ class BaseSchedulerNode:
     min_order: int
     max_order: int
     mpi_node: MemoryPlanningInfoForNode
+    override_estimated_runtime: Optional[float] = None
 
     def __init__(self, scheduler: Scheduler) -> None:
         self.scheduler: Scheduler = scheduler
@@ -237,6 +268,13 @@ def _init_from_node(self, node: ir.Operation) -> None:
             buf.get_name(): buf for buf in self.outputs
         }
 
+        # mutation_renames for the current node. Due to potential
+        # more mutations happening later, this can be different
+        # to Scheduler.mutation_renames. Also this dict should be small
+        # since only mutation information relevant to the deps for this
+        # node is stored here.
+        self.mutation_renames: dict[str, str] = {}
+
     def __repr__(self) -> str:
         return f"{type(self).__name__}(name={self.get_name()!r})"
 
@@ -296,11 +334,16 @@ def log_details(self) -> None:
 
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
-    ) -> None:
-        return
+    ) -> bool:
+        return False
 
     def update_mutated_names(self, renames: dict[str, str]) -> None:
-        self.set_read_writes(self.read_writes.rename(renames))
+        self.mutation_renames = {
+            name: renames[name]
+            for name in (dep.name for dep in self.read_writes.reads_and_writes())
+            if name in renames
+        }
+        self.set_read_writes(self.read_writes.rename(self.mutation_renames))
 
     def add_fake_dep(self, dep: Dep) -> None:
         self.set_read_writes(self.read_writes.with_read(dep))
@@ -611,6 +654,9 @@ def codegen_originating_info(
                     + stack_trace_last_line.replace("{", "{{")
                     .replace("}", "}}")
                     .replace("\n", "\\")
+                    .replace(
+                        "\\", "\\\\"
+                    )  # For windows safe path, avoid for example \x, \U.
                 )
                 out_lines.append("#pragma CMT END ORIGIN")
                 out_lines.append("")
@@ -807,10 +853,16 @@ def estimate_flops(self) -> int | None:
         counters["inductor"]["flop_count"] += resolved_flops
         return resolved_flops
 
-    @cache_on_self
     def get_estimated_runtime(self) -> float:
+        if self.override_estimated_runtime is not None:
+            return self.override_estimated_runtime
+
+        return self._get_estimated_runtime()
+
+    @cache_on_self
+    def _get_estimated_runtime(self) -> float:
         """
-        Returns estimated op runtime in nanoseconds (ns)
+        Returns estimated op runtime in milliseconds (ms)
         """
         buf = self.get_nodes()[0].get_outputs()[0]
         layout = buf.node.get_output_spec()
@@ -822,6 +874,21 @@ def get_estimated_runtime(self) -> float:
         if is_collective(self.node):
             assert isinstance(self.node, ir.IRNode)
             try:
+                if config_comms.runtime_estimations_use_nccl_lib_estimations:
+                    cache_key = get_estimate_runtime_cache_key_from_snode(self)
+                    cache = get_estimate_runtime_cache()
+                    cache_val = cache.lookup(cache_key)
+                    if cache_val is not None:
+                        assert isinstance(cache_val, float)
+                        return cache_val
+
+                    ms = estimate_nccl_collective_runtime_nccl_estimator(self)
+                    if ms is None:
+                        # NCCL estimations fail: fallback to in-tree algorithmic estimation.
+                        ms = estimate_nccl_collective_runtime(self.node)
+
+                    cache.set_value(cache_key, value=ms)
+                    return ms
                 return estimate_nccl_collective_runtime(self.node)
             except ValueError as e:
                 # We don't know how to estimate runtime for this collective,
@@ -840,6 +907,10 @@ def get_estimated_runtime(self) -> float:
             # since it doesn't take extra time to get the result after the collective is completed.
             return 0
 
+        ret = maybe_estimate_runtime_benchmark(self)
+        if ret is not None:
+            return ret
+
         dtype = buf.node.maybe_get_dtype()
         try:
             gpu_memory_bandwidth = get_gpu_dram_gbps()
@@ -860,7 +931,9 @@ def get_estimated_runtime(self) -> float:
 
         if flops_est == 0 or flops_est is None:
             # no flops estimate, so fall back to memory estimate
-            return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
+            ns = self.get_read_write_buffers_sizes() / gpu_memory_bandwidth
+            ms = ns / 1e6
+            return ms
 
         # TODO(xmfan): find a better heuristic to model FLOPS/latency relationship
         factor = 1.0
@@ -869,8 +942,10 @@ def get_estimated_runtime(self) -> float:
         compute_time = (factor * flops_est / gpu_flops) * 1e9
         transfer_time = counted_bytes / gpu_memory_bandwidth
 
-        # Return estimated runtime in nanoseconds
-        return max(compute_time, transfer_time)
+        # Return estimated runtime in milliseconds
+        ns = max(compute_time, transfer_time)
+        ms = ns / 1e6
+        return ms
 
     def get_template_node(self) -> Optional[ir.TemplateBuffer]:
         return None
@@ -895,6 +970,77 @@ def get_prologue_template_epilogue(
         return prologue, template_node, epilogue
 
 
+@functools.cache
+def get_estimate_runtime_cache() -> torch._inductor.codecache.LocalCache:
+    return torch._inductor.codecache.LocalCache()
+
+
+def get_estimate_runtime_cache_key_from_snode(snode: BaseSchedulerNode) -> str:
+    python_kernel_name = getattr(snode.node, "python_kernel_name", "")
+    args = snode.node.inputs  # type: ignore[union-attr]
+    args = snode.node.fill_non_provided_args(  # type: ignore[union-attr]
+        [*args, *snode.node.constant_args],  # type: ignore[union-attr]
+        snode.node.kwargs,  # type: ignore[union-attr]
+    )
+    kwargs = snode.node.kwargs  # type: ignore[union-attr]
+    flat_args, flat_args_pytree_spec = pytree.tree_flatten((args, kwargs))
+
+    def _is_tensor_ir(x) -> bool:  # type: ignore[no-untyped-def]
+        return isinstance(x, ir.IRNode) and not isinstance(x, ir.GeneratorState)
+
+    cache_key = str(
+        (python_kernel_name,)
+        + tuple(tuple(a.get_size()) if _is_tensor_ir(a) else None for a in flat_args)
+    )
+    return cache_key
+
+
+def _get_mm_like_fn(snode: BaseSchedulerNode) -> Optional[Callable[[Any], Any]]:
+    if not isinstance(snode, ExternKernelSchedulerNode):
+        return None
+    mms_fns = {
+        "extern_kernels.mm": torch.ops.aten.mm,
+        "extern_kernels.bmm": torch.ops.aten.bmm,
+        "extern_kernels.addmm": torch.ops.aten.addmm,
+    }
+    python_kernel_name = getattr(snode.node, "python_kernel_name", "")
+    if python_kernel_name not in mms_fns:
+        return None
+    if not isinstance(snode.node, ir.ExternKernel):
+        return None
+    return mms_fns[python_kernel_name]
+
+
+def maybe_estimate_runtime_benchmark(snode: BaseSchedulerNode) -> Optional[float]:
+    bench_fn = None
+    args_kwargs_fn = None
+    if config.runtime_estimations_mms_benchmark:
+        mm_fn = _get_mm_like_fn(snode)
+        if mm_fn is None:
+            return None
+        bench_fn = mm_fn
+        args_kwargs_fn = lambda: snode_args_kwargs(snode)  # noqa: E731
+    else:
+        return None
+
+    cache_key = get_estimate_runtime_cache_key_from_snode(snode)
+    cache = get_estimate_runtime_cache()
+    cache_val = cache.lookup(cache_key)
+    if cache_val is not None:
+        assert isinstance(cache_val, float)
+        return cache_val
+
+    from .utils import snode_args_kwargs
+
+    args, kwargs = args_kwargs_fn()
+    from triton.testing import do_bench
+
+    ms = do_bench(lambda: bench_fn(*args, **kwargs))
+
+    cache.set_value(cache_key, value=ms)
+    return ms
+
+
 class WhyNoFuse:
     # TODO when we drop support for Python < 3.10, we can use
     # @dataclass(slots=True) instead of manually specifying __slots__.
@@ -1009,6 +1155,11 @@ def __init__(self, scheduler: Scheduler, node: ir.Operation) -> None:
 
 
 class SchedulerNode(BaseSchedulerNode):
+    """
+    A SchedulerNode is a node for scheduling that encapsulates either
+    a ComputedBuffer or a TemplateBuffer.
+    """
+
     _sizes: tuple[Sequence[sympy.Expr], ...]
     _body: LoopBody
 
@@ -1078,7 +1229,9 @@ def refresh_dependencies(
         self.set_read_writes(
             dependencies.extract_read_writes(
                 self._body, *self._sizes, normalize=normalize
-            ).with_read(fake_deps)
+            )
+            .with_read(fake_deps)
+            .rename(self.mutation_renames)
         )
 
         self.pointwise_read_writes.clear_cache(self)
@@ -1100,6 +1253,23 @@ def apply_new_loop_order(self, new_order: Sequence[int]) -> None:
 
         self.refresh_dependencies(normalize=False, need_clear_tiling_cache=True)
 
+    def expand_dimension_for_pointwise_node(
+        self, dimension: int, new_range: int
+    ) -> None:
+        assert isinstance(self.node, (ir.ComputedBuffer, ir.TemplateBuffer))
+
+        self._body = self._body.expand_dimension_for_pointwise_node(
+            dimension, new_range
+        )
+        self._sizes = self._body.sizes
+
+        device = self.node.get_device_or_error()
+        group_fn = self.scheduler.get_backend(device).group_fn
+        self.group = (device, group_fn(self._sizes))
+
+        # Need normalize the prefix name to facilitate finding common dependencies
+        self.refresh_dependencies(normalize=True, need_clear_tiling_cache=True)
+
     def merge_loops(self) -> None:
         self._body = self._body.merge_loops()
         self._sizes = self._body.sizes
@@ -1114,7 +1284,7 @@ def merge_loops(self) -> None:
 
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
-    ) -> None:
+    ) -> bool:
         new_order = None
         self_sizes = self._sizes[0]
         if len(self_sizes) == self_dep.num_vars == other_dep.num_vars:
@@ -1126,11 +1296,13 @@ def reorder_loops_by_dep_pair(
                 "Reorder loops for %s with order %s", self.get_name(), new_order
             )
             self.apply_new_loop_order(new_order)
+            return True
         else:
             loop_ordering_log.debug(
                 "Don't reordering %s because we can not decide the suitable loop order",
                 self.get_name(),
             )
+            return False
 
     def debug_str_extra(self) -> str:
         name = self.get_name()
@@ -1387,10 +1559,13 @@ def estimate_flops(self) -> int | None:
 
     def reorder_loops_by_dep_pair(
         self, self_dep: MemoryDep, other_dep: MemoryDep
-    ) -> None:
+    ) -> bool:
+        """
+        Return true if a loop reordering is performed.
+        """
         if self.is_template():
             # We can not really reorder loops for a triton template
-            return
+            return False
         self_sizes = None
         for snode in self.snodes:
             assert isinstance(snode, SchedulerNode)
@@ -1398,7 +1573,7 @@ def reorder_loops_by_dep_pair(
                 loop_ordering_log.debug(
                     "Can not reorder fused node due to different sizes"
                 )
-                return
+                return False
             self_sizes = snode._sizes[0]
         new_order = None
 
@@ -1411,7 +1586,7 @@ def reorder_loops_by_dep_pair(
                 "Dont reordering fused node %s because we can not decide the suitable loop order",
                 self.get_name(),
             )
-            return
+            return False
         metrics.num_loop_reordering += 1
         loop_ordering_log.debug(
             "Reorder loops for fused node %s with order %s", self.get_name(), new_order
@@ -1421,6 +1596,7 @@ def reorder_loops_by_dep_pair(
             snode.apply_new_loop_order(new_order)
 
         refresh_group_node_dependencies(self)
+        return True
 
     def __init__(self, scheduler: Scheduler, snodes: list[BaseSchedulerNode]) -> None:
         super().__init__(scheduler)
@@ -2059,6 +2235,10 @@ def merge(self, other: NodeUser) -> NodeUser:
 _post_grad_graph_counter = itertools.count()
 
 
+def used_non_deterministic_runtime_estimations() -> bool:
+    return config.runtime_estimations_mms_benchmark
+
+
 class Scheduler:
     """
     A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
@@ -2093,6 +2273,9 @@ def _init(self, nodes: list[ir.Operation]) -> None:
         for node in self.nodes:
             node.prune_deps()
 
+        # See [Note: Graph Partition Device Contexts]
+        self.default_device_context: Optional[torch.device] = None
+
         self.name_to_donated_buffer: dict[str, SchedulerDonatedBuffer] = (
             self.get_donated_buffers()
         )
@@ -2179,6 +2362,17 @@ def _init(self, nodes: list[ir.Operation]) -> None:
                 assign_memory_planning_info_for_scheduler_buffers(
                     self.nodes, self.name_to_buf
                 )
+
+            if (
+                used_non_deterministic_runtime_estimations()
+                and config_comms.runtime_estimations_align_across_all_distributed_ranks
+            ):
+                from .comms import (
+                    align_runtime_estimations_across_all_distributed_ranks,
+                )
+
+                align_runtime_estimations_across_all_distributed_ranks(self.nodes)
+
             from torch._logging import trace_structured
 
             trace_structured(
@@ -2768,10 +2962,10 @@ def compute_ancestors(self) -> None:
             node.max_order = order
 
     def merge_loops(self) -> None:
-        for node in self.nodes:
-            if not config.loop_ordering_after_fusion:
-                continue
+        if not config.loop_ordering_after_fusion:
+            return
 
+        for node in self.nodes:
             # Even for CPU, if we are using the halide backend, we still need
             # the merge loops steps below
             if not isinstance(node, (SchedulerNode, FusedSchedulerNode)) or (
@@ -3745,6 +3939,11 @@ def shared_data_after_reordering_loop(
         Right now just greedily reorder the loop of node1 to be compatible with node2,
         but ideally we should have some heuristics to reorder the loop for node2
         to be compatible with node1 if that's more efficient.
+
+        Return the amount of shared data re-computed in this method.
+        If no such recomputation happens, return -1 (not return 0 since 0 is a valid
+        amount of shared data).
+
         """
 
         # TODO Don't do loop reordering for CPU for now.
@@ -3752,14 +3951,14 @@ def shared_data_after_reordering_loop(
         if not config.loop_ordering_after_fusion or any(
             n.is_cpu() for n in [node1, node2]
         ):
-            return 0
+            return -1
 
         node1_buffer_names = node1.read_writes.buffer_names()
         node2_buffer_names = node2.read_writes.buffer_names()
         # Fast path: no common buffers.
         common_buffer_names = node1_buffer_names & node2_buffer_names
         if not common_buffer_names:
-            return 0
+            return -1
 
         node1_name2dep = {dep.name: dep for dep in node1.read_writes.reads_and_writes()}
         node2_name2dep = {dep.name: dep for dep in node2.read_writes.reads_and_writes()}
@@ -3782,13 +3981,13 @@ def shared_data_after_reordering_loop(
                 )
 
         if len(candidates) == 0:
-            return 0
+            return -1
 
         # Pick the largest buffer to guide the loop reordering
         _numel, lhs_dep, rhs_dep = max(candidates, key=operator.itemgetter(0))
 
         if not isinstance(lhs_dep, MemoryDep) or not isinstance(rhs_dep, MemoryDep):
-            return 0
+            return -1
 
         if lhs_dep.num_vars != rhs_dep.num_vars:
             # this can happen due to we don't merge loops.
@@ -3797,13 +3996,14 @@ def shared_data_after_reordering_loop(
             # normalization (merging loops)
             if lhs_dep.normalize() == rhs_dep.normalize():
                 return self.dep_size_hint(lhs_dep)
-            return 0
+            return -1
 
+        reordered = False
         # Only reorder loops for pointwise for now
         if not node1.is_reduction():
-            node1.reorder_loops_by_dep_pair(lhs_dep, rhs_dep)
+            reordered = node1.reorder_loops_by_dep_pair(lhs_dep, rhs_dep)
         elif not node2.is_reduction():
-            node2.reorder_loops_by_dep_pair(rhs_dep, lhs_dep)
+            reordered = node2.reorder_loops_by_dep_pair(rhs_dep, lhs_dep)
         else:
             loop_ordering_log.debug(
                 "Don't reorder loops since both nodes are reductions: %s v.s. %s",
@@ -3811,7 +4011,7 @@ def shared_data_after_reordering_loop(
                 node2.get_name(),
             )
 
-        return self.score_fusion_memory(node1, node2)
+        return self.score_fusion_memory(node1, node2) if reordered else -1
 
     def unfusable_node(self, node: BaseSchedulerNode) -> bool:
         """
@@ -3882,6 +4082,104 @@ def low_prec_fp(dtype: torch.dtype) -> bool:
 
         return True
 
+    def get_expand_dim_for_pointwise_nodes(
+        self, node1: BaseSchedulerNode, node2: BaseSchedulerNode
+    ) -> Optional[tuple[int, SchedulerNode, sympy.Expr]]:
+        """
+        Fusing two small pointwise nodes significantly reduces kernel overhead
+        and launch overhead. However, slightly different sizes would prevent fusion.
+        Here, we decide if expanding sizes of one node is profitible by allowing
+        fusion, and returns the dimension to expand, node with smaller sizes,
+        and new size after expand.
+        """
+        # only support scheduler node
+        if not isinstance(node1, SchedulerNode) or not isinstance(node2, SchedulerNode):
+            return None
+
+        # only support computued buffer
+        if not (
+            isinstance(node1.node, ir.ComputedBuffer)
+            and isinstance(node2.node, ir.ComputedBuffer)
+        ):
+            return None
+
+        # does not support mutation yet since relying on index mod to handle
+        # out-of-boundary access.
+        if node1.has_aliasing_or_mutation() or node2.has_aliasing_or_mutation():
+            return None
+
+        # skip halide which does not support mod for index
+        if config.cpu_backend == "halide":
+            return None
+
+        # only support pointwise nodes with the same reduction size
+        n1_sizes, n2_sizes = node1._sizes, node2._sizes
+        n1_iter_sizes, n1_reduce_sizes = n1_sizes
+        n2_iter_sizes, n2_reduce_sizes = n2_sizes
+        if (
+            node1.is_reduction()
+            or node2.is_reduction()
+            or n1_reduce_sizes != n2_reduce_sizes
+            or len(n1_iter_sizes) != len(n2_iter_sizes)
+        ):
+            return None
+
+        # only support nodes with 1 write for simplification
+        if len(node1.read_writes.writes) > 1 or len(node2.read_writes.writes) > 1:
+            return None
+
+        # When memory access is small, reducing gpu kernel overhead is profitable over
+        # slightly larger memory access.
+        node1_write_memory = self.dep_size_hint(next(iter(node1.read_writes.writes)))
+        node2_write_memory = self.dep_size_hint(next(iter(node1.read_writes.writes)))
+        if (
+            max(node1_write_memory, node2_write_memory)
+            > config.small_memory_access_threshold
+        ):
+            return None
+
+        # does not support reinplace since `index % boundary` may lead to
+        # race condition
+        def has_reusable_buffer(node: BaseSchedulerNode) -> bool:
+            for read in node.read_writes.reads:
+                input_buf: Optional[Union[SchedulerBuffer, SchedulerDonatedBuffer]]
+                if read.name in self.name_to_donated_buffer:
+                    input_buf = self.name_to_donated_buffer[read.name]
+                else:
+                    input_buf = self.name_to_buf.get(read.name)
+
+                if (
+                    input_buf
+                    and V.graph.wrapper_code.can_reuse(input_buf, node)
+                    and not isinstance(input_buf.defining_op, NopKernelSchedulerNode)
+                ):
+                    return True
+            return False
+
+        if has_reusable_buffer(node1) or has_reusable_buffer(node2):
+            return None
+
+        # only support nodes with 1 mismatch dimension
+        mismatch_dimensions = []
+        for idx, (n1_size, n2_size) in enumerate(zip(n1_iter_sizes, n2_iter_sizes)):
+            if n1_size != n2_size:
+                mismatch_dimensions.append(idx)
+
+        if len(mismatch_dimensions) != 1:
+            return None
+
+        mismatch_dim = mismatch_dimensions[0]
+        mismatch_size1, mismatch_size2 = (
+            n1_iter_sizes[mismatch_dim],
+            n2_iter_sizes[mismatch_dim],
+        )
+        if V.graph.sizevars.statically_known_lt(mismatch_size1, mismatch_size2):
+            return mismatch_dim, node1, mismatch_size2
+        elif V.graph.sizevars.statically_known_lt(mismatch_size2, mismatch_size1):
+            return mismatch_dim, node2, mismatch_size1
+        else:
+            return None
+
     def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
         """
         Determine if it is possible to combine node1 and node2 into a
@@ -4002,7 +4300,16 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode) -> bool:
             shared_data_score < config.score_fusion_memory_threshold
             and config.loop_ordering_after_fusion
         ):
-            shared_data_score = self.shared_data_after_reordering_loop(node1, node2)
+            new_shared_data_score = self.shared_data_after_reordering_loop(node1, node2)
+            if new_shared_data_score >= 0:
+                shared_data_score = new_shared_data_score
+
+        if config.expand_dimension_for_pointwise_nodes and (
+            expand_analysis := self.get_expand_dim_for_pointwise_nodes(node1, node2)
+        ):
+            (expand_dim, smaller_node, expand_size) = expand_analysis
+            smaller_node.expand_dimension_for_pointwise_node(expand_dim, expand_size)
+            shared_data_score = self.score_fusion_memory(node1, node2)
 
         if loop_ordering_log.isEnabledFor(logging.DEBUG):
             loop_ordering_log.debug(
@@ -4344,10 +4651,32 @@ def should_partition(
     ) -> bool:
         """Return True if we should partition the inductor graph on this node"""
 
+        # Allow users to manually specify if a node should be partitioned
+        # Can only do this for FallbackKernels
+        ir_node = node.node
+        if isinstance(ir_node, torch._inductor.ir.FallbackKernel):
+            operator = ir_node.op_overload
+            if operator is not None and operator in _custom_should_partition_fns:
+                assert isinstance(operator, torch._ops.OpOverload)
+                should_partition_fn = _custom_should_partition_fns[operator]
+                fx_node = ir_node.get_origin_node()
+                assert fx_node is not None
+                success, fake_args, fake_kwargs = (
+                    torch._inductor.fx_utils.get_fake_args_kwargs(fx_node)
+                )
+                assert success, (
+                    "If this op came from a custom inductor pass, make sure to run FakeTensorUpdator"
+                )
+                should_partition = should_partition_fn(*fake_args, **fake_kwargs)
+                return should_partition
+
         # When not using cudagraphs, keep all kernels in the `call` function
         # instead of graph partition functions, since graph partition only brings
         # benefit to cudagraph
-        if not torch._inductor.config.triton.cudagraphs:
+        if (
+            not torch._inductor.config.triton.cudagraphs
+            and _unstable_customized_partition_wrapper.wrapper is None
+        ):
             return True
 
         # avoid duplicating logs when should_partition is called multiple times
@@ -4920,6 +5249,80 @@ def _codegen_partition_wrapper(
             [node.get_name() for node in signature.output_nodes]
         )
 
+    def use_default_device_context(
+        self, partitions: list[PartitionType], signatures: list[GraphPartitionSignature]
+    ) -> contextlib.AbstractContextManager[None]:
+        @contextlib.contextmanager
+        def ctx() -> Iterator[None]:
+            self.update_graph_partition_default_device(partitions, signatures)
+            if self.default_device_context and device_need_guard(
+                self.default_device_context.type
+            ):
+                assert self.default_device_context.index is not None, (
+                    "device should have an index"
+                )
+                V.graph.wrapper_code.codegen_device_guard_enter(
+                    self.default_device_context.index
+                )
+
+            try:
+                yield
+            finally:
+                if self.default_device_context and device_need_guard(
+                    self.default_device_context.type
+                ):
+                    V.graph.wrapper_code.codegen_device_guard_exit()
+                self.default_device_context = None
+
+        return ctx()
+
+    def update_graph_partition_default_device(
+        self, partitions: list[PartitionType], signatures: list[GraphPartitionSignature]
+    ) -> None:
+        # Note: [Graph Partition Device Contexts]
+        # Entering a device context takes 60 microseconds and exiting a device
+        # context takes 20 microseconds. If all graph partitions and
+        # cudagraph-unsafe ops happen on the same device, we can share the
+        # device context.
+
+        if len(partitions) == 1 and not signatures[0].skip_cudagraph:
+            # If there is only 1 cudagraph partition, the device context
+            # should happen within the cudagraph partition, which
+            # would be removed by cudagraph.
+            return
+
+        def get_cudagraph_partition_device(partition: PartitionType) -> torch.device:
+            partition_device = partition[0].get_device()
+            assert partition_device is not None
+            return partition_device
+
+        def all_on_target_device(
+            partition: PartitionType, target_device: torch.device
+        ) -> bool:
+            for node in partition:
+                device = node.get_device()
+                if device != target_device:
+                    return False
+            return True
+
+        cudagraph_partition_device = None
+        for partition, signature in zip(partitions, signatures):
+            if not signature.skip_cudagraph:
+                cudagraph_partition_device = get_cudagraph_partition_device(partition)
+                break
+
+        # all partitions skip cudagraph
+        if cudagraph_partition_device is None:
+            return
+
+        for partition, signature in zip(partitions, signatures):
+            if signature.skip_cudagraph and not all_on_target_device(
+                partition, cudagraph_partition_device
+            ):
+                return
+
+        self.default_device_context = cudagraph_partition_device
+
     def _codegen_partitions(self) -> None:
         """
         Split nodes into partitions and codegen each partition into separate functions.
@@ -4932,15 +5335,16 @@ def _codegen_partitions(self) -> None:
             msg = f"cudagraph partition into {len(partitions)} partitions"
             maybe_log_cudagraph_partition(msg=msg, prefix="")
 
-        for partition, signature in zip(partitions, signatures):
-            assert len(partition) >= 1, (
-                f"Each partition must have at least one node but found {len(partition)}"
-            )
+        with self.use_default_device_context(partitions, signatures):
+            for partition, signature in zip(partitions, signatures):
+                assert len(partition) >= 1, (
+                    f"Each partition must have at least one node but found {len(partition)}"
+                )
 
-            if signature.skip_cudagraph:
-                self._codegen(partition)
-            else:
-                self._codegen_partition_wrapper(partition, signature)
+                if signature.skip_cudagraph:
+                    self._codegen(partition)
+                else:
+                    self._codegen_partition_wrapper(partition, signature)
 
         num_partitions = next(self._graph_partition_counter)
         V.graph.wrapper_code.set_all_partition_names(num_partitions)
@@ -4973,7 +5377,11 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 )
                 seen.add(key)
 
-        self.current_device = None
+        self.current_device = self.default_device_context
+
+        if self.default_device_context and config.triton.autotune_at_compile_time:
+            V.graph.wrapper_code.write_get_raw_stream_header()
+
         for node in nodes:
             if log.isEnabledFor(logging.DEBUG):
                 try:
@@ -5052,10 +5460,15 @@ def _codegen(self, nodes: list[BaseSchedulerNode]) -> None:
                 ):
                     self.flush()
 
-        if self.current_device and device_need_guard(self.current_device.type):
-            # exit the outermost CUDA device guard. this is
-            # important for nested indentation codegen-ing.
-            V.graph.wrapper_code.codegen_device_guard_exit()
+        if self.current_device != self.default_device_context:
+            # when default_device_context is not None, we are codegen
+            # for graph partitions and all nodes must be on
+            # the same default device.
+            assert self.current_device is not None
+            if device_need_guard(self.current_device.type):
+                # exit the outermost CUDA device guard. this is
+                # important for nested indentation codegen-ing.
+                V.graph.wrapper_code.codegen_device_guard_exit()
 
         self.flush()
 
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 25f505da5d40..ac8daee16417 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -263,7 +263,7 @@ class SubgraphInfo:
 
     # only copied over if not None
     range_trees: Optional[list["IterationRangesRoot"]] = None
-    numels = None  # type: ignore[var-annotated]
+    numels: Optional[dict[str, sympy.Expr]] = None
 
     def __post_init__(self):
         self.only_copy_if_non_none_fields = ("range_trees", "numels")
@@ -613,6 +613,8 @@ def jit_lines(self):
             flops = self.estimate_flops()
             inductor_meta["kernel_flop"] = flops
 
+        inductor_meta["config_args"] = self.meta
+
         template_args = f"""
             num_stages={self.num_stages},
             num_warps={self.num_warps},
@@ -1441,6 +1443,11 @@ def __init__(
     # was not used are the same.
     test_cache = False
 
+    @property
+    def uid(self) -> str:
+        # unique by prefixing with triton
+        return f"triton::{self.name}"
+
     def maybe_append_choice(
         self, choices: list[Any], **kwargs: Any
     ) -> Optional[NotImplementedError]:
@@ -1453,7 +1460,9 @@ def maybe_append_choice(
         """
 
         try:
-            choices.append(self.generate(generate_with_caching=True, **kwargs))
+            choice = self.generate(generate_with_caching=True, **kwargs)
+            if choice is not None:
+                choices.append(choice)
             return None
         except NotImplementedError as e:
             log.info(
@@ -1514,17 +1523,21 @@ def generate_and_load(
 
         for name, val in kwargs.items():
             defines.write(f"{name} : tl.constexpr = {val}\n")
-        defines = defines.getvalue()
 
         fake_out = ir.Buffer(name="buf_out", layout=layout)
         kernel_name = f"triton_{self.name}"
 
         numel = sympy_product(layout.size)
         buffers = itertools.chain(input_nodes, (fake_out,))
-        if not TritonScheduling.can_use_32bit_indexing(numel, buffers):
-            raise NotImplementedError(
-                "64-bit indexing is not yet implemented for triton templates"
-            )
+
+        if TritonScheduling.can_use_32bit_indexing(numel, buffers):
+            index_dtype = "tl.int32"
+        else:
+            index_dtype = "tl.int64"
+
+        # Add index dtype to defines so it's available in the template
+        defines.write(f"INDEX_DTYPE : tl.constexpr = {index_dtype}\n")
+        defines = defines.getvalue()
 
         kernel_options = {
             "input_nodes": input_nodes,
@@ -1901,6 +1914,36 @@ def bind(
             self, input_nodes, layout, kwargs, has_out_variant=self.has_out_variant
         )
 
+    @property
+    def uid(self) -> str:
+        # unique by prefixing with aten
+        return f"aten::{self.name}"
+
+    def choice_or_none(self, **kwargs: Any) -> Optional[ChoiceCaller]:
+        """
+        Maybe generates a new ChoiceCaller and returns it, or None if generation fails.
+
+        kwargs: Additional kwargs to be passed to generate a new ChoiceCaller.
+        """
+        temp_choices: list[Any] = []
+        result = self.maybe_append_choice(temp_choices, **kwargs)
+        if result is None and len(temp_choices) == 1:
+            return temp_choices[0]
+        return None
+
+    def maybe_append_choice(
+        self, choices: list[Any], **kwargs: Any
+    ) -> Optional[NotImplementedError]:
+        # convenience function to match the Template interface, so that
+        # templates and ExternKernelChoice can be treated the same when
+        # generating choice callers
+        assert "input_nodes" in kwargs, "input_nodes argument required"
+        assert "layout" in kwargs, "layout argument required"
+        input_nodes = kwargs.pop("input_nodes")
+        layout = kwargs.pop("layout")
+        choices.append(self.bind(input_nodes=input_nodes, layout=layout, **kwargs))
+        return None
+
 
 class TritonTemplateCaller(ir.TritonTemplateCallerBase):
     def __init__(
@@ -2756,6 +2799,9 @@ def wait_on_futures():
                 timeout=precompilation_timeout_seconds,
             ):
                 if e := future.exception():
+                    counters["inductor"][
+                        "select_algorithm_num_precompilation_exceptions"
+                    ] += 1
                     exceptions.append((futures[future], e))
                     from torch._inductor.codegen.cuda.cuda_kernel import (
                         CUDATemplateCaller,
@@ -3354,6 +3400,9 @@ def key_of(node):
     def add_feedback_saver(self, fn: FeedbackFunction):
         self.feedback_saver_fns.append(fn)
 
+    def clear_feedback_savers(self):
+        self.feedback_saver_fns = []
+
     def add_preprocessing_fn(self, fn: PreprocessingFunction):
         self.preprocessing_fns.append(fn)
 
@@ -3401,6 +3450,12 @@ def add_feedback_saver(
     cache.add_feedback_saver(fn)
 
 
+def clear_feedback_savers():
+    """Clear all feedback saver functions."""
+    cache = get_algorithm_selector_cache()
+    cache.clear_feedback_savers()
+
+
 def add_preprocessing_fn(
     fn: PreprocessingFunction,
 ):
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 1d42c03ecf79..8727777b562b 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -476,6 +476,13 @@ def evaluate_expr(
             fallback_value=fallback_value,
         )
 
+    def is_size_one_or_false(self, size: Expr) -> bool:
+        """Return True if size equals 1.
+
+        Unbacked symbolic sizes return False without introducing a guard.
+        """
+        return self.guard_or_false(sympy.Eq(size, 1))
+
     def evaluate_min(self, left: Expr, right: Expr) -> Expr:
         """return the smaller of left and right, and guard on that choice"""
         if isinstance(left, Expr):
diff --git a/torch/_inductor/template_heuristics/__init__.py b/torch/_inductor/template_heuristics/__init__.py
new file mode 100644
index 000000000000..eb3d731525ea
--- /dev/null
+++ b/torch/_inductor/template_heuristics/__init__.py
@@ -0,0 +1,6 @@
+# NOTE: add new template heuristics here, so they get imported and registered
+# TODO: write a simple glob if there are many heuristics to auto import them in the right order
+from . import aten, base, contiguous_mm, decompose_k, registry, triton
+
+# expose the entry function
+from .registry import get_template_heuristic
diff --git a/torch/_inductor/template_heuristics/aten.py b/torch/_inductor/template_heuristics/aten.py
new file mode 100644
index 000000000000..1b797319586f
--- /dev/null
+++ b/torch/_inductor/template_heuristics/aten.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+from torch._inductor import config as inductor_config
+
+from ..kernel.bmm import aten_baddbmm, aten_bmm, aten_bmm_dtype
+from ..kernel.mm import aten__fp8_mm, aten__int_mm, aten_addmm, aten_bias_addmm, aten_mm
+from ..kernel.mm_plus_mm import aten_mm_plus_mm
+from .base import TemplateConfigHeuristics
+from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from ..ir import Layout
+    from ..kernel_inputs import KernelInputs
+
+from .registry import register_template_heuristic
+
+
+# These are all labeled as device type None to indicate that they
+# are valid for all device types
+@register_template_heuristic(aten_mm.uid, None)
+@register_template_heuristic(aten__fp8_mm.uid, None)
+@register_template_heuristic(aten__int_mm.uid, None)
+@register_template_heuristic(aten_bmm.uid, None)
+@register_template_heuristic(aten_mm_plus_mm.uid, None)
+# bmm dtype is only valid on cuda
+@register_template_heuristic(aten_bmm_dtype.uid, "cuda")
+class ATenConfigHeuristics(TemplateConfigHeuristics):
+    """
+    Pseudo heuristic to make ATen choices go through the same flow as other templates
+
+    This is a single choice without kwargs
+
+    If you want to use this with an ATen choice that has kwargs, just subclass
+    """
+
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        yield dict()
+
+
+# None here indicates that this is valid for all device types on that op
+# Note (None, op) takes precedence over (device_type, None)
+@register_template_heuristic(aten_addmm.uid, None, op_name="addmm")
+@register_template_heuristic(aten_baddbmm.uid, None, op_name="baddbmm")
+class ATenAddMMConfigHeuristics(ATenConfigHeuristics):
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> dict[str, Any]:
+        kwargs = super().get_extra_kwargs(kernel_inputs, layout, op_name)
+        alpha = kernel_inputs.get_scalar("alpha")
+        beta = kernel_inputs.get_scalar("beta")
+        return {
+            **kwargs,
+            "alpha": alpha,
+            "beta": beta,
+        }
+
+
+@register_template_heuristic(aten_bias_addmm.uid, None, op_name="addmm")
+class ATenBiasAddMMConfigHeuristics(
+    ATenAddMMConfigHeuristics, GemmMaxAutotuneTemplateConfigHeuristics
+):
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        nodes = kernel_inputs.nodes()
+        # for addmm, bias is the first input
+        bias = nodes[0]
+        if bias.get_stride()[0] == 0 and inductor_config.triton.autotune_cublasLt:
+            yield dict()
diff --git a/torch/_inductor/template_heuristics/base.py b/torch/_inductor/template_heuristics/base.py
new file mode 100644
index 000000000000..5054de625e87
--- /dev/null
+++ b/torch/_inductor/template_heuristics/base.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from ..ir import Layout
+    from ..kernel_inputs import KernelInputs
+
+
+class TemplateConfigHeuristics:
+    """Base class for generating sets of configs for an associated template."""
+
+    def should_run(self, inputs: KernelInputs, layout: Layout) -> bool:
+        """
+        hookup to check whether the configs are right to run at all e.g. you can check
+        max-autotune specific to your heuristic here or other things
+        If this returns False, get_template_configs will yield no configs
+
+        Args:
+            inputs: KernelInputs
+            layout: Layout
+        """
+        return True
+
+    def get_template_configs(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Get template configs for the given inputs.
+
+        Prefer to override the _get_template_configs_impl method
+        to leverage things like should_run
+        """
+        if not self.should_run(kernel_inputs, layout):
+            return
+
+        yield from self._get_template_configs_impl(
+            kernel_inputs,
+            layout,
+            op_name,
+        )
+
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Get template configs for the given inputs.
+        This is the main entry point for template-specific logic.
+        """
+        # base implementation yields no entries
+        yield from []
+
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> dict[str, Any]:
+        """
+        Get extra kwargs for the given inputs/op for the template.
+
+        Use this to return kwargs that are needed for the template, but
+        do not change depending on the config/choice, but are rather
+        always the same, for all configs
+        """
+        return {}
+
+    def adjust_kernel_inputs(
+        self,
+        kernel_inputs: KernelInputs,
+        op_name: str,
+    ) -> KernelInputs:
+        """
+        Adjust kernel inputs for the given inputs/op for the template.
+
+        override this to adjust the kernel inputs e.g. (un)squeezing
+        """
+        return kernel_inputs
diff --git a/torch/_inductor/template_heuristics/contiguous_mm.py b/torch/_inductor/template_heuristics/contiguous_mm.py
new file mode 100644
index 000000000000..3c3c8c6796a9
--- /dev/null
+++ b/torch/_inductor/template_heuristics/contiguous_mm.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+import torch
+
+from ..ir import get_free_symbols
+from ..kernel.mm import (
+    addmm_contiguous_subgraph_template,
+    mm_contiguous_subgraph_template,
+)
+from ..kernel_inputs import KernelInputs, MMKernelInputs
+from ..utils import use_contiguous
+from .base import TemplateConfigHeuristics
+from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
+from .registry import register_template_heuristic
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from ..ir import Layout
+
+
+@register_template_heuristic(mm_contiguous_subgraph_template.uid, None, op_name="mm")
+@register_template_heuristic(
+    addmm_contiguous_subgraph_template.uid, None, op_name="addmm"
+)
+class EmptyContiguousMMConfigHeuristics(TemplateConfigHeuristics):
+    """empty heuristics to skip contiguous mm on not cuda"""
+
+
+@register_template_heuristic(
+    mm_contiguous_subgraph_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+    op_name="mm",
+)
+@register_template_heuristic(
+    addmm_contiguous_subgraph_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+    op_name="addmm",
+)
+class ContiguousMMHeuristics(GemmMaxAutotuneTemplateConfigHeuristics):
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Get all the valid k_splits for the given m, n, k.
+        """
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            f"{self.__class__.__name__} requires MMKernelInputs"
+        )
+        # Check for unbacked symbols - if found, yield nothing
+        unbacked_symbols = any(
+            len(get_free_symbols(itr, unbacked_only=True)) > 0
+            for itr in (
+                *kernel_inputs.shapes_symbolic(),
+                *kernel_inputs.strides_symbolic(),
+            )
+        )
+        if unbacked_symbols:
+            return
+        mat2 = kernel_inputs.mat1mat2()[1]
+        if mat2.get_layout().is_contiguous():
+            # no need for contiguous decomposition
+            return
+        m, n, k = kernel_inputs.mnk_symbolic()
+        if not use_contiguous(m, n, k):
+            return
+        yield {}
diff --git a/torch/_inductor/template_heuristics/decompose_k.py b/torch/_inductor/template_heuristics/decompose_k.py
new file mode 100644
index 000000000000..6005e421eb3b
--- /dev/null
+++ b/torch/_inductor/template_heuristics/decompose_k.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+import sympy
+
+import torch
+
+from ..ir import get_free_symbols
+from ..kernel.mm import decompose_k_subgraph_template
+from ..kernel_inputs import KernelInputs, MMKernelInputs
+from ..utils import get_k_splits
+from ..virtualized import V
+from .base import TemplateConfigHeuristics
+from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
+from .registry import register_template_heuristic
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from ..ir import Layout
+
+
+@register_template_heuristic(decompose_k_subgraph_template.uid, None, op_name="mm")
+class EmptyDecomposeKConfigHeuristics(TemplateConfigHeuristics):
+    """empty heuristics to skip decompose k on anything not cuda"""
+
+
+# on CUDA, we don't support hip for decompose_k yet
+@register_template_heuristic(
+    decompose_k_subgraph_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="mm",
+)
+# TODO(coconutruben): enable decompose k on AMD by removing the register bool
+# and benchmarking it for performance and stability
+# TODO(coconutruben): enable decompose k on other devices (xpu, cpu, mps, mtia)
+# by either adding specific register_template_heuristic tags, or setting the
+# device to None (enabled on all devices)
+class DecomposeKConfigHeuristics(GemmMaxAutotuneTemplateConfigHeuristics):
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        """
+        Get all the valid k_splits for the given m, n, k.
+        """
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            f"{self.__class__.__name__} requires MMKernelInputs"
+        )
+
+        # Check for unbacked symbols - if found, yield nothing
+        unbacked_symbols = any(
+            len(get_free_symbols(itr, unbacked_only=True)) > 0
+            for itr in (
+                *kernel_inputs.shapes_symbolic(),
+                *kernel_inputs.strides_symbolic(),
+            )
+        )
+        if unbacked_symbols:
+            return
+
+        m, n, k = kernel_inputs.mnk_symbolic()
+        k_splits = get_k_splits(m, n, k)
+        for k_split in k_splits:
+            if not V.graph.sizevars.statically_known_true(
+                sympy.Eq(sympy.Mod(k, k_split), 0)
+            ):
+                continue
+            yield {"k_split": k_split}
diff --git a/torch/_inductor/template_heuristics/gemm.py b/torch/_inductor/template_heuristics/gemm.py
new file mode 100644
index 000000000000..e1119af0d026
--- /dev/null
+++ b/torch/_inductor/template_heuristics/gemm.py
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from .. import config as inductor_config
+from .base import TemplateConfigHeuristics
+
+
+if TYPE_CHECKING:
+    from ..ir import Layout
+    from ..kernel_inputs import KernelInputs
+
+
+class GemmMaxAutotuneTemplateConfigHeuristics(TemplateConfigHeuristics):
+    def should_run(self, inputs: KernelInputs, layout: Layout) -> bool:
+        """
+        simple base override for GEMM family templates that run only in max-autotune
+        """
+        return inductor_config.max_autotune or inductor_config.max_autotune_gemm
diff --git a/torch/_inductor/template_heuristics/registry.py b/torch/_inductor/template_heuristics/registry.py
new file mode 100644
index 000000000000..247c78fd5575
--- /dev/null
+++ b/torch/_inductor/template_heuristics/registry.py
@@ -0,0 +1,175 @@
+"""
+Template heuristic registry system for PyTorch Inductor.
+
+This module provides a centralized registration system for template heuristics,
+allowing automatic registration based on device type and conditional registration
+for CUDA vs ROCm based on torch.version.hip.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import logging
+from typing import Any, Optional, TYPE_CHECKING, Union
+
+from .base import TemplateConfigHeuristics
+
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+
+# Module-wide registry for template heuristics
+_TEMPLATE_HEURISTIC_REGISTRY: dict[
+    tuple[Union[str, None], ...], type[TemplateConfigHeuristics]
+] = {}
+
+# Manual cache for successful lookups only (fallback instances are not cached)
+_HEURISTIC_CACHE: dict[tuple[str, str, str], TemplateConfigHeuristics] = {}
+
+log = logging.getLogger(__name__)
+
+
+def register_template_heuristic(
+    template_name: str,
+    device_type: Union[str, None],
+    register: bool = True,
+    op_name: Optional[str] = None,
+) -> Any:
+    """
+    Decorator to register template heuristic classes.
+
+    Args:
+        template_name: Name of the template (e.g., "mm", "bmm", "scaled_mm")
+        device_type: Device type ("cuda", "cpu", "xpu")
+            Set this to None to indicate that the heuristic is applicable to all device types.
+        register: Whether to register this heuristic. Caller should pass the condition directly.
+        op_name: Name of the operator (e.g., "mm", "bmm", "scaled_mm"). This is optional
+            and is only used when a template uses different heuristics for different ops
+
+    Returns:
+        Decorator function that registers the class if conditions are met.
+
+    Example:
+        @register_template_heuristic("mm", "cuda", register=torch.version.hip is None)
+        class CUDAMMTemplateConfigHeuristic(MMTemplateConfigMixin, CUDAConfigHeuristic):
+            pass
+    """
+
+    def decorator(
+        cls: type[TemplateConfigHeuristics],
+    ) -> type[TemplateConfigHeuristics]:
+        if register:
+            key: tuple[Union[str, None], ...] = (template_name, device_type, op_name)
+            _TEMPLATE_HEURISTIC_REGISTRY[key] = cls
+            log.info(
+                f"Registered template heuristic: {cls.__name__} for '{template_name=}', '{device_type=}', '{op_name=}'"  # noqa: G004
+            )
+        return cls
+
+    return decorator
+
+
+def get_template_heuristic(
+    template_name: str, device_type: str, op_name: str
+) -> TemplateConfigHeuristics:
+    """
+    Retrieve a template heuristic instance for the given template and device type.
+
+    Args:
+        template_name: Name of the template (e.g., "mm", "bmm", "scaled_mm")
+        device_type: Device type ("cuda", "cpu", "xpu")
+        op_name: Name of the operator (e.g., "mm", "bmm", "scaled_mm")
+
+    Returns:
+        Template heuristic instance. If no specific heuristic is found,
+        returns a fallback TemplateConfigHeuristics() instance (uncached).
+    """
+    # Check cache first
+    cache_key = (template_name, device_type, op_name)
+    if cache_key in _HEURISTIC_CACHE:
+        return _HEURISTIC_CACHE[cache_key]
+
+    keys = [
+        # everything is specified
+        (template_name, device_type, op_name),
+        # heuristic is valid across all devices
+        (template_name, None, op_name),
+        # heuristic is valid across all ops for that device
+        (template_name, device_type, None),
+        # heuristic is always valid for that template
+        (template_name, None, None),
+    ]
+
+    # Look up in registry
+    heuristic_class = None
+    for key in keys:
+        if key in _TEMPLATE_HEURISTIC_REGISTRY:
+            heuristic_class = _TEMPLATE_HEURISTIC_REGISTRY[key]
+            break
+
+    if heuristic_class is None:
+        # Log error and return fallback instance (uncached)
+        log.error(
+            "No template heuristic found - template_name=%s, device_type=%s, op_name=%s. "
+            "Available combinations: %s. Using fallback TemplateConfigHeuristics instance.",
+            template_name,
+            device_type,
+            op_name,
+            list(_TEMPLATE_HEURISTIC_REGISTRY.keys()),
+        )
+        return TemplateConfigHeuristics()
+
+    # Cache successful lookup and return
+    instance = heuristic_class()
+    _HEURISTIC_CACHE[cache_key] = instance
+    return instance
+
+
+def clear_registry() -> None:
+    """
+    Clear all registered template heuristics.
+
+    This is primarily useful for testing purposes to ensure a clean state.
+    """
+    _TEMPLATE_HEURISTIC_REGISTRY.clear()
+    _HEURISTIC_CACHE.clear()
+
+
+@contextlib.contextmanager
+def override_template_heuristics(
+    device_type: str,
+    template_op_pairs: list[tuple[str, str]],
+) -> Iterator[None]:
+    """
+    Context manager to temporarily override template heuristics with an empty heuristic.
+
+    This is useful for testing purposes, where we want to ensure a specific template/op pair
+    is not used
+
+    Args:
+        device_type: Device type ("cuda", "cpu", "xpu")
+        template_op_pairs: List of (template_name, op_name) pairs to override.
+    """
+    # Save original entries to restore later
+    original_entries = {}
+    new_keys = []
+    _HEURISTIC_CACHE.clear()
+    try:
+        for template_name, op_name in template_op_pairs:
+            assert op_name is not None
+            key = (device_type, template_name, op_name)
+            if key in _TEMPLATE_HEURISTIC_REGISTRY:
+                original_entries[key] = _TEMPLATE_HEURISTIC_REGISTRY[key]
+                # TemplateConfigHeuristics base class returns no entries
+                # so we use it for overriding
+            _TEMPLATE_HEURISTIC_REGISTRY[key] = TemplateConfigHeuristics
+            new_keys.append(key)
+        yield
+    finally:
+        # Restore original entries or remove if they didn't exist before
+        for key in new_keys:
+            _TEMPLATE_HEURISTIC_REGISTRY.pop(key, None)
+            if key in original_entries:
+                _TEMPLATE_HEURISTIC_REGISTRY[key] = original_entries[key]
+        _HEURISTIC_CACHE.clear()
diff --git a/torch/_inductor/template_heuristics.py b/torch/_inductor/template_heuristics/triton.py
similarity index 79%
rename from torch/_inductor/template_heuristics.py
rename to torch/_inductor/template_heuristics/triton.py
index a75b4f8e6b8c..a7f4d9f5763f 100644
--- a/torch/_inductor/template_heuristics.py
+++ b/torch/_inductor/template_heuristics/triton.py
@@ -3,6 +3,7 @@
 import dataclasses
 import itertools
 import math
+import os
 from functools import partial
 from threading import Lock
 from typing import Any, Callable, Optional, TYPE_CHECKING
@@ -10,14 +11,29 @@
 import sympy
 
 import torch
+from torch._inductor.template_heuristics.triton_addmm import AddMMConfigMixin
 from torch.utils._ordered_set import OrderedSet
 from torch.utils._triton import has_triton_stable_tma_api
 
-from . import config, config as inductor_config
-from .kernel_inputs import KernelInputs, MMKernelInputs
-from .template_registry import register_template_heuristic
-from .utils import get_backend_num_stages, get_num_sms, TMA_DESCRIPTOR_SIZE
-from .virtualized import V
+from .. import config, config as inductor_config
+from ..kernel.bmm import bmm_template
+from ..kernel.mm import (
+    mm_template,
+    persistent_tma_mm_template,
+    scaled_mm_device_tma_template,
+)
+from ..kernel.mm_plus_mm import mm_plus_mm_template
+from ..kernel_inputs import KernelInputs, MMKernelInputs
+from ..utils import (
+    get_backend_num_stages,
+    get_num_sms,
+    get_tma_workspace_arg,
+    TMA_DESCRIPTOR_SIZE,
+    using_b200,
+)
+from ..virtualized import V
+from .gemm import GemmMaxAutotuneTemplateConfigHeuristics
+from .registry import register_template_heuristic
 
 
 if TYPE_CHECKING:
@@ -25,6 +41,8 @@
 
     from triton import Config as TritonConfig
 
+    from ..ir import Layout
+
 
 # Gemm Configs
 @dataclasses.dataclass
@@ -486,7 +504,7 @@ def _scale_mm_configs(
         """
         if not self.should_scale_configs:
             return configs
-        from .runtime.runtime_utils import next_power_of_2
+        from ..runtime.runtime_utils import next_power_of_2
 
         min_block_size = 16
         min_block_size_k = 32 if (has_int8_tensor or self.has_int8_tensor) else 16
@@ -540,34 +558,69 @@ def _scale_mm_configs(
 
         return scaled_configs
 
+    def _get_exceeding_shared_memory_checker(
+        self,
+    ) -> Optional[Callable[[BaseConfig, int], bool]]:
+        """
+        Returns a function that checks whether a given configuration exceeds the available shared memory for the device.
+        If the device does not report available shared memory, returns None.
+        """
+
+        try:
+            device = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(device)
+            if not hasattr(props, "shared_memory_per_block_optin"):  # for NVidia GPUs
+                return None
+            sm_available = int(props.shared_memory_per_block_optin)
+        except Exception:
+            # If CUDA is not available or properties cannot be queried, return None
+            return None
+
+        # TODO make a BaseDeviceConfigHeuristics to handle different device configuration in its own implementation.
+        def exceeds(gemm_config: BaseConfig, dtype_size: int) -> bool:
+            shared_mem_accum = dtype_size * (
+                gemm_config.block_m * gemm_config.block_k
+                + gemm_config.block_n * gemm_config.block_k
+            )
+            return shared_mem_accum * gemm_config.num_stages > sm_available
+
+        return exceeds
+
+    def _prune_exceeding_max_shared_mem_configs(
+        self,
+        configs: list[BaseConfig],
+        dtype_size: int,
+    ) -> list[BaseConfig]:
+        if dtype_size <= 0:
+            return configs
+
+        is_exceeding_shared_memory = self._get_exceeding_shared_memory_checker()
+        if is_exceeding_shared_memory is None:
+            return configs
+
+        return [c for c in configs if not is_exceeding_shared_memory(c, dtype_size)]
+
     def _prune_exhaustive_configs(
         self,
         configs: list[BaseConfig],
         dtype_size: int,
     ) -> list[BaseConfig]:
-        import torch
+        is_exceeding_shared_memory = self._get_exceeding_shared_memory_checker()
 
         pruned_configs = []
         for gemm_config in configs:
-            device = torch.cuda.current_device()
-            props = torch.cuda.get_device_properties(device)
-            sm_available = props.shared_memory_per_block_optin  # type: ignore[attr-defined]
-            NUM_REG = 255
+            # Will use more shared memory than available
+            if is_exceeding_shared_memory and is_exceeding_shared_memory(
+                gemm_config, dtype_size
+            ):
+                continue
 
+            NUM_REG = 255
             acc_regs = math.ceil(
                 gemm_config.block_m * gemm_config.block_n / (gemm_config.num_warps * 32)
             )
-
-            shared_mem_accum = dtype_size * (
-                gemm_config.block_m * gemm_config.block_k
-                + gemm_config.block_n * gemm_config.block_k
-            )
-
-            # Will use more shared memory than available
-            if shared_mem_accum * gemm_config.num_stages > sm_available:
-                continue
             # Lower bound for register spillage, if exceeds the kernel will certainly spill
-            elif acc_regs > NUM_REG:
+            if acc_regs > NUM_REG:
                 continue
 
             pruned_configs.append(gemm_config)
@@ -599,6 +652,13 @@ def preprocess_mm_configs(
         scaled_configs = self._scale_mm_configs(
             m, n, k, configs, scale, has_int8_tensor, exclude
         )
+
+        # Filter out configs that require more shared memory than is available.
+        if config.max_autotune_prune_choices_based_on_shared_mem:
+            scaled_configs = self._prune_exceeding_max_shared_mem_configs(
+                scaled_configs, dtype_size
+            )
+
         if config.max_autotune_gemm_search_space == "EXHAUSTIVE":
             assert dtype_size > 0, "dtype_size must be provided for exhaustive search"
             scaled_configs = self._prune_exhaustive_configs(scaled_configs, dtype_size)
@@ -782,7 +842,7 @@ def __init__(self) -> None:
             (torch.float32, 128): FlexConfig(32, 64, 3, 4),
             (torch.float32, 256): FlexConfig(32, 32, 3, 4),
             (torch.bfloat16, 64): FlexConfig(128, 128, 3, 4),
-            (torch.bfloat16, 128): FlexConfig(128, 64, 2, 8),
+            (torch.bfloat16, 128): FlexConfig(128, 64, 3, 8),
             (torch.bfloat16, 256): FlexConfig(64, 32, 3, 4),
             (torch.float16, 64): FlexConfig(128, 128, 3, 4),
             (torch.float16, 128): FlexConfig(128, 64, 3, 8),
@@ -797,7 +857,7 @@ def __init__(self) -> None:
             (torch.bfloat16, 128): FlexConfig(128, 64, 3, 8),
             (torch.bfloat16, 256): FlexConfig(64, 32, 3, 4),
             (torch.float16, 64): FlexConfig(128, 128, 3, 4),
-            (torch.float16, 128): FlexConfig(128, 128, 3, 8),
+            (torch.float16, 128): FlexConfig(128, 64, 3, 8),
             (torch.float16, 256): FlexConfig(64, 32, 3, 4),
         }
 
@@ -1226,36 +1286,138 @@ def get_flex_decode_configs(
 
 class XPUConfigHeuristic(BaseConfigHeuristic):
     """
-    Placeholder child class for XPU specific overrides.
+    Placeholder child class for Intel GPU specific overrides.
     """
 
+    def __init__(self) -> None:
+        super().__init__()
 
-class MTIAConfigHeuristic(BaseConfigHeuristic):
-    """
-    Placeholder child class for MTIA specific overrides.
-    """
+        self.xpu_default_flex_config = {
+            (torch.float32, 64): FlexConfig(128, 32, 1, 16),
+            (torch.float32, 128): FlexConfig(128, 32, 1, 16),
+            (torch.float32, 256): FlexConfig(64, 16, 1, 8),
+            (torch.bfloat16, 64): FlexConfig(128, 64, 1, 16),
+            (torch.bfloat16, 128): FlexConfig(128, 64, 1, 16),
+            (torch.bfloat16, 256): FlexConfig(32, 64, 1, 4),
+            (torch.float16, 64): FlexConfig(128, 64, 1, 16),
+            (torch.float16, 128): FlexConfig(128, 64, 1, 16),
+            (torch.float16, 256): FlexConfig(32, 64, 1, 4),
+        }
+        self.flex_attn_fwd_autotune_configs: list[FlexConfig] = [
+            FlexConfig(32, 16, 2, 4),
+            FlexConfig(128, 64, 2, 16),
+            FlexConfig(128, 64, 2, 8),
+            FlexConfig(128, 32, 2, 16),
+            FlexConfig(128, 32, 2, 8),
+        ]
+        self.flex_attn_bwd_autotune_configs: list[FlexConfig] = []
+        self.flex_decode_autotune_configs: list[FlexDecodeConfig] = []
+
+        if not bool(os.getenv("CI")):
+            self.flex_attn_bwd_autotune_configs += [
+                FlexConfig(BLOCK1, BLOCK2, s, w)
+                for BLOCK1 in [32, 64]
+                for BLOCK2 in [32, 64, 128]
+                for s in [1, 3, 4, 5]  # num_stages
+                for w in ([4, 8] if BLOCK1 >= 128 or BLOCK2 >= 128 else [4])
+                if BLOCK2 % BLOCK1 == 0
+            ]
+            self.flex_decode_autotune_configs += [
+                FlexDecodeConfig(32, 1, 2),
+                FlexDecodeConfig(32, 1, 1),
+                FlexDecodeConfig(32, 2, 2),
+                FlexDecodeConfig(32, 2, 1),
+                FlexDecodeConfig(64, 1, 2),
+                FlexDecodeConfig(64, 1, 1),
+                FlexDecodeConfig(64, 2, 2),
+                FlexDecodeConfig(64, 2, 1),
+            ]
 
+    def get_flex_attn_fwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        flex_attn_fwd_configs: list[FlexConfig] = []
 
-# Template-specific mixin classes
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_fwd_configs
+            flex_attn_fwd_configs += self.flex_attn_fwd_autotune_configs
 
+        if head_dim <= 256:
+            if dtype == torch.float32:
+                default_config = FlexConfig(64, 64, 1, 8)
+            else:
+                default_config = FlexConfig(128, 64, 1, 16)
+            default_config = self.xpu_default_flex_config.get(
+                (dtype, head_dim), default_config
+            )
+        else:
+            if dtype == torch.float32:
+                default_config = FlexConfig(32, 16, 1, 4)
+            else:
+                default_config = FlexConfig(64, 32, 1, 8)
+
+        if default_config not in flex_attn_fwd_configs:
+            flex_attn_fwd_configs.append(default_config)
+
+        return flex_attn_fwd_configs
+
+    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
+        flex_attn_bwd_configs: list[FlexConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_attn_bwd_configs
+            flex_attn_bwd_configs += self.flex_attn_bwd_autotune_configs
+
+        if dtype == torch.float32:
+            default_config = FlexConfig(16, 16, 1, 4)
+        elif head_dim <= 256:
+            if head_dim == 64:
+                default_config = FlexConfig(64, 64, 1, 8)
+            elif head_dim == 128:
+                default_config = FlexConfig(64, 128, 1, 8)
+            else:
+                default_config = FlexConfig(64, 64, 1, 8)
+        else:  # modest hardware or extremely large head_dim
+            default_config = FlexConfig(16, 16, 1, 4)
+
+        if default_config not in flex_attn_bwd_configs:
+            flex_attn_bwd_configs.append(default_config)
 
-class TemplateConfigHeuristics:
-    def get_template_configs(
+        return flex_attn_bwd_configs
+
+    def get_flex_decode_configs(
+        self, head_dim: int, dtype: Any
+    ) -> list[FlexDecodeConfig]:
+        flex_decode_configs: list[FlexDecodeConfig] = []
+
+        if config.max_autotune:
+            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
+                return self.exhaustive_flex_decode_configs
+            flex_decode_configs += self.flex_decode_autotune_configs
+
+        default_config = FlexDecodeConfig(64, 1, 2)
+
+        if default_config not in flex_decode_configs:
+            flex_decode_configs.append(default_config)
+
+        return flex_decode_configs
+
+    def _prune_exhaustive_configs(
         self,
-        kernel_inputs: KernelInputs,
-        layout: Any,
-        op_name: str,
-    ) -> Generator[dict[str, Any], None, None]:
-        """
-        Get template configs for the given inputs.
-        This is the main entry point for template-specific logic.
-        """
-        # NOTE: not an abstract class, because that clashed below for the mixin
-        # functionality. Can be adjusted, but not a high priority
-        yield from {}
+        configs: list[BaseConfig],
+        dtype_size: int,
+    ) -> list[BaseConfig]:
+        return configs
+
+
+class MTIAConfigHeuristic(BaseConfigHeuristic):
+    """
+    Placeholder child class for MTIA specific overrides.
+    """
 
 
-class MMTemplateConfigMixin(TemplateConfigHeuristics):
+# Template-specific mixin classes
+class MMTemplateConfigMixin(GemmMaxAutotuneTemplateConfigHeuristics):
     """
     Mixin class that converts config lists to template kwargs.
     This handles the logic that was previously in choices.get_mm_configs.
@@ -1270,6 +1432,9 @@ class MMTemplateConfigMixin(TemplateConfigHeuristics):
     ]
     _filter_configs: Callable[[list[BaseConfig]], list[BaseConfig]]
 
+    def _valid(self, kernel_inputs: KernelInputs) -> bool:
+        return True
+
     def _get_config_generator(
         self,
     ) -> partial[Generator[TritonConfig, None, None]]:
@@ -1283,7 +1448,7 @@ def _get_config_generator(
         else:
             return self.get_mm_configs()
 
-    def get_template_configs(
+    def _get_template_configs_impl(
         self,
         kernel_inputs: KernelInputs,
         layout: Any,
@@ -1299,6 +1464,8 @@ def get_template_configs(
         input_nodes = kernel_inputs.nodes()
         if len(input_nodes) < 2:
             raise ValueError(f"Need at least 2 input tensors, got {len(input_nodes)}")
+        if not self._valid(kernel_inputs):
+            return
 
         # Extract M, N, K from kernel_inputs
         m, n, k = kernel_inputs.mnk_symbolic()
@@ -1392,14 +1559,43 @@ def __init__(self) -> None:
         super().__init__()
         self.should_scale_configs = False
 
+    def _get_template_configs_impl(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Any,
+        op_name: str,
+    ) -> Generator[dict[str, Any], None, None]:
+        assert isinstance(kernel_inputs, MMKernelInputs), "Expect MMKernelInputs"
+        m, n, k = kernel_inputs.mnk_symbolic()
+        for kwargs in super()._get_template_configs_impl(
+            kernel_inputs, layout, op_name
+        ):
+            # Apply BLOCK_K constraint specific to mm_plus_mm
+            # see https://github.com/triton-lang/triton/issues/1298
+            # BLOCK_K = K causes llvm error
+            if V.graph.sizevars.statically_known_lt(kwargs.get("BLOCK_K", k), k):
+                yield kwargs
+
 
-# TMA-specific mixin for TMA templates
-class TMAConfigMixin(MMTemplateConfigMixin):
+class TMAWorkspaceMixin(MMTemplateConfigMixin):
     """
-    TMA-specific mixin that uses persistent configs and adds TMA options.
-    This inherits from MMTemplateConfigMixin and overrides config generation.
+    Small mixin to ensure that the workspace arg is correct for TMA
+    and TMA specific filtering can happen.
     """
 
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> dict[str, Any]:
+        kwargs = super().get_extra_kwargs(kernel_inputs, layout, op_name)
+        kwargs["workspace_arg"] = get_tma_workspace_arg(
+            num_tma_descriptors=2,
+            device=kernel_inputs.device(),
+        )
+        return kwargs
+
     def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
         """
         TMA specific filtering, as num_warps=2 not safe for TMA
@@ -1407,7 +1603,15 @@ def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
         configs = [c for c in configs if c.num_warps != 2]
         return super()._filter_configs(configs)
 
-    def get_template_configs(
+
+# TMA-specific mixin for TMA templates
+class TMATemplateConfigMixin(TMAWorkspaceMixin, MMTemplateConfigMixin):
+    """
+    TMA-specific mixin that uses persistent configs and adds TMA options.
+    This inherits from MMTemplateConfigMixin and overrides config generation.
+    """
+
+    def _get_template_configs_impl(
         self,
         kernel_inputs: KernelInputs,
         layout: Any,
@@ -1416,26 +1620,10 @@ def get_template_configs(
         """
         Generate TMA template configs by calling super and adding TMA-specific options.
         """
-        # Get base template configs from superclass
-        for template_kwargs in super().get_template_configs(
-            kernel_inputs, layout, op_name
-        ):
-            # Add TMA-specific options (moved from mm_common.persistent_mm_options)
-            input_nodes = kernel_inputs.nodes()
-            self._add_tma_options(template_kwargs, input_nodes)
-            yield template_kwargs
-
-    def _add_tma_options(
-        self, template_kwargs: dict[str, Any], input_nodes: list[Any]
-    ) -> None:
-        """
-        Add TMA-specific options to template kwargs.
-        Moved from mm_common.persistent_mm_options and mm_common.tma_options.
-        """
-        # For TMA templates, we need the actual matrix tensors
-        mat1 = input_nodes[-2]
-        mat2 = input_nodes[-1]
-
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            "TMATemplateConfigMixin requires MMKernelInputs"
+        )
+        mat1, mat2 = kernel_inputs.mat1mat2()
         tma_opts = {
             "A_ROW_MAJOR": not mat1.layout.is_transposed(),
             "B_ROW_MAJOR": not mat2.layout.is_transposed(),
@@ -1443,18 +1631,57 @@ def _add_tma_options(
             "TMA_SIZE": TMA_DESCRIPTOR_SIZE,
             "TMA_EXPERIMENTAL_API": not has_triton_stable_tma_api(),
         }
-        template_kwargs.update(tma_opts)
+        # Get base template configs from superclass
+        for template_kwargs in super()._get_template_configs_impl(
+            kernel_inputs,
+            layout,
+            op_name,
+        ):
+            yield {**template_kwargs, **tma_opts}
 
 
-# Scaled MM-specific mixin for scaled MM templates (non-TMA)
-class ScaledMMConfigMixin(MMTemplateConfigMixin):
+# Scaled MM-specific mixin for scaled MM templates
+class BaseScaledMMConfigMixin(MMTemplateConfigMixin):
     """
-    Scaled MM-specific mixin that uses scaled configs and adds scaled MM options.
-    This is for non-TMA scaled MM templates only.
-    This inherits from MMTemplateConfigMixin and overrides config generation.
+    This is a base that handles the common case for ScaledMM
+
+    The TMA and non-TMA should build on top of this
     """
 
-    def get_template_configs(
+    def adjust_kernel_inputs(
+        self, kernel_inputs: KernelInputs, op_name: str
+    ) -> KernelInputs:
+        """
+        for scaled_mm, we need to unsqueeze scale tensors, and bias
+        """
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            "Expect MMKernelInputs for scaled MM"
+        )
+        inputs = super().adjust_kernel_inputs(kernel_inputs, op_name)
+        nodes = inputs.nodes()
+        mat_a, mat_b, scale_a, scale_b, *bias = nodes
+        bias = bias[0] if bias else None
+        # Prepare triton input nodes and create kernel_inputs at the top
+        from ..lowering import lowerings as L
+
+        aten = torch.ops.aten
+        if bias and len(mat_b.get_size()) == len(bias.get_size()) + 1:
+            # Need to unsqueeze bias from [N] -> [1, N]
+            bias = L[aten.unsqueeze](bias, 0)
+
+        if len(scale_a.get_size()) == 0 or len(scale_b.get_size()) == 0:
+            assert len(scale_a.get_size()) == len(scale_b.get_size())
+            # Need to unsqueeze scale from [] -> [1, 1]
+            scale_a = L[aten.unsqueeze](L[aten.unsqueeze](scale_a, 0), 1)
+            scale_b = L[aten.unsqueeze](L[aten.unsqueeze](scale_b, 0), 1)
+        nodes = [mat_a, mat_b, scale_a, scale_b]
+        if bias:
+            nodes.append(bias)
+        return MMKernelInputs(
+            nodes, mat1_idx=kernel_inputs._mat1_idx, mat2_idx=kernel_inputs._mat2_idx
+        )
+
+    def _get_template_configs_impl(
         self,
         kernel_inputs: KernelInputs,
         layout: Any,
@@ -1464,8 +1691,8 @@ def get_template_configs(
         Generate scaled MM template configs with scaled MM-specific options.
         Handles the remaining logic from mm_common including assertions and SCALING_ROWWISE.
         """
+        kernel_inputs = self.adjust_kernel_inputs(kernel_inputs, op_name)
         input_nodes = kernel_inputs.nodes()
-
         # Initial assertion from mm_common.scaled_mm_options
         assert len(input_nodes) >= 4, (
             f"scaled_mm requires at least 4 inputs, got {len(input_nodes)}"
@@ -1498,8 +1725,15 @@ def is_scalar_like(sz: Any) -> bool:
             f"or 1-dimensional tensors with the same size. Got scale_a: {len(size_a)} and scale_b: {len(size_b)}."
         )
 
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            f"{self.__class__.__name__} requires MMKernelInputs"
+        )
+
+        if not self._valid(kernel_inputs):
+            return
+
         # Get base template configs from superclass
-        for template_kwargs in super().get_template_configs(
+        for template_kwargs in super()._get_template_configs_impl(
             kernel_inputs, layout, op_name
         ):
             # Add scaled MM-specific options (moved from mm_common.scaled_mm_options)
@@ -1512,22 +1746,50 @@ def is_scalar_like(sz: Any) -> bool:
             yield template_kwargs
 
 
+class ScaledMMConfigMixin(BaseScaledMMConfigMixin):
+    """Mixing for scaled mm with the regular mm template"""
+
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> dict[str, Any]:
+        kwargs = super().get_extra_kwargs(kernel_inputs, layout, op_name)
+        from ..kernel.mm_common import scale_mm_epilogue
+
+        return {
+            **kwargs,
+            "suffix_args": kernel_inputs.count - 2,
+            "epilogue_fn": scale_mm_epilogue(),
+            "epilogue_fn_hash": "scale_mm_epilogue",
+        }
+
+    def _valid(self, kernel_inputs: KernelInputs) -> bool:
+        assert isinstance(kernel_inputs, MMKernelInputs), (
+            "Expect MMKernelInputs for ScaledMMConfigMixin"
+        )
+        _, _, k = kernel_inputs.mnk_symbolic()
+        if V.graph.sizevars.guard_or_false(sympy.Le(k, 16)):
+            # Triton crashes however uncommon for real workloads
+            return False
+
+        # On NVIDIA B200 GPUs, K dim must be >= 32 for tcgen05.mma.kind::f8f6f4.* PTX instruction to be valid
+        # source: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-matrix-shape
+        if using_b200() and V.graph.sizevars.guard_or_false(sympy.Lt(k, 32)):
+            return False
+        return True
+
+
 # Scaled TMA-specific mixin for scaled MM templates with TMA
-class ScaledTMAConfigMixin(ScaledMMConfigMixin):
+class ScaledTMAConfigMixin(TMAWorkspaceMixin, BaseScaledMMConfigMixin):
     """
-    Scaled TMA-specific mixin that extends ScaledMMConfigMixin with TMA functionality.
+    Scaled TMA-specific mixin that extends BaseScaledMMConfigMixin with TMA functionality.
     This is for scaled MM templates that use device TMA.
-    This inherits from ScaledMMConfigMixin and adds TMA-specific options.
+    This inherits from BaseScaledMMConfigMixin and adds TMA-specific options.
     """
 
-    def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
-        """
-        TMA specific filtering, as num_warps=2 not safe for TMA
-        """
-        configs = [c for c in configs if c.num_warps != 2]
-        return super()._filter_configs(configs)
-
-    def get_template_configs(
+    def _get_template_configs_impl(
         self,
         kernel_inputs: KernelInputs,
         layout: Any,
@@ -1537,8 +1799,10 @@ def get_template_configs(
         Generate scaled TMA template configs with both scaled MM and TMA-specific options.
         """
         # Get base scaled MM template configs from superclass
-        for template_kwargs in super().get_template_configs(
-            kernel_inputs, layout, op_name
+        for template_kwargs in super()._get_template_configs_impl(
+            kernel_inputs,
+            layout,
+            op_name,
         ):
             # Add TMA-specific options for device TMA scaled MM
             template_kwargs["TMA_SIZE"] = TMA_DESCRIPTOR_SIZE
@@ -1551,17 +1815,37 @@ def get_template_configs(
 # Template-specific heuristic classes using multiple inheritance
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("mm", "cuda", register=torch.version.hip is None)
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("bmm", "cuda", register=torch.version.hip is None)
+@register_template_heuristic(
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+)
+@register_template_heuristic(
+    bmm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+)
 class CUDAMMTemplateConfigHeuristic(MMTemplateConfigMixin, CUDAConfigHeuristic):
     """Standard MM template heuristic for CUDA"""
 
 
+@register_template_heuristic(
+    mm_template.uid, "cuda", register=torch.version.hip is None, op_name="addmm"
+)
+@register_template_heuristic(
+    bmm_template.uid, "cuda", register=torch.version.hip is None, op_name="baddbmm"
+)
+class CUDAAddMMTemplateConfigHeuristic(AddMMConfigMixin, CUDAMMTemplateConfigHeuristic):
+    """Addmm specific mixin for CUDA"""
+
+
 # TODO(coconutruben): deprecate once autoheuristic is deprecated
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("mm-ah", "cuda", register=torch.version.hip is None)
+@register_template_heuristic(
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="mm-ah",
+)
 class CUDAMMAHTemplateConfigHeuristic(MMTemplateConfigMixin, CUDAConfigHeuristic):
     """Standard MM template heuristic for CUDA using the extra mm configs only (for autoheuristic)"""
 
@@ -1572,11 +1856,14 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.extra_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm_persistent_tma", "cuda", register=torch.version.hip is None
+    persistent_tma_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
 )
-class CUDAPersistentTMATemplateConfigHeuristic(TMAConfigMixin, CUDAConfigHeuristic):
+class CUDAPersistentTMATemplateConfigHeuristic(
+    TMATemplateConfigMixin, CUDAConfigHeuristic
+):
     """Persistent TMA template heuristic for CUDA"""
 
     def __init__(self) -> None:
@@ -1585,9 +1872,20 @@ def __init__(self) -> None:
         self.mm_configs = self.persistent_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm", "cuda", register=torch.version.hip is None, op_name="scaled_mm"
+    persistent_tma_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="addmm",
+)
+class CUDAAddmmPersistentTMATemplateConfigHeuristic(
+    AddMMConfigMixin, CUDAPersistentTMATemplateConfigHeuristic
+):
+    """Addmm specific mixin for CUDA"""
+
+
+@register_template_heuristic(
+    mm_template.uid, "cuda", register=torch.version.hip is None, op_name="scaled_mm"
 )
 class CUDAScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, CUDAConfigHeuristic):
     """Scaled MM template heuristic for CUDA"""
@@ -1603,9 +1901,10 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "scaled_mm_device_tma", "cuda", register=torch.version.hip is None
+    scaled_mm_device_tma_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
 )
 class CUDAScaledTMATemplateConfigHeuristic(ScaledTMAConfigMixin, CUDAConfigHeuristic):
     """Scaled TMA template heuristic for CUDA"""
@@ -1621,8 +1920,11 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_persistent_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("mm_plus_mm", "cuda", register=torch.version.hip is None)
+@register_template_heuristic(
+    mm_plus_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+)
 class CUDAMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, CUDAConfigHeuristic
 ):
@@ -1639,9 +1941,11 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.mm_plus_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm", "cuda", register=torch.version.hip is None, op_name="int_mm"
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is None,
+    op_name="int_mm",
 )
 class CUDAInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, CUDAConfigHeuristic):
     """Int8 MM template heuristic for CUDA"""
@@ -1660,16 +1964,33 @@ def __init__(self) -> None:
 # ROCm template-specific classes
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("mm", "cuda", register=torch.version.hip is not None)
-# TODO(coconutruben): replace with template.name once templates are importable
-@register_template_heuristic("bmm", "cuda", register=torch.version.hip is not None)
+@register_template_heuristic(
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+)
+@register_template_heuristic(
+    bmm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+)
 class ROCmMMTemplateConfigHeuristic(MMTemplateConfigMixin, ROCmConfigHeuristic):
     """Standard MM template heuristic for ROCm"""
 
 
-# TODO(coconutruben): deprecate once autoheuristic is deprecated
 # TODO(coconutruben): replace with template.name once templates are importable
+@register_template_heuristic(
+    mm_template.uid, "cuda", register=torch.version.hip is not None, op_name="addmm"
+)
+# TODO(coconutruben): replace with template.name once templates are importable
+@register_template_heuristic(
+    bmm_template.uid, "cuda", register=torch.version.hip is not None, op_name="baddbmm"
+)
+class ROCmAddMMTemplateConfigHeuristic(AddMMConfigMixin, ROCmMMTemplateConfigHeuristic):
+    """Addmm specific mixin for ROCm"""
+
+
+# TODO(coconutruben): deprecate once autoheuristic is deprecated
 @register_template_heuristic("mm-ah", "cuda", register=torch.version.hip is not None)
 class ROCmMMAHTemplateConfigHeuristic(MMTemplateConfigMixin, ROCmConfigHeuristic):
     """Standard MM template heuristic for ROCm using the extra mm configs only (for autoheuristic)"""
@@ -1681,9 +2002,11 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.extra_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm", "cuda", register=torch.version.hip is not None, op_name="scaled_mm"
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+    op_name="scaled_mm",
 )
 class ROCmScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, ROCmConfigHeuristic):
     """Scaled MM template heuristic for ROCm (non-TMA)"""
@@ -1699,9 +2022,11 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm", "cuda", register=torch.version.hip is not None, op_name="int_mm"
+    mm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
+    op_name="int_mm",
 )
 class ROCmInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, ROCmConfigHeuristic):
     """Int8 MM template heuristic for ROCm"""
@@ -1717,9 +2042,10 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.int8_mm_configs
 
 
-# TODO(coconutruben): replace with template.name once templates are importable
 @register_template_heuristic(
-    "mm_plus_mm", "cuda", register=torch.version.hip is not None
+    mm_plus_mm_template.uid,
+    "cuda",
+    register=torch.version.hip is not None,
 )
 class ROCmMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, ROCmConfigHeuristic
@@ -1743,13 +2069,19 @@ def __init__(self) -> None:
 # CPU template-specific classes
 
 
-@register_template_heuristic("mm", "cpu")
-@register_template_heuristic("bmm", "cpu")
+@register_template_heuristic(mm_template.uid, "cpu")
+@register_template_heuristic(bmm_template.uid, "cpu")
 class CPUMMTemplateConfigHeuristic(MMTemplateConfigMixin, CPUConfigHeuristic):
     """Standard MM template heuristic for CPU"""
 
 
-@register_template_heuristic("mm", "cpu", op_name="scaled_mm")
+@register_template_heuristic(mm_template.uid, "cpu", op_name="addmm")
+@register_template_heuristic(bmm_template.uid, "cpu", op_name="baddbmm")
+class CPUAddmmTemplateConfigHeuristic(AddMMConfigMixin, CPUMMTemplateConfigHeuristic):
+    """Addmm specific mixin for CPU"""
+
+
+@register_template_heuristic(mm_template.uid, "cpu", op_name="scaled_mm")
 class CPUScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, CPUConfigHeuristic):
     """Scaled MM template heuristic for CPU (non-TMA)"""
 
@@ -1764,7 +2096,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-@register_template_heuristic("mm", "cpu", op_name="int_mm")
+@register_template_heuristic(mm_template.uid, "cpu", op_name="int_mm")
 class CPUInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, CPUConfigHeuristic):
     """Int8 MM template heuristic for CPU"""
 
@@ -1779,7 +2111,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.int8_mm_configs
 
 
-@register_template_heuristic("mm_plus_mm", "cpu")
+@register_template_heuristic(mm_plus_mm_template.uid, "cpu")
 class CPUMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, CPUConfigHeuristic
 ):
@@ -1799,13 +2131,41 @@ def __init__(self) -> None:
 # XPU template-specific classes
 
 
-@register_template_heuristic("mm", "xpu")
-@register_template_heuristic("bmm", "xpu")
+@register_template_heuristic(mm_template.uid, "xpu")
+@register_template_heuristic(bmm_template.uid, "xpu")
 class XPUMMTemplateConfigHeuristic(MMTemplateConfigMixin, XPUConfigHeuristic):
     """Standard MM template heuristic for XPU"""
 
 
-@register_template_heuristic("mm", "xpu", op_name="scaled_mm")
+@register_template_heuristic(mm_template.uid, "xpu", op_name="addmm")
+@register_template_heuristic(bmm_template.uid, "xpu", op_name="baddbmm")
+class XPUAddmmTemplateConfigHeuristic(AddMMConfigMixin, XPUMMTemplateConfigHeuristic):
+    """Addmm specific mixin for XPU"""
+
+
+@register_template_heuristic(
+    persistent_tma_mm_template.uid,
+    "xpu",
+)
+class XPUPersistentTMATemplateConfigHeuristic(
+    TMATemplateConfigMixin, XPUConfigHeuristic
+):
+    """Persistent TMA template heuristic for XPU"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        # Override mm_configs to use persistent_mm_configs
+        self.mm_configs = self.persistent_mm_configs
+
+
+@register_template_heuristic(persistent_tma_mm_template.uid, "xpu", op_name="addmm")
+class XPUAddmmPersistentTMATemplateConfigHeuristic(
+    AddMMConfigMixin, XPUPersistentTMATemplateConfigHeuristic
+):
+    """Addmm specific mixin for XPU"""
+
+
+@register_template_heuristic(mm_template.uid, "xpu", op_name="scaled_mm")
 class XPUScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, XPUConfigHeuristic):
     """Scaled MM template heuristic for XPU (non-TMA)"""
 
@@ -1820,7 +2180,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-@register_template_heuristic("mm", "xpu", op_name="int_mm")
+@register_template_heuristic(mm_template.uid, "xpu", op_name="int_mm")
 class XPUInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, XPUConfigHeuristic):
     """Int8 MM template heuristic for XPU"""
 
@@ -1835,7 +2195,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.int8_mm_configs
 
 
-@register_template_heuristic("mm_plus_mm", "xpu")
+@register_template_heuristic(mm_plus_mm_template.uid, "xpu")
 class XPUMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, XPUConfigHeuristic
 ):
@@ -1855,13 +2215,19 @@ def __init__(self) -> None:
 # MTIA template-specific classes
 
 
-@register_template_heuristic("mm", "mtia")
-@register_template_heuristic("bmm", "mtia")
+@register_template_heuristic(mm_template.uid, "mtia")
+@register_template_heuristic(bmm_template.uid, "mtia")
 class MTIAMMTemplateConfigHeuristic(MMTemplateConfigMixin, MTIAConfigHeuristic):
     """Standard MM template heuristic for MTIA"""
 
 
-@register_template_heuristic("mm", "mtia", op_name="scaled_mm")
+@register_template_heuristic(mm_template.uid, "mtia", op_name="addmm")
+@register_template_heuristic(bmm_template.uid, "mtia", op_name="baddbmm")
+class MTIAAddMMTemplateConfigHeuristic(AddMMConfigMixin, MTIAMMTemplateConfigHeuristic):
+    """Addmm specific mixin for MTIA"""
+
+
+@register_template_heuristic(mm_template.uid, "mtia", op_name="scaled_mm")
 class MTIAScaledMMTemplateConfigHeuristic(ScaledMMConfigMixin, MTIAConfigHeuristic):
     """Scaled MM template heuristic for MTIA (non-TMA)"""
 
@@ -1876,7 +2242,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.scaled_mm_configs
 
 
-@register_template_heuristic("mm", "mtia", op_name="int_mm")
+@register_template_heuristic(mm_template.uid, "mtia", op_name="int_mm")
 class MTIAInt8MMTemplateConfigHeuristic(INT8MMTemplateConfigMixin, MTIAConfigHeuristic):
     """Int8 MM template heuristic for MTIA"""
 
@@ -1891,7 +2257,7 @@ def __init__(self) -> None:
         self.exhaustive_configs = self.int8_mm_configs
 
 
-@register_template_heuristic("mm_plus_mm", "mtia")
+@register_template_heuristic(mm_plus_mm_template.uid, "mtia")
 class MTIAMMPlusMMTemplateConfigHeuristic(
     MMPlusMMTemplateConfigMixin, MTIAConfigHeuristic
 ):
diff --git a/torch/_inductor/template_heuristics/triton_addmm.py b/torch/_inductor/template_heuristics/triton_addmm.py
new file mode 100644
index 000000000000..5ce99a6049e8
--- /dev/null
+++ b/torch/_inductor/template_heuristics/triton_addmm.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from typing import Any, TYPE_CHECKING
+
+from ..kernel.mm_common import addmm_epilogue
+from .base import TemplateConfigHeuristics
+
+
+if TYPE_CHECKING:
+    from ..ir import Layout
+    from ..kernel_inputs import KernelInputs
+
+
+class AddMMConfigMixin(TemplateConfigHeuristics):
+    """
+    Simple mixin to handle scalars for addmm like operators (addmm, baddbmm)
+    """
+
+    def get_extra_kwargs(
+        self,
+        kernel_inputs: KernelInputs,
+        layout: Layout,
+        op_name: str,
+    ) -> dict[str, Any]:
+        kwargs = super().get_extra_kwargs(kernel_inputs, layout, op_name)
+        assert op_name in [
+            "addmm",
+            "baddbmm",
+        ], f"op_name={op_name} invalid for AddMMConfigMixin"
+        alpha = kernel_inputs.get_scalar("alpha")
+        beta = kernel_inputs.get_scalar("beta")
+        return {
+            **kwargs,
+            "epilogue_fn": addmm_epilogue(layout.dtype, alpha, beta),
+            "epilogue_fn_hash": str(["addmm_epilogue", layout.dtype, alpha, beta]),
+            "prefix_args": 1,
+        }
diff --git a/torch/_inductor/template_registry.py b/torch/_inductor/template_registry.py
deleted file mode 100644
index d11343e63f0f..000000000000
--- a/torch/_inductor/template_registry.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""
-Template heuristic registry system for PyTorch Inductor.
-
-This module provides a centralized registration system for template heuristics,
-allowing automatic registration based on device type and conditional registration
-for CUDA vs ROCm based on torch.version.hip.
-"""
-
-from __future__ import annotations
-
-import logging
-from functools import cache
-from typing import Any, Optional, TYPE_CHECKING
-
-
-if TYPE_CHECKING:
-    from .template_heuristics import TemplateConfigHeuristics
-
-# Module-wide registry for template heuristics
-_TEMPLATE_HEURISTIC_REGISTRY: dict[tuple[str, ...], type[TemplateConfigHeuristics]] = {}
-
-log = logging.getLogger(__name__)
-
-
-def register_template_heuristic(
-    template_name: str,
-    device_type: str,
-    register: bool = True,
-    op_name: Optional[str] = None,
-) -> Any:
-    """
-    Decorator to register template heuristic classes.
-
-    Args:
-        template_name: Name of the template (e.g., "mm", "bmm", "scaled_mm")
-        device_type: Device type ("cuda", "cpu", "xpu")
-        register: Whether to register this heuristic. Caller should pass the condition directly.
-        op_name: Name of the operator (e.g., "mm", "bmm", "scaled_mm"). This is optional
-            and is only used when a template uses different heuristics for different ops
-
-    Returns:
-        Decorator function that registers the class if conditions are met.
-
-    Example:
-        @register_template_heuristic("mm", "cuda", register=torch.version.hip is None)
-        class CUDAMMTemplateConfigHeuristic(MMTemplateConfigMixin, CUDAConfigHeuristic):
-            pass
-    """
-
-    def decorator(
-        cls: type[TemplateConfigHeuristics],
-    ) -> type[TemplateConfigHeuristics]:
-        if register:
-            key: tuple[str, ...] = (device_type, template_name)
-            if op_name is not None:
-                key = (device_type, template_name, op_name)
-            _TEMPLATE_HEURISTIC_REGISTRY[key] = cls
-            log.info(
-                f"Registered template heuristic: {cls.__name__} for '{template_name=}', '{device_type=}', '{op_name=}'"  # noqa: G004
-            )
-        return cls
-
-    return decorator
-
-
-@cache
-def get_template_heuristic(
-    template_name: str, device_type: str, op_name: str
-) -> TemplateConfigHeuristics:
-    """
-    Retrieve a template heuristic instance for the given template and device type.
-
-    Args:
-        template_name: Name of the template (e.g., "mm", "bmm", "scaled_mm")
-        device_type: Device type ("cuda", "cpu", "xpu")
-
-    Returns:
-        Template heuristic instance.
-
-    Raises:
-        ValueError: If no heuristic is found for the given combination.
-    """
-    # First check the more specific key
-    keys = [(device_type, template_name, op_name), (device_type, template_name)]
-
-    # Look up in registry
-    heuristic_class = None
-    for key in keys:
-        if key in _TEMPLATE_HEURISTIC_REGISTRY:
-            heuristic_class = _TEMPLATE_HEURISTIC_REGISTRY[key]
-            break
-    if heuristic_class is None:
-        raise ValueError(
-            f"No template heuristic found for '{template_name=}', "
-            f"'{device_type=}', '{op_name=}'. "
-            f"Available combinations: {list(_TEMPLATE_HEURISTIC_REGISTRY.keys())}"
-        )
-    return heuristic_class()
diff --git a/torch/_inductor/triton_bundler.py b/torch/_inductor/triton_bundler.py
index b5ccb873e33f..b210dbff5c84 100644
--- a/torch/_inductor/triton_bundler.py
+++ b/torch/_inductor/triton_bundler.py
@@ -183,14 +183,9 @@ def put_static_autotuner(cls, key: str, kernel: "CachingAutotuner") -> None:  #
                     new_kernel,
                 )
             )
+
             # Put the values back since we need it to use now
-            (
-                kernel.fn.fn,
-                kernel.fn.__globals__,
-                kernel.fn.used_global_vals,
-                kernel.fn.repr,
-                kernel.launchers,
-            ) = old_values
+            kernel.restore_after_unpickle(old_values)
 
     @classmethod
     def collect_static_autotuners(
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index c95db9e3ae8d..29a690aa1080 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -59,6 +59,7 @@
 import sympy
 
 import torch
+import torch.utils._pytree as pytree
 from torch._inductor.analysis.device_info import datasheet_tops
 from torch._inductor.runtime.hints import DeviceProperties
 from torch.utils._dtype_abbrs import dtype_abbrs
@@ -109,7 +110,7 @@ def get_gpu_type() -> str:
     return gpu_type
 
 
-from torch._dynamo.device_interface import DeviceInterface, get_interface_for_device
+from torch._dynamo.device_interface import get_interface_for_device
 from torch._dynamo.utils import detect_fake_mode
 from torch.autograd import DeviceType
 from torch.autograd.profiler_util import EventList
@@ -1458,6 +1459,9 @@ def __add__(self, other: Self) -> IndentedBuffer:
         res.writelines(other._lines)
         return res
 
+    def contains(self, new_line: Union[DeferredLineBase, LineContext, str]) -> bool:
+        return new_line in self._lines
+
 
 class FakeIndentedBuffer(IndentedBuffer):
     def __init__(self) -> None:
@@ -1561,12 +1565,26 @@ def is_big_gpu(index_or_device: Union[int, torch.device] = 0) -> bool:
 
 @functools.lru_cache
 def get_max_num_sms() -> int:
+    if torch.xpu.is_available():
+        return torch.xpu.get_device_properties().gpu_subslice_count
     return torch.cuda.get_device_properties("cuda").multi_processor_count
 
 
+@functools.lru_cache
+def using_b200() -> bool:
+    """Returns true if the device is a NVIDIA B200, otherwise returns false."""
+    if not torch.cuda.is_available():
+        return False
+    # compute capability 10.0 or 10.0a is NVIDIA B200
+    device_properties = torch.cuda.get_device_properties(torch.cuda.current_device())
+    return device_properties.major == 10
+
+
 def get_num_sms() -> int:
     """Handle experimental carveout if set otherwise return hardware SM count"""
     # TODO we need to properly guard on this global
+    if torch.xpu.is_available():
+        return get_max_num_sms()
     carveout = torch._C._get_sm_carveout_experimental()
     return get_max_num_sms() - (carveout if carveout is not None else 0)
 
@@ -1620,7 +1638,11 @@ def _use_conv_autotune_backend(backend: str) -> bool:
 
 
 def use_triton_template(
-    layout: Layout, *, enable_int32: bool = False, enable_float8: bool = False
+    layout: Layout,
+    *,
+    enable_int32: bool = False,
+    enable_float8: bool = False,
+    check_max_autotune: bool = True,
 ) -> bool:
     from .codegen.common import BackendFeature, has_backend_feature
 
@@ -1637,7 +1659,8 @@ def use_triton_template(
             )
             or (layout.device.type == "cpu" and layout.dtype in layout_dtypes)
         )
-        and (config.max_autotune or config.max_autotune_gemm)
+        # some callers handle max-autotune checking externally
+        and (config.max_autotune or config.max_autotune_gemm or not check_max_autotune)
         and _use_autotune_backend("TRITON")
         and has_backend_feature(layout.device, BackendFeature.TRITON_TEMPLATES)
     )
@@ -1665,7 +1688,7 @@ def can_use_tma(*matrices: IRNode, add_guards: bool = False) -> bool:
     def _aligned(expr_bytes: Union[int, sympy.Expr]) -> bool:
         return V.graph.sizevars.statically_known_multiple_of(expr_bytes, TMA_ALIGNMENT)
 
-    def _is_tma_compatible(x: IRNode) -> bool:
+    def _is_tma_compatible_default(x: IRNode) -> bool:
         sizes = x.get_size()
         strides = x.get_stride()
         rank = len(sizes)
@@ -1725,7 +1748,25 @@ def _is_tma_compatible(x: IRNode) -> bool:
 
         return True
 
-    return has_triton_tma_device() and all(_is_tma_compatible(m) for m in matrices)
+    def _is_tma_compatible_xpu(x: IRNode) -> bool:
+        strides = x.get_stride()
+        strides_i = [V.graph.sizevars.symbolic_hint(st) for st in strides]
+        # Find the single contiguous (“inner”) dim
+        inner = [
+            i
+            for i, st in enumerate(strides_i)
+            if V.graph.sizevars.statically_known_equals(st, 1)
+        ]
+        if len(inner) != 1:
+            return False
+        return True
+
+    return has_triton_tma_device() and all(
+        _is_tma_compatible_default(m)
+        if (m_device := m.get_device()) is None or m_device.type != "xpu"
+        else _is_tma_compatible_xpu(m)
+        for m in matrices
+    )
 
 
 def use_triton_tma_template(*matrices: IRNode, add_guards: bool = False) -> bool:
@@ -1799,6 +1840,30 @@ def use_decompose_k_choice(m: _IntLike, n: _IntLike, k: _IntLike) -> bool:
     )
 
 
+@functools.cache
+def use_contiguous(m: _IntLike, n: _IntLike, k: _IntLike) -> bool:
+    """
+    Check if we should use the contiguous subgraph transform.
+    This transform makes the second matrix contiguous before the matmul.
+    """
+    contiguous_threshold = config.rocm.contiguous_threshold
+
+    # Similar conditions to decompose_k but for contiguous transform
+    from torch._inductor.virtualized import V
+
+    return (
+        bool(torch.version.hip)  # Only relevant on AMD
+        and V.graph.sizevars.statically_known_true(
+            sympy.And(
+                sympy.Ge(k, contiguous_threshold * m),
+                sympy.Ge(k, contiguous_threshold * n),
+            )
+        )
+        and not V.graph.aot_mode
+        and not V.graph.cpp_wrapper
+    )
+
+
 @functools.cache
 def get_k_splits(m: _IntLike, n: _IntLike, k: _IntLike) -> list[int]:
     # To limit compile time
@@ -3154,16 +3219,15 @@ def register_op_requires_libdevice_fp64(name: str) -> None:
 
 
 def get_current_backend() -> str:
-    """Get the codegen backend for the current graph, or throw."""
     from torch._inductor.virtualized import V
 
-    device: torch.device = V.graph.get_current_device_or_throw()
-    device_interface: type[DeviceInterface] = get_interface_for_device(device.type)
-
-    device_inductor_backend: Optional[str] = device_interface.inductor_backend()
-    if device_inductor_backend is None:
-        raise ValueError(f"Couldn't get an Inductor backend for device {device.type}")
-    return device_inductor_backend
+    device_str = V.graph.get_current_device_or_throw().type
+    if device_str == "cpu":
+        return config.cpu_backend
+    elif device_str == "mps":
+        return "mps"
+    else:
+        return config.cuda_backend
 
 
 def upcast_compute_type(dtype: torch.dtype) -> torch.dtype:
@@ -3335,8 +3399,8 @@ def is_codegen_graph_partition_subgraph(wrapper: PythonWrapperCodegen) -> bool:
 def is_using_cudagraph_partition() -> bool:
     return (
         torch._inductor.config.triton.cudagraphs
-        and torch._inductor.config.graph_partition
-    )
+        or _unstable_customized_partition_wrapper.wrapper is not None
+    ) and torch._inductor.config.graph_partition
 
 
 def dtype_from_size(size: int) -> torch.dtype:
@@ -3561,3 +3625,83 @@ def python_subprocess_env() -> dict[str, str]:
         env["PYTHONHOME"] = sysconfig.get_path("data")
 
     return env
+
+
+@dataclasses.dataclass(frozen=True)
+class CUDAGraphWrapperMetadata:
+    """
+    Metadata for Customized CUDAGraphWrapper.
+
+    Currently assumes there is 1 dynamo graph and will extend to
+    multiple graphs in the future.
+    """
+
+    # The number of partitions that are cudagraphable.
+    num_partitions: int
+
+    # Index of the current partition.
+    partition_index: int
+
+
+PartitionFnType = Callable[..., Any]
+CUDAGraphWrapperType = Callable[
+    [PartitionFnType, CUDAGraphWrapperMetadata], PartitionFnType
+]
+
+
+# only incremented by user call of mark_step_begin
+class CUDAGraphWrapper:
+    wrapper: Optional[CUDAGraphWrapperType] = None
+
+
+# A customized partition wrappers from users. Interface should be:
+#
+# def wrapper(fn: PartitionFnType, metadata: CUDAGraphWrapperMetadata) -> PartitionFnType
+#
+# Inductor generates N wrapper functions for N partition functions, and mechanically wrap
+# each partition fn with the generated wrapper function. Users need to handle all details
+# such as static inputs, dynamic shapes, etc.
+# Users could customize the wrapper based on the metadata. One example is to have special
+# handle for the first and last wrapper function.
+#
+# Warning: This API is unstable and may change in the future.
+_unstable_customized_partition_wrapper = CUDAGraphWrapper()
+
+
+def set_customized_partition_wrappers(wrapper: CUDAGraphWrapperType) -> None:
+    _unstable_customized_partition_wrapper.wrapper = wrapper
+
+
+def snode_args_kwargs(snode: BaseSchedulerNode) -> tuple[list[Any], dict[str, Any]]:
+    args = snode.node.inputs  # type: ignore[union-attr]
+    args = snode.node.fill_non_provided_args(  # type: ignore[union-attr]
+        [*args, *snode.node.constant_args],  # type: ignore[union-attr]
+        snode.node.kwargs,  # type: ignore[union-attr]
+    )
+    kwargs = snode.node.kwargs  # type: ignore[union-attr]
+    flat_args, flat_args_pytree_spec = pytree.tree_flatten((args, kwargs))
+
+    def _is_tensor_ir(x) -> bool:  # type: ignore[no-untyped-def]
+        return isinstance(x, torch._inductor.ir.IRNode) and not isinstance(
+            x, torch._inductor.ir.GeneratorState
+        )
+
+    flat_args = [
+        torch._inductor.ir.ir_node_to_tensor(a, guard_shape=False)
+        if _is_tensor_ir(a)
+        else a
+        for a in flat_args
+    ]
+
+    def _tensor(size, dtype, device) -> torch.Tensor:  # type: ignore[no-untyped-def]
+        return torch.empty(size, dtype=dtype, device=device)
+
+    def to_real_tensor(e: Any) -> Any:
+        if not isinstance(e, torch.Tensor):
+            return e
+        out = _tensor(e.size(), e.dtype, e.device)
+        return out
+
+    flat_args = [to_real_tensor(a) for a in flat_args]
+    args, kwargs = pytree.tree_unflatten(flat_args, flat_args_pytree_spec)
+    return args, kwargs
diff --git a/torch/_library/simple_registry.py b/torch/_library/simple_registry.py
index cfef278679ea..bf25cde9cb53 100644
--- a/torch/_library/simple_registry.py
+++ b/torch/_library/simple_registry.py
@@ -28,9 +28,10 @@ def __init__(self):
         self._data = {}
 
     def find(self, qualname: str) -> "SimpleOperatorEntry":
-        if qualname not in self._data:
-            self._data[qualname] = SimpleOperatorEntry(qualname)
-        return self._data[qualname]
+        res = self._data.get(qualname, None)
+        if res is None:
+            self._data[qualname] = res = SimpleOperatorEntry(qualname)
+        return res
 
 
 singleton: SimpleLibraryRegistry = SimpleLibraryRegistry()
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 03fcc64e227f..9202b4da41d2 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -2373,9 +2373,10 @@ def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int
             ret_shape.append(
                 _formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i])
             )
+    from torch.fx.experimental.symbolic_shapes import sym_or
 
     torch._check(
-        any(x > 0 for x in ret_shape[2:]),
+        sym_or(*[x > 0 for x in ret_shape[2:]]),
         lambda: f"Given input size per channel: {list(dims)}. "
         f"Calculated output size per channel: {ret_shape[2:]}. "
         f"Output size is too small",
@@ -5709,7 +5710,7 @@ def meta__scaled_dot_product_cudnn_attention(
     res = alloc_with_matching_layout(query, res_shape)
 
     logsum_exp = torch.empty(
-        (B, H, S_Q),
+        (B, H, S_Q, 1),
         dtype=torch.float,
         device=query.device,
     )
@@ -7423,17 +7424,17 @@ def _meta_grouped_mm_common(
         fp8_dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
         torch._check(
             mat_a.dtype == fp8_dtype and mat_b.dtype == fp8_dtype,
-            lambda: f"Expected inputs of E4M3 FP8 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",
+            lambda: f"Expected inputs of E4M3 FP8 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",  # noqa: B950
         )
     else:
         torch._check(
             mat_a.dtype == torch.bfloat16 and mat_b.dtype == torch.bfloat16,
-            lambda: f"Expected inputs of BF16 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",
+            lambda: f"Expected inputs of BF16 type but got mat_a.dtype={mat_a.dtype} and mat_b.dtype={mat_b.dtype}.",  # noqa: B950
         )
 
     torch._check(
         mat_a.dim() in [2, 3] and mat_b.dim() in [2, 3],
-        lambda: f"Multiplicands must be 2D or 3D but got mat_a.dim()={mat_a.dim()} and mat_b.dim()={mat_b.dim()}",
+        lambda: f"Multiplicands must be 2D or 3D but got mat_a.dim()={mat_a.dim()} and mat_b.dim()={mat_b.dim()}",  # noqa: B950
     )
 
     mat_a_is_2d = mat_a.dim() == 2
@@ -7457,11 +7458,11 @@ def is_col_major(mat):
 
         torch._check(
             is_row_major(mat_a),
-            lambda: f"Expected mat_a tensor to be row major in the last two dimensions, got strides {mat_a.stride()[-2:]}",
+            lambda: f"Expected mat_a tensor to be row major in the last two dimensions, got strides {mat_a.stride()[-2:]}",  # noqa: B950
         )
         torch._check(
             is_col_major(mat_b),
-            lambda: f"Expected mat_b tensor to be column major in the last two dimensions, got strides {mat_b.stride()[-2:]}",
+            lambda: f"Expected mat_b tensor to be column major in the last two dimensions, got strides {mat_b.stride()[-2:]}",  # noqa: B950
         )
 
     def check_valid_strides(mat_name, mat):
@@ -7473,7 +7474,7 @@ def check_valid_strides(mat_name, mat):
         ):
             torch._check(
                 mat_stride[end_dim] % alignment == 0,
-                lambda: f"Expected {mat_name} stride along {end_dim} dim to be multiple of 16 bytes, got {mat_stride[end_dim]}.",
+                lambda: f"Expected {mat_name} stride along {end_dim} dim to be multiple of 16 bytes, got {mat_stride[end_dim]}.",  # noqa: B950
             )
         elif mat_stride[end_dim] == 1 and mat_stride[end_dim - 1] >= max(
             1, mat.shape[end_dim]
@@ -7493,41 +7494,81 @@ def check_valid_strides(mat_name, mat):
 
     if scale_a is not None and scale_b is not None:
         torch._check(
-            scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32,
-            lambda: "Both scale_a and scale_b must be float (fp32) tensors, but got scale_a.dtype={scale_a.dtype} and scale_b.dtype={scale_b.dtype}.",  # noqa: B950
+            (scale_a.dtype == torch.float32 and scale_b.dtype == torch.float32)
+            or (
+                scale_a.dtype == torch.float8_e8m0fnu
+                and scale_b.dtype == torch.float8_e8m0fnu
+            ),
+            lambda: f"For FP8 scales must both be float32, or for MXFP8 both scales must be float8_e8m0fnu. Got scale_a.dtype={scale_a.dtype} and scale_b.dtype={scale_b.dtype}.",  # noqa: B950
         )
+        is_mxfp8 = (
+            scale_a.dtype == torch.float8_e8m0fnu
+            and scale_b.dtype == torch.float8_e8m0fnu
+        )
+
+        def round_up(x, y):
+            """Rounds up x to nearest multiple of y"""
+            return ((x + y - 1) // y) * y
 
         def check_scale(scale_name, scale, mat, scaled_dim, scale_multiplier=1):
             if mat.dim() == 2:
-                torch._check(
-                    scale.dim() == 1,
-                    lambda: f"Expected {scale_name} to be 1D tensor, but got {scale.dim()}D tensor.",
-                )
                 torch._check(
                     scale.is_contiguous(),
                     lambda: f"Expected {scale_name} to be contiguous.",
                 )
-                torch._check(
-                    scale.shape[0] == mat.shape[scaled_dim] * scale_multiplier,
-                    lambda: f"Expected {scale_name} to have {mat.shape[scaled_dim] * scale_multiplier} elements, got {scale.shape[0]} elements.",  # noqa: B950
-                )
+                # For MXFP8, 2d tensors have variable size groups represented as subtensors,
+                # that are converted to blocked padded format individually. At compile time we don't know
+                # the group sizes yet, so we don't know the expect size of the blocked format scale.
+                # This limits what we can check here.
+                if is_mxfp8:
+                    torch._check(
+                        scale.dim() == mat.dim(),
+                        lambda: f"For MXFP8, scale must have same number of dimensions as target tensor, but {scale_name} has mat.ndim={mat.ndim} and scale.ndim={scale.ndim}",  # noqa: B950
+                    )
+                else:
+                    torch._check(
+                        scale.dim() == 1,
+                        lambda: f"Expected {scale_name} to be 1D tensor, but got {scale.dim()}D tensor.",
+                    )
+                    torch._check(
+                        scale.shape[0] == mat.shape[scaled_dim] * scale_multiplier,
+                        lambda: f"Expected {scale_name} to have {mat.shape[scaled_dim] * scale_multiplier} elements, got {scale.shape[0]} elements.",  # noqa: B950
+                    )
             else:
                 torch._check(
-                    scale.dim() == 2,
-                    lambda: f"Expected {scale_name} to be 2D tensor, but got {scale.dim()}D tensor.",
-                )
-                torch._check(
-                    scale.stride(1) == 1,
+                    scale.stride(-1) == 1,
                     lambda: f"Expected {scale_name} to be contiguous in the last dimension.",
                 )
                 torch._check(
                     scale.shape[0] == mat.shape[0],
                     lambda: f"Expected {scale_name} batch dimension to be {mat.shape[0]}, got {scale.shape[0]}.",
                 )
-                torch._check(
-                    scale.shape[1] == mat.shape[1 + scaled_dim],
-                    lambda: f"Expected {scale_name} non-batch dimension to be {mat.shape[1 + scaled_dim]}, got {scale.shape[1]}.",
-                )
+                # For MXFP8, 3d tensors have static 'groups' (stack of 2d tensors) so we can know the expected blocked
+                # scale sizes at compile time.
+                if is_mxfp8:
+                    torch._check(
+                        mat.ndim == scale.ndim,
+                        lambda: f"For MXFP8, scale should have same number of dimensions as target tensor, but {scale_name} has mat.ndim={mat.ndim} and scale.ndim={scale.ndim}",  # noqa: B950
+                    )
+                    # TODO: This logic only holds for RHS tensor in 2d-3d case.
+                    # We'll need to update it to handle LHS 3d tensor in 3d-2d and 3d-3d cases.
+                    G, K, N = scale.shape
+                    block_size = 32
+                    blocked_K = round_up(K / block_size, 4)
+                    blocked_N = round_up(N, 128)
+                    torch._check(
+                        mat.shape[-2] == blocked_K and mat.shape[-1] == blocked_N,
+                        lambda: f"For MXFP8, expected mat.shape={mat.shape} to have scale shape of ({G},{blocked_K},{blocked_N}), but got {scale.shape}",  # noqa: B950
+                    )
+                else:
+                    torch._check(
+                        scale.dim() == 2,
+                        lambda: f"Expected {scale_name} to be 2D tensor, but got {scale.dim()}D tensor.",
+                    )
+                    torch._check(
+                        scale.shape[1] == mat.shape[1 + scaled_dim],
+                        lambda: f"Expected {scale_name} non-batch dimension to be {mat.shape[1 + scaled_dim]}, got {scale.shape[1]}.",  # noqa: B950
+                    )
 
         scale_multiplier = (
             offs.shape[0] if offs is not None and mat_a_is_2d and mat_b_is_2d else 1
diff --git a/torch/_ops.py b/torch/_ops.py
index 83a5dc0e57a5..b351aa17dfa7 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -267,6 +267,7 @@ def resolve_key(op: OperatorBase, k: DispatchKey):  # type: ignore[valid-type]
     DispatchKey.BackendSelect,
     DispatchKey.AutocastCPU,  # type: ignore[attr-defined]
     DispatchKey.AutocastCUDA,  # type: ignore[attr-defined]
+    DispatchKey.AutocastXPU,  # type: ignore[attr-defined]
 ]
 
 
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
index 51ab8c54c010..91b0cc1f68d4 100644
--- a/torch/_prims_common/__init__.py
+++ b/torch/_prims_common/__init__.py
@@ -107,24 +107,18 @@ def __rmul__(self, other: Any) -> typing.Self: ...
 
 
 def same_shape(a: ShapeType, b: ShapeType, *, allow_rhs_unbacked=False) -> bool:
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import guard_or_true
 
     if len(a) != len(b):
         return False
 
     for x, y in zip(a, b):
         if allow_rhs_unbacked:
-            # TODO: We should check that the symbols are consistent
-            # with each other
             if isinstance(y, torch.SymInt):
                 continue
-        # NB: Naively, you would not expect to have to do an oblivious guard
-        # here because there is seemingly no broadcasting here, but in fact we
-        # use this in some situations to determine if we need to do an expand
-        # on the tensor because they don't line up, so you can definitely end
-        # up trying to prove u0 != 1 in this situation.  See
-        # python test/test_proxy_tensor.py -k test_cumsum_unbacked
-        if guard_size_oblivious(x != y):
+
+        # if we do not know, then they are not the same.
+        if guard_or_true(x != y):
             return False
 
     return True
@@ -265,12 +259,14 @@ def check_contiguous_sizes_strides(sizes, strides, false_if_dde=False):
     from torch.fx.experimental.symbolic_shapes import (
         guard_or_false,
         guard_or_true,
-        guard_size_oblivious,
         is_nested_int,
     )
 
-    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
-    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+    def eval_eager(x):
+        return bool(x)
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
+    maybe_guard_or_true = guard_or_true if false_if_dde else eval_eager
 
     expected_stride = 1
     expected_stride_max = 1
@@ -325,14 +321,13 @@ def is_channels_last_contiguous_2d(a: Tensor, false_if_dde=False) -> bool:
     if a.ndim != 4:
         return False
 
-    from torch.fx.experimental.symbolic_shapes import (
-        guard_or_false,
-        guard_or_true,
-        guard_size_oblivious,
-    )
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, guard_or_true
 
-    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
-    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+    def eval_eager(x):
+        return bool(x)
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
+    maybe_guard_or_true = guard_or_true if false_if_dde else eval_eager
 
     expected_stride = 1
     for idx in (1, 3, 2, 0):
@@ -354,14 +349,13 @@ def is_channels_last_contiguous_3d(a: Tensor, false_if_dde=False) -> bool:
     if a.ndim != 5:
         return False
 
-    from torch.fx.experimental.symbolic_shapes import (
-        guard_or_false,
-        guard_or_true,
-        guard_size_oblivious,
-    )
+    from torch.fx.experimental.symbolic_shapes import guard_or_false, guard_or_true
 
-    maybe_guard_or_false = guard_or_false if false_if_dde else guard_size_oblivious
-    maybe_guard_or_true = guard_or_true if false_if_dde else guard_size_oblivious
+    def eval_eager(x):
+        return bool(x)
+
+    maybe_guard_or_false = guard_or_false if false_if_dde else eval_eager
+    maybe_guard_or_true = guard_or_true if false_if_dde else eval_eager
 
     expected_stride = 1
     for idx in (1, 4, 3, 2, 0):
@@ -426,7 +420,7 @@ def is_channels_last_contiguous_or_false_3d(a: Tensor) -> bool:
 
 
 # similar to is_contiguous_for_memory_format but return false on data dependency.
-def contiguous_for_memory_format_or_false(  # type: ignore[return]
+def is_contiguous_for_memory_format_or_false(  # type: ignore[return]
     a: Tensor, *, memory_format: torch.memory_format
 ) -> bool:
     return is_contiguous_for_memory_format(
@@ -542,7 +536,10 @@ def is_non_overlapping_and_dense(a: Tensor) -> bool:
 def compute_elementwise_output_logical_to_physical_perm(
     *tensors, _skip_checks=False
 ) -> list[int]:
-    from torch.fx.experimental.symbolic_shapes import guard_size_oblivious
+    from torch.fx.experimental.symbolic_shapes import (
+        guard_or_false,
+        guard_size_oblivious,
+    )
 
     if not _skip_checks and len(tensors) == 0:
         msg = "Can't compute elementwise output strides for zero tensors!"
@@ -576,11 +573,14 @@ def compute_elementwise_output_logical_to_physical_perm(
     is_contiguous = True
     is_channels_last = True
     for t in tensors:
-        is_contiguous = is_contiguous and contiguous_for_memory_format_or_false(
+        is_contiguous = is_contiguous and is_contiguous_for_memory_format_or_false(
             t, memory_format=torch.contiguous_format
         )
-        is_channels_last = is_channels_last and contiguous_for_memory_format_or_false(
-            t, memory_format=torch.channels_last
+        is_channels_last = (
+            is_channels_last
+            and is_contiguous_for_memory_format_or_false(
+                t, memory_format=torch.channels_last
+            )
         )
 
     if is_contiguous and not is_channels_last:
@@ -595,12 +595,23 @@ def should_swap(idx_a, idx_b):
         for tensor in tensors:
             stride_a = tensor.stride()[idx_a]
             stride_b = tensor.stride()[idx_b]
-
             if guard_size_oblivious(stride_a == 0) or guard_size_oblivious(
                 stride_b == 0
             ):
                 continue
 
+            if guard_or_false(stride_a == stride_b):
+                if guard_size_oblivious(shape[idx_a] > shape[idx_b]):
+                    return 1
+
+            # when stride_a = 1, we want stride_a < stride_b to be TRUE
+            # when stride_b = 1, we want stride_a < stride_b to be FALSE
+            elif guard_or_false(stride_a == 1):
+                return -1
+
+            elif guard_or_false(stride_b == 1):
+                return 1
+
             if guard_size_oblivious(stride_a < stride_b):
                 return -1
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index ee3abe957f05..783e44022379 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -19,7 +19,6 @@
 from torch import sym_float, sym_int
 from torch._prims_common import (
     BoolLike,
-    contiguous_for_memory_format_or_false,
     DeviceLikeType,
     Dim,
     DimsSequenceType,
@@ -29,6 +28,7 @@
     FloatLike,
     FloatWithoutSymFloat,
     IntLike,
+    is_contiguous_for_memory_format_or_false,
     is_contiguous_or_false,
     is_weakly_lesser_type,
     Number,
@@ -449,6 +449,38 @@ def _maybe_broadcast(*args, preserve_cpu_scalar_tensors=True):
         *(t.shape if isinstance(t, TensorLike) else None for t in args)
     )
 
+    def should_expand(a: ShapeType, b: ShapeType) -> bool:
+        from torch.fx.experimental.symbolic_shapes import (
+            guard_or_false,
+            sym_and,
+            sym_or,
+        )
+
+        if len(a) != len(b):
+            return True
+
+        for x, y in zip(a, b):
+            if guard_or_false(x != y):
+                # We know they are not the same.
+                return True
+
+            # They are the same or we do not know if they are the same or not.
+            # 1==1 no-broadcast
+            # u0==1 and 1==u0 cases. We broadcast!
+            if guard_or_false(sym_and(x == 1, y == 1)):
+                pass
+            elif guard_or_false(sym_or(x == 1, y == 1)):
+                # assume broadcasting.
+                return True
+
+            # u0==u1 assume the same, no broadcasting!
+            torch._check(
+                x == y,
+                "sizes assumed to be the same due to unbacked broadcasting semantics",
+            )
+
+        return False
+
     def __maybe_broadcast(x, shape):
         if x is None:
             return None
@@ -458,7 +490,7 @@ def __maybe_broadcast(x, shape):
             if preserve_cpu_scalar_tensors and utils.is_cpu_scalar_tensor(x):
                 return x
 
-            if not utils.same_shape(x.shape, common_shape):
+            if should_expand(x.shape, common_shape):
                 return x.expand(common_shape)
 
             return x
@@ -3000,7 +3032,7 @@ def contiguous(
     )
 
     # TODO: make logic consistent with aten contiguous
-    if contiguous_for_memory_format_or_false(a, memory_format=memory_format):
+    if is_contiguous_for_memory_format_or_false(a, memory_format=memory_format):
         return a
 
     return torch.clone(a, memory_format=memory_format)
@@ -3014,7 +3046,7 @@ def dstack(tensors: TensorSequenceType) -> TensorLikeType:
 
 
 @register_decomposition(aten.expand)
-def expand(a: Tensor, *shape) -> Tensor:
+def expand(a: Tensor, *shape, implicit: bool = False) -> Tensor:
     from torch.fx.experimental.symbolic_shapes import guard_or_false, sym_or
 
     # NOTE: cannot use utils.extract_shape_from_varargs here
diff --git a/torch/_refs/linalg/__init__.py b/torch/_refs/linalg/__init__.py
index 418691fe24aa..28711c2c5485 100644
--- a/torch/_refs/linalg/__init__.py
+++ b/torch/_refs/linalg/__init__.py
@@ -180,7 +180,7 @@ def vector_norm(
             if keepdim or x.ndim == 0:
                 return to_result_dtype(x).contiguous()
             elif dim is None:
-                return x.flatten()[0]
+                return to_result_dtype(x).flatten()[0]
             else:
                 new_shape = [s for d, s in enumerate(x.shape) if d not in dim]
                 return to_result_dtype(x.view(new_shape)).contiguous()
diff --git a/torch/_subclasses/fake_impls.py b/torch/_subclasses/fake_impls.py
index 7ebd2ec92d12..cefff832c5fd 100644
--- a/torch/_subclasses/fake_impls.py
+++ b/torch/_subclasses/fake_impls.py
@@ -15,11 +15,11 @@
 from torch._dispatch.python import no_python_dispatcher
 from torch._ops import OpOverload
 from torch._prims_common import (
-    contiguous_for_memory_format_or_false,
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     is_boolean_dtype,
     is_contiguous,
+    is_contiguous_for_memory_format_or_false,
     is_contiguous_or_false,
     is_float_dtype,
     is_integer_dtype,
@@ -1256,13 +1256,13 @@ def slow(msg):
                     continue
                 definitely_contiguous = (
                     definitely_contiguous
-                    and contiguous_for_memory_format_or_false(
+                    and is_contiguous_for_memory_format_or_false(
                         op, memory_format=torch.contiguous_format
                     )
                 )
                 definitely_channels_last = (
                     definitely_channels_last
-                    and contiguous_for_memory_format_or_false(
+                    and is_contiguous_for_memory_format_or_false(
                         op, memory_format=torch.channels_last
                     )
                 )
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 52b776946b36..5767f6a1d0c1 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -1677,6 +1677,10 @@ def _prep_args_for_hash(
         )
         from torch._higher_order_ops.utils import FunctionalizeCtxWrapper
 
+        if isinstance(args, (list, tuple, dict)):
+            result.append(type(args))
+            result.append(f"length_{len(args)}")
+
         if isinstance(args, dict):
             self._prep_args_for_hash(result, args.keys(), state, id_hashed_objects)
             self._prep_args_for_hash(result, args.values(), state, id_hashed_objects)
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 71550b5d28b2..68c3fe31c5bf 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -5555,26 +5555,48 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.is_floating_point,
     r"""
-is_floating_point(input) -> (bool)
+is_floating_point(input: Tensor) -> bool
 
 Returns True if the data type of :attr:`input` is a floating point data type i.e.,
 one of ``torch.float64``, ``torch.float32``, ``torch.float16``, and ``torch.bfloat16``.
 
 Args:
     {input}
+
+Example::
+
+    >>> torch.is_floating_point(torch.tensor([1.0, 2.0, 3.0]))
+    True
+    >>> torch.is_floating_point(torch.tensor([1, 2, 3], dtype=torch.int32))
+    False
+    >>> torch.is_floating_point(torch.tensor([1.0, 2.0, 3.0], dtype=torch.float16))
+    True
+    >>> torch.is_floating_point(torch.tensor([1, 2, 3], dtype=torch.complex64))
+    False
 """.format(**common_args),
 )
 
 add_docstr(
     torch.is_complex,
     r"""
-is_complex(input) -> (bool)
+is_complex(input: Tensor) -> bool
 
 Returns True if the data type of :attr:`input` is a complex data type i.e.,
 one of ``torch.complex64``, and ``torch.complex128``.
 
 Args:
     {input}
+
+Example::
+
+    >>> torch.is_complex(torch.tensor([1, 2, 3], dtype=torch.complex64))
+    True
+    >>> torch.is_complex(torch.tensor([1, 2, 3], dtype=torch.complex128))
+    True
+    >>> torch.is_complex(torch.tensor([1, 2, 3], dtype=torch.int32))
+    False
+    >>> torch.is_complex(torch.tensor([1.0, 2.0, 3.0], dtype=torch.float16))
+    False
 """.format(**common_args),
 )
 
@@ -7126,7 +7148,7 @@ def merge_dicts(*dicts):
     {opt_keepdim}
 
 Keyword arguments:
-    interpolation (str): interpolation method to use when the desired quantile lies between two data points.
+    interpolation (str, optional): interpolation method to use when the desired quantile lies between two data points.
                             Can be ``linear``, ``lower``, ``higher``, ``midpoint`` and ``nearest``.
                             Default is ``linear``.
     {out}
@@ -9987,7 +10009,7 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.sort,
     r"""
-sort(input, dim=-1, descending=False, stable=False, *, out=None) -> (Tensor, LongTensor)
+sort(input, dim=-1, descending=False, *, stable=False, out=None) -> (Tensor, LongTensor)
 
 Sorts the elements of the :attr:`input` tensor along a given dimension
 in ascending order by value.
@@ -10008,10 +10030,10 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): the dimension to sort along
     descending (bool, optional): controls the sorting order (ascending or descending)
-    stable (bool, optional): makes the sorting routine stable, which guarantees that the order
-       of equivalent elements is preserved.
 
 Keyword args:
+    stable (bool, optional): makes the sorting routine stable, which guarantees that the order
+        of equivalent elements is preserved.
     out (tuple, optional): the output tuple of (`Tensor`, `LongTensor`) that can
         be optionally given to be used as output buffers
 
@@ -10052,7 +10074,7 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.argsort,
     r"""
-argsort(input, dim=-1, descending=False, stable=False) -> Tensor
+argsort(input, dim=-1, descending=False, *, stable=False) -> Tensor
 
 Returns the indices that sort a tensor along a given dimension in ascending
 order by value.
@@ -10068,6 +10090,8 @@ def merge_dicts(*dicts):
     {input}
     dim (int, optional): the dimension to sort along
     descending (bool, optional): controls the sorting order (ascending or descending)
+
+Keyword args:
     stable (bool, optional): controls the relative order of equivalent elements
 
 Example::
@@ -12395,6 +12419,24 @@ def merge_dicts(*dicts):
     {device}
     {requires_grad}
     {memory_format}
+
+Example::
+
+    >>> x = torch.ones(2, 3)
+    >>> torch.full_like(x, 3.141592)
+    tensor([[ 3.1416,  3.1416,  3.1416],
+            [ 3.1416,  3.1416,  3.1416]])
+    >>> torch.full_like(x, 7)
+    tensor([[7., 7., 7.],
+            [7., 7., 7.]])
+    >>> torch.full_like(x, 0.5, dtype=torch.int32)
+    tensor([[0, 0, 0],
+            [0, 0, 0]], dtype=torch.int32)
+    >>> y = torch.randn(3, 4, dtype=torch.float64)
+    >>> torch.full_like(y, -1.0)
+    tensor([[-1., -1., -1., -1.],
+            [-1., -1., -1., -1.],
+            [-1., -1., -1., -1.]], dtype=torch.float64)
 """.format(**factory_like_common_args),
 )
 
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index f2613e734bbf..f20a88ce8540 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -358,3 +358,10 @@ def get_default_numa_options():
 
 def log_triton_builds(fail: Optional[str]):
     pass
+
+
+def find_compile_subproc_binary() -> Optional[str]:
+    """
+    Allows overriding the binary used for subprocesses
+    """
+    return None
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index 745cdd315a63..9382a5500e0e 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -520,7 +520,7 @@ def load(self):
             elif key[0] == BINPERSID[0]:
                 pid = self.stack.pop()
                 # Only allow persistent load of storage
-                if type(pid) is not tuple and not type(pid) is not int:
+                if type(pid) is not tuple and type(pid) is not int:
                     raise UnpicklingError(
                         f"persistent_load id must be tuple or int, but got {type(pid)}"
                     )
diff --git a/torch/amp/autocast_mode.py b/torch/amp/autocast_mode.py
index f93c050f4508..c758d47fc815 100644
--- a/torch/amp/autocast_mode.py
+++ b/torch/amp/autocast_mode.py
@@ -324,7 +324,7 @@ def __init__(
         elif self.device == self.custom_backend_name:
             supported_dtype = self.custom_device_mod.get_amp_supported_dtype()
             if self.fast_dtype not in supported_dtype:
-                error_message = f"In {self.custom_backend_name} autocast, but the target dtype is not supported. "
+                error_message = f"In {self.custom_backend_name} autocast, but the target dtype {self.fast_dtype} is not supported. "
                 error_message += f"Disabling autocast.\n {self.custom_backend_name} Autocast only supports dtypes of "
                 error_message += (
                     ", ".join(str(dtype) for dtype in supported_dtype) + " currently."
diff --git a/torch/ao/quantization/fx/convert.py b/torch/ao/quantization/fx/convert.py
index 9513fb288850..dc51ab943bc5 100644
--- a/torch/ao/quantization/fx/convert.py
+++ b/torch/ao/quantization/fx/convert.py
@@ -94,6 +94,7 @@ def _replace_observer_with_quantize_dequantize_node_decomposed(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node working with decomposed Tensor
@@ -210,7 +211,11 @@ def add_dequantize_op_kwargs(dequantize_op, input_node):
                     # sure that the default overload can be used.
                     # TODO: maybe need more complex attr name here
                     qparam_node = create_getattr_from_value(
-                        model, graph, module_path + prefix + key, value_or_node
+                        model,
+                        graph,
+                        module_path + prefix + key,
+                        value_or_node,
+                        model_device,
                     )
                     quantize_op_inputs.append(qparam_node)
                 else:
@@ -362,6 +367,7 @@ def _replace_observer_with_quantize_dequantize_node(
     modules: dict[str, torch.nn.Module],
     node_name_to_scope: dict[str, tuple[str, type]],
     node_name_to_qconfig: dict[str, QConfigAny],
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """Replace activation_post_process module call node with quantize and
     dequantize node
@@ -442,7 +448,11 @@ def _replace_observer_with_quantize_dequantize_node(
                     # For scale and zero_point values we register them as buffers in the root module.
                     # TODO: maybe need more complex attr name here
                     qparam_node = create_getattr_from_value(
-                        model, graph, module_path + prefix + key, value_or_node
+                        model,
+                        graph,
+                        module_path + prefix + key,
+                        value_or_node,
+                        model_device,
                     )
                     quantize_op_inputs.append(qparam_node)
                 else:
@@ -740,6 +750,7 @@ def convert_weighted_module(
     backend_config: BackendConfig,
     is_decomposed: bool = False,
     is_reference: bool = False,
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """Convert a weighted module to reference quantized module in the model
     If the QConfig of a QAT module is not set, the module will still be converted to
@@ -828,7 +839,10 @@ def convert_weighted_module(
         is_ptq = weight_post_process is None
         if is_ptq:
             weight_post_process = qconfig.weight()  # type: ignore[union-attr, operator]
-            device = assert_and_get_unique_device(float_module)
+            if model_device is not None:
+                device = model_device
+            else:
+                device = assert_and_get_unique_device(float_module)
             if device:
                 weight_post_process.to(device)
 
@@ -1144,6 +1158,7 @@ def convert(
     qat_module_classes = get_qat_module_classes(backend_config)
     fused_module_classes = get_fused_module_classes(backend_config)
     statically_quantized_custom_module_nodes: set[Node] = set()
+    model_device = assert_and_get_unique_device(model)
 
     for node in list(model.graph.nodes):
         if node.op == "placeholder":
@@ -1197,6 +1212,7 @@ def convert(
                             modules,
                             node_name_to_scope,
                             node_name_to_qconfig,
+                            model_device,
                         )
                     else:
                         _replace_observer_with_quantize_dequantize_node(
@@ -1205,6 +1221,7 @@ def convert(
                             modules,
                             node_name_to_scope,
                             node_name_to_qconfig,
+                            model_device,
                         )
             elif isinstance(mod, DeQuantStub):
                 _replace_observer_or_dequant_stub_with_dequantize_node(
@@ -1234,6 +1251,7 @@ def convert(
                     backend_config,
                     is_decomposed,
                     is_reference,
+                    model_device,
                 )
             elif type_before_parametrizations(mod) in custom_module_classes:
                 convert_custom_module(
diff --git a/torch/ao/quantization/fx/prepare.py b/torch/ao/quantization/fx/prepare.py
index b1b2c6b05b33..e70a078630d9 100644
--- a/torch/ao/quantization/fx/prepare.py
+++ b/torch/ao/quantization/fx/prepare.py
@@ -478,6 +478,7 @@ def _insert_obs_or_fq(
     model: torch.nn.Module,
     named_modules: dict[str, torch.nn.Module],
     graph: Graph,
+    model_device: Optional[torch.device] = None,
 ) -> Node:
     """
     Attaches `obs_or_fq` to `model`, and creates a node which calls
@@ -485,7 +486,8 @@ def _insert_obs_or_fq(
 
     obs_or_fq: an instance of Observer or FakeQuantize module
     """
-    model_device = assert_and_get_unique_device(model)
+    if model_device is None:
+        model_device = assert_and_get_unique_device(model)
     if model_device:
         obs_or_fq.to(model_device)
     # add obs_or_fq module as attribute
@@ -805,6 +807,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
     backend_config: Optional[BackendConfig] = None,
+    model_device: Optional[torch.device] = None,
 ) -> Argument:
     """
     Given a `node` and an `arg`, inserts an input observer between
@@ -827,6 +830,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
                 obs_or_fq_map,
                 is_qat,
                 backend_config,
+                model_device,
             )
             new_arg_to_return.append(new_inner_arg)
         return type(arg)(new_arg_to_return)
@@ -945,7 +949,12 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
         obs_or_fq_map[(arg, node)] = arg_as_input_act_obs_or_fq
         if existing_obs_node is None:
             new_obs_node = _insert_obs_or_fq(
-                arg, arg_as_input_act_obs_or_fq, model, named_modules, graph
+                arg,
+                arg_as_input_act_obs_or_fq,
+                model,
+                named_modules,
+                graph,
+                model_device,
             )
             # override this arg to be the observed arg
             new_arg = new_obs_node
@@ -966,6 +975,7 @@ def _maybe_insert_input_observers_for_node(
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
     backend_config: Optional[BackendConfig] = None,
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """
     If needed, inserts observers to the input args and kwargs of `node`.
@@ -997,6 +1007,7 @@ def _maybe_insert_input_observers_for_node(
             obs_or_fq_map,
             is_qat,
             backend_config,
+            model_device,
         )
         new_args.append(new_arg)
 
@@ -1014,6 +1025,7 @@ def _maybe_insert_input_observers_for_node(
             obs_or_fq_map,
             is_qat,
             backend_config,
+            model_device,
         )
         new_kwargs[k] = new_kwarg
 
@@ -1663,6 +1675,7 @@ def insert_observers_for_model(
     outputs_seen_counter = 0
     results_node = None
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize] = {}
+    model_device = assert_and_get_unique_device(model)
 
     # TODO: change this to insert obs/fq by pattern instead of by node
     for node in nodes_before_observation:
@@ -1766,6 +1779,7 @@ def insert_observers_for_model(
                             obs_or_fq_map,
                             is_qat,
                             backend_config,
+                            model_device,
                         )
 
                         # insert equalization input observers if needed
diff --git a/torch/ao/quantization/fx/utils.py b/torch/ao/quantization/fx/utils.py
index fb17d6b16417..f8445da5fea1 100644
--- a/torch/ao/quantization/fx/utils.py
+++ b/torch/ao/quantization/fx/utils.py
@@ -254,7 +254,11 @@ def assert_and_get_unique_device(module: torch.nn.Module) -> Any:
 
 
 def create_getattr_from_value(
-    module: torch.nn.Module, graph: Graph, prefix: str, value: Any
+    module: torch.nn.Module,
+    graph: Graph,
+    prefix: str,
+    value: Any,
+    device: Optional[torch.device] = None,
 ) -> Node:
     """
     Given a value of any type, creates a getattr node corresponding to the value and
@@ -262,7 +266,8 @@ def create_getattr_from_value(
     """
     get_new_attr_name = get_new_attr_name_with_prefix(prefix)
     attr_name = get_new_attr_name(module)
-    device = assert_and_get_unique_device(module)
+    if device is None:
+        device = assert_and_get_unique_device(module)
     new_value = (
         value.detach().clone()
         if isinstance(value, torch.Tensor)
diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py
index c2610fd3ca7f..7b56fbe7232c 100644
--- a/torch/ao/quantization/observer.py
+++ b/torch/ao/quantization/observer.py
@@ -1902,10 +1902,18 @@ def convert(self, model: torch.fx.GraphModule, observer_node: Node):
             else:
                 scale, zero_point = self.calculate_qparams()
                 scale_node = create_getattr_from_value(
-                    model, model.graph, "_scale", scale
+                    model,
+                    model.graph,
+                    "_scale",
+                    scale,
+                    scale.device if isinstance(scale, torch.Tensor) else None,
                 )
                 zero_point_node = create_getattr_from_value(
-                    model, model.graph, "_zero_point", zero_point
+                    model,
+                    model.graph,
+                    "_zero_point",
+                    zero_point,
+                    zero_point.device if isinstance(zero_point, torch.Tensor) else None,
                 )
 
             q_node = model.graph.call_function(
diff --git a/torch/ao/quantization/pt2e/prepare.py b/torch/ao/quantization/pt2e/prepare.py
index 8b1c5bfed4eb..57ff31152101 100644
--- a/torch/ao/quantization/pt2e/prepare.py
+++ b/torch/ao/quantization/pt2e/prepare.py
@@ -22,6 +22,7 @@
     QuantizationSpecBase,
     SharedQuantizationSpec,
 )
+from torch.ao.quantization.utils import _assert_and_get_unique_device
 from torch.fx import Graph, GraphModule, Node
 from torch.fx.node import Argument
 
@@ -319,6 +320,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+    model_device: Optional[torch.device] = None,
 ) -> Argument:
     """
     Given a `node` and an `arg`, inserts an input observer between
@@ -337,6 +339,7 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
                 named_modules,
                 obs_or_fq_map,
                 is_qat,
+                model_device,
             )
             new_arg_to_return.append(new_inner_arg)
         return type(arg)(new_arg_to_return)
@@ -390,7 +393,12 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
 
     assert isinstance(model.graph, Graph)
     new_arg = _insert_obs_or_fq(
-        arg, input_edge_obs_or_fq, model, named_modules, model.graph
+        arg,
+        input_edge_obs_or_fq,
+        model,
+        named_modules,
+        model.graph,
+        model_device,
     )
     return new_arg
 
@@ -402,6 +410,7 @@ def _maybe_insert_input_observers_for_node(
     named_modules: dict[str, torch.nn.Module],
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+    model_device: Optional[torch.device] = None,
 ) -> None:
     """
     If needed, inserts observers to the input args and kwargs of `node`.
@@ -428,6 +437,7 @@ def _maybe_insert_input_observers_for_node(
             named_modules,
             obs_or_fq_map,
             is_qat,
+            model_device,
         )
         new_args.append(new_arg)
 
@@ -452,11 +462,17 @@ def _maybe_insert_output_observer_for_node(
     graph: Graph,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+    model_device: Optional[torch.device] = None,
 ) -> Optional[Node]:
     if node in obs_or_fq_map:
         output_act_obs_or_fq = obs_or_fq_map[node]
         new_output = _insert_obs_or_fq(
-            node, output_act_obs_or_fq, model, named_modules, graph
+            node,
+            output_act_obs_or_fq,
+            model,
+            named_modules,
+            graph,
+            model_device,
         )
         # propagate numeric debug handle from original node to observer/fake_quant node
         if (
@@ -479,6 +495,7 @@ def _maybe_insert_input_and_output_observers_for_node(
     model: torch.fx.GraphModule,
     obs_or_fq_map: dict[EdgeOrNode, ObserverOrFakeQuantize],
     is_qat: bool,
+    model_device: Optional[torch.device] = None,
 ):
     this_node_quantization_annotation = (
         node.meta["quantization_annotation"]
@@ -496,6 +513,7 @@ def _maybe_insert_input_and_output_observers_for_node(
         named_modules,
         obs_or_fq_map,
         is_qat,
+        model_device,
     )
 
     output_is_a_tensor = "val" in node.meta and isinstance(node.meta["val"], FakeTensor)
@@ -504,7 +522,13 @@ def _maybe_insert_input_and_output_observers_for_node(
 
     # this returns the new observer node if it was needed
     maybe_output_obs_node = _maybe_insert_output_observer_for_node(
-        node, model, named_modules, model.graph, obs_or_fq_map, is_qat
+        node,
+        model,
+        named_modules,
+        model.graph,
+        obs_or_fq_map,
+        is_qat,
+        model_device,
     )
 
     if maybe_output_obs_node is None:
@@ -552,11 +576,16 @@ def prepare(
     )
     if obs_or_fq_callback:
         obs_or_fq_callback(model, obs_or_fq_map)
+    model_device = _assert_and_get_unique_device(model)
 
     for node in nodes_before_observation:
         # TODO: simplify logic for inserting observers
         _maybe_insert_input_and_output_observers_for_node(
-            node, model, obs_or_fq_map, is_qat
+            node,
+            model,
+            obs_or_fq_map,
+            is_qat,
+            model_device,
         )
 
     model = GraphModule(model, model.graph)
diff --git a/torch/ao/quantization/pt2e/utils.py b/torch/ao/quantization/pt2e/utils.py
index f919c3d9dff0..699a4c384837 100644
--- a/torch/ao/quantization/pt2e/utils.py
+++ b/torch/ao/quantization/pt2e/utils.py
@@ -361,7 +361,7 @@ def _get_aten_graph_module_for_pattern(
         example_inputs,
         kwargs,
         strict=True,
-    ).module()
+    ).module(check_guards=False)
 
     aten_pattern.graph.eliminate_dead_code()  # type: ignore[operator, union-attr]
     aten_pattern.recompile()  # type: ignore[operator]
diff --git a/torch/autograd/_functions/utils.py b/torch/autograd/_functions/utils.py
index a3f242920c7e..1e74e21d3cef 100644
--- a/torch/autograd/_functions/utils.py
+++ b/torch/autograd/_functions/utils.py
@@ -1,6 +1,4 @@
 # mypy: allow-untyped-defs
-import operator
-from functools import reduce
 
 
 def maybe_view(tensor, size, check_same_size=True):
@@ -26,38 +24,3 @@ def maybe_unexpand(tensor, old_size, check_same_size=True):
     for dim in expanded_dims:
         tensor = tensor.sum(dim, keepdim=True)
     return tensor
-
-
-# Check whether the op enable broadcasting, and whether it is supported by ONNX.
-# If dims1 and dims2 are different, then broadcast is True.
-# We always assume the combination of dims1 and dims2 is broadcastable.
-# The following types of broadcasting are supported in ONNX:
-#     1) Only one element in dims2, such as dims2 = [1, 1]
-#     2) dims2 is suffix of dims1, such as dims1 = [2, 3, 4], and dims2 = [3, 4]
-# Details can be found here: https://github.com/onnx/onnx/blob/master/docs/Operators.md#Gemm
-def check_onnx_broadcast(dims1, dims2):
-    broadcast = False
-    supported = True
-    len1 = len(dims1)
-    len2 = len(dims2)
-
-    numel2 = reduce(operator.mul, dims2)
-    if len1 < len2:
-        broadcast = True
-        if numel2 != 1:
-            supported = False
-    elif len1 > len2:
-        broadcast = True
-        if numel2 != 1 and dims1[len1 - len2 :] != dims2:
-            supported = False
-    else:
-        if dims1 != dims2:
-            broadcast = True
-            if numel2 != 1:
-                supported = False
-
-    if not supported:
-        raise ValueError(
-            f"Numpy style broadcasting is not supported in ONNX. Input dims are: {dims1}, {dims2}"
-        )
-    return broadcast
diff --git a/torch/compiler/__init__.py b/torch/compiler/__init__.py
index 163c25f12dbc..08ec23b748eb 100644
--- a/torch/compiler/__init__.py
+++ b/torch/compiler/__init__.py
@@ -1,4 +1,5 @@
 # mypy: allow-untyped-defs
+import io
 from typing import Any, Callable, Optional, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import ParamSpec
 
@@ -23,6 +24,7 @@
     "set_stance",
     "set_enable_guard_collectives",
     "cudagraph_mark_step_begin",
+    "load_compiled_function",
     "wrap_numpy",
     "is_compiling",
     "is_dynamo_compiling",
@@ -639,3 +641,23 @@ def nested_compile_region(fn=None):
     )
 
     return _mark_compile_region(fn)
+
+
+def load_compiled_function(file: io.IOBase) -> Callable[..., Any]:
+    """
+    Load an aot-compiled function from a file.
+
+    .. warning::
+
+        This API is currently experimental and subject to change.
+
+    Args:
+        file: A file-like object containing the serialized compiled function.
+
+    Returns:
+        A torch-compiled function with compilation preloaded from disk.
+    """
+    from torch._dynamo.aot_compile import AOTCompiledFunction
+
+    data = file.read()
+    return AOTCompiledFunction.deserialize(data)
diff --git a/torch/csrc/DeviceAccelerator.cpp b/torch/csrc/DeviceAccelerator.cpp
index 59cb8047467c..dc3da8881a71 100644
--- a/torch/csrc/DeviceAccelerator.cpp
+++ b/torch/csrc/DeviceAccelerator.cpp
@@ -1,4 +1,3 @@
-#include <c10/core/AllocatorConfig.h>
 #include <torch/csrc/DeviceAccelerator.h>
 #include <torch/csrc/utils/device_lazy_init.h>
 
@@ -74,10 +73,6 @@ void initModule(PyObject* module) {
     return at::accelerator::maybeExchangeDevice(device_index);
   });
 
-  m.def("_accelerator_setAllocatorSettings", [](std::string env) {
-    c10::CachingAllocator::setAllocatorSettings(env);
-  });
-
   m.def("_accelerator_isAllocatorInitialized", []() {
     const auto device_type = at::accelerator::getAccelerator(true).value();
     return at::getDeviceAllocator(device_type)->initialized();
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 675a4c431005..bf615360b657 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -71,6 +71,7 @@
 #include <torch/csrc/cpu/Module.h>
 #include <torch/csrc/dynamo/init.h>
 #include <torch/csrc/export/pybind.h>
+#include <torch/csrc/functionalization/Module.h>
 #include <torch/csrc/functorch/init.h>
 #include <torch/csrc/fx/node.h>
 #include <torch/csrc/inductor/aoti_package/pybind.h>
@@ -2076,6 +2077,7 @@ PyObject* initModule() {
   torch::instruction_counter::initModule(module);
   torch::initVerboseBindings(module);
   ASSERT_TRUE(THPStorage_init(module));
+  torch::functionalization::initModule(module);
 
 #ifdef USE_CUDA
   // This will only initialise base classes and attach them to library namespace
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index f289a286b19c..e6016a7721e8 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -82,6 +82,8 @@ struct ConcretePyInterpreterVTable final
 
   bool is_contiguous(const c10::TensorImpl* self, at::MemoryFormat)
       const override;
+  c10::SymBool sym_is_contiguous(const c10::TensorImpl* self, at::MemoryFormat)
+      const override;
   bool is_strides_like(const c10::TensorImpl* self, at::MemoryFormat)
       const override;
   bool is_non_overlapping_and_dense(const c10::TensorImpl* self) const override;
@@ -476,6 +478,33 @@ bool ConcretePyInterpreterVTable::is_contiguous(
   return PyObject_IsTrue(out.ptr());
 }
 
+c10::SymBool ConcretePyInterpreterVTable::sym_is_contiguous(
+    const c10::TensorImpl* self,
+    at::MemoryFormat memory_format) const {
+  pybind11::gil_scoped_acquire gil;
+  at::impl::MaybeSetTLSOnEntryGuard guard;
+
+  py::object out;
+  out = torchDispatchFromTensorImpl(
+      self,
+      "sym_is_contiguous",
+      py::module::import("torch")
+          .attr("ops")
+          .attr("aten")
+          .attr("sym_is_contiguous")
+          .attr("default")
+          .ptr(),
+      "torch.ops.aten",
+      {py::cast(memory_format)});
+
+  if (out.is_none()) {
+    return self->sym_is_contiguous_default(memory_format);
+  }
+
+  return torch::is_symbool(out) ? out.cast<c10::SymBool>()
+                                : c10::SymBool{py::cast<bool>(out)};
+}
+
 bool ConcretePyInterpreterVTable::is_strides_like(
     const c10::TensorImpl* self,
     at::MemoryFormat memory_format) const {
diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp
index 089c0571aea4..14591bc1fb4a 100644
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@@ -803,6 +803,7 @@ static void _get_tensors_to_save(
         }
       }
     }
+    Py_CLEAR(self->to_save);
   }
 }
 // Save any variables that requested by to_save
@@ -810,7 +811,7 @@ static void _save_variables(
     const std::vector<std::optional<at::Tensor>>& tensors_to_save,
     const std::shared_ptr<PyNode>& cdata_ptr,
     THPFunction* self) {
-  if (!self->to_save)
+  if (tensors_to_save.size() == 0)
     return;
   size_t num_saved = tensors_to_save.size();
   self->saved_variables.clear();
@@ -823,8 +824,6 @@ static void _save_variables(
       self->saved_variables.emplace_back(opt_tensor.value(), is_output);
     }
   }
-  // Free .to_save
-  Py_CLEAR(self->to_save);
 }
 
 // Mark requires_grad = 0 on non-differentiable variables (as per
@@ -1054,7 +1053,8 @@ void _trace_post_record(
       }
     }
   }
-  py::object onnx_globals = py::module::import("torch.onnx._globals");
+  py::object onnx_globals =
+      py::module::import("torch.onnx._internal.torchscript_exporter._globals");
   py::bool_ is_in_onnx_export =
       py::module::import("torch.onnx.__init__").attr("is_in_onnx_export");
   py::bool_ is_autograd_inlining_enabled =
diff --git a/torch/csrc/autograd/python_torch_functions_manual.cpp b/torch/csrc/autograd/python_torch_functions_manual.cpp
index 1236fad45f36..79739b6e459d 100644
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@@ -644,15 +644,6 @@ void initTorchFunctions(PyObject* module) {
             at::functionalization::impl::isFunctionalTensor(t));
         at::functionalization::impl::mark_mutation_hidden_from_autograd(t);
       });
-  py_module.def(
-      "_functionalize_apply_view_metas",
-      [](const at::Tensor& tensor, const at::Tensor& base) {
-        TORCH_INTERNAL_ASSERT(
-            at::functionalization::impl::isFunctionalTensor(tensor));
-        auto impl =
-            at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
-        return impl->apply_view_metas(base);
-      });
   py_module.def("_functionalize_is_symbolic", [](const at::Tensor& t) {
     TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
     auto impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index c184dd63d294..712719304ad6 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -157,7 +157,7 @@ void pushPyOutToStack(
     const char* msg) {
   TORCH_CHECK(
       PyGILState_Check(), "GIL must be held before you call pushPyOutToStack");
-  auto schema_returns = op.schema().returns();
+  const auto& schema_returns = op.schema().returns();
   const auto num_returns = schema_returns.size();
   if (num_returns == 0) {
     // Check that we got a None return from Python. Anything else is an error.
@@ -209,7 +209,8 @@ PyObject* ParameterClass = nullptr;
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
     const at::TensorBase& _var,
-    bool allow_preexisting_pyobj = false);
+    bool allow_preexisting_pyobj = false,
+    std::optional<bool> has_torch_dispatch_if_known = std::nullopt);
 
 // clang-tidy gets confused by static const
 static const char* VOLATILE_WARNING =
@@ -626,6 +627,65 @@ static PyObject* THPVariable_make_subclass(
   END_HANDLE_TH_ERRORS
 }
 
+// Shared code factored out of THPVariable_make_wrapper_subclass and
+// THPVariable_make_dtensor.
+static Tensor make_tensor_for_subclass_helper(
+    SymIntArrayRef sym_sizes,
+    OptionalSymIntArrayRef sym_strides,
+    const std::optional<c10::SymInt>& sym_storage_offset,
+    const TensorOptions& options,
+    const std::optional<c10::SymInt>& storage_size,
+    std::optional<DispatchKeySet> extra_dispatch_keys) {
+  AutoDispatchBelowADInplaceOrView guard{}; // TODO: Remove.
+  tracer::impl::NoTracerDispatchMode tracer_guard{};
+
+  c10::SymInt size_bytes;
+  auto dtype_itemsize = static_cast<int64_t>(options.dtype().itemsize());
+
+  if (storage_size.has_value()) {
+    size_bytes = storage_size.value();
+  } else if (sym_strides.has_value()) {
+    size_bytes = at::detail::computeStorageNbytes(
+        sym_sizes,
+        sym_strides.value(),
+        dtype_itemsize,
+        sym_storage_offset.value_or(0));
+  } else {
+    size_bytes = at::detail::computeStorageNbytesContiguous(
+        sym_sizes, dtype_itemsize, sym_storage_offset.value_or(0));
+  }
+
+  // We use storages **only** to track aliasing of subclasses during tracing.
+  // The actual data pointers are not valid.
+  Storage storage{
+      Storage::use_byte_size_t{},
+      size_bytes,
+      /*allocator=*/c10::GetAllocator(c10::kMeta),
+      /*resizable=*/true};
+  // TODO: constructor should probably accept data pointer
+  storage.set_data_ptr_noswap(at::DataPtr{nullptr, options.device()});
+
+  auto keys = c10::DispatchKeySet({options.computeDispatchKey()});
+  if (extra_dispatch_keys.has_value()) {
+    keys = keys | *extra_dispatch_keys;
+  }
+  Tensor tensor = at::detail::make_tensor<TensorImpl>(
+      std::move(storage), keys, options.dtype());
+
+  TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
+
+  if (sym_strides.has_value()) {
+    tensor_impl->set_sizes_and_strides(
+        sym_sizes, sym_strides.value(), sym_storage_offset);
+  } else {
+    TORCH_CHECK(
+        !sym_storage_offset.has_value(),
+        "setting storage offset without stride not supported");
+    tensor_impl->generic_set_sizes_contiguous(sym_sizes);
+  }
+  return tensor;
+}
+
 static PyObject* THPVariable_make_wrapper_subclass(
     PyObject*,
     PyObject* args,
@@ -693,69 +753,20 @@ static PyObject* THPVariable_make_wrapper_subclass(
 
   // don't bother releasing GIL here, as we are not allocating any nontrivial
   // data
-  Tensor tensor;
-
-  {
-    AutoDispatchBelowADInplaceOrView guard{}; // TODO: Remove.
-    tracer::impl::NoTracerDispatchMode tracer_guard{};
-
-    auto sym_sizes = r.symintlist(1);
-    auto sym_strides_own = r.symintlistOptional(2);
-    auto sym_strides =
-        static_cast<std::optional<c10::SymIntArrayRef>>(sym_strides_own);
-    auto sym_storage_offset = r.toSymIntOptional(3);
-
-    c10::SymInt size_bytes;
-    auto dtype_itemsize = static_cast<int64_t>(options.dtype().itemsize());
-    auto storage_size = r.toSymIntOptional(14);
-
-    if (storage_size.has_value()) {
-      size_bytes = storage_size.value();
-    } else if (sym_strides.has_value()) {
-      size_bytes = at::detail::computeStorageNbytes(
-          sym_sizes,
-          sym_strides.value(),
-          dtype_itemsize,
-          sym_storage_offset.value_or(0));
-    } else {
-      size_bytes = at::detail::computeStorageNbytesContiguous(
-          sym_sizes, dtype_itemsize, sym_storage_offset.value_or(0));
-    }
-
-    // We use storages **only** to track aliasing of subclasses during tracing.
-    // The actual data pointers are not valid.
-    Storage storage{
-        Storage::use_byte_size_t{},
-        size_bytes,
-        /*allocator=*/c10::GetAllocator(c10::kMeta),
-        /*resizable=*/true};
-    // TODO: constructor should probably accept data pointer
-    storage.set_data_ptr_noswap(at::DataPtr{nullptr, r.device(7)});
-
-    auto keys = c10::DispatchKeySet({options.computeDispatchKey()});
-    if (auto mb_extra_keys = r.toDispatchKeySetOptional(13)) {
-      keys = keys | *mb_extra_keys;
-    }
-    tensor = at::detail::make_tensor<TensorImpl>(
-        std::move(storage), keys, options.dtype());
-
-    TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
-
-    if (sym_strides.has_value()) {
-      tensor_impl->set_sizes_and_strides(
-          sym_sizes, sym_strides.value(), sym_storage_offset);
-    } else {
-      TORCH_CHECK(
-          !sym_storage_offset.has_value(),
-          "setting storage offset without stride not supported");
-      tensor_impl->generic_set_sizes_contiguous(sym_sizes);
-    }
-
-    const auto sizes_strides_policy = r.stringViewOptional(10);
-    if (sizes_strides_policy.has_value()) {
-      tensor.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
-          parseSizesStridesPolicyArgument(*sizes_strides_policy));
-    }
+  auto sym_sizes = r.symintlist(1);
+  auto sym_strides_own = r.symintlistOptional(2);
+  Tensor tensor = make_tensor_for_subclass_helper(
+      /*sym_sizes=*/r.symintlist(1),
+      /*sym_strides=*/r.symintlistOptional(2),
+      /*sym_storage_offset=*/r.toSymIntOptional(3),
+      options,
+      /*storage_size=*/r.toSymIntOptional(14),
+      r.toDispatchKeySetOptional(13));
+
+  const auto sizes_strides_policy = r.stringViewOptional(10);
+  if (sizes_strides_policy.has_value()) {
+    tensor.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
+        parseSizesStridesPolicyArgument(*sizes_strides_policy));
   }
 
   tensor.set_requires_grad(r.toBool(9));
@@ -767,7 +778,76 @@ static PyObject* THPVariable_make_wrapper_subclass(
     tensor.unsafeGetTensorImpl()->set_python_custom_layout(true);
   }
 
-  return THPVariable_NewWithVar((PyTypeObject*)cls, tensor);
+  return THPVariable_NewWithVar(
+      (PyTypeObject*)cls,
+      tensor,
+      // false is the default
+      /*allow_preexisting_pyobj=*/false,
+      // we checked __torch_dispatch__ above; avoid checking again.
+      /*has_torch_dispatch_if_known=*/true);
+  END_HANDLE_TH_ERRORS
+}
+
+// DTensor-specific variant of make_wrapper_subclass to minimize DTensor
+// overhead.
+static PyObject* THPVariable_make_dtensor(
+    PyObject*,
+    PyObject* args,
+    PyObject* kwargs) {
+  HANDLE_TH_ERRORS
+  static PythonArgParser parser({
+      "_make_dtensor(PyObject* cls, SymIntArrayRef size, SymIntArrayRef strides, "
+      "Tensor local_tensor, bool requires_grad)",
+  });
+  ParsedArgs<5> parsed_args{};
+  auto r = parser.parse(args, kwargs, parsed_args);
+  PyObject* cls = r.pyobject(0);
+
+  TORCH_CHECK_TYPE(
+      PyType_Check(cls),
+      "cls must be a type (got ",
+      Py_TYPE(cls)->tp_name,
+      ")");
+  // See note about the __torch_dispatch__ check in
+  // THPVariable_make_wrapper_subclass above.
+  py::object attr = PyObject_FastGetAttrString(cls, "__torch_dispatch__");
+  TORCH_CHECK_TYPE(
+      attr.ptr() != nullptr &&
+          attr.ptr() != torch::disabled_torch_dispatch_impl(),
+      ((PyTypeObject*)cls)->tp_name,
+      " must define __torch_dispatch__");
+
+  const auto& local_tensor = r.tensor(3);
+  const auto options = TensorOptions()
+                           .dtype(local_tensor.dtype())
+                           .device(local_tensor.device())
+                           .layout(local_tensor.layout());
+
+  DispatchKeySet extra_dispatch_keys;
+  const auto tensor_keys = local_tensor.key_set();
+  if (tensor_keys.has(c10::DispatchKey::Conjugate)) {
+    extra_dispatch_keys = extra_dispatch_keys.add(c10::DispatchKey::Conjugate);
+  }
+  if (tensor_keys.has(c10::DispatchKey::Negative)) {
+    extra_dispatch_keys = extra_dispatch_keys.add(c10::DispatchKey::Negative);
+  }
+
+  Tensor tensor = make_tensor_for_subclass_helper(
+      /*sym_sizes=*/r.symintlist(1),
+      /*sym_strides=*/r.symintlist(2),
+      /*sym_storage_offset=*/std::nullopt,
+      options,
+      /*storage_size=*/std::nullopt,
+      extra_dispatch_keys);
+  tensor.set_requires_grad(r.toBool(4));
+  return THPVariable_NewWithVar(
+      (PyTypeObject*)cls,
+      tensor,
+      // false is the default
+      /*allow_preexisting_pyobj=*/false,
+      // we know DTensor has __torch_dispatch__ and we double-checked
+      // above; avoid checking again.
+      /*has_torch_dispatch_if_known=*/true);
   END_HANDLE_TH_ERRORS
 }
 
@@ -1661,6 +1741,10 @@ static PyMethodDef extra_methods[] = {
      castPyCFunctionWithKeywords(THPVariable_make_wrapper_subclass),
      METH_STATIC | METH_VARARGS | METH_KEYWORDS,
      nullptr},
+    {"_make_dtensor",
+     castPyCFunctionWithKeywords(THPVariable_make_dtensor),
+     METH_STATIC | METH_VARARGS | METH_KEYWORDS,
+     nullptr},
     {"_fix_weakref", THPVariable_fix_weakref, METH_NOARGS, nullptr},
     {"_view_func",
      castPyCFunctionWithKeywords(THPVariable_view_func),
@@ -2023,10 +2107,11 @@ static void THPVariable_subclass_dealloc(PyObject* self) {
 static PyObject* THPVariable_NewWithVar(
     PyTypeObject* type,
     const at::TensorBase& _var,
-    bool allow_preexisting_pyobj) {
+    bool allow_preexisting_pyobj,
+    std::optional<bool> has_torch_dispatch_if_known) {
   // Make sure that the reinterpret into a THPVariable* will be valid
   TORCH_CHECK(
-      PyType_IsSubtype(type, &THPVariableType),
+      type == &THPVariableType || PyType_IsSubtype(type, &THPVariableType),
       "Creating a Tensor subclass from a class ",
       "that does not inherit from Tensor is not possible. Make sure your class inherits from Tensor.");
 
@@ -2116,7 +2201,9 @@ static PyObject* THPVariable_NewWithVar(
       v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
       const auto& var = THPVariable_Unpack(v);
       var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(obj);
-      if (check_has_torch_dispatch(obj)) {
+      if (has_torch_dispatch_if_known.has_value()
+              ? *has_torch_dispatch_if_known
+              : check_has_torch_dispatch(obj)) {
         var.unsafeGetTensorImpl()->set_python_dispatch(true);
       }
     }
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 9dd811eabe79..e618ee703378 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -61,6 +61,33 @@ Py_ssize_t THPVariable_length(PyObject* self) {
 // and tuples of those types. We also handle bools as if they were a
 // Variable[ByteTensor].
 
+// We only go one deep, because that's all torchdim needs (it supports
+// a tuple/list of FCDs which triggers a split behavior, but you can
+// only do it at the top level) and it's all the dispatcher will do
+// as well.
+static bool sequence_has_torch_function(PyObject* seq) {
+  auto length = PySequence_Length(seq);
+  if (length < 0) {
+    PyErr_Clear();
+    return false;
+  }
+
+  for (Py_ssize_t i = 0; i < length; i++) {
+    THPObjectPtr item(PySequence_GetItem(seq, i));
+    if (!item.get()) {
+      PyErr_Clear();
+      continue;
+    }
+
+    // Only check direct torch function on item (no recursion)
+    if (check_has_torch_function(item.get(), /*ignore_mode*/ true)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static int64_t count_specified_dimensions(PyObject* index) {
   // Count the number of indexed dimensions (everything but ellipsis and None)
   // -1 is a sentinel for __torch_function__
@@ -68,8 +95,10 @@ static int64_t count_specified_dimensions(PyObject* index) {
   auto size = PyTuple_GET_SIZE(index);
   for (Py_ssize_t i = 0; i < size; i++) {
     PyObject* obj = PyTuple_GET_ITEM(index, i);
-    if (check_has_torch_function(obj))
+    if (check_has_torch_function(obj)) {
       return -1;
+    }
+
     if (THPVariable_Check(obj)) {
       const auto& var = THPVariable_Unpack(obj);
       const auto& var_scalar_type = var.scalar_type();
@@ -78,10 +107,17 @@ static int64_t count_specified_dimensions(PyObject* index) {
       } else {
         count++;
       }
-    } else if (
-        obj != Py_None && obj != Py_Ellipsis && obj != Py_True &&
-        obj != Py_False) {
-      count++;
+    } else {
+      // Check sequences for __torch_function__ (top-level only)
+      if (PySequence_Check(obj)) {
+        if (sequence_has_torch_function(obj)) {
+          return -1; // Signal torch function handling needed
+        }
+      }
+      if (obj != Py_None && obj != Py_Ellipsis && obj != Py_True &&
+          obj != Py_False) {
+        count++;
+      }
     }
   }
   return count;
@@ -398,7 +434,7 @@ PyObject* THPVariable_getitem(PyObject* self, PyObject* index) {
   variable_list variableIndices;
   int64_t specified_dims = count_specified_dimensions(holder.get());
   if (specified_dims == -1) {
-    return handle_torch_function_indexing(self, holder.get());
+    return handle_torch_function_indexing(self, index);
   }
   Variable sliced = applySlicing(
       self_,
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.cpp b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
index 43606807c6e4..3fbe6f906db4 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.cpp
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.cpp
@@ -7,23 +7,6 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
-CUDAPluggableAllocatorDeleterContext::CUDAPluggableAllocatorDeleterContext(
-    std::function<FreeFuncType> free_fn,
-    void* data,
-    size_t size,
-    int device,
-    cudaStream_t stream)
-    : free_fn_(std::move(free_fn)),
-      data_(data),
-      size_(size),
-      device_(device),
-      stream_(stream) {}
-
-void CUDAPluggableAllocatorDeleterContext::free() {
-  free_fn_(data_, size_, device_, stream_);
-  delete this;
-}
-
 int device_count = 0;
 
 void custom_raw_deleter(void* ptr);
@@ -41,8 +24,8 @@ _AllocationMetadata::_AllocationMetadata(
 // This avoids having to link against libtorch for C++ based custom allocators
 // And also use this from python
 CUDAPluggableAllocator::CUDAPluggableAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn)
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn)
     : alloc_fn_(std::move(alloc_fn)), free_fn_(std::move(free_fn)) {}
 
 CUDAPluggableAllocator::CUDAPluggableAllocator(CUDAPluggableAllocator& other)
@@ -114,10 +97,8 @@ c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
   C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
   cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
   void* r = this->malloc(size, device, stream);
-  auto* ctx = new CUDAPluggableAllocatorDeleterContext(
-      free_fn_, r, size, device, stream);
   c10::DataPtr data_ptr = {
-      r, ctx, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
+      r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
   return data_ptr;
 }
 
@@ -382,8 +363,8 @@ getCurrentAllocator() {
 // TODO: add more functions in the argument
 std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn) {
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn) {
   std::shared_ptr<CUDAPluggableAllocator> allocator(
       new CUDAPluggableAllocator(std::move(alloc_fn), std::move(free_fn)));
   allocator->init(device_count);
@@ -400,8 +381,8 @@ void changeCurrentAllocator(
   current_custom_allocator = allocator;
 }
 
-void custom_raw_deleter(void* ctx) {
-  reinterpret_cast<CUDAPluggableAllocatorDeleterContext*>(ctx)->free();
+void custom_raw_deleter(void* ptr) {
+  current_custom_allocator->raw_delete(ptr);
 }
 
 } // namespace torch::cuda::CUDAPluggableAllocator
diff --git a/torch/csrc/cuda/CUDAPluggableAllocator.h b/torch/csrc/cuda/CUDAPluggableAllocator.h
index 5a1b7be0a15d..d4f73117eca6 100644
--- a/torch/csrc/cuda/CUDAPluggableAllocator.h
+++ b/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -11,32 +11,6 @@
 
 namespace torch::cuda::CUDAPluggableAllocator {
 
-using MallocFuncType = void*(size_t, int, cudaStream_t);
-using FreeFuncType = void(void*, size_t, int, cudaStream_t);
-
-// A CUDAPluggableAllocatorDeleterContext object is used as the `ctx`
-// argument for DataPtr. We need context because a user can use
-// multiple allocators in the same PyTorch program, and
-// the allocators can have different free functions, such as:
-// free, cudaFree, cudaFreeAsync, ncclMemFree etc.
-struct TORCH_CUDA_CPP_API CUDAPluggableAllocatorDeleterContext {
-  explicit CUDAPluggableAllocatorDeleterContext(
-      std::function<FreeFuncType> free_fn,
-      void* data,
-      size_t size,
-      int device,
-      cudaStream_t stream);
-
-  void free();
-
- private:
-  std::function<FreeFuncType> free_fn_;
-  void* data_;
-  size_t size_;
-  int device_;
-  cudaStream_t stream_{};
-};
-
 #if defined(USE_ROCM)
 using streamType = c10::hip::HIPStream;
 #else
@@ -49,8 +23,8 @@ getCurrentAllocator();
 TORCH_CUDA_CPP_API std::shared_ptr<
     c10::cuda::CUDACachingAllocator::CUDAAllocator>
 createCustomAllocator(
-    std::function<MallocFuncType> alloc_fn,
-    std::function<FreeFuncType> free_fn);
+    std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+    std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
 TORCH_CUDA_CPP_API void changeCurrentAllocator(
     const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
         allocator);
@@ -69,8 +43,8 @@ struct _AllocationMetadata {
 struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
     : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
   CUDAPluggableAllocator(
-      std::function<MallocFuncType> alloc_fn,
-      std::function<FreeFuncType> free_fn);
+      std::function<void*(size_t, int, cudaStream_t)> alloc_fn,
+      std::function<void(void*, size_t, int, cudaStream_t)> free_fn);
 
   CUDAPluggableAllocator(CUDAPluggableAllocator& other);
   CUDAPluggableAllocator(CUDAPluggableAllocator&& other) = delete;
@@ -173,8 +147,8 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
   void copy_data(void* dest, const void* src, std::size_t count) const final;
 
  protected:
-  std::function<MallocFuncType> alloc_fn_;
-  std::function<FreeFuncType> free_fn_;
+  std::function<void*(size_t, int, cudaStream_t)> alloc_fn_;
+  std::function<void(void*, size_t, int, cudaStream_t)> free_fn_;
   std::function<void(int)> init_fn_;
   std::function<void()> reset_fn_;
   std::function<void(double, int)> memory_fraction_fn_;
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index 7782dd787f3e..3a8929110e8b 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -5,7 +5,6 @@
 #include <c10/core/Device.h>
 #include <c10/core/TensorImpl.h>
 #include <c10/util/UniqueVoidPtr.h>
-#include <fmt/core.h>
 #include <pybind11/pytypes.h>
 #include <torch/csrc/utils/python_arg_parser.h>
 #include <unordered_set>
@@ -908,6 +907,8 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
   py::str release_lock_on_malloc_s = "release_lock_on_cudamalloc";
   py::str pinned_use_host_register_s = "pinned_use_cuda_host_register";
   py::str roundup_power2_divisions_s = "roundup_power2_divisions";
+  py::str graph_capture_record_stream_reuse_s =
+      "graph_capture_record_stream_reuse";
 
   allocator_settings[last_allocator_settings_s] =
       snapshot.config_metadata.last_allocator_settings;
@@ -923,6 +924,8 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) {
       snapshot.config_metadata.release_lock_on_malloc;
   allocator_settings[pinned_use_host_register_s] =
       snapshot.config_metadata.pinned_use_host_register;
+  allocator_settings[graph_capture_record_stream_reuse_s] =
+      snapshot.config_metadata.graph_capture_record_stream_reuse;
   unsigned int roundup_key = 1;
   py::dict roundup_settings;
   for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
@@ -1017,34 +1020,6 @@ PyObject* THCPModule_cudaGetSyncDebugMode(PyObject* self, PyObject* noargs) {
   END_HANDLE_TH_ERRORS
 }
 
-std::string uuid_to_string(const char* uuid_bytes) {
-  // UUIDs are a 128-bit label. CUDA and HIP store this as char[16].
-  // For string representation, the code here expands this to
-  // 8-4-4-4-12 hex format, so each byte becomes 2 hex characters.
-  return fmt::format(
-      "{:02x}{:02x}{:02x}{:02x}-"
-      "{:02x}{:02x}-"
-      "{:02x}{:02x}-"
-      "{:02x}{:02x}-"
-      "{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
-      (uint8_t)uuid_bytes[0],
-      (uint8_t)uuid_bytes[1],
-      (uint8_t)uuid_bytes[2],
-      (uint8_t)uuid_bytes[3],
-      (uint8_t)uuid_bytes[4],
-      (uint8_t)uuid_bytes[5],
-      (uint8_t)uuid_bytes[6],
-      (uint8_t)uuid_bytes[7],
-      (uint8_t)uuid_bytes[8],
-      (uint8_t)uuid_bytes[9],
-      (uint8_t)uuid_bytes[10],
-      (uint8_t)uuid_bytes[11],
-      (uint8_t)uuid_bytes[12],
-      (uint8_t)uuid_bytes[13],
-      (uint8_t)uuid_bytes[14],
-      (uint8_t)uuid_bytes[15]);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 // Cuda module initialization
 ////////////////////////////////////////////////////////////////////////////////
@@ -1299,14 +1274,16 @@ static void registerCudaPluggableAllocator(PyObject* module) {
             self.set_release_pool(func);
           });
   m.def("_cuda_customAllocator", [](uint64_t malloc_ptr, uint64_t free_ptr) {
-    using namespace torch::cuda::CUDAPluggableAllocator;
+    using MallocFuncType = void*(size_t, int, cudaStream_t);
+    using FreeFuncType = void(void*, size_t, int, cudaStream_t);
     std::function<MallocFuncType> malloc_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<MallocFuncType*>(malloc_ptr);
     std::function<FreeFuncType> free_fn =
         // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<FreeFuncType*>(free_ptr);
-    return createCustomAllocator(malloc_fn, free_fn);
+    return torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
+        malloc_fn, free_fn);
   });
 
   // NOLINTNEXTLINE(bugprone-unused-raii)
@@ -2053,6 +2030,10 @@ static struct PyMethodDef _THCPModule_methods[] = {
      THCPModule_cudaCachingAllocator_enable,
      METH_O,
      nullptr},
+    {"_cuda_cudaCachingAllocator_set_allocator_settings",
+     THCPModule_cudaCachingAllocator_set_allocator_settings,
+     METH_O,
+     nullptr},
     {"_cuda_getAllocatorBackend",
      THCPModule_getAllocatorBackend,
      METH_NOARGS,
diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp
index 3abd4acddc79..3c96d5c5908d 100644
--- a/torch/csrc/cuda/memory_snapshot.cpp
+++ b/torch/csrc/cuda/memory_snapshot.cpp
@@ -458,6 +458,8 @@ std::string _memory_snapshot_pickled() {
   IValue release_lock_on_malloc_s = "release_lock_on_cudamalloc";
   IValue pinned_use_host_register_s = "pinned_use_cuda_host_register";
   IValue roundup_power2_divisions_s = "roundup_power2_divisions";
+  IValue graph_capture_record_stream_reuse_s =
+      "graph_capture_record_stream_reuse";
 
   allocator_settings.insert(
       last_allocator_settings_s,
@@ -478,6 +480,9 @@ std::string _memory_snapshot_pickled() {
   allocator_settings.insert(
       pinned_use_host_register_s,
       snapshot.config_metadata.pinned_use_host_register);
+  allocator_settings.insert(
+      graph_capture_record_stream_reuse_s,
+      snapshot.config_metadata.graph_capture_record_stream_reuse);
   unsigned int roundup_key = 1;
   auto roundup_settings = new_dict();
   for (const auto& v : snapshot.config_metadata.roundup_power2_divisions) {
diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp
index 862c983d9e05..7b0fc862e680 100644
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@@ -33,7 +33,11 @@
 #define LOCK_SH 0x00000010
 #define LOCK_UN 0x00000100
 
-int flock_(int fd, int op) {
+#if defined(_WIN32) && defined(USE_ROCM)
+static
+#endif
+    int
+    flock_(int fd, int op) {
   HANDLE hdl = (HANDLE)_get_osfhandle(fd);
   DWORD low = 1, high = 0;
   OVERLAPPED offset = {0, 0, 0, 0, NULL};
diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp
index 3c6af83bde29..2384448a06e7 100644
--- a/torch/csrc/distributed/c10d/FlightRecorder.cpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/distributed/c10d/FlightRecorderDetail.hpp>
+#include <fstream>
 
 namespace c10d {
 
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
index e99421fe2f62..c7f21d62e24e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -797,7 +797,10 @@ class AsyncBroadcastWork : public ProcessGroupGloo::AsyncWork {
   const int rootTensor;
   const uint32_t tag;
 
-  void broadcast(at::Tensor& tensor) {
+  void broadcast(at::Tensor tensor) {
+    if (tensor.is_complex()) {
+      tensor = at::view_as_real(tensor);
+    }
     const auto& scalarType = tensor.scalar_type();
     gloo::BroadcastOptions opts(context_);
     opts.setRoot(rootRank);
@@ -1128,13 +1131,22 @@ class AsyncReduceWork : public ProcessGroupGloo::AsyncWork {
   const uint32_t tag;
 
   void reduce(std::vector<at::Tensor>& tensors) {
-    const auto& scalarType = tensors[0].scalar_type();
+    auto tensor = tensors[0];
+    if (tensor.is_complex()) {
+      TORCH_CHECK(
+          c10d::isComplexViewAsRealAllowed(reduceOp),
+          "reduce does not support",
+          reduceOp,
+          "on complex tensors");
+      tensor = at::view_as_real(tensor);
+    }
     gloo::ReduceOptions opts(context_);
+    const auto& scalarType = tensor.scalar_type();
     opts.setRoot(rootRank);
     opts.setTag(tag);
     opts.setReduceFunction(getFunction(scalarType, reduceOp));
     opts.setTimeout(timeout_);
-    GENERATE_ALL_TYPES(scalarType, setOutput, opts, tensors[0]);
+    GENERATE_ALL_TYPES(scalarType, setOutput, opts, tensor);
     gloo::reduce(opts);
 
     // Gloo doesn't support AVG so we use SUM + division.
@@ -1333,7 +1345,8 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
     // Use single flat output tensor.
     // The first dimension corresponds to the index into outputs[N],
     // so copying into the actual output later is easy.
-    at::Tensor flatOutputTensor = newLikeFlat(outputs[0]);
+    at::Tensor flatOutputTensor =
+        newLikeFlat(outputs[0], /*preserve_strides*/ false);
     GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
     gloo::allgather(opts);
 
@@ -1350,7 +1363,7 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
-    return {newLikeFlat(outputs[0])};
+    return {newLikeFlat(outputs[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -1646,7 +1659,7 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
-    return {newLikeFlat(output_lists[0])};
+    return {newLikeFlat(output_lists[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -1770,7 +1783,7 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
     // This is later scattered to the separate output tensors.
     at::Tensor flatOutputTensor;
     if (context_->rank == root) {
-      flatOutputTensor = newLikeFlat(outputs[0]);
+      flatOutputTensor = newLikeFlat(outputs[0], /*preserve_strides*/ false);
       GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
     }
 
@@ -1793,7 +1806,8 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
 
   const std::vector<at::Tensor> getOutputTensors() override {
     return outputs.empty() ? std::vector<at::Tensor>{}
-                           : std::vector<at::Tensor>{newLikeFlat(outputs[0])};
+                           : std::vector<at::Tensor>{newLikeFlat(
+                                 outputs[0], /*preserve_strides*/ false)};
   }
 
   void run() override {
@@ -2009,7 +2023,8 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
 
   const std::vector<at::Tensor> getInputTensors() override {
     return inputs.empty() ? std::vector<at::Tensor>{}
-                          : std::vector<at::Tensor>{newLikeFlat(inputs[0])};
+                          : std::vector<at::Tensor>{newLikeFlat(
+                                inputs[0], /*preserve_strides*/ false)};
   }
 
   const std::vector<at::Tensor> getOutputTensors() override {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
index ee5977ed380d..6e680b41fe8d 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
@@ -9,7 +9,7 @@ namespace c10d {
 class AsyncAllreduceCUDADeviceWork : public ProcessGroupGloo::AsyncWork {
  public:
   AsyncAllreduceCUDADeviceWork(
-      const std::shared_ptr<gloo::Context>& context,
+      std::shared_ptr<gloo::Context> context,
       std::vector<at::Tensor>& inputs,
       ReduceOp reduceOp,
       uint32_t tag,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
index 439a79490c9f..442cb490743b 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
@@ -232,8 +232,8 @@ void setInput(O& opts, at::Tensor& tensor, std::vector<int64_t>& counts) {
 }
 
 template <typename T, typename O>
-void setOutputs(O& opts, std::vector<at::Tensor>& tensors) {
-  opts.setOutputs(getDataPointers<T>(tensors), tensors[0].numel());
+void setOutputs(O& opts, std::vector<at::Tensor>& tensors, int64_t count) {
+  opts.setOutputs(getDataPointers<T>(tensors), count);
 }
 
 template <typename T, typename O>
@@ -289,12 +289,23 @@ class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
   const uint32_t tag;
 
   void allreduce(std::vector<at::Tensor>& tensors) {
-    const auto& scalarType = tensors[0].scalar_type();
+    auto tensor = tensors[0];
+    if (tensor.is_complex()) {
+      TORCH_CHECK(
+          c10d::isComplexViewAsRealAllowed(reduceOp),
+          "all_reduce does not support",
+          reduceOp,
+          "on complex tensors");
+      tensor = at::view_as_real(tensor);
+    }
     gloo::AllreduceOptions opts(context_);
+    const auto& scalarType = tensor.scalar_type();
     opts.setReduceFunction(getFunction(scalarType, reduceOp));
     opts.setTag(tag);
     opts.setTimeout(timeout_);
-    GENERATE_ALL_TYPES(scalarType, setOutputs, opts, tensors);
+    // Use tensor.numel() instead of tensors[0].numel() to
+    // get the right number of elements when tensors[0] is complex
+    GENERATE_ALL_TYPES(scalarType, setOutputs, opts, tensors, tensor.numel());
     gloo::allreduce(opts);
 
     // Gloo doesn't support AVG so we use SUM + division.
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 339a8c147d5a..768cb3b14fab 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -68,23 +68,6 @@ inline bool isUnsupportedFloat8(at::ScalarType t) {
   );
 }
 
-bool complexViewAsRealAllowed(const ReduceOp& reduceOp) {
-  switch (reduceOp) {
-    // NOLINTNEXTLINE(bugprone-branch-clone)
-    case ReduceOp::SUM:
-      return true;
-    case ReduceOp::AVG:
-      return true;
-    case ReduceOp::PREMUL_SUM:
-      return true;
-    case ReduceOp::UNUSED:
-      return true;
-    default:
-      return false;
-  }
-  return false;
-}
-
 #ifdef ENABLE_NCCL_PREMUL_SUM_SUPPORT
 template <typename T, ncclDataType_t dataType>
 ncclRedOpRAII unpackPreMulSum(
@@ -3234,9 +3217,15 @@ void check_gpu_single_tensor(
   if (!tensor.is_cuda() || tensor.is_sparse()) {
     C10_THROW_ERROR(ValueError, "Tensors must be CUDA and dense");
   }
-  // Skip the following requirements for P2P operations
+  // Check memory format
   if (!tensor.is_contiguous(tensor.suggest_memory_format())) {
+    // P2P is a bit relaxed, supporting transfer of a transposed tensor
     if (p2p) {
+      // But must be dense still
+      if (!tensor.is_non_overlapping_and_dense()) {
+        C10_THROW_ERROR(
+            ValueError, "Tensors for P2P must be non-overlapping and dense");
+      }
       TORCH_WARN_ONCE(
           "Detected non-contiguous tensor in P2P operations. It is user "
           "responsibility to guarantee that source and destination tensors have "
@@ -4403,7 +4392,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
   auto tensor = tensors.back();
   if (tensor.is_complex()) {
     TORCH_CHECK(
-        complexViewAsRealAllowed(opts.reduceOp),
+        c10d::isComplexViewAsRealAllowed(opts.reduceOp),
         "all_reduce does not support",
         opts.reduceOp,
         "on complex tensors");
@@ -4597,7 +4586,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce(
   auto tensor = tensors.back();
   if (tensor.is_complex()) {
     TORCH_CHECK(
-        complexViewAsRealAllowed(opts.reduceOp),
+        c10d::isComplexViewAsRealAllowed(opts.reduceOp),
         "reduce does not support",
         opts.reduceOp,
         "on complex tensors");
@@ -5075,14 +5064,12 @@ c10::DeviceIndex ProcessGroupNCCL::guessDeviceId() const {
   // offset wrt the device id if intra-node GPUs are sharded into multiple
   // dimensions.
   int devIdx = globalRank() % localDeviceCount_;
-  LOG(WARNING)
-      << logPrefix()
-      << c10::str(
-             " using GPU ",
-             devIdx,
-             " as device used by this process is currently unknown. ",
-             "This can potentially cause a hang if this rank to GPU mapping is incorrect. ",
-             "You can specify device_id in init_process_group() to force use of a particular device.");
+  if (devIdx == 0) { // only log on first rank of each node
+    LOG(WARNING) << c10::str(
+        "Guessing device ID based on global rank. ",
+        "This can cause a hang if rank to GPU mapping is heterogeneous. ",
+        "You can specify device_id in init_process_group()");
+  }
   return static_cast<c10::DeviceIndex>(devIdx);
 }
 
diff --git a/torch/csrc/distributed/c10d/Types.cpp b/torch/csrc/distributed/c10d/Types.cpp
new file mode 100644
index 000000000000..300d21780bdb
--- /dev/null
+++ b/torch/csrc/distributed/c10d/Types.cpp
@@ -0,0 +1,22 @@
+#include <torch/csrc/distributed/c10d/Types.hpp>
+
+namespace c10d {
+
+bool isComplexViewAsRealAllowed(const ReduceOp& reduceOp) {
+  switch (reduceOp) {
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    case ReduceOp::SUM:
+      return true;
+    case ReduceOp::AVG:
+      return true;
+    case ReduceOp::PREMUL_SUM:
+      return true;
+    case ReduceOp::UNUSED:
+      return true;
+    default:
+      return false;
+  }
+  return false;
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/Types.hpp b/torch/csrc/distributed/c10d/Types.hpp
index 8fec5dd0e9e2..18db14f5cef0 100644
--- a/torch/csrc/distributed/c10d/Types.hpp
+++ b/torch/csrc/distributed/c10d/Types.hpp
@@ -110,6 +110,8 @@ ReduceOp makeNCCLPreMulSum(const T& factor) {
   return rop;
 }
 
+TORCH_API bool isComplexViewAsRealAllowed(const ReduceOp& reduceOp);
+
 constexpr auto kUnsetTimeout = std::chrono::milliseconds(-1);
 
 struct BroadcastOptions {
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index 03bd6ef3cafd..c7a2e3523ae4 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -444,7 +444,9 @@ inline at::Tensor newLikeFlat(
       sizes, strides, t.options().memory_format(std::nullopt));
 }
 
-inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
+inline at::Tensor newLikeFlat(
+    std::vector<at::Tensor>& tensors,
+    bool preserve_strides = true) {
   if (tensors.empty()) {
     TORCH_CHECK(false, "Received an empty list");
   }
@@ -452,7 +454,20 @@ inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
   at::DeviceGuard gpuGuard(t.device());
   std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
   sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
-  return at::empty(sizes, t.options());
+  if (t.is_contiguous() ||
+      !preserve_strides) { // we are checking for memory format, so tensor might
+    // not be contiguous
+    // TODO handle all non-overlapping-and-dense, although if the strides
+    // disagree in ranks we are opening a door for more bugs than currently
+    // where channels-last might disagree between ranks
+    // fast path, don't call empty_strided
+    return at::empty(sizes, t.options());
+  } else {
+    // memory-dense, but not necessarily contiguous tensor
+    std::vector<int64_t> strides{t.numel()};
+    strides.insert(strides.end(), t.strides().begin(), t.strides().end());
+    return at::empty_strided(sizes, strides, t.options());
+  }
 }
 
 inline std::vector<std::vector<int64_t>> getSizes(
diff --git a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
index 3049464d96ee..76f58b833861 100644
--- a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
+++ b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
@@ -151,7 +151,7 @@ at::Tensor async_input_mm_impl(
           reinterpret_cast<ElementB*>(b.data_ptr<at::BFloat16>()),
           stride_B,
       },
-      {{1, 1},
+      {{},
        reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
        stride_C,
        reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index a0904a814637..018932668358 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -1128,6 +1128,9 @@ This class does not support ``__members__`` property.)");
           &::c10d::symmetric_memory::has_multicast_support)
       .def_static("set_backend", &::c10d::symmetric_memory::set_backend)
       .def_static("get_backend", &::c10d::symmetric_memory::get_backend)
+      .def_static(
+          "get_mempool_allocator",
+          &::c10d::symmetric_memory::get_mempool_allocator)
       .def_property_readonly("rank", &SymmetricMemory::get_rank)
       .def_property_readonly("world_size", &SymmetricMemory::get_world_size)
       .def_property_readonly(
@@ -1167,6 +1170,7 @@ This class does not support ``__members__`` property.)");
       .def_property_readonly("buffer_size", &SymmetricMemory::get_buffer_size)
       .def_property_readonly(
           "signal_pad_size", &SymmetricMemory::get_signal_pad_size)
+      .def_property_readonly("offset", &SymmetricMemory::get_offset)
       .def(
           "get_buffer",
           &SymmetricMemory::get_buffer,
@@ -1198,6 +1202,12 @@ This class does not support ``__members__`` property.)");
           py::arg("src_rank"),
           py::arg("channel") = 0,
           py::arg("timeout_ms") = 0)
+      .def(
+          "get_remote_tensor",
+          &SymmetricMemory::get_remote_tensor,
+          py::arg("peer"),
+          py::arg("sizes"),
+          py::arg("dtype"))
       // Util functions that are often used together with symmetric memory but
       // not necessarily directly on symmetric memory.
       .def_static(
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index 110ff4606a01..bd1446c57941 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -146,78 +146,6 @@ void* CUDASymmetricMemory::get_multicast_ptr() {
   return mc_addr_;
 }
 
-at::Tensor CUDASymmetricMemory::get_buffer(
-    int rank,
-    c10::IntArrayRef sizes,
-    c10::ScalarType dtype,
-    int64_t storage_offset) {
-  const size_t numel = std::accumulate(
-      sizes.begin(),
-      sizes.end(),
-      static_cast<size_t>(1),
-      std::multiplies<size_t>());
-  const auto element_size = c10::elementSize(dtype);
-  const auto req_size = (numel + storage_offset) * element_size;
-  TORCH_CHECK(
-      req_size <= buffer_size_,
-      "CUDASymmetricMemory::get_buffer: the requested size (",
-      req_size,
-      " bytes) exceeds the allocated size (",
-      buffer_size_,
-      " bytes)");
-  auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
-      storage_offset * element_size;
-  auto device = c10::Device(c10::DeviceType::CUDA, local_device_idx_);
-  auto options = at::TensorOptions().dtype(dtype).device(device);
-  return at::for_blob(data_ptr, sizes)
-      .options(options)
-      .target_device(device)
-      .make_tensor();
-}
-
-at::Tensor CUDASymmetricMemory::get_signal_pad(
-    int rank,
-    c10::IntArrayRef sizes,
-    std::optional<c10::ScalarType> dtype,
-    int64_t storage_offset) {
-  // If the dtype is unspecified, default it to UInt32, as it
-  // is the most common type for signaling purposes.
-  if (!dtype.has_value()) {
-    dtype = c10::ScalarType::UInt32;
-  }
-
-  // If the shape is unspecified, treat the signal pad as a 1d tensor.
-  const auto element_size = c10::elementSize(*dtype);
-  std::vector<int64_t> shape;
-  if (!sizes.empty()) {
-    shape = sizes.vec();
-  } else {
-    shape.push_back(signal_pad_size / element_size);
-  }
-
-  const size_t numel = std::accumulate(
-      shape.begin(),
-      shape.end(),
-      static_cast<size_t>(1),
-      std::multiplies<size_t>());
-  const auto req_size = (numel + storage_offset) * element_size;
-  TORCH_CHECK(
-      req_size <= signal_pad_size,
-      "CUDASymmetricMemory::get_signal_pad: the requested size (",
-      req_size,
-      " bytes) exceeds the allocated size (",
-      signal_pad_size,
-      " bytes)");
-  auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
-      storage_offset * element_size;
-  auto device = c10::Device(c10::DeviceType::CUDA, local_device_idx_);
-  auto options = at::TensorOptions().dtype(*dtype).device(device);
-  return at::for_blob(data_ptr, shape)
-      .options(options)
-      .target_device(device)
-      .make_tensor();
-}
-
 void check_channel(int channel, int world_size) {
   TORCH_CHECK(
       channel >= 0,
@@ -388,6 +316,14 @@ int CUDASymmetricMemory::get_world_size() {
   return world_size_;
 }
 
+c10::Device CUDASymmetricMemory::get_device() {
+  return c10::Device(c10::DeviceType::CUDA, local_device_idx_);
+}
+
+bool CUDASymmetricMemory::world_within_direct_access() {
+  return true;
+}
+
 Block::Block(
     c10::intrusive_ptr<AllocationRef> alloc_ref,
     int device_idx,
@@ -442,6 +378,7 @@ void* CUDASymmetricMemoryAllocator::alloc(
   C10_CUDA_DRIVER_CHECK(driver_api->cuMemCreate_(&handle, block_size, &prop, 0));
 
 #elif defined(USE_ROCM)
+  handle_type_ = Expandable_Segments_Handle_Type::POSIX_FD;
   hipMemAllocationProp prop = {};
   prop.type = hipMemAllocationTypePinned;
   prop.location.type = hipMemLocationTypeDevice;
@@ -507,6 +444,7 @@ struct RendezvousRequest {
   size_t buffer_size;
   size_t signal_pad_offset;
   bool has_multicast_support;
+  char hostname[HOST_NAME_MAX + 1];
 };
 
 void validate_rendezvous_requests(
@@ -514,13 +452,15 @@ void validate_rendezvous_requests(
     int world_size) {
   TORCH_CHECK(reqs.size() == (size_t)world_size);
 
-  std::unordered_set<int> device_indices;
-  device_indices.reserve(world_size);
+  // For NVL72 systems, multiple hosts can be within a single nvlink domain.
+  // Multiple blocks will have same device_idx but they are on different hosts.
+  // Use (hostname, device_idx) pair to uniquely identify each allocation.
+  std::set<std::pair<std::string, int>> device_host_pairs;
   for (auto req : reqs) {
-    device_indices.insert(req.device_idx);
+    device_host_pairs.insert(std::make_pair(std::string(req.hostname), req.device_idx));
   }
   if (!allow_overlapping_devices() &&
-      device_indices.size() < (size_t)world_size) {
+      device_host_pairs.size() < (size_t)world_size) {
     TORCH_CHECK(
         false,
         "CUDASymmetricMemoryAllocator::rendezvous: ",
@@ -710,6 +650,9 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
       .buffer_size = block->buffer_size,
       .signal_pad_offset = block->signal_pad_offset,
       .has_multicast_support = device_has_multicast_support(block->device_idx)};
+
+  // Populate hostname field for host identification
+  gethostname(local_req.hostname, sizeof(local_req.hostname));
   auto reqs = storeExchange.all_gather(store, rank, world_size, local_req);
   validate_rendezvous_requests(reqs, world_size);
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
index f61d8f9622a7..39a6122bcdb2 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
@@ -52,24 +52,14 @@ class CUDASymmetricMemory : public SymmetricMemory {
   bool has_multicast_support() override;
   void* get_multicast_ptr() override;
 
-  at::Tensor get_buffer(
-      int rank,
-      c10::IntArrayRef sizes,
-      c10::ScalarType dtype,
-      int64_t storage_offset) override;
-
-  at::Tensor get_signal_pad(
-      int rank,
-      c10::IntArrayRef sizes,
-      std::optional<c10::ScalarType> dtype,
-      int64_t storage_offset) override;
-
   void barrier(int channel, size_t timeout_ms) override;
   void put_signal(int dst_rank, int channel, size_t timeout_ms) override;
   void wait_signal(int src_rank, int channel, size_t timeout_ms) override;
 
   int get_rank() override;
   int get_world_size() override;
+  c10::Device get_device() override;
+  bool world_within_direct_access() override;
 
  private:
   std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs_;
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
index 3a004ae73ce7..572c5a8fd369 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@@ -104,7 +104,8 @@ void init_elementwise_launch_config(
     size_t max_num_blocks,
     size_t max_num_threads,
     int& num_blocks,
-    int& num_threads) {
+    int& num_threads,
+    int world_size) {
   // Align to preserve alignment in each split
   const size_t aligned_numel = at::round_up(numel, alignment * splits);
   const size_t numel_per_split = aligned_numel / splits;
@@ -112,9 +113,11 @@ void init_elementwise_launch_config(
 
   if (numel_per_split <= max_num_threads * numel_per_thread) {
     num_blocks = 1;
-    num_threads = at::round_up(
-        at::ceil_div(numel_per_split, numel_per_thread),
-        static_cast<size_t>(at::cuda::warp_size()));
+    num_threads = at::ceil_div(numel_per_split, numel_per_thread);
+    // `sync_remote_blocks` maps threads to peers, so we need to make sure there
+    // are enough threads
+    num_threads = max(num_threads, world_size);
+    num_threads = at::round_up(num_threads, at::cuda::warp_size());
   } else {
     num_blocks = std::min(
         at::ceil_div(numel_per_split, max_num_threads * numel_per_thread),
@@ -185,7 +188,8 @@ at::Tensor multimem_all_reduce_(
       8,
       1024,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "multimem_all_reduce_", [&]() {
@@ -271,7 +275,8 @@ at::Tensor multimem_one_shot_all_reduce_out(
       8,
       1024,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "multimem_one_shot_all_reduce", [&]() {
@@ -378,7 +383,8 @@ at::Tensor multimem_all_gather_out(
       8,
       1024,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
     multimem_all_gather_kernel<k_alignment>
@@ -493,7 +499,8 @@ at::Tensor one_shot_all_reduce_out_impl(
       one_shot_all_reduce_max_num_blocks,
       one_shot_all_reduce_max_num_threads,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   AT_DISPATCH_FLOAT_AND_BFLOAT16(
       input.scalar_type(), "one_shot_all_reduce", [&]() {
@@ -748,7 +755,8 @@ at::Tensor two_shot_all_reduce_impl(
       two_shot_all_reduce_max_num_blocks,
       two_shot_all_reduce_max_num_threads,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
 
   if (!output.has_value()) {
     AT_DISPATCH_FLOAT_AND_BFLOAT16(
@@ -895,7 +903,8 @@ at::Tensor reduce_scatter_out(
       two_shot_all_reduce_max_num_blocks,
       two_shot_all_reduce_max_num_threads,
       num_blocks,
-      num_threads);
+      num_threads,
+      symm_mem->get_world_size());
   if (split_last_dim) {
     AT_DISPATCH_FLOAT_AND_BFLOAT16(
         input.scalar_type(), "two_shot_all_reduce", [&]() {
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
index 958b547bd4cf..daf273446ef3 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
@@ -1,8 +1,18 @@
 #pragma once
 
+#include <cstdint>
+
 namespace c10d::symmetric_memory {
 
-constexpr size_t signal_pad_size = 2048;
+// Covers NVL72
+constexpr int max_cuda_p2p_domain_size = 72;
+// Maximum number of channels
+constexpr int symm_max_nblocks = 32;
+
+// Maximally, a rank will need to sync with all other ranks, over all
+// channels. Each signal is 32 bits, which is the minimum unit for atomic cas.
+constexpr size_t signal_pad_size =
+    symm_max_nblocks * max_cuda_p2p_domain_size * sizeof(uint32_t);
 
 #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
 using HandleType = CUmemGenericAllocationHandle;
diff --git a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
index 55695ca27c8e..0eda605fad6f 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
@@ -93,82 +93,6 @@ class NCCLSymmetricMemory : public SymmetricMemory {
     return nullptr;
   }
 
-  // TODO: This is up for change.
-  at::Tensor get_buffer(
-      int rank,
-      c10::IntArrayRef sizes,
-      c10::ScalarType dtype,
-      int64_t storage_offset) override {
-    // TODO: deduplicate
-    const size_t numel = std::accumulate(
-        sizes.begin(),
-        sizes.end(),
-        static_cast<size_t>(1),
-        std::multiplies<size_t>());
-    const auto element_size = c10::elementSize(dtype);
-    const auto req_size = (numel + storage_offset) * element_size;
-    TORCH_CHECK(
-        req_size <= buffer_size_,
-        "NCCLSymmetricMemory::get_buffer: the requested size (",
-        req_size,
-        " bytes) exceeds the allocated size (",
-        buffer_size_,
-        " bytes)");
-    auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
-        storage_offset * element_size;
-    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
-    auto options = at::TensorOptions().dtype(dtype).device(device);
-    return at::for_blob(data_ptr, sizes)
-        .options(options)
-        .target_device(device)
-        .make_tensor();
-  }
-
-  // TODO: This is up for change.
-  at::Tensor get_signal_pad(
-      int rank,
-      c10::IntArrayRef sizes,
-      std::optional<c10::ScalarType> dtype,
-      int64_t storage_offset) override {
-    // TODO: deduplicate
-    // If the dtype is unspecified, default it to UInt32, as it
-    // is the most common type for signaling purposes.
-    if (!dtype.has_value()) {
-      dtype = c10::ScalarType::UInt32;
-    }
-
-    // If the shape is unspecified, treat the signal pad as a 1d tensor.
-    const auto element_size = c10::elementSize(*dtype);
-    std::vector<int64_t> shape;
-    if (!sizes.empty()) {
-      shape = sizes.vec();
-    } else {
-      shape.push_back(signal_pad_size / element_size);
-    }
-
-    const size_t numel = std::accumulate(
-        shape.begin(),
-        shape.end(),
-        static_cast<size_t>(1),
-        std::multiplies<size_t>());
-    const auto req_size = (numel + storage_offset) * element_size;
-    TORCH_CHECK(
-        req_size <= signal_pad_size,
-        "NCCLSymmetricMemory::get_signal_pad: the requested size (",
-        req_size,
-        " bytes) exceeds the allocated size (",
-        signal_pad_size,
-        " bytes)");
-    auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
-        storage_offset * element_size;
-    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
-    auto options = at::TensorOptions().dtype(*dtype).device(device);
-    return at::for_blob(data_ptr, shape)
-        .options(options)
-        .target_device(device)
-        .make_tensor();
-  }
-
   void barrier(int channel, size_t timeout_ms) override {
     // TODO
   }
@@ -189,6 +113,10 @@ class NCCLSymmetricMemory : public SymmetricMemory {
     return world_size_;
   }
 
+  c10::Device get_device() override {
+    return c10::Device(c10::DeviceType::CUDA, device_idx_);
+  }
+
   virtual std::vector<int>& get_rank_to_global_rank() override {
     return rank_to_global_rank_;
   };
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
index d9f71e4cddf0..a470c7e2e54f 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -21,7 +21,7 @@
 namespace c10d {
 namespace symmetric_memory {
 
-/* Start of CUDASymmetricMemory implementation */
+/* Start of NVSHMEMSymmetricMemory implementation */
 
 static StoreExchange storeExchange = StoreExchange("NVSHMEMSymmetricMemory");
 
@@ -43,21 +43,24 @@ struct NVSHMEMAllocation {
   }
 };
 
-class NVSHMEMSymmetricMemory : public SymmetricMemory {
+// A class to hold the base pointers and signal pad pointers for a group of
+// peers. One `NVSHMEMPeerAllocInfo` object can be shared by multiple
+// `NVSHMEMSymmetricMemory` objects when latter reside on the same allocation
+// and rendezvous over the same group. (The `NVSHMEMSymmetricMemory` objects may
+// have different offsets compared to the base address.)
+class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
  public:
-  NVSHMEMSymmetricMemory(
-      std::shared_ptr<NVSHMEMAllocation> allocation,
+  NVSHMEMPeerAllocInfo(
+      NVSHMEMAllocation* allocation,
       const std::string& group_name)
-      : allocation_(allocation),
-        buffer_size_(allocation->buffer_size),
-        device_idx_(allocation->device_idx),
-        group_name_(group_name) {
+      : base_ptr_(allocation->ptr),
+        buffer_size_(allocation->buffer_size) {
     // For logging only
     static int exchanged_n_times = 0;
-    c10::cuda::CUDAGuard guard(device_idx_);
+    c10::cuda::CUDAGuard guard(allocation->device_idx);
 
     auto global_rank = get_group_info("0").rank;
-    GroupInfo& group_info = get_group_info(group_name_);
+    GroupInfo& group_info = get_group_info(group_name);
     auto store = group_info.store;
     rank_ = group_info.rank;
     world_size_ = group_info.world_size;
@@ -70,15 +73,22 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
       if (rank_ == 0) {
         LOG(INFO) << "[rank " << rank_ << "]"
                   << " rank_to_global_rank: " << group_info.rank_to_global_rank
-                  << ", group_name: " << group_name_
+                  << ", group_name: " << group_name
                   << ", exchanged_n_times: " << exchanged_n_times;
       }
     }
     TORCH_INTERNAL_ASSERT(!group_info.rank_to_global_rank.empty());
     rank_to_global_rank_ = group_info.rank_to_global_rank;
+
+    world_within_cuda_p2p_ = true;
     for (int r = 0; r < world_size_; ++r) {
-      buffers_.push_back(nvshmem_ptr(
-          allocation->ptr, rank_to_global_rank_[r]));
+      auto peer_ptr = nvshmem_ptr(
+          base_ptr_, rank_to_global_rank_[r]);
+      buffers_.push_back(peer_ptr);
+      // If a peer is over network, `nvshmem_ptr` returns null
+      if (peer_ptr == nullptr) {
+        world_within_cuda_p2p_ = false;
+      }
     }
 
     // TODO: use the same allocation for signal pad
@@ -114,28 +124,69 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
         cudaMemcpyHostToDevice));
   }
 
+ private:
+  void* base_ptr_;
+  size_t buffer_size_;
+  int rank_;
+  int world_size_;
+  std::vector<void*> buffers_;
+  std::vector<void*> signal_pads_;
+  void** buffers_dev_;
+  void** signal_pads_dev_;
+  std::vector<int> rank_to_global_rank_;
+  int* rank_to_global_rank_dev_;
+  // Whether the world is within CUDA P2P only, not network
+  bool world_within_cuda_p2p_;
+
+  friend class NVSHMEMSymmetricMemory;
+};
+
+class NVSHMEMSymmetricMemory : public SymmetricMemory {
+ public:
+  NVSHMEMSymmetricMemory(
+      NVSHMEMAllocation* allocation,
+      const std::string& group_name)
+      : device_idx_(allocation->device_idx),
+        group_name_(group_name) {
+    // A handle stores two types of info:
+    // (i) allocation's base ptrs and base signal pads, ours and peers'
+    pai_ = c10::make_intrusive<NVSHMEMPeerAllocInfo>(allocation, group_name);
+    // (ii) offset of tensor compared to base ptr (in byte)
+    offset_ = 0;
+  }
+
+  // Exact copy is not needed / supported
+  NVSHMEMSymmetricMemory(const NVSHMEMSymmetricMemory& other) = delete;
+
+  // Copy with offset is allowed
+  // This is mostly a shallow copy that shares the pointer to `NVSHMEMPeerAllocInfo` which has been created by `other`
+  NVSHMEMSymmetricMemory(const NVSHMEMSymmetricMemory& other, size_t offset)
+      : device_idx_(other.device_idx_), group_name_(other.group_name_), pai_(other.pai_) {
+    offset_ = offset;
+  }
+
   ~NVSHMEMSymmetricMemory() override{
       // TODO
   };
 
   std::vector<void*> get_buffer_ptrs() override {
-    return buffers_;
+    return pai_->buffers_;
   }
 
   std::vector<void*> get_signal_pad_ptrs() override {
-    return signal_pads_;
+    return pai_->signal_pads_;
   }
 
   void** get_buffer_ptrs_dev() override {
-    return buffers_dev_;
+    return pai_->buffers_dev_;
   }
 
   void** get_signal_pad_ptrs_dev() override {
-    return signal_pads_dev_;
+    return pai_->signal_pads_dev_;
   }
 
   size_t get_buffer_size() override {
-    return buffer_size_;
+    return pai_->buffer_size_;
   }
 
   size_t get_signal_pad_size() override {
@@ -152,78 +203,8 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
     return nullptr;
   }
 
-  at::Tensor get_buffer(
-      int rank,
-      c10::IntArrayRef sizes,
-      c10::ScalarType dtype,
-      int64_t storage_offset) override {
-    // TODO: deduplicate
-    const size_t numel = std::accumulate(
-        sizes.begin(),
-        sizes.end(),
-        static_cast<size_t>(1),
-        std::multiplies<size_t>());
-    const auto element_size = c10::elementSize(dtype);
-    const auto req_size = (numel + storage_offset) * element_size;
-    TORCH_CHECK(
-        req_size <= buffer_size_,
-        "NVSHMEMSymmetricMemory::get_buffer: the requested size (",
-        req_size,
-        " bytes) exceeds the allocated size (",
-        buffer_size_,
-        " bytes)");
-    auto data_ptr = reinterpret_cast<uint8_t*>(buffers_[rank]) +
-        storage_offset * element_size;
-    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
-    auto options = at::TensorOptions().dtype(dtype).device(device);
-    return at::for_blob(data_ptr, sizes)
-        .options(options)
-        .target_device(device)
-        .make_tensor();
-  }
-
-  at::Tensor get_signal_pad(
-      int rank,
-      c10::IntArrayRef sizes,
-      std::optional<c10::ScalarType> dtype,
-      int64_t storage_offset) override {
-    // TODO: deduplicate
-    // If the dtype is unspecified, default it to UInt32, as it
-    // is the most common type for signaling purposes.
-    if (!dtype.has_value()) {
-      dtype = c10::ScalarType::UInt32;
-    }
-
-    // If the shape is unspecified, treat the signal pad as a 1d tensor.
-    const auto element_size = c10::elementSize(*dtype);
-    std::vector<int64_t> shape;
-    if (!sizes.empty()) {
-      shape = sizes.vec();
-    } else {
-      shape.push_back(signal_pad_size / element_size);
-    }
-
-    const size_t numel = std::accumulate(
-        shape.begin(),
-        shape.end(),
-        static_cast<size_t>(1),
-        std::multiplies<size_t>());
-    const auto req_size = (numel + storage_offset) * element_size;
-    TORCH_CHECK(
-        req_size <= signal_pad_size,
-        "NVSHMEMSymmetricMemory::get_signal_pad: the requested size (",
-        req_size,
-        " bytes) exceeds the allocated size (",
-        signal_pad_size,
-        " bytes)");
-    auto data_ptr = reinterpret_cast<uint8_t*>(signal_pads_[rank]) +
-        storage_offset * element_size;
-    auto device = c10::Device(c10::DeviceType::CUDA, device_idx_);
-    auto options = at::TensorOptions().dtype(*dtype).device(device);
-    return at::for_blob(data_ptr, shape)
-        .options(options)
-        .target_device(device)
-        .make_tensor();
+  size_t get_offset() override {
+    return offset_;
   }
 
   void barrier(int channel, size_t timeout_ms) override {
@@ -239,40 +220,39 @@ class NVSHMEMSymmetricMemory : public SymmetricMemory {
   }
 
   int get_rank() override {
-    return rank_;
+    return pai_->rank_;
   }
 
   int get_world_size() override {
-    return world_size_;
+    return pai_->world_size_;
   }
 
-  virtual const std::vector<int>& get_rank_to_global_rank() override {
-    return rank_to_global_rank_;
+  c10::Device get_device() override {
+    return c10::Device(c10::DeviceType::CUDA, device_idx_);
+  }
+
+  const std::vector<int>& get_rank_to_global_rank() override {
+    return pai_->rank_to_global_rank_;
   };
 
   int* get_rank_to_global_rank_dev() override {
-    return rank_to_global_rank_dev_;
+    return pai_->rank_to_global_rank_dev_;
   };
 
+  bool world_within_direct_access() {
+    return pai_->world_within_cuda_p2p_;
+  }
+
  private:
-  std::shared_ptr<NVSHMEMAllocation> allocation_;
-  size_t buffer_size_;
-  std::vector<void*> buffers_;
-  std::vector<void*> signal_pads_;
   int device_idx_;
-  int rank_;
-  int world_size_;
-  void** buffers_dev_;
-  void** signal_pads_dev_;
   std::string group_name_;
-
-  std::vector<int> rank_to_global_rank_;
-  int* rank_to_global_rank_dev_;
+  c10::intrusive_ptr<NVSHMEMPeerAllocInfo> pai_;
+  size_t offset_{0};  // in byte
 };
 
 // Bootstrap based on user's setting for NCCL
 // Long term, this may be a bit unclean; short term, it improves UX
-void maybe_initialize_env_vars() {
+static void maybe_initialize_env_vars() {
   auto nccl_socket_if_name = c10::utils::get_env("NCCL_SOCKET_IFNAME");
   auto nccl_hca_list = c10::utils::get_env("NCCL_IB_HCA");
   auto nccl_ib_gid_index = c10::utils::get_env("NCCL_IB_GID_INDEX");
@@ -294,16 +274,20 @@ void maybe_initialize_env_vars() {
   }
 }
 
-void initialize_nvshmem_with_store(
+static void initialize_nvshmem_with_store(
     c10::intrusive_ptr<c10d::Store> store,
     int rank,
-    int world_size) {
+    int world_size,
+    int device_idx) {
   static bool is_initialized = false;
   if (is_initialized) {
     return;
   }
 
+  c10::cuda::CUDAGuard guard(device_idx);
   maybe_initialize_env_vars();
+  // Make sure the CUDA runtime is initialized.
+  cudaFree(nullptr);
 
   nvshmemx_uniqueid_t unique_id;
   NVSHMEM_CHECK(
@@ -338,20 +322,21 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
         group_name == std::nullopt,
         "NVSHMEMSymmetricMemoryAllocator::alloc "
         "must not be called with a group_name");
+    c10::cuda::CUDAGuard guard(device_idx);
 
     auto group_info = get_group_info("0");
     auto store = group_info.store;
     int rank = group_info.rank;
     int world_size = group_info.world_size;
 
-    initialize_nvshmem_with_store(store, rank, world_size);
+    initialize_nvshmem_with_store(store, rank, world_size, device_idx);
     auto ptr = nvshmem_malloc(size);
     // If size is 0 (which is legal allocation request) we shouldn't error out
     TORCH_CHECK(ptr != nullptr || size == 0, "nvshmem_malloc failed");
-    auto allocation =
-        std::make_shared<NVSHMEMAllocation>(ptr, size, device_idx);
     // TODO: thread safety
-    allocations_.try_emplace(ptr, std::move(allocation));
+    allocations_.try_emplace(
+      ptr,
+      std::make_unique<NVSHMEMAllocation>(ptr, size, device_idx));
     return ptr;
   }
 
@@ -379,13 +364,48 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
         return it->second;
       }
     }
-    auto it = allocations_.find(ptr);
-    TORCH_CHECK(it != allocations_.end());
-    auto symm_mem =
-        c10::make_intrusive<NVSHMEMSymmetricMemory>(it->second, *group_name);
+    // In case of MemPool, tensor.storage().data_ptr() may not match
+    // exactly an allocation's base address. Thus we perform the search by
+    // testing if the former is within an allocation's range.
+    auto alloc_it = std::find_if(allocations_.begin(), allocations_.end(),
+                               [&](const auto& pair){
+                                  auto& allocation = pair.second;
+                                  auto ptr_int = reinterpret_cast<uintptr_t>(ptr);
+                                  auto base_ptr = reinterpret_cast<uintptr_t>(allocation->ptr);
+                                  return ptr_int >= base_ptr && ptr_int < base_ptr + allocation->buffer_size; });
+    TORCH_CHECK(alloc_it != allocations_.end(),
+        "Pointer not within any SymmetricMemory allocation, "
+        "is the tensor allocated from SymmetricMemory?");
+
+    auto& allocation = alloc_it->second;
+
+    // Search again using allocation base ptr (which is the key we use for caching, see below)
+    auto it = symm_mems_.find(std::make_tuple(allocation->ptr, *group_name));
+    c10::intrusive_ptr<NVSHMEMSymmetricMemory> symm_mem;
+    if (it != symm_mems_.end()) {
+      // Base allocation has been rendezvoused
+      symm_mem = it->second;
+    } else {
+      // Create a new rendezvous
+      symm_mem =
+          c10::make_intrusive<NVSHMEMSymmetricMemory>(allocation.get(), *group_name);
+    }
+
+    // Cache rendezvous using allocation's base address as key
+    symm_mems_[std::make_tuple(allocation->ptr, *group_name)] = symm_mem;
 
-    symm_mems_[std::make_tuple(ptr, *group_name)] = symm_mem;
-    return symm_mem;
+    // TODO: change the `ptr` below to `tensor.data_ptr()` when adding support
+    // for user slice/view operations. For MemPool support,
+    // `tensor.storate().data_ptr()` is fine (today's `ptr`).
+
+    // If the tensor's ptr happen to be the same as allocation ptr
+    if (ptr == allocation->ptr) {
+      return symm_mem;
+    } else {
+      // Return a copy of the SymmetricMemory with an offset. This is a
+      // "shallow" copy adjusting the offset field in the handle.
+      return c10::make_intrusive<NVSHMEMSymmetricMemory>(*symm_mem, (uintptr_t)ptr - (uintptr_t)allocation->ptr);
+    }
   };
 
   bool has_multicast_support(int device_idx) override {
@@ -402,8 +422,8 @@ class NVSHMEMSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
   }
 
  private:
-  std::unordered_map<void*, std::shared_ptr<NVSHMEMAllocation>> allocations_;
-  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<SymmetricMemory>>
+  std::unordered_map<void*, std::unique_ptr<NVSHMEMAllocation>> allocations_;
+  std::map<std::tuple<void*, std::string>, c10::intrusive_ptr<NVSHMEMSymmetricMemory>>
       symm_mems_;
 };
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
index 2831a4416de9..81853c4c07d2 100644
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp
@@ -266,6 +266,167 @@ TORCH_API bool has_multicast_support(
     return allocator->has_multicast_support(device_idx);
   }
 }
+
+// MemPool Support
+
+// A map from device type to allocator for MemPool.
+// TODO: Consolidate with `AllocatorMap` above.
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+class MemPoolAllocatorMap {
+ public:
+  MemPoolAllocatorMap(const MemPoolAllocatorMap&) = delete;
+  MemPoolAllocatorMap& operator=(const MemPoolAllocatorMap&) = delete;
+  static MemPoolAllocatorMap& get() {
+    static MemPoolAllocatorMap instance;
+    return instance;
+  }
+
+  // Register allocator for MemPool given device type
+  void register_mempool_allocator(
+      c10::DeviceType device_type,
+      std::shared_ptr<c10::Allocator> allocator) {
+    mempool_allocators_[device_type] = std::move(allocator);
+  }
+
+  // Get allocator for MemPool given device
+  std::shared_ptr<c10::Allocator> get_mempool_allocator(c10::Device device) {
+    auto it = mempool_allocators_.find(device.type());
+    if (it == mempool_allocators_.end()) {
+      TORCH_CHECK(
+          false,
+          "SymmetricMemory MemPool did not find backend for device type ",
+          device.type());
+    }
+    return it->second;
+  }
+
+ private:
+  MemPoolAllocatorMap() = default;
+
+  std::unordered_map<c10::DeviceType, std::shared_ptr<c10::Allocator>>
+      mempool_allocators_;
+};
+
+// Register allocator for MemPool given device type
+C10_EXPORT void register_mempool_allocator(
+    c10::DeviceType device_type,
+    std::shared_ptr<c10::Allocator> allocator) {
+  return MemPoolAllocatorMap::get().register_mempool_allocator(
+      device_type, std::move(allocator));
+}
+
+// Get allocator for MemPool given device
+TORCH_API std::shared_ptr<c10::Allocator> get_mempool_allocator(
+    c10::Device device) {
+  return MemPoolAllocatorMap::get().get_mempool_allocator(device);
+}
+
+// Helper function:
+// Calculate the number of bytes of a tensor given its shape and dtype
+static inline size_t nbytes_of(c10::IntArrayRef sizes, c10::ScalarType dtype) {
+  const auto numel = std::accumulate(
+      sizes.begin(), sizes.end(), static_cast<size_t>(1), std::multiplies<>());
+  return numel * c10::elementSize(dtype);
+}
+
+// Helper function:
+// Get the buffer pointer for a peer at a given offset
+static at::Tensor get_buffer_at_byte_offset(
+    SymmetricMemory* handle,
+    int peer,
+    c10::IntArrayRef sizes,
+    c10::ScalarType dtype,
+    size_t offset_bytes) {
+  TORCH_CHECK(
+      peer >= 0 && peer < handle->get_world_size(),
+      "Invalid peer rank: ",
+      peer);
+  auto peer_ptr = handle->get_buffer_ptrs()[peer];
+  TORCH_CHECK(
+      peer_ptr != nullptr,
+      "Cannot get buffer across nodes, my rank: ",
+      handle->get_rank(),
+      ", peer: ",
+      peer);
+  const size_t tensor_bytes = nbytes_of(sizes, dtype);
+  const auto req_size = offset_bytes + tensor_bytes;
+  const auto buffer_size = handle->get_buffer_size();
+  TORCH_CHECK(
+      req_size <= buffer_size,
+      "SymmetricMemory::get_buffer: the requested size (",
+      req_size,
+      " bytes) exceeds the allocated size (",
+      buffer_size,
+      " bytes)");
+  auto data_ptr = reinterpret_cast<uint8_t*>(peer_ptr) + offset_bytes;
+  auto device = handle->get_device();
+  auto options = at::TensorOptions().dtype(dtype).device(device);
+  return at::for_blob(data_ptr, sizes)
+      .options(options)
+      .target_device(device)
+      .make_tensor();
+}
+
+// Implementation of SymmetricMemory APIs common to all backends
+
+at::Tensor SymmetricMemory::get_buffer(
+    int rank,
+    c10::IntArrayRef sizes,
+    c10::ScalarType dtype,
+    int64_t storage_offset) {
+  // storage_offset is in element, convert to byte
+  const auto offset_bytes = storage_offset * c10::elementSize(dtype);
+  return get_buffer_at_byte_offset(this, rank, sizes, dtype, offset_bytes);
+}
+
+at::Tensor SymmetricMemory::get_remote_tensor(
+    int peer,
+    c10::IntArrayRef sizes,
+    c10::ScalarType dtype) {
+  return get_buffer_at_byte_offset(this, peer, sizes, dtype, get_offset());
+}
+
+at::Tensor SymmetricMemory::get_signal_pad(
+    int rank,
+    c10::IntArrayRef sizes,
+    std::optional<c10::ScalarType> dtype,
+    int64_t storage_offset) {
+  // If the dtype is unspecified, default it to UInt32, as it
+  // is the most common type for signaling purposes.
+  if (!dtype.has_value()) {
+    dtype = c10::ScalarType::UInt32;
+  }
+
+  // If the shape is unspecified, treat the signal pad as a 1d tensor.
+  const auto element_size = c10::elementSize(*dtype);
+  const auto signal_pad_size = get_signal_pad_size();
+  std::vector<int64_t> shape;
+  if (!sizes.empty()) {
+    shape = sizes.vec();
+  } else {
+    shape.push_back(static_cast<int64_t>(signal_pad_size / element_size));
+  }
+
+  const auto req_pad_bytes = nbytes_of(shape, *dtype);
+  const auto offset_bytes = storage_offset * element_size;
+  const auto req_size = offset_bytes + req_pad_bytes;
+  TORCH_CHECK(
+      req_size <= signal_pad_size,
+      "SymmetricMemory::get_signal_pad: the requested size (",
+      req_size,
+      " bytes) exceeds the allocated size (",
+      signal_pad_size,
+      " bytes)");
+  auto data_ptr =
+      reinterpret_cast<uint8_t*>(get_signal_pad_ptrs()[rank]) + offset_bytes;
+  auto device = get_device();
+  auto options = at::TensorOptions().dtype(dtype).device(device);
+  return at::for_blob(data_ptr, shape)
+      .options(options)
+      .target_device(device)
+      .make_tensor();
+}
+
 } // namespace c10d::symmetric_memory
 
 namespace {
@@ -336,11 +497,12 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
 
   m.def("nvshmem_put(Tensor(a!) tensor, int peer) -> ()");
   m.def("nvshmem_get(Tensor(a!) tensor, int peer) -> ()");
-  m.def("nvshmem_broadcast(Tensor(a!) input, str group_name) -> Tensor(a!)");
+  m.def(
+      "nvshmem_broadcast(Tensor(a!) input, int root, str group_name) -> Tensor(a!)");
   m.def(
       "nvshmem_all_to_all(Tensor input, Tensor(a!) out, str group_name) -> Tensor(a!)");
   m.def(
-      "all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor(a!) in_out_splits, str group_name) -> Tensor(a!)");
+      "all_to_all_vdev(Tensor input, Tensor(a!) out, Tensor in_splits, Tensor(a!) out_splits_offsets, str group_name) -> ()");
   m.def(
       "all_to_all_vdev_2d(Tensor input, Tensor(a!) out, Tensor in_splits, Tensor(a!) out_splits_offsets, str group_name, int? major_align=None) -> ()");
   m.def(
diff --git a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
index c2828de04c9b..d2cb70e1b1ae 100644
--- a/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
@@ -50,20 +50,29 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
   virtual size_t get_buffer_size() = 0;
   virtual size_t get_signal_pad_size() = 0;
 
+  virtual size_t get_offset() {
+    TORCH_CHECK(false, "NYI");
+  }
+
   virtual bool has_multicast_support() = 0;
   virtual void* get_multicast_ptr() = 0;
 
-  virtual at::Tensor get_buffer(
+  at::Tensor get_buffer(
       int rank,
       c10::IntArrayRef sizes,
       c10::ScalarType dtype,
-      int64_t storage_offset) = 0;
+      int64_t storage_offset);
 
-  virtual at::Tensor get_signal_pad(
+  at::Tensor get_signal_pad(
       int rank,
       c10::IntArrayRef sizes,
       std::optional<c10::ScalarType> dtype = std::nullopt,
-      int64_t storage_offset = 0) = 0;
+      int64_t storage_offset = 0);
+
+  at::Tensor get_remote_tensor(
+      int peer,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype);
 
   virtual void barrier(int channel, size_t timeout_ms) = 0;
   virtual void put_signal(int dst_rank, int channel, size_t timeout_ms) = 0;
@@ -71,6 +80,7 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
 
   virtual int get_rank() = 0;
   virtual int get_world_size() = 0;
+  virtual c10::Device get_device() = 0;
 
   virtual const std::vector<int>& get_rank_to_global_rank() {
     TORCH_CHECK(false, "NYI");
@@ -79,6 +89,12 @@ class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
   virtual int* get_rank_to_global_rank_dev() {
     TORCH_CHECK(false, "NYI");
   }
+
+  // Returns true if *all* peers within the group are accessible via direct
+  // memory load and store.
+  virtual bool world_within_direct_access() {
+    TORCH_CHECK(false, "NYI");
+  }
 };
 
 class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
@@ -184,4 +200,11 @@ TORCH_API void set_backend(const std::string& name);
 
 TORCH_API std::optional<std::string> get_backend(c10::Device device);
 
+C10_EXPORT void register_mempool_allocator(
+    c10::DeviceType device_type,
+    std::shared_ptr<c10::Allocator> allocator);
+
+TORCH_API std::shared_ptr<c10::Allocator> get_mempool_allocator(
+    c10::Device device);
+
 } // namespace c10d::symmetric_memory
diff --git a/torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp b/torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
new file mode 100644
index 000000000000..bfbe02bd6f86
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
@@ -0,0 +1,39 @@
+#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+namespace {
+using namespace c10d::symmetric_memory;
+
+// Alloc functor for MemPool
+void* cuda_symm_alloc(size_t size, int device, void* stream) {
+  static auto allocator = get_allocator(c10::DeviceType::CUDA);
+  TORCH_CHECK(
+      allocator->name() == "NVSHMEM", "Only NVSHMEM backend is supported");
+  // Note: this alloc functor works for the NVSHMEM and NCCL backends only,
+  // because only these backends takes `nullopt` for the `group` argument which
+  // is not given by MemPool's invocation (actually these two backends requires
+  // it to be `nullopt`).
+  return allocator->alloc(size, device, /*group_name=*/std::nullopt);
+}
+
+// Free functor for MemPool
+void cuda_symm_free(void* ptr, size_t size, int device, void* stream) {
+  static auto allocator = get_allocator(c10::DeviceType::CUDA);
+  TORCH_CHECK(
+      allocator->name() == "NVSHMEM", "Only NVSHMEM backend is supported");
+  allocator->free(ptr);
+}
+
+// Register allocator for CUDA MemPool
+struct RegisterCUDAMemPoolAllocator {
+  RegisterCUDAMemPoolAllocator() {
+    std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator> allocator =
+        torch::cuda::CUDAPluggableAllocator::createCustomAllocator(
+            cuda_symm_alloc, cuda_symm_free);
+    register_mempool_allocator(c10::DeviceType::CUDA, allocator);
+  }
+};
+
+static RegisterCUDAMemPoolAllocator register_cuda_mempool_allocator_;
+
+} // namespace
diff --git a/torch/csrc/distributed/c10d/symm_mem/env.hpp b/torch/csrc/distributed/c10d/symm_mem/env.hpp
new file mode 100644
index 000000000000..d1998ef9070d
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/env.hpp
@@ -0,0 +1,18 @@
+#include <c10/util/env.h>
+
+namespace c10d::symmetric_memory {
+
+static int getenv_nblocks() {
+  static int num_blocks = -1; // Uninitialized
+  if (num_blocks == -1) {
+    auto str = c10::utils::get_env("TORCH_SYMMMEM_NBLOCKS");
+    if (str.has_value()) {
+      num_blocks = std::stoi(str.value());
+    } else {
+      num_blocks = -2; // Not set
+    }
+  }
+  return num_blocks;
+}
+
+} // namespace c10d::symmetric_memory
\ No newline at end of file
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
index 55ebebb28e24..182eaeb90f1a 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -1,7 +1,10 @@
 #include <dlfcn.h>
+#include <ATen/ceil_div.h>
 #include <c10/cuda/CUDAGuard.h>
 
+#include <torch/csrc/distributed/c10d/symm_mem/env.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
+#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_team_manager.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h>
 #include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
@@ -12,10 +15,18 @@
 // NVSHMEM minimum SM arch
 #define _NVSHMEM_MIN_SM_ARCH 700
 
+// If CUDA_ARCH is less than sm_70, or on sm_110, skip NVSHMEM device APIs
+#define _NVSHMEM_DEVICELIB_SUPPORTED 1
+#if defined(__CUDA_ARCH__)
+#  if (__CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH) || (__CUDA_ARCH__ == 1100)
+#    undef _NVSHMEM_DEVICELIB_SUPPORTED
+#  endif
+#endif
+
 // Some NVSHMEM device APIs do not compile on older SM archs
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH)
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
 // Only include host APIs. See nvshmem.h for details.
-#define NVSHMEM_HOSTLIB_ONLY
+#  define NVSHMEM_HOSTLIB_ONLY
 #endif  // Must be done before nvshmem.h is included
 
 #include <nvshmem.h>
@@ -26,8 +37,6 @@ namespace c10d::nvshmem_extension {
 #define THREADS_PER_BLOCK 512
 #define WARP_SIZE 32
 
-constexpr int MiB = 1024 * 1024;
-
 extern "C" void nvshmem_init() __attribute__((weak));
 
 // Check if NVSHMEM is available
@@ -67,50 +76,22 @@ void nvshmemx_cumodule_init(uintptr_t module) {
     "nvshmemx_cumodule_init failed");
 }
 
-static std::unordered_map<std::string, nvshmem_team_t> group_name_to_team_;
-
-nvshmem_team_t group_to_team(
-    const std::string& group_name,
-    const std::vector<int>& global_ranks) {
-  auto it = group_name_to_team_.find(group_name);
-  if (it != group_name_to_team_.end()) {
-    return it->second;
-  }
-  TORCH_CHECK(global_ranks.size() > 1);
-  int stride = global_ranks[1] - global_ranks[0];
-  for (size_t r = 1; r < global_ranks.size(); ++r) {
-    TORCH_CHECK(global_ranks[r] - global_ranks[r - 1] == stride);
-  }
-
-  nvshmem_team_t team;
-  NVSHMEM_CHECK(
-      nvshmem_team_split_strided(
-          NVSHMEM_TEAM_WORLD,
-          global_ranks[0],
-          stride,
-          global_ranks.size(),
-          nullptr,
-          0,
-          &team),
-          "nvshmem_team_split_strided failed");
-  group_name_to_team_[group_name] = team;
-  TORCH_CHECK(team != NVSHMEM_TEAM_INVALID);
-  return team;
-}
-
-at::Tensor nvshmem_broadcast(at::Tensor& input, const std::string& group_name) {
+at::Tensor nvshmem_broadcast(at::Tensor& input, const int64_t root, const std::string& group_name) {
   auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
   int rank = input_hdl->get_rank();
-  int world_size = input_hdl->get_world_size();
-  auto team = group_to_team(group_name, input_hdl->get_rank_to_global_rank());
-  void* buffer_ptr = input_hdl->get_buffer_ptrs()[rank];
+  void* buffer_ptr = input.mutable_data_ptr();
+  auto buffer_size = input.numel() * input.element_size();
+  auto& team_manager = TeamManager::get(input.device());
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
+  int team_size = nvshmem_team_n_pes(team);
+  TORCH_CHECK(root < team_size, "root must be smaller than group size");
 
   auto stream = at::cuda::getCurrentCUDAStream();
-  nvshmemx_broadcastmem_on_stream(team, buffer_ptr, buffer_ptr, input_hdl->get_buffer_size(), 0, stream);
+  nvshmemx_broadcastmem_on_stream(team, buffer_ptr, buffer_ptr, buffer_size, root, stream);
   return input;
 }
 
-void nvshmem_put(at::Tensor& tensor, int64_t peer) {
+void nvshmem_put(at::Tensor& tensor, const int64_t peer) {
   // TODO: support non-contiguous tensors
   TORCH_CHECK(tensor.is_contiguous(),
       "put op currently supports contiguous tensors only");
@@ -119,13 +100,14 @@ void nvshmem_put(at::Tensor& tensor, int64_t peer) {
   auto rank = hdl->get_rank();
   void* buffer_ptr = hdl->get_buffer_ptrs()[rank];
   auto buffer_size = tensor.numel() * tensor.element_size();
+  TORCH_CHECK(peer < hdl->get_world_size(), "peer must be smaller than world size");
 
   c10::cuda::CUDAGuard guard(tensor.device());
   auto stream = at::cuda::getCurrentCUDAStream();
   nvshmemx_putmem_on_stream(buffer_ptr, tensor.data_ptr(), buffer_size, peer, stream);
 }
 
-void nvshmem_get(at::Tensor& tensor, int64_t peer) {
+void nvshmem_get(at::Tensor& tensor, const int64_t peer) {
   // TODO: support non-contiguous tensors
   TORCH_CHECK(tensor.is_contiguous(),
       "get op currently supports contiguous tensors only");
@@ -134,10 +116,11 @@ void nvshmem_get(at::Tensor& tensor, int64_t peer) {
   auto rank = hdl->get_rank();
   void* buffer_ptr = hdl->get_buffer_ptrs()[rank];
   auto buffer_size = tensor.numel() * tensor.element_size();
+  TORCH_CHECK(peer < hdl->get_world_size(), "peer must be smaller than world size");
 
   c10::cuda::CUDAGuard guard(tensor.device());
   auto stream = at::cuda::getCurrentCUDAStream();
-  nvshmemx_getmem_on_stream(tensor.data_ptr(), buffer_ptr, buffer_size, peer, stream);
+  nvshmemx_getmem_on_stream(tensor.mutable_data_ptr(), buffer_ptr, buffer_size, peer, stream);
 }
 
 at::Tensor nvshmem_all_to_all(
@@ -148,11 +131,17 @@ at::Tensor nvshmem_all_to_all(
   auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
   int rank = input_hdl->get_rank();
   int world_size = input_hdl->get_world_size();
-  auto team = group_to_team(group_name, input_hdl->get_rank_to_global_rank());
-
-  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
-  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
-  size_t bytes_per_rank = input_hdl->get_buffer_size() / world_size;
+  auto& team_manager = TeamManager::get(input.device());
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
+
+  void* input_ptr = input.data_ptr();
+  void* output_ptr = out.mutable_data_ptr();
+  TORCH_CHECK(input.is_contiguous() && out.is_contiguous());
+  TORCH_CHECK_EQ(input.numel(), out.numel());
+  TORCH_CHECK_EQ(input.dtype(), out.dtype());
+  TORCH_CHECK_EQ(input.numel() % world_size, 0);
+  auto buffer_size = input.numel() * input.element_size();
+  size_t bytes_per_rank = buffer_size / world_size;
 
   auto stream = at::cuda::getCurrentCUDAStream(input.device().index());
   nvshmemx_alltoallmem_on_stream(team, output_ptr, input_ptr, bytes_per_rank, stream);
@@ -191,15 +180,18 @@ __device__ int64_t prefixSum(int64_t *odata, int64_t *idata, int n) {
 // - input splits (IN)
 // - output splits (OUT) and
 // - source offsets (OUT).
-__global__ void exchangeSplitAndOffset(int64_t* in_out_splits, int mype, int npes) {
-#if __CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH
-  CUDA_KERNEL_ASSERT_MSG(false, "SM arch too old for NVSHMEM");
+__global__ void exchangeSplitAndOffset(int64_t* input_splits, int64_t* out_splits_offsets, nvshmem_team_t team) {
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
+  CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
-  auto input_splits = in_out_splits;
-  auto output_splits = in_out_splits + npes;
-  auto source_offsets = in_out_splits + npes * 2;
+  CUDA_KERNEL_ASSERT(team != NVSHMEM_TEAM_INVALID);
+  int mype = nvshmem_team_my_pe(team);
+  int npes = nvshmem_team_n_pes(team);
+  auto output_splits = out_splits_offsets;
+  auto source_offsets = out_splits_offsets + npes;
   int tid = threadIdx.x;
 
+  CUDA_KERNEL_ASSERT(npes <= THREADS_PER_BLOCK);
   __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
 
   // Scan input splits to get the source offsets
@@ -208,29 +200,34 @@ __global__ void exchangeSplitAndOffset(int64_t* in_out_splits, int mype, int npe
 
   // Use 1 block to do the exchange
   if (tid < npes) {
-    int peer = tid;
-    nvshmem_int64_p(source_offsets + mype, peer_offsets[peer], peer);
-    nvshmem_int64_p(output_splits + mype, input_splits[peer], peer);
+    // tid is peer index within team, but put calls require global rank
+    int peer_global = nvshmem_team_translate_pe(team, tid, NVSHMEM_TEAM_WORLD);
+    nvshmem_int64_p(source_offsets + mype, peer_offsets[tid], peer_global);
+    nvshmem_int64_p(output_splits + mype, input_splits[tid], peer_global);
   }
   // This barrier ensures that all remote PEs see the updated values
-  nvshmemx_barrier_all_block();
+  nvshmemx_barrier_block(team);
 #endif
 }
 
 // This kernel is used to do the actual data exchange.
 // `in_out_splits` has the same definition as in `exchangeSplitAndOffset`.
 // `stride` is the stride at dim 0, unit in byte.
-__global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_splits, size_t stride, int mype, int npes) {
-#if __CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH
-  CUDA_KERNEL_ASSERT_MSG(false, "SM arch too old for NVSHMEM");
+__global__ void allToAllV(void *send_data, void *recv_data, int64_t* out_splits_offsets, size_t stride, nvshmem_team_t team) {
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
+  CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
-  auto output_splits = in_out_splits + npes;
-  auto source_offsets = in_out_splits + npes * 2;
+  CUDA_KERNEL_ASSERT(team != NVSHMEM_TEAM_INVALID);
+  int mype = nvshmem_team_my_pe(team);
+  int npes = nvshmem_team_n_pes(team);
+  auto output_splits = out_splits_offsets;
+  auto source_offsets = out_splits_offsets + npes;
   int bid = blockIdx.x;
   int tid = threadIdx.x;
   int blocks_per_peer = max(gridDim.x / npes, 1);
 
   // Calculate the output offsets
+  CUDA_KERNEL_ASSERT(npes <= THREADS_PER_BLOCK);
   __shared__ int64_t peer_offsets[THREADS_PER_BLOCK];
   prefixSum(peer_offsets, output_splits, npes);
   __syncthreads();
@@ -238,6 +235,7 @@ __global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_spli
   // Target a different peer based on bid
   for (int i = bid / blocks_per_peer; i < npes; i += gridDim.x / blocks_per_peer) {
     int peer = (mype + i) % npes;
+    auto peer_global = nvshmem_team_translate_pe(team, peer, NVSHMEM_TEAM_WORLD);
     // Total amount from `peer`
     auto peer_size = output_splits[peer] * stride;
     // Amount to get from `peer` in this block
@@ -248,51 +246,75 @@ __global__ void allToAllV(void *send_data, void *recv_data, int64_t* in_out_spli
     auto block_offset = block_size * (bid % blocks_per_peer);
     auto source_offset = source_offsets[peer] * stride + block_offset;
     auto write_offset = peer_offsets[peer] * stride + block_offset;
-    nvshmemx_getmem_block(
+    nvshmemx_getmem_nbi_block(
       (char*)recv_data + write_offset,
       (char*)send_data + source_offset,
       block_size,
-      peer);
+      peer_global);
   }
   // Write out the output offsets (to the scratchpad line)
   if (bid == 0 && tid < npes) {
     source_offsets[tid] = peer_offsets[tid];
   }
+  // Make sure getmem_nbi calls finish
+  nvshmem_quiet();
 #endif
 }
 
-at::Tensor all_to_all_vdev(
+static int get_a2a_nblocks(size_t size, int world_size, bool intra_node) {
+  // Check user setting first
+  int num_blocks = c10d::symmetric_memory::getenv_nblocks();
+  if (num_blocks > 0) {  // set by user
+    return num_blocks;
+  }
+  // 16B per thread, 8 loops
+  constexpr size_t chunk_size = 16 * THREADS_PER_BLOCK * 8;
+  num_blocks = at::ceil_div(size, chunk_size);
+  // Allow kernel to target even number of blocks per peer
+  num_blocks = at::round_up(num_blocks, world_size);
+  const int max_blocks = intra_node ? 64 : 16;
+  return std::min(num_blocks, max_blocks);
+}
+
+void all_to_all_vdev(
     at::Tensor& input,
     at::Tensor& out,
-    at::Tensor& in_out_splits,
+    at::Tensor& in_splits,
+    at::Tensor& out_splits_offsets,
     std::string group_name) {
   /* Perform AllToAllv operation using NVSHMEM, with split information provided on device.
    * Arguments:
    *  - `input` is the input tensor
    *  - `out` is the output tensor
-   *  - `in_out_splits` is a 2D tensor of size (3, npes). The rows are (in order):
-        input splits (IN)
-        output splits (OUT) and
-        output offsets (OUT).
+   *  - `in_splits` is a 1D tensor of size (npes), containing the input splits
+   *  - `out_splits_offsets` is a 2D tensor of size (2, npes). The rows are (in order):
+        output splits and output offsets.
   */
   auto input_hdl = c10d::symmetric_memory::rendezvous(input, group_name);
   auto out_hdl = c10d::symmetric_memory::rendezvous(out, group_name);
-  auto splits_hdl = c10d::symmetric_memory::rendezvous(in_out_splits, group_name);
+  auto in_splits_hdl = c10d::symmetric_memory::rendezvous(in_splits, group_name);
+  auto out_splits_offsets_hdl = c10d::symmetric_memory::rendezvous(out_splits_offsets, group_name);
   int rank = input_hdl->get_rank();
   int world_size = input_hdl->get_world_size();
 
-  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
-  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
-  int64_t* splits_ptr = (int64_t*)(splits_hdl->get_buffer_ptrs()[rank]);
+  void* input_ptr = input.data_ptr();
+  void* output_ptr = out.mutable_data_ptr();
+  int64_t* in_splits_ptr = (int64_t*)(in_splits.const_data_ptr());
+  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets.mutable_data_ptr());
 
-  auto stream = at::cuda::getCurrentCUDAStream(input.device().index());
+  TORCH_CHECK_EQ(input.device(), out.device());
+  auto device = input.device();
+  c10::cuda::CUDAGuard guard(device);
+  auto& team_manager = TeamManager::get(device);
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
+  auto stream = at::cuda::getCurrentCUDAStream(device.index());
 
   // Exchange output splits and source offsets
   // Use collective launch because kernel involves nvshmem barrier
   void* args0[] = {
-      &splits_ptr,
-      &rank,
-      &world_size};
+      &in_splits_ptr,
+      &out_splits_offsets_ptr,
+      &team};
   nvshmemx_collective_launch(
       (const void*)exchangeSplitAndOffset,
       dim3(1),
@@ -302,28 +324,11 @@ at::Tensor all_to_all_vdev(
       stream);
 
   // CTA Tuning
-  // Intra-node: use multiple blocks per peer to increase data parallelism, up to 8.
-  // Up to 1 MB -> 1 block
-  // Up to 2 MB -> 2 blocks
-  // Up to 4 MB -> 4 blocks
-  // More -> 8 blocks
-  // The tuning for `num_blocks` below multiplies these numbers by world_size
-  // (e.g. 8 -> 8 * 8). If world_size is smaller, we simply shift the blocks
-  // towards data parallelism. (There may be room for improvement here)
   auto input_size = input.numel() * input.element_size();
-  int num_blocks = input_size < MiB ? 8 :
-      (input_size < 2 * MiB ? 16 :
-      (input_size < 4 * MiB ? 32 : 64));
-
-  // Inter-node: limit the total the number of blocks:
-  // = 16 for 16GPUs which is enough to max out 90 GB/s bandwidth perf
-  // = 8 for more than 16 GPUs which is enough to max out approx 50 GB/s bandwidth perf
-  // Above assumes 400Gb/s NIC for inter-node and 400GB/s NVLinks for intra-node comms.
-  // TODO: better intra vs inter detection, currently it is based on world_size.
-  int max_inter_node_blocks = world_size <= 16 ? 16 : 8;
-  if (world_size > 8) {
-    num_blocks = std::min(num_blocks, max_inter_node_blocks);
-  }
+  int num_blocks = get_a2a_nblocks(
+    input_size,
+    input_hdl->get_world_size(),
+    input_hdl->world_within_direct_access());
 
   // Stride at dim 0 (assuming input is contiguous, TODO)
   size_t stride_bytes = input.stride(0) * input.element_size();
@@ -332,10 +337,9 @@ at::Tensor all_to_all_vdev(
   void* args1[] = {
       &input_ptr,
       &output_ptr,
-      &splits_ptr,
+      &out_splits_offsets_ptr,
       &stride_bytes,
-      &rank,
-      &world_size};
+      &team};
   nvshmemx_collective_launch(
       (const void*)allToAllV,
       dim3(num_blocks),
@@ -343,7 +347,6 @@ at::Tensor all_to_all_vdev(
       args1,
       0,
       stream);
-  return out;
 }
 
 // Start of `all_to_all_vdev_2d`
@@ -366,10 +369,13 @@ at::Tensor all_to_all_vdev(
 */
 
 template <bool HAS_IN_OFFSETS>
-__global__ void exchangeSplitAndOffset_2d(int64_t* in_splits_offsets, int64_t* out_splits_offsets, int mype, int npes, int ne, size_t input_dim0, bool rank_is_row_in) {
-#if __CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH
-  CUDA_KERNEL_ASSERT_MSG(false, "SM arch too old for NVSHMEM");
+__global__ void exchangeSplitAndOffset_2d(int64_t* in_splits_offsets, int64_t* out_splits_offsets, nvshmem_team_t team, int ne, size_t input_dim0, bool rank_is_row_in) {
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
+  CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
+  CUDA_KERNEL_ASSERT(team != NVSHMEM_TEAM_INVALID);
+  int mype = nvshmem_team_my_pe(team);
+  int npes = nvshmem_team_n_pes(team);
   int nsplits = npes * ne;
   auto input_splits = in_splits_offsets;
   auto output_splits = out_splits_offsets;
@@ -408,11 +414,12 @@ __global__ void exchangeSplitAndOffset_2d(int64_t* in_splits_offsets, int64_t* o
     // (or vice versa).
     auto split_val = input_splits[tid];
     CUDA_KERNEL_ASSERT(split_val >= 0 && "split value is negative\n");
-    nvshmem_int64_p(source_offsets + dst_offset, input_offsets[tid], peer);
-    nvshmem_int64_p(output_splits + dst_offset, split_val, peer);
+    auto peer_global = nvshmem_team_translate_pe(team, peer, NVSHMEM_TEAM_WORLD);
+    nvshmem_int64_p(source_offsets + dst_offset, input_offsets[tid], peer_global);
+    nvshmem_int64_p(output_splits + dst_offset, split_val, peer_global);
   }
   // This barrier ensures that all remote PEs see the updated values
-  nvshmemx_barrier_all_block();
+  nvshmemx_barrier_block(team);
 #endif
 }
 
@@ -462,9 +469,9 @@ __device__ int64_t prefixSum_warp(int64_t *odata, int64_t *idata, int n) {
 // In dispatch case, rank_is_row_out = false, major_size = ne, minor_size = npes.
 // In combine case, rank_is_row_out = true, major_size = npes, minor_size = ne.
 
-__global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_splits, int64_t* out_splits_offsets, size_t stride, int minor_size, int major_size, int64_t major_align, bool rank_is_row_out) {
-#if __CUDA_ARCH__ < _NVSHMEM_MIN_SM_ARCH
-  CUDA_KERNEL_ASSERT_MSG(false, "SM arch too old for NVSHMEM");
+__global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_splits, int64_t* out_splits_offsets, size_t stride, int minor_size, int major_size, int64_t major_align, bool rank_is_row_out, nvshmem_team_t team) {
+#ifndef _NVSHMEM_DEVICELIB_SUPPORTED
+  CUDA_KERNEL_ASSERT_MSG(false, "SM arch unsupported for NVSHMEM");
 #else
   int nsplits = minor_size * major_size;
   auto output_splits = out_splits_offsets;
@@ -534,16 +541,19 @@ __global__ void allToAllV_2d(void *send_data, void *recv_data, int64_t* in_split
     auto source_offset = source_offsets[eid] * stride;
     auto e_offset = tile_prefix_sums[row][col];
     auto write_offset = e_offset * stride;
-    nvshmemx_getmem_block(
+    auto peer_global = nvshmem_team_translate_pe(team, rank_is_row_out ? row : col, NVSHMEM_TEAM_WORLD);
+    nvshmemx_getmem_nbi_block(
       (char*)recv_data + write_offset,
       (char*)send_data + source_offset,
       peer_size,
-      rank_is_row_out ? row : col);  // peer
+      peer_global);  // peer's global index
   }
   // Write out the output offsets (to the scratchpad line)
   if (bid == 0 && tid < nsplits) {
     source_offsets[tid] = tile_prefix_sums[tid / minor_size][tid % minor_size];
   }
+  // Make sure getmem_nbi calls finish
+  nvshmem_quiet();
 #endif
 }
 
@@ -603,10 +613,10 @@ void all_to_all_vdev_2d(
   int64_t major_align_val = major_align.value_or(1);
   TORCH_CHECK(major_align_val > 0, "major_align must be positive");
 
-  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
-  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
-  int64_t* in_splits_ptr = (int64_t*)(in_splits_hdl->get_buffer_ptrs()[rank]);
-  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets_hdl->get_buffer_ptrs()[rank]);
+  void* input_ptr = input.data_ptr();
+  void* output_ptr = out.mutable_data_ptr();
+  int64_t* in_splits_ptr = (int64_t*)(in_splits.data_ptr());
+  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets.mutable_data_ptr());
 
   // Shape checks
   TORCH_CHECK(in_splits.is_contiguous()
@@ -645,6 +655,8 @@ void all_to_all_vdev_2d(
       "all tensor arguments must be on the same CUDA device");
   c10::cuda::CUDAGuard guard(device);
   auto stream = at::cuda::getCurrentCUDAStream();
+  auto& team_manager = TeamManager::get(device);
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
 
   // Exchange output splits and source offsets
   auto input_dim0 = input.size(0);
@@ -653,8 +665,7 @@ void all_to_all_vdev_2d(
   void* args0[] = {
       &in_splits_ptr,
       &out_splits_offsets_ptr,
-      &rank,
-      &world_size,
+      &team,
       &ne,
       &input_dim0,
       &rank_is_row_in};
@@ -685,7 +696,8 @@ void all_to_all_vdev_2d(
       &world_size,
       &ne,
       &major_align_val,
-      &rank_is_row_out};
+      &rank_is_row_out,
+      &team};
   nvshmemx_collective_launch(
       (const void*)allToAllV_2d,
       dim3(num_blocks),
@@ -737,10 +749,10 @@ void all_to_all_vdev_2d_offset(
 
   int64_t major_align_val = 0;
 
-  void* input_ptr = input_hdl->get_buffer_ptrs()[rank];
-  void* output_ptr = out_hdl->get_buffer_ptrs()[rank];
-  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets_hdl->get_buffer_ptrs()[rank]);
-  int64_t* in_splits_offsets_ptr = (int64_t*)(in_splits_offsets_hdl->get_buffer_ptrs()[rank]);
+  void* input_ptr = input.data_ptr();
+  void* output_ptr = out.mutable_data_ptr();
+  int64_t* out_splits_offsets_ptr = (int64_t*)(out_splits_offsets.mutable_data_ptr());
+  int64_t* in_splits_offsets_ptr = (int64_t*)(in_splits_offsets.data_ptr());
 
   // Shape checks
   TORCH_CHECK(out_splits_offsets.is_contiguous()
@@ -778,6 +790,8 @@ void all_to_all_vdev_2d_offset(
       "all tensor arguments must be on the same CUDA device");
   c10::cuda::CUDAGuard guard(device);
   auto stream = at::cuda::getCurrentCUDAStream();
+  auto& team_manager = TeamManager::get(device);
+  auto team = team_manager.get_team(group_name, input_hdl->get_rank_to_global_rank());
 
   // Exchange output splits and source offsets
   auto input_dim0 = input.size(0);
@@ -786,8 +800,7 @@ void all_to_all_vdev_2d_offset(
   void* args0[] = {
       &in_splits_offsets_ptr,
       &out_splits_offsets_ptr,
-      &rank,
-      &world_size,
+      &team,
       &ne,
       &input_dim0,
       &rank_is_row_in};
@@ -818,7 +831,8 @@ void all_to_all_vdev_2d_offset(
       &ne,
       &world_size,
       &major_align_val,
-      &rank_is_row_out};
+      &rank_is_row_out,
+      &team};
   nvshmemx_collective_launch(
       (const void*)allToAllV_2d,
       dim3(num_blocks),
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
index f364e2ebfa3d..ae008921bcd8 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
@@ -21,21 +21,22 @@ TORCH_API bool is_nvshmem_available();
 // operations.
 TORCH_API void nvshmemx_cumodule_init(uintptr_t module);
 
-TORCH_API void nvshmem_put(at::Tensor& tensor, int64_t peer);
+TORCH_API void nvshmem_put(at::Tensor& tensor, const int64_t peer);
 
-TORCH_API void nvshmem_get(at::Tensor& tensor, int64_t peer);
+TORCH_API void nvshmem_get(at::Tensor& tensor, const int64_t peer);
 
-at::Tensor nvshmem_broadcast(at::Tensor& input, const std::string& group_name);
+at::Tensor nvshmem_broadcast(at::Tensor& input, const int64_t root, const std::string& group_name);
 
 at::Tensor nvshmem_all_to_all(
     at::Tensor& input,
     at::Tensor& out,
     std::string group_name);
 
-at::Tensor all_to_all_vdev(
+void all_to_all_vdev(
     at::Tensor& input,
     at::Tensor& out,
-    at::Tensor& in_out_splits,
+    at::Tensor& in_splits,
+    at::Tensor& out_splits_offsets,
     std::string group_name);
 
 void all_to_all_vdev_2d(
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_team_manager.hpp b/torch/csrc/distributed/c10d/symm_mem/nvshmem_team_manager.hpp
new file mode 100644
index 000000000000..0deda8e8f657
--- /dev/null
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_team_manager.hpp
@@ -0,0 +1,170 @@
+#pragma once
+
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Exception.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// Starting from NVSHMEM 3.3.9, nvshmem_host.h exists so that we can cleanly
+// include only the nvshmem host library headers:
+// #include <nvshmem_host.h>
+// It translates into the following two lines:
+#include <host/nvshmem_api.h>
+#include <host/nvshmemx_api.h>
+// For maximum compatibility, we use the "host/" style for now.
+
+namespace c10d::nvshmem_extension {
+
+// This corresponds to max nblocks
+constexpr int MAX_N_TEAMS = 128;
+
+// A pool of teams for each group. These are duplicate teams.
+using TeamPool = std::vector<nvshmem_team_t>;
+
+// Manage all the team business. Singleton.
+class TeamManager {
+ public:
+  // Constructor
+  explicit TeamManager(const c10::Device device) : device_(device) {}
+
+  // Get single, global manager.
+  static TeamManager& get(const c10::Device device) {
+    static TeamManager manager(device);
+    TORCH_CHECK(
+        manager.device_ == device,
+        "Detected use of TeamManager on multiple devices. This is not supported.");
+    return manager;
+  }
+
+  // Get a team for a group.
+  nvshmem_team_t get_team(
+      const std::string& group_name,
+      const std::vector<int>& global_ranks) {
+    auto [team_pool, pool_updated] =
+        group_to_team_pool(group_name, global_ranks, 1);
+    // Return the fist available team
+    return team_pool[0];
+  }
+
+  // Get n teams for a group.
+  // The first element of the returned pair is the team pool on host side.
+  // The second element of the returned pair is the team pool on device side.
+  // This API must be call with a device guard.
+  std::pair<const TeamPool&, nvshmem_team_t*> get_n_teams(
+      const std::string& group_name,
+      const std::vector<int>& global_ranks,
+      const int need_n) {
+    // A device guard is required for malloc and memcpy below
+    c10::cuda::CUDAGuard guard(device_);
+    // Get the team pool with the requested number of teams
+    auto [team_pool, pool_updated] =
+        group_to_team_pool(group_name, global_ranks, need_n);
+    // Check if the pool already exists in device memory
+    nvshmem_team_t* team_pool_dev = nullptr;
+    constexpr auto pool_bytes = sizeof(nvshmem_team_t) * MAX_N_TEAMS;
+    auto it = team_pool_devptrs_.find(group_name);
+    if (it == team_pool_devptrs_.end()) {
+      // If not, allocate a new pool in device memory
+      team_pool_dev = reinterpret_cast<nvshmem_team_t*>(
+          c10::cuda::CUDACachingAllocator::raw_alloc(pool_bytes));
+      team_pool_devptrs_[group_name] = team_pool_dev;
+    } else {
+      team_pool_dev = it->second;
+    }
+    // Update the pool in device memory if host side pool is updated
+    if (pool_updated) {
+      TORCH_INTERNAL_ASSERT(team_pool.size() == MAX_N_TEAMS);
+      auto stream = at::cuda::getCurrentCUDAStream();
+      C10_CUDA_CHECK(cudaMemcpyAsync(
+          team_pool_dev,
+          team_pool.data(),
+          pool_bytes,
+          cudaMemcpyHostToDevice,
+          stream));
+    }
+    return std::make_pair(std::cref(team_pool), team_pool_dev);
+  }
+
+  ~TeamManager() noexcept {
+    // Free the team pools in device memory
+    // Note that we do it in a best effort manner because the team pool is
+    // managed by a static TeamManager and the destruction order of static
+    // objects is undetermined. If the destructor is called after the CUDA
+    // context is destroyed, cudaFree would fail.
+    try {
+      // cudaFree generally implies a device synchronization, meaning it will
+      // block until all preceding CUDA operations on the device have completed
+      // before freeing the memory. Thus we don't need to worry about freeing
+      // the memory before CUDA kernels complete.
+      for (auto& [_, team_pool_dev] : team_pool_devptrs_) {
+        c10::cuda::CUDACachingAllocator::raw_delete(team_pool_dev);
+      }
+    } catch (...) {
+      // Ignore the error
+      std::cerr << "Failed to free the team pool in device memory, skipping\n";
+    }
+  }
+
+ private:
+  // Get the team pool for a group. If the pool doesn't exist, create it. If the
+  // pool exists but is not large enough, create more teams.
+  // The first element of the returned pair is the team pool on host side.
+  // The second element of the returned pair is a boolean indicating if the pool
+  // is updated.
+  std::pair<const TeamPool&, bool> group_to_team_pool(
+      const std::string& group_name,
+      const std::vector<int>& global_ranks,
+      const int need_n) {
+    TORCH_CHECK(need_n < MAX_N_TEAMS, "Too many teams requested");
+    // Guarding the NVSHMEM API calls below just to be safe
+    c10::cuda::CUDAGuard guard(device_);
+
+    // Insert a new team pool if not exists
+    auto [it, inserted] = group_name_to_team_pool_.emplace(
+        group_name, TeamPool(MAX_N_TEAMS, NVSHMEM_TEAM_INVALID));
+    auto& team_pool = it->second;
+    bool pool_updated = inserted;
+
+    // Create new teams if what's requested is more than what we have
+    int stride = 0; // stride in globe, uninitialized
+    for (int i = 0; i < need_n; ++i) {
+      if (team_pool[i] != NVSHMEM_TEAM_INVALID) {
+        continue;
+      }
+      // Some checks before we create new teams
+      if (stride == 0) { // Check only once
+        TORCH_CHECK(global_ranks.size() > 1);
+        stride = global_ranks[1] - global_ranks[0];
+        for (size_t r = 1; r < global_ranks.size(); ++r) {
+          TORCH_CHECK(global_ranks[r] - global_ranks[r - 1] == stride);
+        }
+      }
+      nvshmem_team_t team = NVSHMEM_TEAM_INVALID;
+      nvshmem_team_split_strided(
+          NVSHMEM_TEAM_WORLD,
+          global_ranks[0],
+          stride,
+          global_ranks.size(),
+          nullptr,
+          0,
+          &team);
+      TORCH_CHECK(team != NVSHMEM_TEAM_INVALID, "Failed to create a new team");
+      team_pool[i] = team;
+      pool_updated = true;
+    }
+    return std::make_pair(std::cref(team_pool), pool_updated);
+  }
+
+ private:
+  // Device where the team manager is created
+  const c10::Device device_;
+  // A map from group name to team pool for that group.
+  std::unordered_map<std::string, TeamPool> group_name_to_team_pool_;
+  // A map from group name to team pool array in device memory.
+  std::unordered_map<std::string, nvshmem_team_t*> team_pool_devptrs_;
+};
+
+} // namespace c10d::nvshmem_extension
\ No newline at end of file
diff --git a/torch/csrc/dynamo/eval_frame.c b/torch/csrc/dynamo/eval_frame.c
index 72bb8839bac3..07d28e7c77cf 100644
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@@ -479,15 +479,15 @@ PyObject* dynamo_eval_custom_code(
     THP_EVAL_API_FRAME_OBJECT* frame,
     PyCodeObject* code,
     const char* trace_annotation,
-    int throw_flag) {}
+    int throw_flag) { return NULL; }
 THPPyInterpreterFrame* THPPyInterpreterFrame_New(
-    THP_EVAL_API_FRAME_OBJECT* frame) {}
+    THP_EVAL_API_FRAME_OBJECT* frame) { return NULL; }
 PyObject* dynamo_eval_frame_default(
     PyThreadState* tstate,
     THP_EVAL_API_FRAME_OBJECT* frame,
-    int throw_flag) {}
+    int throw_flag) { return NULL; }
 
-static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
+static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {{NULL}};
 
 static PyTypeObject THPPyInterpreterFrameType = {
     PyVarObject_HEAD_INIT(NULL, 0)
@@ -544,7 +544,6 @@ static PyObject* decrement_working_threads(
 
 static PyObject* set_eval_frame(
     PyObject* new_callback,
-    PyThreadState* tstate,
     PyObject* module) {
   // Change the eval frame callback and return the old one
   //  - None: disables TorchDynamo
@@ -552,21 +551,31 @@ static PyObject* set_eval_frame(
   //  - Python callable(): enables TorchDynamo
   PyObject* old_callback = eval_frame_callback_get();
 
-  // owned by caller
-  Py_INCREF(old_callback);
+  // Common case: if Dynamo is actually off, we might see a lot of
+  // traffic setting the callback to None when it was already
+  // None. Skip messing with threading, thread-local storage, and
+  // reference counts.
+  if (old_callback != new_callback) {
+    if (new_callback == Py_None) {
+      decrement_working_threads(PyThreadState_GET(), module);
+    } else {
+      increment_working_threads(PyThreadState_GET(), module);
+    }
 
-  if (old_callback != Py_None && new_callback == Py_None) {
-    decrement_working_threads(tstate, module);
-  } else if (old_callback == Py_None && new_callback != Py_None) {
-    increment_working_threads(tstate, module);
-  }
+    Py_INCREF(new_callback);
 
-  Py_INCREF(new_callback);
-  Py_DECREF(old_callback);
+    // Set thread local callback. This will drive behavior of our shim, if/when it
+    // is installed.
+    eval_frame_callback_set(new_callback);
 
-  // Set thread local callback. This will drive behavior of our shim, if/when it
-  // is installed.
-  eval_frame_callback_set(new_callback);
+    // Transfer owned reference from eval_frame_callback_get() to caller
+    // without Py_DECREF/Py_INCREF.
+  } else {
+    // We retain a reference to old_callback because it's still the
+    // eval_frame_callback, so we need to give the caller their
+    // own reference.
+    Py_INCREF(old_callback);
+  }
 
   return old_callback;
 }
@@ -582,7 +591,7 @@ static PyObject* set_eval_frame_py(PyObject* module, PyObject* callback) {
       "python enabled=%d and is run_only=%d",
       callback != Py_None,
       callback == Py_False);
-  return set_eval_frame(callback, PyThreadState_GET(), module);
+  return set_eval_frame(callback, module);
 }
 
 static PyObject* set_skip_guard_eval_unsafe(
diff --git a/torch/csrc/functionalization/Module.cpp b/torch/csrc/functionalization/Module.cpp
new file mode 100644
index 000000000000..d38cb1078054
--- /dev/null
+++ b/torch/csrc/functionalization/Module.cpp
@@ -0,0 +1,71 @@
+#include <torch/csrc/functionalization/Module.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <ATen/FunctionalStorageImpl.h>
+#include <ATen/FunctionalTensorWrapper.h>
+#include <ATen/FunctionalizeFallbackKernel.h>
+#include <memory>
+
+namespace torch::functionalization {
+
+void initModule(PyObject* module) {
+  auto m = py::handle(module).cast<py::module>();
+
+  // Create a `torch._C._functionalization` Python module.
+  auto functionalization = m.def_submodule(
+      "_functionalization", "functionalization related pybind.");
+
+  // Retrieve the ViewMeta sequence of a given functional tensor.
+  functionalization.def("get_view_meta_sequence", [](const at::Tensor& tensor) {
+    TORCH_INTERNAL_ASSERT(
+        at::functionalization::impl::isFunctionalTensor(tensor));
+    auto impl = at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
+    return impl->view_metas();
+  });
+
+  // Applies the given ViewMeta sequence to the given base.
+  functionalization.def(
+      "apply_view_meta_sequence",
+      [](const at::Tensor& base,
+         const std::vector<std::shared_ptr<at::functionalization::ViewMeta>>&
+             sequence) {
+        return at::functionalization::impl::apply_view_meta_sequence(
+            base, sequence);
+      });
+
+  // Binding for InverseReturnMode.
+  py::enum_<at::functionalization::InverseReturnMode>(
+      functionalization, "InverseReturnMode")
+      .value("AlwaysView", at::functionalization::InverseReturnMode::AlwaysView)
+      .value("NeverView", at::functionalization::InverseReturnMode::NeverView)
+      .value(
+          "ViewOrScatterInverse",
+          at::functionalization::InverseReturnMode::ViewOrScatterInverse);
+
+  // Create bindings for the ViewMeta base class.
+  //
+  // Needed so that we can take a list of ViewMeta objects as parameter.
+  // Specifically, in the Python-side, we will have a list of derived ViewMeta
+  // classes. We need to tell pybind11 that all of those are, in fact, instances
+  // of different ViewMeta sub-types.
+  py::class_<
+      at::functionalization::ViewMeta,
+      std::shared_ptr<at::functionalization::ViewMeta>>(
+      functionalization, "ViewMeta")
+      .def_property_readonly(
+          "has_symbolic_inputs",
+          [](const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
+            return meta->has_symbolic_inputs;
+          });
+
+  // Bindings for `ViewMeta` specializations manually implemented.
+  create_binding_with_pickle<at::functionalization::resize__ViewMeta>(
+      functionalization);
+  create_binding_with_pickle<at::functionalization::_unsafe_view_ViewMeta>(
+      functionalization);
+
+  // Bindings for `ViewMeta` specializations automatically generated.
+  initGenerated(functionalization.ptr());
+}
+
+} // namespace torch::functionalization
diff --git a/torch/csrc/functionalization/Module.h b/torch/csrc/functionalization/Module.h
new file mode 100644
index 000000000000..2f77fd3098c3
--- /dev/null
+++ b/torch/csrc/functionalization/Module.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/FunctionalStorageImpl.h>
+
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::functionalization {
+
+// Creates the default bindings for `ViewMeta` specializations.
+//
+// Defines a constructor using the types in `SerializableTuple`, as well
+// as pickle methods.
+template <class T>
+void create_binding_with_pickle(py::module m) {
+  py::class_<T, std::shared_ptr<T>, at::functionalization::ViewMeta>(
+      m, T::name())
+      .def(py::init<typename T::SerializableTuple>())
+      .def(
+          "as_tuple",
+          [](const std::shared_ptr<T>& meta) {
+            return meta->to_serializable_tuple();
+          })
+      .def(py::pickle(
+          [](const std::shared_ptr<T>& meta) {
+            return meta->to_serializable_tuple();
+          },
+          [](const typename T::SerializableTuple& tpl) {
+            return std::make_shared<T>(tpl);
+          }));
+}
+
+void initModule(PyObject* module);
+void initGenerated(PyObject* module);
+
+} // namespace torch::functionalization
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index b835b1a00821..aa8ef905d57a 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -445,7 +445,8 @@ class RAIIMinizArchive {
  public:
   RAIIMinizArchive(const std::string& zip_path) {
     mz_zip_zero_struct(&_zip_archive);
-    if (!mz_zip_reader_init_file(&_zip_archive, zip_path.c_str(), 0)) {
+    if (!mz_zip_reader_init_file(
+            &_zip_archive, normalize_path_separator(zip_path).c_str(), 0)) {
       throw std::runtime_error(fmt::format(
           "Failed to initialize zip archive: {}",
           mz_zip_get_error_string(mz_zip_get_last_error(&_zip_archive))));
diff --git a/torch/csrc/inductor/aoti_runtime/model_base.h b/torch/csrc/inductor/aoti_runtime/model_base.h
index 6e80c90499a0..589c34134345 100644
--- a/torch/csrc/inductor/aoti_runtime/model_base.h
+++ b/torch/csrc/inductor/aoti_runtime/model_base.h
@@ -1,13 +1,270 @@
 #pragma once
-
 #ifdef _WIN32
 #include <Windows.h>
 #include <functional> // std::function
-#else
+#ifdef USE_MMAP_SELF
+#include <errno.h>
+#include <fcntl.h>
+#include <io.h>
+#include <sys/stat.h>
+
+#define PROT_READ 0x1
+#define PROT_WRITE 0x2
+#define PROT_EXEC 0x4
+
+#define MAP_SHARED 0x01
+#define MAP_PRIVATE 0x02
+#define MAP_FAILED ((void*)-1)
+
+#define SEEK_SET 0
+#define SEEK_CUR 1
+#define SEEK_END 2
+
+struct Dl_info {
+  char dli_fname[MAX_PATH]; /**< Filename of defining object */
+  void* dli_fbase; /**< Load address of that object */
+  const char* dli_sname; /**< Name of nearest lower symbol */
+  void* dli_saddr; /**< Exact value of nearest symbol */
+};
+typedef struct Dl_info Dl_info;
+
+int dladdr(const void* addr, Dl_info* info) {
+  // only returns filename, FWIW.
+  CHAR tpath[MAX_PATH];
+  MEMORY_BASIC_INFORMATION mbi;
+  char* path;
+  char* tmp;
+  size_t length;
+  int ret = 0;
+
+  if (!info)
+    return 0;
+
+  HMODULE hModule;
+  if (!GetModuleHandleExA(
+          GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+              GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+          (LPCSTR)addr,
+          &hModule) ||
+      hModule == NULL)
+    return 0;
+
+  ret = GetModuleFileNameA(hModule, (LPSTR)&tpath, MAX_PATH);
+  if (!ret)
+    return 0;
+
+  path = tpath;
+
+  length = strlen(path);
+  if (length >= MAX_PATH) {
+    length = MAX_PATH - 1;
+    path[MAX_PATH - 1] = '\0';
+  }
+
+  tmp = path;
+  while (*tmp) {
+    if (*tmp == '\\')
+      *tmp = '/';
+    tmp++;
+  }
+
+  memcpy(info->dli_fname, path, length + 1);
+  info->dli_fbase = hModule;
+  info->dli_sname = NULL;
+  info->dli_saddr = NULL;
+  return 1;
+}
+
+static DWORD get_creation_disposition(int flags) {
+  if (flags & O_CREAT) {
+    if (flags & O_EXCL)
+      return CREATE_NEW;
+    if (flags & O_TRUNC)
+      return CREATE_ALWAYS;
+    return OPEN_ALWAYS;
+  }
+  if (flags & O_TRUNC)
+    return TRUNCATE_EXISTING;
+  return OPEN_EXISTING;
+}
+
+#define O_ACCMODE 03
+#define O_RDONLY 00
+#define O_WRONLY 01
+#define O_RDWR 02
+
+static DWORD get_access_mode(int flags) {
+  switch (flags & O_ACCMODE) {
+    case O_RDONLY:
+      return GENERIC_READ;
+    case O_WRONLY:
+      return GENERIC_WRITE;
+    case O_RDWR:
+      return GENERIC_READ | GENERIC_WRITE;
+    default:
+      return GENERIC_READ;
+  }
+}
+#ifndef O_DSYNC
+#define O_DSYNC 00010000 /* used to be O_SYNC, see below */
+#endif
+
+#ifndef O_SYNC
+#define __O_SYNC 04000000
+#define O_SYNC (__O_SYNC | O_DSYNC)
+#endif
+
+int open(char* pathname, int flags) {
+  DWORD dwDesiredAccess = get_access_mode(flags);
+  DWORD dwCreationDisposition = get_creation_disposition(flags);
+  DWORD dwShareMode = FILE_SHARE_READ | FILE_SHARE_WRITE;
+  DWORD dwFlagsAndAttributes = FILE_ATTRIBUTE_NORMAL;
+
+  if (flags & O_SYNC) {
+    dwFlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH;
+  }
+
+  if (flags & O_SEQUENTIAL) {
+    dwFlagsAndAttributes |= FILE_FLAG_SEQUENTIAL_SCAN;
+  }
+
+  if (flags & O_RANDOM) {
+    dwFlagsAndAttributes |= FILE_FLAG_RANDOM_ACCESS;
+  }
+
+  HANDLE hFile = CreateFileA(
+      pathname,
+      dwDesiredAccess,
+      dwShareMode,
+      NULL,
+      dwCreationDisposition,
+      dwFlagsAndAttributes,
+      NULL);
+
+  if (hFile == INVALID_HANDLE_VALUE) {
+    switch (GetLastError()) {
+      case ERROR_FILE_NOT_FOUND:
+        errno = ENOENT;
+        break;
+      case ERROR_PATH_NOT_FOUND:
+        errno = ENOTDIR;
+        break;
+      case ERROR_ACCESS_DENIED:
+        errno = EACCES;
+        break;
+      case ERROR_FILE_EXISTS:
+        errno = EEXIST;
+        break;
+      case ERROR_TOO_MANY_OPEN_FILES:
+        errno = EMFILE;
+        break;
+      default:
+        errno = EIO;
+    }
+    return -1;
+  }
+
+  int fd = _open_osfhandle((intptr_t)hFile, flags);
+  if (fd == -1) {
+    CloseHandle(hFile);
+    errno = EMFILE;
+    return -1;
+  }
+
+  if (flags & O_APPEND) {
+    lseek(fd, 0, SEEK_END);
+  }
+
+  return fd;
+}
+
+int close(int fd) {
+  return _close(fd);
+}
+
+void* mmap(
+    void* addr,
+    size_t length,
+    int prot,
+    int flags,
+    int fd,
+    off_t offset) {
+  HANDLE hFile = (HANDLE)_get_osfhandle(fd);
+  if (hFile == INVALID_HANDLE_VALUE) {
+    errno = EBADF;
+    return MAP_FAILED;
+  }
+
+  DWORD flProtect;
+  if (prot & PROT_WRITE) {
+    flProtect = PAGE_READWRITE;
+  } else if (prot & PROT_READ) {
+    flProtect = PAGE_READONLY;
+  } else {
+    flProtect = PAGE_NOACCESS;
+  }
+
+  flProtect = PAGE_READONLY;
+
+  DWORD dwDesiredAccess = 0;
+  if (prot & PROT_READ)
+    dwDesiredAccess |= FILE_MAP_READ;
+  if (prot & PROT_WRITE)
+    dwDesiredAccess |= FILE_MAP_WRITE;
+  if (prot & PROT_EXEC)
+    dwDesiredAccess |= FILE_MAP_EXECUTE;
+
+  dwDesiredAccess = FILE_MAP_READ;
+
+  SYSTEM_INFO SysInfo;
+  GetSystemInfo(&SysInfo);
+  DWORD dwSysGran = SysInfo.dwAllocationGranularity;
+
+  DWORD dwFileMapStart = (offset / dwSysGran) * dwSysGran;
+  DWORD dwMapViewSize = (offset % dwSysGran) + length;
+  DWORD dwFileMapSize = offset + length;
+  int iViewDelta = offset - dwFileMapStart;
+
+  HANDLE hMapping =
+      CreateFileMapping(hFile, NULL, flProtect, 0, dwFileMapSize, NULL);
+
+  if (!hMapping) {
+    DWORD dwErrCode = GetLastError();
+    errno = EACCES;
+    return MAP_FAILED;
+  }
+
+  void* lpMapAddress = MapViewOfFileEx(
+      hMapping, dwDesiredAccess, 0, dwFileMapStart, dwMapViewSize, addr);
+  if (!lpMapAddress) {
+    DWORD dwErrCode = GetLastError();
+    errno = EINVAL;
+  }
+
+  void* pData = (char*)lpMapAddress + iViewDelta;
+
+  CloseHandle(hMapping);
+
+  if (!lpMapAddress) {
+    return MAP_FAILED;
+  }
+
+  return pData;
+}
+
+int munmap(void* addr, size_t length) {
+  if (!UnmapViewOfFile(addr)) {
+    errno = EINVAL;
+    return -1;
+  }
+  return 0;
+}
+#endif // USE_MMAP_SELF
+#else // !_WIN32
 #include <dlfcn.h>
 #include <sys/mman.h>
 #include <unistd.h>
-#endif
+#endif // _WIN32
 
 #include <fcntl.h>
 #include <optional>
@@ -330,6 +587,7 @@ class AOTInductorModelBase {
 #endif
 
     size_t bytes_read = 0;
+    size_t non_folded_idx = 0; // Separate index for non-folded constants
     for (size_t i = 0; i < num_constants; i++) {
       bool from_folded = this->constant_from_folded(i);
       if (from_folded) {
@@ -339,12 +597,13 @@ class AOTInductorModelBase {
       size_t data_size = this->constant_data_size(i);
       uint8_t* internal_ptr = (data_size != 0)
           ? constant_ptr(
-                constants_internal_offset[i],
+                constants_internal_offset[non_folded_idx],
                 bytes_read,
                 data_size,
                 /* skip_copy = */ false)
           : nullptr;
       bytes_read += data_size;
+      non_folded_idx++; // Increment the non-folded index
 
       // Create at::Tensor from copied memory.
       auto dtype = this->constant_dtype(i);
diff --git a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
index 9745f69ccf4f..3a2e91c37c91 100644
--- a/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
+++ b/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
@@ -128,12 +128,10 @@ static std::unique_ptr<sycl::kernel> _createKernel(
     uint32_t numWarps,
     uint32_t sharedMemory,
     void** params,
-    sycl::queue* queuePtr) {
+    sycl::queue* queuePtr,
+    uint32_t threadsPerWarp) {
   std::string kernelName =
       kernelPtr->get_info<sycl::info::kernel::function_name>();
-  // Currently threadsPerWarp is hard code to 32 from torch.compile to triton
-  // stack.
-  int threadsPerWarp = 32;
   uint32_t numParams = kernelPtr->get_info<sycl::info::kernel::num_args>();
   size_t globalRangeX = gridX * threadsPerWarp * numWarps;
   size_t globalRangeY = gridY;
diff --git a/torch/csrc/inductor/aoti_torch/c/shim_xpu.h b/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
index 408c99ca655f..c25fe6443c94 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
@@ -107,6 +107,100 @@ aoti_torch_xpu_mkldnn__convolution_pointwise_binary_(
     const char** unary_algorithm,
     AtenTensorHandle* ret0);
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__qlinear_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* post_op_name,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char* post_op_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_xpu__qlinear_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* other,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    const char* binary_post_op,
+    double binary_alpha,
+    const char* unary_post_op,
+    const double** unary_post_op_args,
+    int64_t unary_post_op_args_len_,
+    const char* unary_post_op_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__qconv_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* attr,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_xpu__qconv2d_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle accum,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
 #endif // AT_MKLDNN_ENABLED()
 #ifdef __cplusplus
 } // extern "C"
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h
index c262b91ab47c..4672e3293c5a 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h
@@ -18,6 +18,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_amax(AtenTensorHandle self, con
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_fill__Scalar(AtenTensorHandle self, double value);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_new_empty(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_new_zeros(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_aten_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
 
 #ifdef __cplusplus
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
index 56bd07115858..179c0074b3cd 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
@@ -78,6 +78,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Scalar(AtenTensorHandle self
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/shim_cpu.cpp b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
index 904bd5f9e51f..b1c864bf3fbb 100644
--- a/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_cpu.cpp
@@ -19,16 +19,6 @@ using namespace torch::aot_inductor;
 
 #if AT_MKLDNN_ENABLED()
 
-template <typename T>
-static c10::List<T> convert_to_c10_List(const T* scalars, const int64_t len) {
-  c10::List<T> scalars_list;
-  scalars_list.reserve(len);
-  for (int64_t i = 0; i < len; i++) {
-    scalars_list.emplace_back(scalars[i]);
-  }
-  return scalars_list;
-}
-
 AOTITorchError aoti_torch_cpu_mkldnn__convolution_pointwise_binary(
     AtenTensorHandle X,
     AtenTensorHandle other,
diff --git a/torch/csrc/inductor/aoti_torch/shim_xpu.cpp b/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
index 33f8985d83bd..c05872ae0423 100644
--- a/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_xpu.cpp
@@ -80,6 +80,8 @@ AOTITorchError aoti_torch_get_current_sycl_queue(void** ret) {
 
 #if AT_MKLDNN_ENABLED()
 #include <ATen/native/mkldnn/xpu/Conv.h>
+#include <ATen/native/mkldnn/xpu/qconv.h>
+#include <ATen/native/mkldnn/xpu/qlinear.h>
 
 AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise_binary(
     AtenTensorHandle X,
@@ -204,4 +206,227 @@ AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise(
   });
 }
 
+AOTITorchError aoti_torch_xpu__qlinear_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* post_op_name,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char* post_op_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(post_op_args_len_);
+    for (int64_t i = 0; i < post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(post_op_args[i]));
+    }
+
+    auto tmp_result =
+        at::native::xpu::QLinearOnednnXPU::q_linear_pointwise_tensor(
+            *tensor_handle_to_tensor_pointer(X),
+            *tensor_handle_to_tensor_pointer(act_scale),
+            *tensor_handle_to_tensor_pointer(act_zero_point),
+            *tensor_handle_to_tensor_pointer(onednn_weight),
+            *tensor_handle_to_tensor_pointer(weight_scales),
+            *tensor_handle_to_tensor_pointer(weight_zero_points),
+            pointer_to_optional<at::Tensor>(B),
+            output_scale,
+            output_zero_point,
+            pointer_to_optional<at::ScalarType>(output_dtype),
+            post_op_name,
+            scalars_list,
+            post_op_algorithm);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_xpu__qlinear_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* other,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    const char* binary_post_op,
+    double binary_alpha,
+    const char* unary_post_op,
+    const double** unary_post_op_args,
+    int64_t unary_post_op_args_len_,
+    const char* unary_post_op_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(unary_post_op_args_len_);
+    for (int64_t i = 0; i < unary_post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(unary_post_op_args[i]));
+    }
+
+    auto tmp_result =
+        at::native::xpu::QLinearOnednnXPU::q_linear_pointwise_binary_tensor(
+            *tensor_handle_to_tensor_pointer(X),
+            *tensor_handle_to_tensor_pointer(act_scale),
+            *tensor_handle_to_tensor_pointer(act_zero_point),
+            *tensor_handle_to_tensor_pointer(onednn_weight),
+            *tensor_handle_to_tensor_pointer(weight_scales),
+            *tensor_handle_to_tensor_pointer(weight_zero_points),
+            pointer_to_optional<at::Tensor>(other),
+            pointer_to_optional<at::Tensor>(B),
+            output_scale,
+            output_zero_point,
+            pointer_to_optional<at::ScalarType>(output_dtype),
+            other_scale,
+            other_zero_point,
+            binary_post_op,
+            binary_alpha,
+            unary_post_op,
+            scalars_list,
+            unary_post_op_algorithm);
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_xpu__qconv_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* attr,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> scalars_list;
+    scalars_list.reserve(post_op_args_len_);
+    for (int64_t i = 0; i < post_op_args_len_; i++) {
+      scalars_list.emplace_back(pointer_to_optional(post_op_args[i]));
+    }
+
+    c10::List<int64_t> stride_list =
+        convert_to_c10_List<int64_t>(stride_args, stride_len_);
+    c10::List<int64_t> padding_list =
+        convert_to_c10_List<int64_t>(padding_args, padding_len_);
+    c10::List<int64_t> dilation_list =
+        convert_to_c10_List<int64_t>(dilation_args, dilation_len_);
+
+    auto tmp_result = at::native::xpu::QConvoneDNNXPU::run_pointwise_tensor(
+        *tensor_handle_to_tensor_pointer(X),
+        *tensor_handle_to_tensor_pointer(act_scale),
+        *tensor_handle_to_tensor_pointer(act_zero_point),
+        *tensor_handle_to_tensor_pointer(onednn_weight),
+        *tensor_handle_to_tensor_pointer(weight_scales),
+        *tensor_handle_to_tensor_pointer(weight_zero_points),
+        pointer_to_optional<at::Tensor>(B),
+        stride_list,
+        padding_list,
+        dilation_list,
+        groups,
+        output_scale,
+        output_zero_point,
+        pointer_to_optional<at::ScalarType>(output_dtype),
+        attr,
+        scalars_list,
+        pointer_to_optional<std::string_view>(algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
+AOTITorchError aoti_torch_xpu__qconv2d_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle accum,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    c10::List<std::optional<c10::Scalar>> unary_scalars_list;
+    unary_scalars_list.reserve(unary_scalars_len_);
+    for (int64_t i = 0; i < unary_scalars_len_; i++) {
+      unary_scalars_list.emplace_back(pointer_to_optional(unary_scalars[i]));
+    }
+
+    c10::List<int64_t> stride_list =
+        convert_to_c10_List<int64_t>(stride_args, stride_len_);
+    c10::List<int64_t> padding_list =
+        convert_to_c10_List<int64_t>(padding_args, padding_len_);
+    c10::List<int64_t> dilation_list =
+        convert_to_c10_List<int64_t>(dilation_args, dilation_len_);
+
+    auto tmp_result =
+        at::native::xpu::QConvoneDNNXPU::run_pointwise_binary_tensor(
+            *tensor_handle_to_tensor_pointer(X),
+            *tensor_handle_to_tensor_pointer(act_scale),
+            *tensor_handle_to_tensor_pointer(act_zero_point),
+            *tensor_handle_to_tensor_pointer(onednn_weight),
+            *tensor_handle_to_tensor_pointer(weight_scales),
+            *tensor_handle_to_tensor_pointer(weight_zero_points),
+            *tensor_handle_to_tensor_pointer(accum),
+            pointer_to_optional<at::Tensor>(B),
+            stride_list,
+            padding_list,
+            dilation_list,
+            groups,
+            output_scale,
+            output_zero_point,
+            pointer_to_optional<at::ScalarType>(output_dtype),
+            accum_scale,
+            accum_zero_point,
+            binary_attr,
+            pointer_to_optional<c10::Scalar>(alpha),
+            pointer_to_optional<std::string_view>(unary_attr),
+            unary_scalars_list,
+            pointer_to_optional<std::string_view>(unary_algorithm));
+    *ret0 = new_tensor_handle(std::move(tmp_result));
+  });
+}
+
 #endif // AT_MKLDNN_ENABLED()
diff --git a/torch/csrc/inductor/aoti_torch/utils.h b/torch/csrc/inductor/aoti_torch/utils.h
index 4f19fd670d0f..22018cd70c82 100644
--- a/torch/csrc/inductor/aoti_torch/utils.h
+++ b/torch/csrc/inductor/aoti_torch/utils.h
@@ -222,4 +222,14 @@ inline std::optional<c10::ArrayRef<T>> pointer_to_optional_list(
       : std::nullopt;
 }
 
+template <typename T>
+static c10::List<T> convert_to_c10_List(const T* scalars, const int64_t len) {
+  c10::List<T> scalars_list;
+  scalars_list.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    scalars_list.emplace_back(scalars[i]);
+  }
+  return scalars_list;
+}
+
 } // namespace torch::aot_inductor
diff --git a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
index 9728d27d4d79..0ac2c79d1e98 100644
--- a/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
+++ b/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
@@ -260,7 +260,7 @@ typedef __half half;
 )";
 #endif
 
-#if defined(USE_ROCM)
+#if defined(USE_ROCM) && ROCM_VERSION < 70000
 constexpr auto bfloat16_support_literal =
     R"(
 #ifndef __align__
@@ -317,6 +317,75 @@ __device__ __nv_bfloat16 __float2bfloat16(const float a) {
   return val;
 }
 
+__device__ float __bfloat162float(const __nv_bfloat16 a) {
+  union
+  {
+      uint32_t int32;
+      float    fp32;
+  } u = {uint32_t(a.__x) << 16};
+  return u.fp32;
+}
+#endif /* defined(__cplusplus) */
+)";
+#elif defined(USE_ROCM) && ROCM_VERSION >= 70000
+constexpr auto bfloat16_support_literal =
+    R"(
+#ifndef __align__
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+typedef unsigned int uint32_t;
+
+typedef struct __align__(2) {
+  unsigned short x;
+}
+__nv_bfloat16_raw;
+
+#if defined(__cplusplus)
+struct __align__(2) __nv_bfloat16 {
+  __host__ __device__ __nv_bfloat16() {}
+
+  __host__ __device__ __nv_bfloat16& operator=(const __nv_bfloat16_raw& hr) {
+    __x = hr.x;
+    return *this;
+  }
+
+  unsigned short __x;
+};
+
+__device__ unsigned short __internal_float2bfloat16(
+    const float f,
+    unsigned int& sign,
+    unsigned int& remainder) {
+  unsigned int x;
+
+  x = __float_as_uint(f);
+
+  if ((x & 0x7fffffffU) > 0x7f800000U) {
+    sign = 0U;
+    remainder = 0U;
+    return static_cast<unsigned short>(0x7fffU);
+  }
+  sign = x >> 31;
+  remainder = x << 16;
+  return static_cast<unsigned short>(x >> 16);
+}
+
+/* Definitions of intrinsics */
+__device__ __nv_bfloat16 __float2bfloat16(const float a) {
+  __nv_bfloat16 val;
+  __nv_bfloat16_raw r;
+  unsigned int sign;
+  unsigned int remainder;
+  r.x = __internal_float2bfloat16(a, sign, remainder);
+  if ((remainder > 0x80000000U) ||
+      ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+    r.x++;
+  }
+  val = r;
+  return val;
+}
+
 __device__ float __bfloat162float(const __nv_bfloat16 a) {
   union
   {
diff --git a/torch/csrc/jit/frontend/schema_type_parser.cpp b/torch/csrc/jit/frontend/schema_type_parser.cpp
index bbfeb3787c91..4df9fb663984 100644
--- a/torch/csrc/jit/frontend/schema_type_parser.cpp
+++ b/torch/csrc/jit/frontend/schema_type_parser.cpp
@@ -33,6 +33,7 @@ using c10::StorageType;
 using c10::StreamObjType;
 using c10::StringType;
 using c10::Symbol;
+using c10::SymBoolType;
 using c10::SymIntType;
 using c10::TensorType;
 using c10::TupleType;
@@ -66,6 +67,7 @@ TypePtr SchemaTypeParser::parseBaseType() {
       {"int", c10::TypeFactory::get<IntType>()},
       {"SymInt", c10::TypeFactory::get<SymIntType>()},
       {"bool", c10::TypeFactory::get<BoolType>()},
+      {"SymBool", c10::TypeFactory::get<SymBoolType>()},
       {"None", c10::TypeFactory::get<NoneType>()},
       {"NoneType", c10::TypeFactory::get<NoneType>()},
       {"Capsule", c10::TypeFactory::get<CapsuleType>()},
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index a0e6babe54b6..cddae7776822 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -260,10 +260,12 @@ void NodeToONNX(
     ::torch::onnx::OperatorExportTypes operator_export_type,
     py::dict& env,
     py::set& values_in_env) {
-  py::object onnx = py::module::import("torch.onnx");
-  py::object onnx_globals = py::module::import("torch.onnx._globals");
-  py::object onnx_registration =
-      py::module::import("torch.onnx._internal.registration");
+  py::object onnx_utils =
+      py::module::import("torch.onnx._internal.torchscript_exporter.utils");
+  py::object onnx_globals =
+      py::module::import("torch.onnx._internal.torchscript_exporter._globals");
+  py::object onnx_registration = py::module::import(
+      "torch.onnx._internal.torchscript_exporter.registration");
 
   // Setup all the lambda helper functions.
 
@@ -474,7 +476,7 @@ void NodeToONNX(
     // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
     // Python. Check #87343 for details.
     py::list new_nodes = py::list();
-    py::object raw_output = onnx.attr("_run_symbolic_function")(
+    py::object raw_output = onnx_utils.attr("_run_symbolic_function")(
         g->shared_from_this(),
         new_block,
         n,
@@ -590,7 +592,7 @@ void NodeToONNX(
 
       // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
       // Python. Check #87343 for details.
-      py::object raw_output = onnx.attr("_run_symbolic_method")(
+      py::object raw_output = onnx_utils.attr("_run_symbolic_method")(
           new_block->owningGraph()->shared_from_this(),
           op->name(),
           pyobj.attr("symbolic"),
@@ -605,7 +607,7 @@ void NodeToONNX(
       // IMPORTANT: NEVER pass raw pointer of smart pointer managed objects to
       // Python. Check #87343 for details.
       py::list new_nodes = py::list();
-      py::object raw_output = onnx.attr("_run_symbolic_function")(
+      py::object raw_output = onnx_utils.attr("_run_symbolic_function")(
           new_block->owningGraph()->shared_from_this(),
           new_block,
           n,
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index 966388278a32..a188eb0abd6b 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -191,8 +191,7 @@ std::pair<Value*, Value*> PrepareCopyForONNX(Node* node) {
   expanded_value->node()->copyMetadata(node);
 
   auto index_put = graph->insert(
-      aten::index_put_,
-      {node->input(0), dummy_list, expanded_value, node->input(2)});
+      aten::index_put_, {node->input(0), dummy_list, expanded_value});
   index_put->node()->copyMetadata(node);
   index_put->copyMetadata(node->output());
   node->output()->replaceAllUsesWith(index_put);
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 147154609223..bb052fc8421f 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -196,11 +196,20 @@ static void removeProfileNodesAndSpecializeTypes(Block* b) {
       if (it->input()->type()->kind() == c10::TypeKind::TensorType) {
         input_tensor_type = it->input()->type()->expect<TensorType>();
       } else {
-        input_tensor_type = it->input()
-                                ->type()
-                                ->expectRef<OptionalType>()
-                                .getElementType()
-                                ->expect<TensorType>();
+        auto element_type = it->input()
+                              ->type();
+        if (element_type->cast<OptionalType>()) {
+          input_tensor_type = element_type->expectRef<OptionalType>()
+                                          .getElementType()
+                                          ->expect<TensorType>();
+        } else {
+          // This handles the following scenario:
+          // 1. profiling nodes are inserted
+          // 2. optimizations simplify a Tensor? -> None type
+          // 3. Now the input to the prim::profile() is actually a None type.
+          element_type->expect<NoneType>();
+        }
+
         input_is_optional = true;
       }
 
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 254162764afa..1cc439aa65b2 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1726,7 +1726,7 @@ void initJITBindings(PyObject* module) {
                       const py::args& args, const py::kwargs& kwargs) {
                     ToIValueAllowNumbersAsTensors g(allow_numbers_as_tensors);
                     return _get_operation_for_overload_or_packet(
-                        {op}, symbol, args, kwargs, /*is_overload*/ true);
+                        op, symbol, args, kwargs, /*is_overload*/ true);
                   });
               auto func_dk =
                   py::cpp_function([op, symbol, allow_numbers_as_tensors](
@@ -1735,7 +1735,7 @@ void initJITBindings(PyObject* module) {
                                        const py::kwargs& kwargs) {
                     ToIValueAllowNumbersAsTensors g(allow_numbers_as_tensors);
                     return _get_operation_for_overload_or_packet(
-                        {op}, symbol, args, kwargs, /*is_overload*/ true, dk_);
+                        op, symbol, args, kwargs, /*is_overload*/ true, dk_);
                   });
               return py::make_tuple(
                   func, func_dk, py::cast(op->getTags().vec()));
@@ -1958,17 +1958,25 @@ void initJITBindings(PyObject* module) {
            std::vector<Argument>,
            bool,
            bool>())
-      .def_property_readonly(
-          "name", [](FunctionSchema& self) { return self.name(); })
-      .def_property_readonly(
-          "overload_name",
-          [](FunctionSchema& self) { return self.overload_name(); })
-      .def_property_readonly(
-          "arguments", [](FunctionSchema& self) { return self.arguments(); })
-      .def_property_readonly(
-          "returns", [](FunctionSchema& self) { return self.returns(); })
+      .def_property_readonly("name", &FunctionSchema::name)
+      .def_property_readonly("overload_name", &FunctionSchema::overload_name)
+      .def_property_readonly("arguments", &FunctionSchema::arguments)
+      .def_property_readonly("returns", &FunctionSchema::returns)
+      .def(
+          "_is_view_op",
+          [](const FunctionSchema& self) -> bool {
+            for (const auto& arg : self.arguments()) {
+              if (arg.alias_info() && !arg.alias_info()->isWrite()) {
+                return true;
+              }
+            }
+            return false;
+          })
       .def(
           "is_backward_compatible_with",
+          // FunctionSchema::isBackwardCompatibleWith has an extra
+          // defaulted argument, so we can't just use a
+          // pointer-to-member here.
           [](const FunctionSchema& self, const FunctionSchema& old_schema) {
             return self.isBackwardCompatibleWith(old_schema);
           })
@@ -1991,14 +1999,14 @@ void initJITBindings(PyObject* module) {
           })
       .def(
           "__str__",
-          [](FunctionSchema& self) {
+          [](const FunctionSchema& self) {
             std::stringstream ss;
             ss << self;
             return ss.str();
           })
       .def(
           "__repr__",
-          [](FunctionSchema& self) {
+          [](const FunctionSchema& self) {
             std::stringstream ss;
             ss << self;
             return ss.str();
@@ -2012,8 +2020,9 @@ void initJITBindings(PyObject* module) {
           [](const py::str& schema) { // __setstate__, note: no `self` argument
             return parseSchema(schema);
           }))
-      .def_property_readonly(
-          "is_mutable", [](FunctionSchema& self) { return self.is_mutable(); });
+      .def_property_readonly("is_mutable", [](const FunctionSchema& self) {
+        return self.is_mutable();
+      });
   py::class_<Argument>(m, "Argument")
       .def(py::init<
            std::string,
@@ -2022,18 +2031,17 @@ void initJITBindings(PyObject* module) {
            std::optional<IValue>,
            bool,
            std::optional<AliasInfo>>())
-      .def_property_readonly("name", [](Argument& self) { return self.name(); })
-      .def_property_readonly("type", [](Argument& self) { return self.type(); })
-      .def_property_readonly(
-          "real_type", [](Argument& self) { return self.real_type(); })
+      .def_property_readonly("name", &Argument::name)
+      .def_property_readonly("type", &Argument::type)
+      .def_property_readonly("real_type", &Argument::real_type)
       .def_property_readonly(
           "N",
-          [](Argument& self) -> py::object {
+          [](const Argument& self) -> py::object {
             return (self.N()) ? py::cast(*self.N()) : py::none();
           })
       .def_property_readonly(
           "default_value",
-          [](Argument& self) -> py::object {
+          [](const Argument& self) -> py::object {
             if (!self.default_value()) {
               return py::none();
             }
@@ -2042,38 +2050,38 @@ void initJITBindings(PyObject* module) {
           })
       .def(
           "has_default_value",
-          [](Argument& self) -> py::bool_ {
+          [](const Argument& self) -> py::bool_ {
             return self.default_value().has_value();
           })
       .def_property_readonly(
-          "alias_info", [](Argument& self) { return self.alias_info(); })
+          "alias_info", [](const Argument& self) { return self.alias_info(); })
       .def_property_readonly(
           "is_write",
-          [](Argument& self) {
+          [](const Argument& self) {
             if (self.alias_info() == nullptr) {
               return false;
             }
             return self.alias_info()->isWrite();
           })
       .def_property_readonly(
-          "is_out", [](Argument& self) { return self.is_out(); })
-      .def_property_readonly("kwarg_only", [](Argument& self) -> bool {
+          "is_out", [](const Argument& self) { return self.is_out(); })
+      .def_property_readonly("kwarg_only", [](const Argument& self) -> bool {
         return self.kwarg_only();
       });
   py::class_<AliasInfo>(m, "_AliasInfo")
       .def(py::init<bool, std::set<std::string>, std::set<std::string>>())
       .def_property_readonly(
-          "is_write", [](AliasInfo& self) { return self.isWrite(); })
+          "is_write", [](const AliasInfo& self) { return self.isWrite(); })
       .def_property_readonly(
           "before_set",
-          [](AliasInfo& self) {
+          [](const AliasInfo& self) {
             std::set<py::str> before_set_python;
             for (const auto& set : self.beforeSets()) {
               before_set_python.insert(py::str(set.toUnqualString()));
             }
             return before_set_python;
           })
-      .def_property_readonly("after_set", [](AliasInfo& self) {
+      .def_property_readonly("after_set", [](const AliasInfo& self) {
         std::set<py::str> after_set_python;
         for (const auto& set : self.afterSets()) {
           after_set_python.insert(py::str(set.toUnqualString()));
diff --git a/torch/csrc/jit/python/pybind_utils.cpp b/torch/csrc/jit/python/pybind_utils.cpp
index e30648399c5a..a366aa58f822 100644
--- a/torch/csrc/jit/python/pybind_utils.cpp
+++ b/torch/csrc/jit/python/pybind_utils.cpp
@@ -780,9 +780,17 @@ std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
     const std::vector<std::shared_ptr<Operator>>& operations,
     const py::args& args,
     const py::kwargs& kwargs) {
+  return getOpWithStack(
+      c10::ArrayRef<std::shared_ptr<Operator>>(operations), args, kwargs);
+}
+
+std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    const py::args& args,
+    const py::kwargs& kwargs) {
   Stack stack;
   if (operations.size() == 1) {
-    std::shared_ptr<Operator> op = operations.at(0);
+    std::shared_ptr<Operator> op = operations[0];
     // Create a stack full of the arguments and keyword arguments.
     stack = createStackForSchema(op->schema(), args, kwargs, std::nullopt);
 
@@ -834,6 +842,15 @@ py::object invokeOperatorFromPython(
     const py::args& args,
     const py::kwargs& kwargs,
     std::optional<c10::DispatchKey> dk) {
+  return invokeOperatorFromPython(
+      c10::ArrayRef<std::shared_ptr<Operator>>(operations), args, kwargs, dk);
+}
+
+py::object invokeOperatorFromPython(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    std::optional<c10::DispatchKey> dk) {
   auto [found_op, stack] = getOpWithStack(operations, args, kwargs);
   {
     pybind11::gil_scoped_release no_gil_guard;
@@ -855,8 +872,9 @@ std::optional<py::object> _maybe_handle_torch_function(
     const py::args& args,
     const py::kwargs& kwargs) {
   std::vector<PyObject*> overloaded_args;
-  size_t total_arg_num = args.size() + kwargs.size();
-  for (const auto i : c10::irange(args.size())) {
+  const auto args_size = args.size();
+  size_t total_arg_num = args_size + kwargs.size();
+  for (const auto i : c10::irange(args_size)) {
     is_tensor_and_append_overloaded(args[i].ptr(), &overloaded_args);
     is_tensor_list_and_append_overloaded(
         args[i].ptr(),
@@ -911,6 +929,17 @@ py::object _get_operation_for_overload_or_packet(
     const py::kwargs& kwargs,
     bool is_overload,
     std::optional<c10::DispatchKey> dk) {
+  return _get_operation_for_overload_or_packet(
+      c10::ArrayRef(operations), symbol, args, kwargs, is_overload, dk);
+}
+
+py::object _get_operation_for_overload_or_packet(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    Symbol symbol,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    bool is_overload,
+    std::optional<c10::DispatchKey> dk) {
   std::string ns = symbol.ns().toUnqualString();
   std::string method_name = symbol.toUnqualString();
   std::string overload_name = operations[0]->schema().overload_name();
diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
index f80ae1b9481c..5ae84e3e0c68 100644
--- a/torch/csrc/jit/python/pybind_utils.h
+++ b/torch/csrc/jit/python/pybind_utils.h
@@ -1277,12 +1277,27 @@ TORCH_PYTHON_API std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
     const py::args& args,
     const py::kwargs& kwargs);
 
+// Efficient overload (does not require vector allocation) of the
+// above for use from C++ code.
+std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    const py::args& args,
+    const py::kwargs& kwargs);
+
 TORCH_PYTHON_API py::object invokeOperatorFromPython(
     const std::vector<std::shared_ptr<Operator>>& operations,
     const py::args& args,
     const py::kwargs& kwargs,
     std::optional<c10::DispatchKey> dk = std::nullopt);
 
+// Efficient overload (does not require vector allocation) of the
+// above for use from C++ code.
+py::object invokeOperatorFromPython(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    std::optional<c10::DispatchKey> dk = std::nullopt);
+
 TORCH_PYTHON_API std::optional<py::object> _maybe_handle_torch_function(
     const std::string& ns,
     const std::string& method_name,
@@ -1304,4 +1319,14 @@ TORCH_PYTHON_API py::object _get_operation_for_overload_or_packet(
     bool is_overload,
     std::optional<c10::DispatchKey> dk = std::nullopt);
 
+// Efficient overload (does not require vector allocation) of the
+// above for use from C++ code.
+py::object _get_operation_for_overload_or_packet(
+    c10::ArrayRef<std::shared_ptr<Operator>> operations,
+    Symbol symbol,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    bool is_overload,
+    std::optional<c10::DispatchKey> dk = std::nullopt);
+
 } // namespace torch::jit
diff --git a/torch/csrc/lazy/ts_backend/ts_lowering_context.h b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
index 5e6cc4234846..356ea3d8e923 100644
--- a/torch/csrc/lazy/ts_backend/ts_lowering_context.h
+++ b/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@@ -91,7 +91,7 @@ class TORCH_API TSLoweringContext : public LoweringContext {
     for (torch::jit::Value* output : root_tuple_) {
       graph_->block()->registerOutput(output);
     }
-    return std::shared_ptr<Computation>(new TSComputation(graph_));
+    return std::make_shared<TSComputation>(graph_);
   }
 
   // Retrieves the lowered operation for an output. If the requested output is
diff --git a/torch/csrc/stable/ops.h b/torch/csrc/stable/ops.h
index d4bb5947abcc..4a11c7256bf4 100644
--- a/torch/csrc/stable/ops.h
+++ b/torch/csrc/stable/ops.h
@@ -90,6 +90,44 @@ inline Tensor new_empty(
   return Tensor(ret0);
 }
 
+// We expect this to be a stable version of the new_zeros op that takes in
+// only dtype information.
+inline Tensor new_zeros(
+    const Tensor& self,
+    std::vector<int64_t> size,
+    std::optional<c10::ScalarType> dtype = std::nullopt) {
+  int32_t device_type;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(self.get(), &device_type));
+
+  int32_t device_index;
+  TORCH_ERROR_CODE_CHECK(
+      aoti_torch_get_device_index(self.get(), &device_index));
+
+  int32_t target_dtype;
+  if (dtype.has_value()) {
+    target_dtype = to<int32_t>(from(dtype.value()));
+  } else {
+    TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(self.get(), &target_dtype));
+  }
+
+  int32_t layout;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_layout(self.get(), &layout));
+
+  AtenTensorHandle ath;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_aten_new_zeros(
+      self.get(),
+      size.data(),
+      static_cast<int64_t>(size.size()),
+      &target_dtype,
+      &layout,
+      &device_type,
+      device_index,
+      nullptr, // pin_memory (nullptr for default)
+      &ath));
+
+  return Tensor(ath);
+}
+
 // We expect this to be the stable version of the pad.default op.
 // pad.default takes in a SymInt[] as the pad argument however pad is typed as
 // use std::vector<int64_t> because
@@ -116,7 +154,7 @@ inline Tensor pad(
 
 // This function is an overload to compute the maximum value along each slice of
 // `self` along a single dimension `dim`.
-inline Tensor amax(Tensor& self, int64_t dim, bool keepdim = false) {
+inline Tensor amax(const Tensor& self, int64_t dim, bool keepdim = false) {
   AtenTensorHandle ret = nullptr;
   TORCH_ERROR_CODE_CHECK(
       aoti_torch_aten_amax(self.get(), &dim, 1, keepdim, &ret));
@@ -129,7 +167,7 @@ inline Tensor amax(Tensor& self, int64_t dim, bool keepdim = false) {
 // typed as use std::vector<int64_t> here because (1) IntArrayRef is not yet
 // header-only (2) SymInt is not yet header-only
 inline Tensor amax(
-    Tensor& self,
+    const Tensor& self,
     std::vector<int64_t> dims,
     bool keepdim = false) {
   AtenTensorHandle ret = nullptr;
diff --git a/torch/csrc/utils.cpp b/torch/csrc/utils.cpp
index eee9af9d9ecb..c23a41e8e64e 100644
--- a/torch/csrc/utils.cpp
+++ b/torch/csrc/utils.cpp
@@ -240,6 +240,34 @@ uint8_t storage_get(const at::Storage& self, ptrdiff_t idx) {
   return self_t[idx].item<uint8_t>();
 }
 
+std::string uuid_to_string(const char* uuid_bytes) {
+  // UUIDs are a 128-bit label. CUDA/HIP and XPU store this as char[16].
+  // For string representation, the code here expands this to
+  // 8-4-4-4-12 hex format, so each byte becomes 2 hex characters.
+  return fmt::format(
+      "{:02x}{:02x}{:02x}{:02x}-"
+      "{:02x}{:02x}-"
+      "{:02x}{:02x}-"
+      "{:02x}{:02x}-"
+      "{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
+      (uint8_t)uuid_bytes[0],
+      (uint8_t)uuid_bytes[1],
+      (uint8_t)uuid_bytes[2],
+      (uint8_t)uuid_bytes[3],
+      (uint8_t)uuid_bytes[4],
+      (uint8_t)uuid_bytes[5],
+      (uint8_t)uuid_bytes[6],
+      (uint8_t)uuid_bytes[7],
+      (uint8_t)uuid_bytes[8],
+      (uint8_t)uuid_bytes[9],
+      (uint8_t)uuid_bytes[10],
+      (uint8_t)uuid_bytes[11],
+      (uint8_t)uuid_bytes[12],
+      (uint8_t)uuid_bytes[13],
+      (uint8_t)uuid_bytes[14],
+      (uint8_t)uuid_bytes[15]);
+}
+
 template class THPPointer<THPStorage>;
 // NOLINTBEGIN(misc-use-internal-linkage)
 namespace torch::gdb {
diff --git a/torch/csrc/utils.h b/torch/csrc/utils.h
index be79adccb74f..71a2b10e5904 100644
--- a/torch/csrc/utils.h
+++ b/torch/csrc/utils.h
@@ -201,3 +201,5 @@ bool maybeThrowBackCompatKeepdimWarn(char* func);
 void storage_fill(const at::Storage& self, uint8_t value);
 void storage_set(const at::Storage& self, ptrdiff_t idx, uint8_t value);
 uint8_t storage_get(const at::Storage& self, ptrdiff_t idx);
+
+std::string uuid_to_string(const char* uuid_bytes);
diff --git a/torch/csrc/utils/generated_serialization_types.h b/torch/csrc/utils/generated_serialization_types.h
index fe27b44e4537..bec4e283dcac 100644
--- a/torch/csrc/utils/generated_serialization_types.h
+++ b/torch/csrc/utils/generated_serialization_types.h
@@ -1,5 +1,5 @@
 // @generated by update_schema.py
-// checksum<<87c161b9527f9694d80839363f0e324f16f7d0f7277761016dc10228c3ce20e6>>
+// checksum<<74d07b92c36d5854263145c231553dcda15215f0460e7ace43554248c05378ec>>
 // clang-format off
 
 #pragma once
@@ -129,6 +129,7 @@ inline void from_json(const nlohmann::json& j, F64& f) {
 class AOTInductorModelPickleData;
 class Argument;
 class BufferMutationSpec;
+class ComplexValue;
 class ConstantValue;
 class CustomObjArgument;
 class Device;
@@ -1199,16 +1200,43 @@ class CustomObjArgument {
   friend void from_json(const nlohmann::json& nlohmann_json_j, CustomObjArgument& nlohmann_json_t);
 };
 
+class ComplexValue {
+ private:
+  F64 real;
+  F64 imag;
+
+ public:
+
+  const F64& get_real() const {
+    return real;
+  }
+
+  void set_real(F64 def) {
+    real = std::move(def);
+  }
+
+  const F64& get_imag() const {
+    return imag;
+  }
+
+  void set_imag(F64 def) {
+    imag = std::move(def);
+  }
+
+  friend void to_json(nlohmann::json& nlohmann_json_j, const ComplexValue& nlohmann_json_t);
+  friend void from_json(const nlohmann::json& nlohmann_json_j, ComplexValue& nlohmann_json_t);
+};
+
 class Argument {
   struct Void {};
 
  public:
   enum class Tag {
-    AS_NONE, AS_TENSOR, AS_TENSORS, AS_INT, AS_INTS, AS_FLOAT, AS_FLOATS, AS_STRING, AS_STRINGS, AS_SYM_INT, AS_SYM_INTS, AS_SCALAR_TYPE, AS_MEMORY_FORMAT, AS_LAYOUT, AS_DEVICE, AS_BOOL, AS_BOOLS, AS_SYM_BOOL, AS_SYM_BOOLS, AS_GRAPH, AS_OPTIONAL_TENSORS, AS_CUSTOM_OBJ, AS_OPERATOR, AS_SYM_FLOAT, AS_SYM_FLOATS, AS_OPTIONAL_TENSOR
+    AS_NONE, AS_TENSOR, AS_TENSORS, AS_INT, AS_INTS, AS_FLOAT, AS_FLOATS, AS_STRING, AS_STRINGS, AS_SYM_INT, AS_SYM_INTS, AS_SCALAR_TYPE, AS_MEMORY_FORMAT, AS_LAYOUT, AS_DEVICE, AS_BOOL, AS_BOOLS, AS_SYM_BOOL, AS_SYM_BOOLS, AS_GRAPH, AS_OPTIONAL_TENSORS, AS_CUSTOM_OBJ, AS_OPERATOR, AS_SYM_FLOAT, AS_SYM_FLOATS, AS_OPTIONAL_TENSOR, AS_COMPLEX
   };
 
  private:
-  std::variant<Void, bool, TensorArgument, std::vector<TensorArgument>, int64_t, std::vector<int64_t>, F64, std::vector<F64>, std::string, std::vector<std::string>, SymIntArgument, std::vector<SymIntArgument>, ScalarType, MemoryFormat, Layout, Device, bool, std::vector<bool>, SymBoolArgument, std::vector<SymBoolArgument>, GraphArgument, std::vector<OptionalTensorArgument>, CustomObjArgument, std::string, SymFloatArgument, std::vector<SymFloatArgument>, OptionalTensorArgument> variant_;
+  std::variant<Void, bool, TensorArgument, std::vector<TensorArgument>, int64_t, std::vector<int64_t>, F64, std::vector<F64>, std::string, std::vector<std::string>, SymIntArgument, std::vector<SymIntArgument>, ScalarType, MemoryFormat, Layout, Device, bool, std::vector<bool>, SymBoolArgument, std::vector<SymBoolArgument>, GraphArgument, std::vector<OptionalTensorArgument>, CustomObjArgument, std::string, SymFloatArgument, std::vector<SymFloatArgument>, OptionalTensorArgument, ComplexValue> variant_;
   Tag tag_;
 
  public:
@@ -1450,6 +1478,15 @@ class Argument {
     tag_ = Tag::AS_OPTIONAL_TENSOR;
   }
 
+  const ComplexValue& get_as_complex() const {
+    return std::get<27>(variant_);
+  }
+
+  void set_as_complex(ComplexValue def) {
+    variant_.emplace<27>(std::move(def));
+    tag_ = Tag::AS_COMPLEX;
+  }
+
   friend void to_json(nlohmann::json& nlohmann_json_j, const Argument& nlohmann_json_t) {
 
     if (nlohmann_json_t.tag_ == Tag::AS_NONE) {
@@ -1556,6 +1593,10 @@ class Argument {
       nlohmann_json_j["as_optional_tensor"] = nlohmann_json_t.get_as_optional_tensor();
       return;
     }
+    if (nlohmann_json_t.tag_ == Tag::AS_COMPLEX) {
+      nlohmann_json_j["as_complex"] = nlohmann_json_t.get_as_complex();
+      return;
+    }
   }
 
   friend void from_json(const nlohmann::json& nlohmann_json_j, Argument& nlohmann_json_t) {
@@ -1690,6 +1731,11 @@ class Argument {
       nlohmann_json_t.tag_ = Tag::AS_OPTIONAL_TENSOR;
       return;
     }
+    if (nlohmann_json_j.contains("as_complex")) {
+      nlohmann_json_t.variant_.emplace<27>(nlohmann_json_j.at("as_complex").template get<ComplexValue>());
+      nlohmann_json_t.tag_ = Tag::AS_COMPLEX;
+      return;
+    }
   }
 };
 
@@ -1721,6 +1767,7 @@ inline std::string_view printEnum(const Argument::Tag& e) {
     case Argument::Tag::AS_SYM_FLOAT: return "AS_SYM_FLOAT";
     case Argument::Tag::AS_SYM_FLOATS: return "AS_SYM_FLOATS";
     case Argument::Tag::AS_OPTIONAL_TENSOR: return "AS_OPTIONAL_TENSOR";
+    case Argument::Tag::AS_COMPLEX: return "AS_COMPLEX";
     default:
       throw std::runtime_error("Unknown enum value");
   }
@@ -1753,6 +1800,7 @@ inline void parseEnum(std::string_view s, Argument::Tag& t) {
   if (s == "AS_SYM_FLOAT") { t = Argument::Tag::AS_SYM_FLOAT; return; }
   if (s == "AS_SYM_FLOATS") { t = Argument::Tag::AS_SYM_FLOATS; return; }
   if (s == "AS_OPTIONAL_TENSOR") { t = Argument::Tag::AS_OPTIONAL_TENSOR; return; }
+  if (s == "AS_COMPLEX") { t = Argument::Tag::AS_COMPLEX; return; }
   throw std::runtime_error("Unknown enum value: " + std::string{s});
 }
 
@@ -3062,6 +3110,7 @@ class ExportedProgram {
   SchemaVersion schema_version;
   std::vector<std::string> verifiers = {};
   std::string torch_version = "<=2.4";
+  std::vector<std::string> guards_code = {};
 
  public:
 
@@ -3113,6 +3162,14 @@ class ExportedProgram {
     torch_version = std::move(def);
   }
 
+  const std::vector<std::string>& get_guards_code() const {
+    return guards_code;
+  }
+
+  void set_guards_code(std::vector<std::string> def) {
+    guards_code = std::move(def);
+  }
+
   friend void to_json(nlohmann::json& nlohmann_json_j, const ExportedProgram& nlohmann_json_t);
   friend void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nlohmann_json_t);
 };
@@ -3318,6 +3375,17 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, BufferMutationSpec&
   nlohmann_json_t.buffer_name = nlohmann_json_j.value("buffer_name", nlohmann_json_default_obj.buffer_name);
 }
 
+inline void to_json(nlohmann::json& nlohmann_json_j, const ComplexValue& nlohmann_json_t) {
+  nlohmann_json_j["real"] = nlohmann_json_t.real;
+  nlohmann_json_j["imag"] = nlohmann_json_t.imag;
+}
+
+inline void from_json(const nlohmann::json& nlohmann_json_j, ComplexValue& nlohmann_json_t) {
+  ComplexValue nlohmann_json_default_obj;
+  nlohmann_json_t.real = nlohmann_json_j.value("real", nlohmann_json_default_obj.real);
+  nlohmann_json_t.imag = nlohmann_json_j.value("imag", nlohmann_json_default_obj.imag);
+}
+
 inline void to_json(nlohmann::json& nlohmann_json_j, const CustomObjArgument& nlohmann_json_t) {
   nlohmann_json_j["name"] = nlohmann_json_t.name;
   nlohmann_json_j["class_fqn"] = nlohmann_json_t.class_fqn;
@@ -3347,6 +3415,7 @@ inline void to_json(nlohmann::json& nlohmann_json_j, const ExportedProgram& nloh
   nlohmann_json_j["schema_version"] = nlohmann_json_t.schema_version;
   nlohmann_json_j["verifiers"] = nlohmann_json_t.verifiers;
   nlohmann_json_j["torch_version"] = nlohmann_json_t.torch_version;
+  nlohmann_json_j["guards_code"] = nlohmann_json_t.guards_code;
 }
 
 inline void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nlohmann_json_t) {
@@ -3357,6 +3426,7 @@ inline void from_json(const nlohmann::json& nlohmann_json_j, ExportedProgram& nl
   nlohmann_json_t.schema_version = nlohmann_json_j.value("schema_version", nlohmann_json_default_obj.schema_version);
   nlohmann_json_t.verifiers = nlohmann_json_j.value("verifiers", nlohmann_json_default_obj.verifiers);
   nlohmann_json_t.torch_version = nlohmann_json_j.value("torch_version", nlohmann_json_default_obj.torch_version);
+  nlohmann_json_t.guards_code = nlohmann_json_j.value("guards_code", nlohmann_json_default_obj.guards_code);
 }
 
 inline void to_json(nlohmann::json& nlohmann_json_j, const ExternKernelNode& nlohmann_json_t) {
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 1ae03f91f218..613657e03b92 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -303,6 +303,10 @@ static py::object maybe_get_registered_torch_dispatch_rule(
   return result;
 }
 
+// NB: Invariant: if you run this function, you MUST test if the returned
+// py::object is nullptr, as this will occur WITHOUT error condition being set.
+// And if an error happens, this function is responsible for throwing a C++
+// error.
 static py::object dispatch_on_subclass(
     PyObject* args,
     PyObject* kwargs,
@@ -382,6 +386,7 @@ static py::object dispatch_on_subclass(
       break;
     }
   }
+  // NB: PyErr_Occurred is NOT set here, this means NO dispatch happened
   return ret;
 }
 
@@ -583,9 +588,15 @@ auto handle_torch_function_no_python_arg_parser(
   }
 
   if (ret.ptr() == nullptr) {
-    // if an exception occurred in a user's implementation of
-    // __torch_function__, throw it
-    throw python_error();
+    // We didn't successfully dispatch anything, this should be impossible
+    TORCH_INTERNAL_ASSERT(
+        0,
+        "dispatch_on_subclass called with NO overloaded args that actually triggered dispatch, "
+        "perhaps there is a divergence in how you detect torch function/dispatch and how overloaded args is "
+        "computed?  overloaded_args = ",
+        overloaded_args,
+        ", is_mode_active = ",
+        is_mode_active());
   } else if (ret.ptr() == Py_NotImplemented) {
     // all __torch_function__ implementations in overloaded_args
     // returned NotImplemented, so we raise a TypeError.
@@ -666,7 +677,22 @@ auto handle_torch_function_indexing(
   auto size = PyTuple_GET_SIZE(index_tup.ptr());
   for (auto i : c10::irange(size)) {
     auto* obj = PyTuple_GetItem(index_tup.ptr(), i);
-    is_tensor_and_append_overloaded(obj, &overridable_args);
+    auto r = is_tensor_and_append_overloaded(obj, &overridable_args);
+    if (!r && PySequence_Check(obj)) {
+      auto inner_size = PySequence_Length(obj);
+      if (inner_size < 0) {
+        // PySequence_Length failed, but we continue as this is optional
+        // optimization
+        PyErr_Clear();
+        continue;
+      }
+      for (auto j : c10::irange(inner_size)) {
+        THPObjectPtr inner_obj(PySequence_GetItem(obj, j));
+        if (inner_obj.get()) {
+          is_tensor_and_append_overloaded(inner_obj.get(), &overridable_args);
+        }
+      }
+    }
   }
   if (val != nullptr) {
     is_tensor_and_append_overloaded(val, &overridable_args);
@@ -793,17 +819,29 @@ bool is_tensor_and_append_overloaded(
   return false;
 }
 
-static bool is_scalar_list(PyObject* obj) {
+static bool is_scalar_list(
+    PyObject* obj,
+    std::vector<PyObject*>* overloaded_args = nullptr) {
   auto tuple = six::isTuple(obj);
   if (!(tuple || PyList_Check(obj))) {
     return false;
   }
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
+  bool has_torch_func = false;
+
   for (const auto idx : c10::irange(size)) {
     PyObject* iobj =
         tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
-    if (!THPUtils_checkScalar(iobj)) {
+
+    // Check if this element has torch function
+    if (overloaded_args &&
+        check_has_torch_function(iobj, /*ignore_mode*/ true)) {
+      append_overloaded_arg(overloaded_args, iobj, /*obj_is_type*/ false);
+      has_torch_func = true;
+    }
+
+    if (!THPUtils_checkScalar(iobj) && !has_torch_func) {
       return false;
     }
   }
@@ -853,7 +891,9 @@ static bool is_float_or_symfloat(PyObject* obj) {
   return false;
 }
 
-static bool is_float_or_complex_list(PyObject* obj) {
+static bool is_float_or_complex_list(
+    PyObject* obj,
+    std::vector<PyObject*>* overloaded_args = nullptr) {
   auto tuple = six::isTuple(obj);
   if (!(tuple || PyList_Check(obj))) {
     return false;
@@ -861,10 +901,25 @@ static bool is_float_or_complex_list(PyObject* obj) {
 
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
-  if (size > 0) {
-    PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0);
-    if (!is_float_or_symfloat(iobj) && !PyComplex_Check(iobj)) {
-      return false;
+  bool has_torch_func = false;
+
+  for (long idx = 0; idx < size; idx++) {
+    PyObject* iobj =
+        tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
+
+    // Check if this element has torch function
+    if (overloaded_args &&
+        check_has_torch_function(iobj, /*ignore_mode*/ true)) {
+      append_overloaded_arg(overloaded_args, iobj, /*obj_is_type*/ false);
+      has_torch_func = true;
+    }
+
+    // For the first element, do the original type checking
+    if (idx == 0) {
+      if (!is_float_or_symfloat(iobj) && !PyComplex_Check(iobj) &&
+          !has_torch_func) {
+        return false;
+      }
     }
   }
 
@@ -872,10 +927,14 @@ static bool is_float_or_complex_list(PyObject* obj) {
 }
 
 static bool is_int_or_symint(PyObject* obj) {
+  // Call checkLong first so that actual ints go fast.
+  if (THPUtils_checkLong(obj)) {
+    return true;
+  }
+
   // THPUtils_checkIndex may call __index__ or __int__
   // which may have side effects if obj is a symint node
   // so we do `is_symint` check first
-  // TODO: maybe we should be using checkLong here?
   if (torch::is_symint(py::handle(obj))) {
     return true;
   }
@@ -905,26 +964,51 @@ static bool is_int_or_symint(PyObject* obj) {
 static bool is_int_or_symint_list(
     PyObject* obj,
     int broadcast_size,
-    int64_t* failed_idx = nullptr) {
-  if (PyTuple_Check(obj) || PyList_Check(obj)) {
-    if (PySequence_Size(obj) == 0) {
+    int64_t* failed_idx = nullptr,
+    std::vector<PyObject*>* overloaded_args = nullptr) {
+  const bool is_tuple = PyTuple_Check(obj);
+  if (is_tuple || PyList_Check(obj)) {
+    const auto size = is_tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
+    if (size == 0) {
       return true;
     }
-    auto item = py::reinterpret_steal<py::object>(PySequence_GetItem(obj, 0));
 
-    if (is_int_or_symint(item.ptr())) {
-      return true;
-    }
+    // Check all elements, not just the first one, when looking for torch
+    // functions
+    bool has_torch_func = false;
+
+    for (Py_ssize_t idx = 0; idx < size; idx++) {
+      PyObject* item_ptr =
+          is_tuple ? PyTuple_GET_ITEM(obj, idx) : PyList_GET_ITEM(obj, idx);
 
-    // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints
-    // in an intlist argument. Even float or complex scalar tensors.
-    bool r =
-        (jit::tracer::isTracing() && THPVariable_Check(item.ptr()) &&
-         THPVariable_Unpack(item.ptr()).sizes().empty());
-    if (!r && failed_idx != nullptr) {
-      *failed_idx = 0;
+      // Check if this element has torch function
+      if (overloaded_args &&
+          check_has_torch_function(item_ptr, /*ignore_mode*/ true)) {
+        append_overloaded_arg(overloaded_args, item_ptr, /*obj_is_type*/ false);
+        has_torch_func = true;
+      }
+
+      // For the first element, do the original type checking
+      if (idx == 0) {
+        if (is_int_or_symint(item_ptr)) {
+          continue;
+        }
+
+        // NOTE: JIT tracer allows arbitrary scalar tensors to act as ints
+        // in an intlist argument. Even float or complex scalar tensors.
+        bool r =
+            (jit::tracer::isTracing() && THPVariable_Check(item_ptr) &&
+             THPVariable_Unpack(item_ptr).sizes().empty());
+        if (!r && failed_idx != nullptr) {
+          *failed_idx = 0;
+        }
+        if (!r && !has_torch_func) {
+          return false;
+        }
+      }
     }
-    return r;
+
+    return true;
   }
 
   // if a size is specified (e.g. IntArrayRef[2]) we also allow passing a single
@@ -1024,7 +1108,7 @@ auto FunctionParameter::_check(
           obj, &overloaded_args, argnum, true /* throw_error */);
     }
     case ParameterType::FLOAT_LIST:
-      return is_float_or_complex_list(obj);
+      return is_float_or_complex_list(obj, &overloaded_args);
     case ParameterType::GENERATOR:
       return THPGenerator_Check(obj);
     case ParameterType::BOOL:
@@ -1051,13 +1135,13 @@ auto FunctionParameter::_check(
     case ParameterType::STRING:
       return THPUtils_checkString(obj);
     case ParameterType::SCALAR_LIST:
-      return is_scalar_list(obj);
+      return is_scalar_list(obj, &overloaded_args);
     case ParameterType::SYM_INT:
       return is_int_or_symint(obj);
     // Allow SymInt where int is expected; we'll guard in this case
     case ParameterType::INT_LIST:
     case ParameterType::SYM_INT_LIST:
-      return is_int_or_symint_list(obj, size, failed_idx);
+      return is_int_or_symint_list(obj, size, failed_idx, &overloaded_args);
     case ParameterType::DISPATCH_KEY_SET:
       return py::isinstance<c10::DispatchKeySet>(py::handle(obj));
     default:
@@ -1605,7 +1689,8 @@ bool FunctionSignature::parse(
       // should avoid having complex signatures that make use of it...
     } else if (
         varargs_eligible &&
-        (is_int_or_symint_list(args, param.size, &failed_idx))) {
+        (is_int_or_symint_list(
+            args, param.size, &failed_idx, &overloaded_args))) {
       // take all positional arguments as this parameter
       // e.g. permute(1, 2, 3) -> permute((1, 2, 3))
       dst[i++] = args;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index 2c1373921e57..a81f861ae903 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -1059,13 +1059,20 @@ inline double PythonArgs::toDouble(int i) {
 }
 
 inline bool PythonArgs::toBool(int i) {
-  if (!args[i])
+  if (!args[i]) {
     return signature.params[i].default_bool;
+  }
+  if (args[i] == Py_True) {
+    return true;
+  }
+  if (args[i] == Py_False) {
+    return false;
+  }
   if (torch::is_symbool(py::handle(args[i]))) {
     return py::cast<c10::SymBool>(py::handle(args[i]))
         .guard_bool(__FILE__, __LINE__);
   }
-  return args[i] == Py_True;
+  return false;
 }
 
 inline double PythonArgs::toDoubleWithDefault(int i, double default_double) {
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 568d9402140d..9d6eb35c7178 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -2,6 +2,7 @@
 #include <torch/csrc/utils/python_dispatch.h>
 
 #include <ATen/ATen.h>
+#include <ATen/DTensorState.h>
 #include <ATen/FuncTorchTLS.h>
 #include <ATen/FunctionalTensorWrapper.h>
 #include <ATen/TensorSubclassLikeUtils.h>
@@ -26,6 +27,8 @@
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_raii.h>
 
+#include <cstdlib>
+#include <cstring>
 #include <iostream>
 #include <utility>
 
@@ -33,6 +36,10 @@ namespace py = pybind11;
 
 namespace torch::impl::dispatch {
 
+// Global storage for leaked Python filenames to ensure they remain valid
+// for the lifetime of Library objects
+static std::vector<std::string> leaked_python_filenames_;
+
 // NB: I'd like to index this on OperatorHandle, but I can't, as I can't
 // guarantee that the main interpreter has finish doing all registrations before
 // the other interpreters start banging on it
@@ -497,13 +504,18 @@ void initDispatchBindings(PyObject* module) {
          const char* file,
          uint32_t linenum) {
         HANDLE_TH_ERRORS
+        // Store the file string in global storage to ensure it remains valid
+        // for the lifetime of the Library object
+        leaked_python_filenames_.emplace_back(file);
+        const char* leaked_file = leaked_python_filenames_.back().c_str();
+
         return std::make_unique<torch::Library>(
             parseKind(kind),
             std::move(name),
             std::string(dispatch).empty()
                 ? std::nullopt
                 : std::make_optional(c10::parseDispatchKey(dispatch)),
-            "/dev/null", // temporary workaround
+            leaked_file,
             linenum);
         END_HANDLE_TH_ERRORS_PYBIND
       },
@@ -514,6 +526,12 @@ void initDispatchBindings(PyObject* module) {
       py::arg("file") = "/dev/null",
       py::arg("linenum") = 0);
 
+  m.def(
+      "_dispatch_clear_leaked_python_filenames",
+      []() { leaked_python_filenames_.clear(); },
+      "Clear the global storage of leaked Python filenames. "
+      "WARNING: Only call this if you're sure no Library objects are still using the filenames.");
+
   m.def(
       "_dispatch_find_schema_or_throw",
       [](const char* name, const char* overload_name) -> c10::OperatorHandle {
@@ -1028,6 +1046,13 @@ void initDispatchBindings(PyObject* module) {
   m.def("_only_lift_cpu_tensors", &torch::utils::only_lift_cpu_tensors);
   m.def("_set_only_lift_cpu_tensors", &torch::utils::set_only_lift_cpu_tensors);
 
+  m.def(
+      "_get_dtensor_allow_implicit_replication",
+      &at::get_dtensor_allow_implicit_replication);
+  m.def(
+      "_set_dtensor_allow_implicit_replication",
+      &at::set_dtensor_allow_implicit_replication);
+
   using c10::impl::TorchDispatchModeKey;
   py::enum_<TorchDispatchModeKey>(m, "_TorchDispatchModeKey")
       .value("FUNCTIONAL", TorchDispatchModeKey::FUNCTIONAL)
diff --git a/torch/csrc/utils/python_strings.h b/torch/csrc/utils/python_strings.h
index a6cb8d5c30b5..1d26c4333bc2 100644
--- a/torch/csrc/utils/python_strings.h
+++ b/torch/csrc/utils/python_strings.h
@@ -116,7 +116,7 @@ inline py::object PyObject_FastGetAttrString(PyObject* obj, const char* name) {
   }
   /* Attribute referenced by (PyObject *)name */
   else if (tp->tp_getattro != nullptr) {
-    auto w = py::reinterpret_steal<py::object>(THPUtils_internString(name));
+    auto w = py::reinterpret_steal<py::object>(PyUnicode_FromString(name));
     if (w.ptr() == nullptr) {
       return py::object();
     }
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
index 715bf5b8fb66..d49fc0539a08 100644
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@@ -295,8 +295,23 @@ static void registerXpuDeviceProperties(PyObject* module) {
     return static_cast<int64_t>(prop.architecture);
   };
 #endif
+  // Wrapper class for XPU UUID
+  struct XPUuuid {
+    XPUuuid(const std::array<unsigned char, 16>& uuid) : bytes(uuid) {}
+    const std::array<unsigned char, 16>& bytes{};
+  };
   auto m = py::handle(module).cast<py::module>();
 
+  py::class_<XPUuuid>(m, "_XPUuuid")
+      .def_property_readonly(
+          "bytes",
+          [](const XPUuuid& uuid) {
+            return std::vector<uint8_t>(uuid.bytes.begin(), uuid.bytes.end());
+          })
+      .def("__str__", [](const XPUuuid& uuid) {
+        return uuid_to_string(reinterpret_cast<const char*>(uuid.bytes.data()));
+      });
+
 #define DEFINE_READONLY_MEMBER(member) \
   def_readonly(#member, &DeviceProp::member)
 
@@ -328,6 +343,9 @@ static void registerXpuDeviceProperties(PyObject* module) {
       .def_property_readonly("architecture", get_device_architecture)
 #endif
       .def_property_readonly("type", get_device_type)
+      .def_property_readonly(
+          "uuid",
+          [](const DeviceProp& prop) -> XPUuuid { return XPUuuid(prop.uuid); })
       .def(
           "__repr__",
           [&get_device_type, &gpu_subslice_count](const DeviceProp& prop) {
@@ -335,7 +353,9 @@ static void registerXpuDeviceProperties(PyObject* module) {
             stream << "_XpuDeviceProperties(name='" << prop.name
                    << "', platform_name='" << prop.platform_name << "', type='"
                    << get_device_type(prop) << "', device_id=0x" << std::hex
-                   << std::uppercase << prop.device_id << std::dec
+                   << std::uppercase << prop.device_id << std::dec << ", uuid="
+                   << uuid_to_string(
+                          reinterpret_cast<const char*>(prop.uuid.data()))
                    << ", driver_version='" << prop.driver_version
                    << "', total_memory="
                    << prop.global_mem_size / (1024ull * 1024) << "MB"
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 01bc4d73a459..e9049f036e1e 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -259,7 +259,7 @@ def _check_capability():
     CUDA_ARCHES_SUPPORTED = {
         "12.6": {"min": 50, "max": 90},
         "12.8": {"min": 70, "max": 120},
-        "12.9": {"min": 70, "max": 120},
+        "13.0": {"min": 75, "max": 120},
     }
 
     if (
diff --git a/torch/cuda/_utils.py b/torch/cuda/_utils.py
index 5fdcd65ddf7b..d5e3a6d18013 100644
--- a/torch/cuda/_utils.py
+++ b/torch/cuda/_utils.py
@@ -30,12 +30,22 @@ def _check_cuda(result: int) -> None:
 
 
 def _get_nvrtc_library() -> ctypes.CDLL:
-    # Since PyTorch already loads NVRTC, we can use the system library
-    # which should be compatible with PyTorch's version
+    major_version = int(torch.version.cuda.split(".")[0])  # type: ignore[union-attr]
     if sys.platform == "win32":
-        return ctypes.CDLL("nvrtc64_120_0.dll")
+        nvrtc_libs = [
+            f"nvrtc64_{major_version}0_0.dll",
+        ]
     else:
-        return ctypes.CDLL("libnvrtc.so")
+        nvrtc_libs = [
+            f"libnvrtc.so.{major_version}",
+            "libnvrtc.so",  # Fallback to unversioned
+        ]
+    for lib_name in nvrtc_libs:
+        try:
+            return ctypes.CDLL(lib_name)
+        except OSError:
+            continue
+    raise OSError("Could not find any NVRTC library")
 
 
 def _nvrtc_compile(
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index 54b75d4611ba..5a1a0adc02af 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -1075,8 +1075,8 @@ def _save_memory_usage(filename="output.svg", snapshot=None):
         f.write(_memory(snapshot))
 
 
-# Keep for BC only
-_set_allocator_settings = torch._C._accelerator_setAllocatorSettings
+def _set_allocator_settings(env: str):
+    return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env)
 
 
 def get_allocator_backend() -> str:
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
index 0b53da3988bd..c893794fc301 100644
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@@ -815,6 +815,11 @@ def _are_we_tracing() -> bool:
     # If fake mode is turned on, we are almost definitely compiling/tracing.
     if torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.FAKE) is not None:
         return True
+    # See Note [enable_python_dispatcher in dynamo]
+    if torch._C._dispatch_tls_is_dispatch_key_included(
+        torch._C.DispatchKey.PythonDispatcher
+    ):
+        return True
     return get_proxy_mode() is not None
 
 
diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py
index 2bfbbcb575cd..772483322cc5 100644
--- a/torch/distributed/_shard/sharded_tensor/api.py
+++ b/torch/distributed/_shard/sharded_tensor/api.py
@@ -1146,8 +1146,12 @@ def reshard(self, resharding_spec: shard_spec.ShardingSpec) -> ShardedTensor:
             resharding_spec, shard_spec.ChunkShardingSpec
         ) or not isinstance(self._sharding_spec, shard_spec.ChunkShardingSpec):
             raise NotImplementedError("Only ChunkShardingSpec supported for reshard.")
-        if len(self.local_shards()) != 1:
-            raise NotImplementedError("Only single local shard supported for reshard.")
+
+        num_local_shards = len(self.local_shards())
+        if num_local_shards != 1:
+            raise NotImplementedError(
+                f"Only single local shard supported for reshard. Number of shards: {num_local_shards}"
+            )
 
         if self._sharding_spec.dim == resharding_spec.dim:  # type: ignore[attr-defined]
             if self._sharding_spec.placements == resharding_spec.placements:  # type: ignore[attr-defined]
@@ -1180,8 +1184,11 @@ def local_tensor(self) -> torch.Tensor:
         Returns:
             A :class:`torch.Tensor` of the local shard.
         """
-        if len(self.local_shards()) != 1:
-            raise NotImplementedError("Only single local shard is supported.")
+        num_local_shards = len(self.local_shards())
+        if num_local_shards != 1:
+            raise NotImplementedError(
+                f"Only single local shard is supported. Number of shards: {num_local_shards}"
+            )
         return self.local_shards()[0].tensor
 
     @classmethod
diff --git a/torch/distributed/_symmetric_memory/__init__.py b/torch/distributed/_symmetric_memory/__init__.py
index 4b0e9acc19bd..43c2959fdd8d 100644
--- a/torch/distributed/_symmetric_memory/__init__.py
+++ b/torch/distributed/_symmetric_memory/__init__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import math
 import os
 import socket
@@ -7,7 +9,7 @@
 from datetime import timedelta
 from enum import Enum
 from functools import partial
-from typing import Any, Callable, Literal, Optional
+from typing import Any, Callable, Literal
 
 import torch
 import torch.distributed._functional_collectives as funcol
@@ -47,11 +49,11 @@ def enable_symm_mem_for_group(group_name: str) -> None:
 
 
 _is_test_mode: bool = False
-_mocked_group_names: Optional[set[str]] = None
+_mocked_group_names: set[str] | None = None
 
 
 @contextmanager
-def _test_mode(group_names: Optional[set[str]] = None) -> Generator[None, None, None]:
+def _test_mode(group_names: set[str] | None = None) -> Generator[None, None, None]:
     """
     Forces ``is_symm_mem_enabled_for_group()`` to return ``True`` and the ops
     defined in the ``symm_mem`` namespace to use fallback implementations.
@@ -83,7 +85,7 @@ def is_symm_mem_enabled_for_group(group_name: str) -> bool:
     return group_name in _group_name_to_store
 
 
-_group_name_to_workspace_tensor: dict[str, Optional[torch.Tensor]] = {}
+_group_name_to_workspace_tensor: dict[str, torch.Tensor | None] = {}
 
 
 def get_symm_mem_workspace(group_name: str, min_size: int) -> _SymmetricMemory:
@@ -469,7 +471,7 @@ class _ScaleMode(Enum):
 
 
 def _check_and_verify_fp8_all_gather_scale_mode(
-    shard: torch.Tensor, scale: Optional[torch.Tensor], gather_dim: int, group_size: int
+    shard: torch.Tensor, scale: torch.Tensor | None, gather_dim: int, group_size: int
 ) -> _ScaleMode:
     full_shape = list(shard.shape)
     full_shape[gather_dim] *= group_size
@@ -498,13 +500,13 @@ def _fused_all_gather_matmul_impl(
     mm_out_op: torch._ops.OpOverload,
     A_shard: torch.Tensor,
     Bs: list[torch.Tensor],
-    A_scale: Optional[torch.Tensor],
+    A_scale: torch.Tensor | None,
     kwargs_list: list[dict[str, Any]],
-    out_dtypes: list[Optional[torch.dtype]],
+    out_dtypes: list[torch.dtype | None],
     gather_dim: int,
     group_name: str,
     return_A: bool,
-) -> tuple[Optional[torch.Tensor], list[torch.Tensor]]:
+) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
     if A_shard.dim() < 2:
         raise ValueError("A_shard must be a matrix")
     for B in Bs:
@@ -627,7 +629,7 @@ def _fused_all_gather_matmul_fallback(
     group_name: str,
     *,
     return_A: bool = True,
-) -> tuple[Optional[torch.Tensor], list[torch.Tensor]]:
+) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
     group_size = c10d._get_group_size_by_name(group_name)
     A = torch.ops._c10d_functional.all_gather_into_tensor(
         A_shard.contiguous(), group_size, group_name
@@ -649,7 +651,7 @@ def _fused_all_gather_matmul(
     group_name: str,
     *,
     return_A: bool = True,
-) -> tuple[Optional[torch.Tensor], list[torch.Tensor]]:
+) -> tuple[torch.Tensor | None, list[torch.Tensor]]:
     """
     Perform the following logic with micro-pipelined computation and
     communication:
@@ -819,9 +821,9 @@ def _fused_all_gather_scaled_matmul_fallback(
     B_scales: list[torch.Tensor],
     gather_dim: int,
     group_name: str,
-    biases: list[Optional[torch.Tensor]],
-    result_scales: list[Optional[torch.Tensor]],
-    out_dtypes: list[Optional[torch.dtype]],
+    biases: list[torch.Tensor | None],
+    result_scales: list[torch.Tensor | None],
+    out_dtypes: list[torch.dtype | None],
     use_fast_accum: list[bool],
 ) -> tuple[torch.Tensor, list[torch.Tensor]]:
     out_dtypes = _maybe_convert_scalar_types_to_dtypes(out_dtypes)
@@ -857,9 +859,9 @@ def scaled_matmul(
         B: torch.Tensor,
         A_scale: torch.Tensor,
         B_scale: torch.Tensor,
-        bias: Optional[torch.Tensor],
-        result_scale: Optional[torch.Tensor],
-        out_dtype: Optional[torch.dtype],
+        bias: torch.Tensor | None,
+        result_scale: torch.Tensor | None,
+        out_dtype: torch.dtype | None,
         use_fast_accum: bool,
     ) -> torch.Tensor:
         leading_dims = A.shape[:-1]
@@ -893,9 +895,9 @@ def _fused_all_gather_scaled_matmul(
     B_scales: list[torch.Tensor],
     gather_dim: int,
     group_name: str,
-    biases: list[Optional[torch.Tensor]],
-    result_scales: list[Optional[torch.Tensor]],
-    out_dtypes: list[Optional[torch.dtype]],
+    biases: list[torch.Tensor | None],
+    result_scales: list[torch.Tensor | None],
+    out_dtypes: list[torch.dtype | None],
     use_fast_accum: list[bool],
 ) -> tuple[torch.Tensor, list[torch.Tensor]]:
     """
@@ -1046,7 +1048,7 @@ def _fused_matmul_reduce_scatter_impl(
     A: torch.Tensor,
     B: torch.Tensor,
     kwargs: dict[str, Any],
-    out_dtype: Optional[torch.dtype],
+    out_dtype: torch.dtype | None,
     reduce_op: str,
     scatter_dim: int,
     group_name: str,
@@ -1108,9 +1110,9 @@ def _fused_scaled_matmul_reduce_scatter(
     scatter_dim_after_maybe_reshape: int,
     group_name: str,
     output_shape: list[int],
-    bias: Optional[torch.Tensor] = None,
-    result_scale: Optional[torch.Tensor] = None,
-    out_dtype: Optional[torch.dtype] = None,
+    bias: torch.Tensor | None = None,
+    result_scale: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
     use_fast_accum: bool = False,
 ) -> torch.Tensor:
     if _is_test_mode:
@@ -1162,9 +1164,9 @@ def _fused_scaled_matmul_reduce_scatter_fallback(
     scatter_dim_after_maybe_reshape: int,
     group_name: str,
     output_shape: list[int],
-    bias: Optional[torch.Tensor] = None,
-    result_scale: Optional[torch.Tensor] = None,
-    out_dtype: Optional[torch.dtype] = None,
+    bias: torch.Tensor | None = None,
+    result_scale: torch.Tensor | None = None,
+    out_dtype: torch.dtype | None = None,
     use_fast_accum: bool = False,
 ) -> torch.Tensor:
     if A_scale.numel() > 1:
@@ -1208,7 +1210,7 @@ def _fused_scaled_matmul_reduce_scatter_impl(
     B: torch.Tensor,
     A_scale: torch.Tensor,
     kwargs: dict[str, Any],
-    out_dtype: Optional[torch.dtype],
+    out_dtype: torch.dtype | None,
     reduce_op: str,
     orig_scatter_dim: int,
     scatter_dim_after_maybe_reshape: int,
@@ -1350,7 +1352,7 @@ def restride_A_for_fused_matmul_reduce_scatter(
 
 def _maybe_convert_scalar_types_to_dtypes(
     scalar_types: list[Any],
-) -> list[Optional[torch.dtype]]:
+) -> list[torch.dtype | None]:
     """
     When a list of `torch.dtype`s is passed through the dispatcher as
     `ScalarType[]`, it is converted to a list of scalar type enum values. This
@@ -1382,7 +1384,7 @@ def _maybe_convert_scalar_types_to_dtypes(
     if any(not isinstance(x, (type(None), int)) for x in scalar_types):
         return scalar_types
 
-    dtypes: list[Optional[torch.dtype]] = []
+    dtypes: list[torch.dtype | None] = []
     for scalar_type in scalar_types:
         if scalar_type is None:
             dtypes.append(scalar_type)
@@ -1621,7 +1623,7 @@ def _all_to_all_vdev_2d_meta(
     in_splits: torch.Tensor,
     out_splits_offsets: torch.Tensor,
     group_name: str,
-    major_align: Optional[int] = None,
+    major_align: int | None = None,
 ) -> None:
     return None
 
@@ -1643,18 +1645,17 @@ def _all_to_all_vdev_2d_offset_meta(
 
 
 from collections.abc import Sequence
-from typing import Any, overload, TYPE_CHECKING, Union
-
-from torch.types import _device, _dtype, _int
+from typing import overload, TYPE_CHECKING, Union
 
 
 if TYPE_CHECKING:
     from torch._C._distributed_c10d import ProcessGroup
+    from torch.types import _device, _dtype, _int
 
 
 @overload
 def empty(
-    *size: _int, dtype: Optional[_dtype] = None, device: Optional[_device] = None
+    *size: _int, dtype: _dtype | None = None, device: _device | None = None
 ) -> torch.Tensor: ...
 
 
@@ -1662,15 +1663,15 @@ def empty(
 def empty(
     size: Sequence[_int],
     *,
-    dtype: Optional[_dtype] = None,
-    device: Optional[_device] = None,
+    dtype: _dtype | None = None,
+    device: _device | None = None,
 ) -> torch.Tensor: ...
 
 
 def empty(  # type: ignore[misc]
     *size: Any,
-    dtype: Optional[_dtype] = None,
-    device: Optional[_device] = None,
+    dtype: _dtype | None = None,
+    device: _device | None = None,
 ) -> torch.Tensor:
     r"""
     empty(*size, *, dtype=None, device=None) -> Tensor
@@ -1711,7 +1712,7 @@ def empty(  # type: ignore[misc]
 
 
 def rendezvous(
-    tensor: torch.Tensor, group: Union[str, "ProcessGroup"]
+    tensor: torch.Tensor, group: Union[str, ProcessGroup]
 ) -> _SymmetricMemory:
     r"""
     rendezvous(tensor, group) -> _SymmetricMemory
@@ -1769,7 +1770,7 @@ def set_backend(name: Literal["NVSHMEM", "CUDA", "NCCL"]) -> None:
     _SymmetricMemory.set_backend(name)
 
 
-def get_backend(device: _device) -> Optional[str]:
+def get_backend(device: _device) -> str | None:
     r"""
     Get the backend for symmetric memory allocation for a given device. If not
     found, return None.
@@ -1781,4 +1782,14 @@ def get_backend(device: _device) -> Optional[str]:
     return _SymmetricMemory.get_backend(torch.device(device))
 
 
+def get_mempool_allocator(device: _device):  # type: ignore[no-untyped-def]
+    r"""
+    Get the MemPool allocator for symmetric memory for a given device.
+    Args:
+        device (class:`torch.device` or str): the device for which to get the
+        MemPool allocator.
+    """
+    return _SymmetricMemory.get_mempool_allocator(torch.device(device))
+
+
 __all__ = ["empty", "rendezvous", "is_nvshmem_available", "set_backend", "get_backend"]
diff --git a/torch/distributed/_symmetric_memory/_nvshmem_triton.py b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
index c543fdffc1c7..0d5e88e91805 100644
--- a/torch/distributed/_symmetric_memory/_nvshmem_triton.py
+++ b/torch/distributed/_symmetric_memory/_nvshmem_triton.py
@@ -1,113 +1,246 @@
+import logging
 import os
 import subprocess
 import sysconfig
 from typing import Any, Optional
 
+import torch.distributed as dist
 from torch.utils._triton import has_triton
 
 
-def _find_nvshmem_device_library() -> str:
-    paths = [os.path.join(sysconfig.get_path("purelib"), "nvidia", "nvshmem", "lib")]
+logger = logging.getLogger(__name__)
 
-    # Add common system installation paths
-    common_paths = [
-        "/usr/local/lib",
-        "/usr/lib",
-        "/opt/nvidia/nvshmem/lib",
-    ]
-    paths.extend(common_paths)
 
-    try:
-        import torch
-
-        torch_lib = os.path.join(os.path.dirname(torch.__file__), "lib")
-        so_path = os.path.join(torch_lib, "libtorch_nvshmem.so")
+class NvshmemLibFinder:
+    """
+    A class to find path to the NVSHMEM device library.
 
-        if os.path.exists(so_path):
-            try:
-                result = subprocess.run(
-                    ["readelf", "-d", so_path],
-                    capture_output=True,
-                    text=True,
-                    check=True,
-                )
+    Environment variable:
 
-                for line in result.stdout.splitlines():
-                    if ("RPATH" in line or "RUNPATH" in line) and "[" in line:
-                        rpath = line.split("[", 1)[1].split("]", 1)[0]
-                        for p in rpath.split(":"):
-                            p = p.strip().replace("$ORIGIN", torch_lib)
-                            if p and p not in paths:
-                                paths.append(p)
-            except subprocess.CalledProcessError:
-                pass
+    `NVSHMEM_LIB_DIR` (Optional[str]): The directory where the NVSHMEM device
+    library is located. If not provided, it will use the default path where
+    NVSHMEM wheel is installed, or search for the library in common system
+    paths.
+    """
 
-    except ImportError:
-        pass
+    # Class variable to store the found library path for reuse
+    found_device_lib_path: Optional[str] = None
 
-    for path in paths:
-        device_lib = os.path.join(path, "libnvshmem_device.bc")
-        if os.path.exists(device_lib):
-            return device_lib
+    @classmethod
+    def find_device_library(cls) -> str:
+        """
+        Find the path to the NVSHMEM device library.
 
-    raise RuntimeError(f"NVSHMEM device library not found. Searched: {paths}")
+        Returns:
+            str: The path to libnvshmem_device.bc (included).
+        """
+        if cls.found_device_lib_path is not None:
+            # Return the cached path if it exists
+            return cls.found_device_lib_path
+
+        # First, check if the user has specified a custom library path
+        user_lib_dir = os.environ.get("NVSHMEM_LIB_DIR", None)
+        if user_lib_dir is not None:
+            lib_path = os.path.join(user_lib_dir, "libnvshmem_device.bc")
+            if not os.path.exists(lib_path):
+                raise RuntimeError(
+                    f"NVSHMEM device library not found at specified path: {user_lib_dir}"
+                )
+            cls.found_device_lib_path = lib_path
+            return lib_path
+
+        # Otherwise, search for the library in the default installation paths
+        paths = [
+            os.path.join(sysconfig.get_path("purelib"), "nvidia", "nvshmem", "lib")
+        ]
+
+        # Add common system installation paths
+        common_paths = [
+            "/usr/local/lib",
+            "/usr/lib",
+            "/opt/nvidia/nvshmem/lib",
+        ]
+        paths.extend(common_paths)
+
+        try:
+            import torch
+
+            torch_lib = os.path.join(os.path.dirname(torch.__file__), "lib")
+            so_path = os.path.join(torch_lib, "libtorch_nvshmem.so")
+
+            if os.path.exists(so_path):
+                try:
+                    result = subprocess.run(
+                        ["readelf", "-d", so_path],
+                        capture_output=True,
+                        text=True,
+                        check=True,
+                    )
+
+                    for line in result.stdout.splitlines():
+                        if ("RPATH" in line or "RUNPATH" in line) and "[" in line:
+                            rpath = line.split("[", 1)[1].split("]", 1)[0]
+                            for p in rpath.split(":"):
+                                p = p.strip().replace("$ORIGIN", torch_lib)
+                                if p and p not in paths:
+                                    paths.append(p)
+                except subprocess.CalledProcessError:
+                    pass
+
+        except ImportError:
+            pass
+
+        for path in paths:
+            device_lib = os.path.join(path, "libnvshmem_device.bc")
+            if os.path.exists(device_lib):
+                cls.found_device_lib_path = device_lib
+                return device_lib
+
+        raise RuntimeError(f"NVSHMEM device library not found. Searched: {paths}")
 
 
 def enable_triton(lib_dir: Optional[str] = None) -> dict[str, str]:
+    raise NotImplementedError(
+        "`enable_triton` is deprecated. "
+        "If you need NVSHMEM device function support for Triton, "
+        "please use `@requires_nvshmem` to decorate your Triton kernel. ",
+    )
+
+
+class NvshmemKernelRegistry:
     """
-    Enable NVSHMEM device functions for Triton. It performs a NVSHMEM
-    device-side initialization on the kernel module created by Triton.
-
-    This function sets a global hook that initializes NVSHMEM for Triton
-    kernels. To avoid unnecessary initializations, the hook only acts on
-    kernels that have "nvshmem" in their function name. Therefore, it is
-    required that all Triton kernels using NVSHMEM primitives follow this
-    naming convention.
-
-    Args:
-        lib_dir (Optional[str]): The directory where the NVSHMEM device library
-        is located. If not provided, it will use the default path where NVSHMEM
-        wheel is installed.
-
-    Returns:
-        dict[str, str]: A dictionary containing the NVSHMEM device library name
-        and path.
+    A class to register kernel functions that ** require NVSHMEM initialization **
     """
-    import triton
 
+    # Class variable to store the functions to be initialized
+    _to_init: dict[str, Any] = {}
+
+    @classmethod
+    def register(cls, name: str) -> None:
+        """
+        Register a kernel function with the given name.
+
+        Args:
+            name (str): The name of the kernel function.
+        """
+        cls._to_init.setdefault(name)
+
+    @classmethod
+    def deregister(cls, name: str) -> None:
+        """
+        Deregister a kernel function with the given name.
+
+        Args:
+            name (str): The name of the kernel function.
+        """
+        cls._to_init.pop(name, None)
+
+    @classmethod
+    def has(cls, name: str) -> bool:
+        """
+        Check if a kernel function with the given name is registered.
+
+        Args:
+            name (str): The name of the kernel function.
+
+        Returns:
+            bool: True if the kernel function is registered, False otherwise.
+        """
+        return name in cls._to_init
+
+
+def _nvshmem_init_hook(*args, **kwargs) -> None:  # type: ignore[no-untyped-def]
+    """
+    A hook function to initialize the CUModule created by `triton.jit` with
+    NVSHMEM device context
+    """
     from torch._C._distributed_c10d import _nvshmemx_cumodule_init
 
-    if lib_dir is not None:
-        lib_path = os.path.join(lib_dir, "libnvshmem_device.bc")
-        if not os.path.exists(lib_path):
-            raise RuntimeError(
-                f"NVSHMEM device library not found at specified path: {lib_path}"
+    jit_function = kwargs["fn"].jit_function
+    fn_name = jit_function.fn.__name__
+
+    # Only initialize NVSHMEM module for kernels registered via @requires_nvshmem
+    if NvshmemKernelRegistry.has(fn_name):
+        key = kwargs["key"]
+        device = kwargs["compile"]["device"]
+        jit_function = kwargs["fn"].jit_function
+        kernel_cache = jit_function.device_caches[device][0]
+        kernel = kernel_cache.get(key, None)
+        if kernel is not None:
+            kernel.run
+            # Initialize NVSHMEM for the CU module
+            _nvshmemx_cumodule_init(kernel.module)
+        else:
+            logger.warning(
+                f"It seems Triton hasn't created a kernel for function {fn_name}. "  # noqa: G004
+                "Please report this issue to Triton."
             )
-    else:
-        # Otherwise, search for the library automatically.
-        lib_path = _find_nvshmem_device_library()
 
+
+if has_triton():
+    from triton.runtime.jit import JITFunction, KernelInterface
+
+    # Create a new Callable class that follows the KernelInterface protocol so
+    # that the Callable works with the subscript operator, e.g. `foo[(1, 1)]`
+    class GridCallableWithExtern(KernelInterface):
+        """
+        `KernelInterface` invokes `self.run` in `__getitem__`, i.e. [].  We
+        implement a `run` method by directing the call to `JITFunction.run`,
+        with added extern_libs kwarg, so that users don't have to pass it
+        """
+
+        def __init__(self, jit_func: JITFunction, extern_libs: dict[str, str]) -> None:
+            self.jit_func = jit_func
+            self.extern_libs = extern_libs
+
+        def run(self, *args, **kwargs):  # type: ignore[no-untyped-def]
+            # Call the JITFunction.run with added extern_libs kwarg
+            return self.jit_func.run(*args, **kwargs, extern_libs=self.extern_libs)
+
+
+def requires_nvshmem(  # type: ignore[no-untyped-def]
+    jit_func,  # JITFunction created by triton.jit
+):
+    """
+    A decorator to register a Triton kernel function that requires NVSHMEM initialization.
+
+    Example usage:
+    ```
+        @requires_nvshmem
+        @triton.jit
+        def foo(...):
+            ...
+    ```
+
+    If you would like to specify a path to the NVSHMEM device library other
+    than standard search locations, you can use the following environment
+    variable:
+    ```
+        export NVSHMEM_LIB_DIR=/path/to/nvshmem/lib
+    ```
+    """
+
+    import triton
+    from triton.runtime.jit import JITFunction
+
+    if not isinstance(jit_func, JITFunction):
+        raise TypeError(f"Expected a JITFunction, but got {type(jit_func)}")
+
+    # Find the NVSHMEM device library
+    lib_path = NvshmemLibFinder.find_device_library()
     extern_libs = {"libnvshmem_device": lib_path}
 
-    # A hook function to initialize NVSHMEM in Triton
-    def nvshmem_init_hook(*args, **kwargs) -> None:  # type: ignore[no-untyped-def]
-        jit_function = kwargs["fn"].jit_function
-        # Only initialize NVSHMEM module for kernels containing "nvshmem" in their name
-        if "nvshmem" in jit_function.fn.__name__:
-            key = kwargs["key"]
-            device = kwargs["compile"]["device"]
-            jit_function = kwargs["fn"].jit_function
-            kernel_cache, _, _, _ = jit_function.device_caches[device]
-            kernel = kernel_cache.get(key, None)
-            if kernel is not None:
-                kernel.run
-                _nvshmemx_cumodule_init(kernel.module)
+    # Register the JITFunction with the kernel registry as "to be initialized"
+    NvshmemKernelRegistry.register(jit_func.fn.__name__)
 
-    # Register the function as a post-compile hook
-    triton.knobs.runtime.jit_post_compile_hook = nvshmem_init_hook
+    # Register the NVSHMEM init function as a post-compile hook.
+    # [Note] This is a global setting (due to lack of Triton API exposure). To
+    # avoid initializing Triton kernels that do not require NVSHMEM, filtering
+    # is performed in the hook function itself by checking against
+    # NvshmemKernelRegistry.
+    triton.knobs.runtime.jit_post_compile_hook = _nvshmem_init_hook
 
-    # Return to user so that they can use it in Triton kernel invocation
-    return extern_libs
+    return GridCallableWithExtern(jit_func, extern_libs)
 
 
 if has_triton():
@@ -147,7 +280,7 @@ def put(dest, source, nelems, pe):  # type: ignore[no-untyped-def]
         tl.static_assert(dest.type == source.type)
         nbytes = nelems * dest.type.element_ty.itemsize
         return putmem_block_extern_wrapper(
-            dest.to(tl.int64), source.to(tl.int64), nbytes, pe
+            dest.to(tl.int64), source.to(tl.int64), nbytes.to(tl.int64), pe
         )
 
     @core.extern
@@ -162,7 +295,7 @@ def putmem_block_extern_wrapper(dest, source, size_bytes, pe, _semantic=None):
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
                     core.dtype("int64"),  # size in bytes
-                    core.dtype("int64"),  # pe number
+                    core.dtype("int32"),  # pe number
                 ): ("nvshmemx_putmem_block", core.dtype("int32"))
             },
             is_pure=False,
@@ -200,7 +333,7 @@ def get(dest, source, nelems, pe):  # type: ignore[no-untyped-def]
         tl.static_assert(dest.type == source.type)
         nbytes = nelems * dest.type.element_ty.itemsize
         return getmem_block_extern_wrapper(
-            dest.to(tl.int64), source.to(tl.int64), nbytes, pe
+            dest.to(tl.int64), source.to(tl.int64), nbytes.to(tl.int64), pe
         )
 
     @core.extern
@@ -215,23 +348,22 @@ def getmem_block_extern_wrapper(dest, source, size_bytes, pe, _semantic=None):
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
                     core.dtype("int64"),  # size in bytes
-                    core.dtype("int64"),  # pe number
+                    core.dtype("int32"),  # pe number
                 ): ("nvshmemx_getmem_block", core.dtype("int32"))
             },
             is_pure=False,
             _semantic=_semantic,
         )
 
-    @core.extern
+    @triton.jit  # type: ignore[misc]
     def putmem_signal_block(  # type: ignore[no-untyped-def]
         dst,
         src,
         size_bytes,
-        sig_addr,
         signal,
+        sig_val,
         sig_op,
         pe,
-        _semantic=None,
     ):  # type: ignore[no-untyped-def]
         """
         Put data to remote PE with atomic signal operation using block-scoped operation.
@@ -241,17 +373,16 @@ def putmem_signal_block(  # type: ignore[no-untyped-def]
         This enables efficient point-to-point synchronization between PEs.
 
         Args:
-            dst (int64): Symmetric address of the destination data object on the remote PE.
-            src (int64): Local address of the source data object containing data to be copied.
+            dst (tensor): A tensor on calling PE symmetric to the destination tensor on remote PE.
+            src (tensor): Local tensor containing the source data.
             size_bytes (int64): Number of bytes to transfer. Must be positive.
-            sig_addr (int64): Symmetric address of the signal variable (uint64_t) on the remote PE.
+            signal (tensor): Symmetric signal pad with remote PE.
                              Must be 8-byte aligned symmetric memory.
             signal (int64): Value to be used in the signal operation.
-            sig_op (int64): Signal operation type. Common values:
+            sig_op (int32): Signal operation type. Common values:
                            - NVSHMEM_SIGNAL_SET (0): Atomic set operation
                            - NVSHMEM_SIGNAL_ADD (5): Atomic add operation
-            pe (int64): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
-            _semantic: Optional semantic information for Triton compilation.
+            pe (int32): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
 
         Returns:
             int32: Status code (0 for success).
@@ -273,19 +404,42 @@ def putmem_signal_block(  # type: ignore[no-untyped-def]
             )
             ```
         """
+        # Ensure sig_val is 64 bits
+        sig_val = 0 << 32 | sig_val
+        return putmem_signal_block_extern_wrapper(
+            dst.to(tl.int64),
+            src.to(tl.int64),
+            size_bytes.to(tl.int64),
+            signal.to(tl.int64),
+            sig_val.to(tl.uint64),
+            sig_op,
+            pe,
+        )
+
+    @core.extern
+    def putmem_signal_block_extern_wrapper(  # type: ignore[no-untyped-def]
+        dst,
+        src,
+        size_bytes,
+        signal,
+        sig_val,
+        sig_op,
+        pe,
+        _semantic=None,
+    ):  # type: ignore[no-untyped-def]
         return core.extern_elementwise(
             "",
             "",
-            [dst, src, size_bytes, sig_addr, signal, sig_op, pe],
+            [dst, src, size_bytes, signal, sig_val, sig_op, pe],
             {
                 (
                     core.dtype("int64"),
                     core.dtype("int64"),
                     core.dtype("int64"),
                     core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
+                    core.dtype("uint64"),
+                    core.dtype("int32"),
+                    core.dtype("int32"),
                 ): ("nvshmemx_putmem_signal_block", core.dtype("int32"))
             },
             is_pure=False,
@@ -327,8 +481,8 @@ def wait_until(ivar, cmp_op, cmp_val):  # type: ignore[no-untyped-def]
             ```
         """
         tl.static_assert(
-            ivar.type.element_ty.itemsize == 8,
-            "wait_until expects a 64-bit type for the synchronization variable",
+            ivar.type.element_ty.itemsize == 4,
+            "wait_until expects a 32-bit type for the synchronization variable",
         )
         return wait_until_extern_wrapper(ivar.to(tl.int64), cmp_op, cmp_val)
 
@@ -341,16 +495,16 @@ def wait_until_extern_wrapper(ivar, cmp, cmp_val, _semantic=None):  # type: igno
             {
                 (
                     core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
-                ): ("nvshmem_longlong_wait_until", core.dtype("int32"))
+                    core.dtype("int32"),
+                    core.dtype("int32"),
+                ): ("nvshmem_int_wait_until", core.dtype("int32"))
             },
             is_pure=False,
             _semantic=_semantic,
         )
 
-    @core.extern
-    def signal_wait_until(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[no-untyped-def]
+    @triton.jit  # type: ignore[misc]
+    def signal_wait_until(signal, cmp, cmp_val):  # type: ignore[no-untyped-def]
         """
         Wait until a signal variable meets a specified condition.
 
@@ -360,9 +514,9 @@ def signal_wait_until(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[n
         with signal operations.
 
         Args:
-            sig_addr (int64): Symmetric address of the signal variable (uint64_t).
+            signal (tensor): Symmetric signal tensor with remote PE.
                              Must be 8-byte aligned symmetric memory.
-            cmp (int64): Comparison operator. Common values:
+            cmp (int32): Comparison operator. Common values:
                         - NVSHMEM_CMP_EQ (0): Wait until signal == cmp_val
                         - NVSHMEM_CMP_NE (1): Wait until signal != cmp_val
                         - NVSHMEM_CMP_GT (2): Wait until signal > cmp_val
@@ -370,7 +524,6 @@ def signal_wait_until(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[n
                         - NVSHMEM_CMP_LT (4): Wait until signal < cmp_val
                         - NVSHMEM_CMP_LE (5): Wait until signal <= cmp_val
             cmp_val (int64): Value to compare against.
-            _semantic: Optional semantic information for Triton compilation.
 
         Returns:
             int32: Status code (0 for success).
@@ -389,15 +542,22 @@ def signal_wait_until(sig_addr, cmp, cmp_val, _semantic=None):  # type: ignore[n
             nvshmem.signal_wait_until(signal_ptr, NVSHMEM_CMP_EQ, 42)
             ```
         """
+        cmp_val = 0 << 32 | cmp_val
+        return signal_wait_until_extern_wrapper(
+            signal.to(tl.int64), cmp, cmp_val.to(tl.uint64)
+        )
+
+    @core.extern
+    def signal_wait_until_extern_wrapper(signal, cmp, cmp_val, _semantic=None):  # type: ignore[no-untyped-def]
         return core.extern_elementwise(
             "",
             "",
-            [sig_addr, cmp, cmp_val],
+            [signal, cmp, cmp_val],
             {
                 (
                     core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
+                    core.dtype("int32"),
+                    core.dtype("uint64"),
                 ): ("nvshmem_signal_wait_until", core.dtype("int32"))
             },
             is_pure=False,
@@ -417,10 +577,10 @@ def signal_op(sig_addr, signal, sig_op, pe, _semantic=None):  # type: ignore[no-
             sig_addr (int64): Symmetric address of the signal variable (uint64_t) on the remote PE.
                              Must be 8-byte aligned symmetric memory.
             signal (int64): Value to be used in the signal operation.
-            sig_op (int64): Signal operation type. Common values:
+            sig_op (int32): Signal operation type. Common values:
                            - NVSHMEM_SIGNAL_SET (0): Atomically set sig_addr = signal
                            - NVSHMEM_SIGNAL_ADD (5): Atomically set sig_addr += signal
-            pe (int64): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
+            pe (int32): PE number of the remote PE (0 ≤ pe < nvshmem_n_pes()).
             _semantic: Optional semantic information for Triton compilation.
 
         Returns:
@@ -448,8 +608,8 @@ def signal_op(sig_addr, signal, sig_op, pe, _semantic=None):  # type: ignore[no-
                 (
                     core.dtype("int64"),
                     core.dtype("int64"),
-                    core.dtype("int64"),
-                    core.dtype("int64"),
+                    core.dtype("int32"),
+                    core.dtype("int32"),
                 ): ("nvshmemx_signal_op", core.dtype("int32"))
             },
             is_pure=False,
@@ -764,7 +924,7 @@ def alltoall(team, dest, source, nelems_per_pe):  # type: ignore[no-untyped-def]
         tl.static_assert(dest.type == source.type)
         size_bytes_per_pe = nelems_per_pe * dest.type.element_ty.itemsize
         return alltoallmem_block_extern_wrapper(
-            team, dest.to(tl.int64), source.to(tl.int64), size_bytes_per_pe
+            team, dest.to(tl.int64), source.to(tl.int64), size_bytes_per_pe.to(tl.int64)
         )
 
     @core.extern  # type: ignore[misc]
@@ -778,7 +938,7 @@ def alltoallmem_block_extern_wrapper(
             [team, dest, source, size_bytes],
             {
                 (
-                    core.dtype("int64"),  # team handle
+                    core.dtype("int32"),  # team handle
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
                     core.dtype("int64"),  # size in bytes
@@ -819,7 +979,7 @@ def broadcast(team, dest, source, nelems, pe_root):  # type: ignore[no-untyped-d
         tl.static_assert(dest.type == source.type)
         nbytes = nelems * dest.type.element_ty.itemsize
         return broadcastmem_block_extern_wrapper(
-            team, dest.to(tl.int64), source.to(tl.int64), nbytes, pe_root
+            team, dest.to(tl.int64), source.to(tl.int64), nbytes.to(tl.int64), pe_root
         )
 
     @core.extern  # type: ignore[misc]
@@ -838,11 +998,11 @@ def broadcastmem_block_extern_wrapper(
             [team, dest, source, size_bytes, pe_root],
             {
                 (
-                    core.dtype("int64"),  # team handle
+                    core.dtype("int32"),  # team handle
                     core.dtype("int64"),  # dest ptr
                     core.dtype("int64"),  # source ptr
                     core.dtype("int64"),  # size in bytes
-                    core.dtype("int64"),  # pe_root
+                    core.dtype("int32"),  # pe_root
                 ): ("nvshmemx_broadcastmem_block", core.dtype("int32"))
             },
             is_pure=False,
@@ -883,7 +1043,7 @@ def reduce(team, dest, source, nreduce, operation: tl.constexpr):  # type: ignor
             team,
             dest.to(tl.int64),
             source.to(tl.int64),
-            nreduce,
+            nreduce.to(tl.int64),
             operation,
             dtype,
         )
@@ -966,7 +1126,7 @@ def reduce_extern_wrapper(
 
         # Define function signature - all parameters are int64 in Triton (they are just ptrs)
         signature = (
-            core.dtype("int64"),  # team handle
+            core.dtype("int32"),  # team handle
             core.dtype("int64"),  # destination pointer
             core.dtype("int64"),  # source pointer
             core.dtype("int64"),  # number of elements
@@ -980,3 +1140,27 @@ def reduce_extern_wrapper(
             is_pure=False,
             _semantic=_semantic,
         )
+
+    # Utility for inspecting Triton kernels
+
+    triton_kernels: dict = {}
+
+    def _log_triton_kernel(kernel) -> None:  # type: ignore[no-untyped-def]
+        import atexit
+        import tempfile
+
+        if dist.is_initialized() and dist.get_rank() != 0:
+            return
+
+        def on_exit() -> None:
+            logger.info("PTX files:")
+            for kernel in triton_kernels:
+                with tempfile.NamedTemporaryFile(dir="/tmp", delete=False) as f:
+                    f.write(kernel.asm["ptx"].encode("utf-8"))
+                    logger.info(f"+- {kernel.name}: {f.name}")  # noqa: G004
+
+        if len(triton_kernels) == 0:
+            atexit.register(on_exit)
+
+        if kernel not in triton_kernels:
+            triton_kernels[kernel] = None
diff --git a/torch/distributed/checkpoint/__init__.py b/torch/distributed/checkpoint/__init__.py
index 56bac60b9566..c9eb7de5b25a 100644
--- a/torch/distributed/checkpoint/__init__.py
+++ b/torch/distributed/checkpoint/__init__.py
@@ -11,6 +11,7 @@
 )
 from .optimizer import load_sharded_optimizer_state_dict
 from .planner import LoadPlan, LoadPlanner, ReadItem, SavePlan, SavePlanner, WriteItem
+from .quantized_hf_storage import QuantizedHuggingFaceStorageReader
 from .state_dict_loader import load, load_state_dict
 from .state_dict_saver import async_save, save, save_state_dict
 from .storage import StorageReader, StorageWriter
diff --git a/torch/distributed/checkpoint/_consolidate_hf_safetensors.py b/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
index db5b8aa6f96c..9db89d038658 100644
--- a/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
+++ b/torch/distributed/checkpoint/_consolidate_hf_safetensors.py
@@ -620,8 +620,7 @@ def consolidate_safetensors_files_on_every_rank(
     output_dir: str,
     fqn_to_index_mapping: dict[str, int],
     num_threads: int = 1,
-    rank: Optional[int] = None,
-    world_size: Optional[int] = None,
+    process_group: Optional[dist.ProcessGroup] = None,
 ) -> None:
     """
     Consolidate sharded safetensors files across multiple ranks, with each rank handling a subset of output files.
@@ -630,35 +629,29 @@ def consolidate_safetensors_files_on_every_rank(
     All tensors with the same index in fqn_to_index_mapping are processed by the same rank,
     as they belong to the same output file.
 
-    If rank and world_size are not provided, they will be automatically detected from the
-    distributed environment if available.
+    If process_group is provided, rank and world_size will be derived from it. Otherwise,
+    they will be automatically detected from the distributed environment if available.
 
     Args:
         input_dir: Directory containing sharded safetensors files
         output_dir: Directory where consolidated files will be written
         fqn_to_index_mapping: Mapping of tensor names to output file indices
         num_threads: Number of threads to use for parallel processing on each rank
-        rank: Current process rank (default: None, will be auto-detected)
-        world_size: Total number of ranks/processes (default: None, will be auto-detected)
+        process_group: PyTorch distributed process group (default: None, will use default group)
     """
 
     start_time = time.time()
-    # Auto-detect rank and world_size if not provided
-    if rank is None or world_size is None:
-        if dist.is_available() and dist.is_initialized():
-            if rank is None:
-                rank = dist.get_rank()
-            if world_size is None:
-                world_size = dist.get_world_size()
-        else:
-            # Default to single process mode if distributed is not initialized
-            rank = 0
-            world_size = 1
-            logger.warning(
-                "Distributed environment not initialized. Running in single process mode."
-            )
-
-    start_time = time.time()
+    # Derive rank and world_size from process_group or default distributed environment
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank(group=process_group)
+        world_size = dist.get_world_size(group=process_group)
+    else:
+        # Default to single process mode if distributed is not initialized
+        rank = 0
+        world_size = 1
+        logger.warning(
+            "Distributed environment not initialized. Running in single process mode."
+        )
     logger.info(
         "Rank %d/%d: Consolidating safetensors files from %s to %s",
         rank,
diff --git a/torch/distributed/checkpoint/quantized_hf_storage.py b/torch/distributed/checkpoint/quantized_hf_storage.py
new file mode 100644
index 000000000000..1bc8b852ed81
--- /dev/null
+++ b/torch/distributed/checkpoint/quantized_hf_storage.py
@@ -0,0 +1,244 @@
+# mypy: allow-untyped-defs
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+import torch
+from torch.distributed.checkpoint._hf_utils import _metadata_fn
+from torch.distributed.checkpoint.planner import LoadPlanner, ReadItem
+
+from .hf_storage import HuggingFaceStorageReader
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+__all__ = ["QuantizedHuggingFaceStorageReader"]
+
+
+class QuantizedHuggingFaceStorageReader(HuggingFaceStorageReader):
+    """
+    Extension of HuggingFaceStorageReader that handles quantized tensors.
+    Checkpoint should have the full tensor in a SafeTensor file. The quantized
+    tensor should not be sharded across multiple files.
+
+    This reader handles the dequantization of tensors during the read process,
+    converting them from quantized blocks to full dequantized tensors before
+    copying to the target tensor.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        thread_count: int = 1,
+        target_dtype: torch.dtype = torch.float32,
+        block_size: int = 128,
+    ):
+        """
+        Initialize the HuggingFace storage reader to load quantized checkpoints
+
+        Args:
+            path: directory where the checkpoint will be read from.
+            thread_count: Number of threads to use to read distributed checkpoint. Defaults to 1.
+            target_dtype: Target dtype for dequantized tensor. Defaults to torch.float32.
+            block_size: Fixed block size for dequantization. Defaults to 128.
+        """
+        super().__init__(path=path, thread_count=thread_count)
+
+        self.target_dtype: torch.dtype = target_dtype
+        self.block_size: int = block_size
+        self._weight_scale_mapping: dict[str, str] = {}
+        # Track which file contains each tensor
+        self._weight_map: dict[str, str] = {}
+
+    def read_metadata(self) -> Any:
+        self._load_quantization_metadata()
+        return super().read_metadata()
+
+    def _load_quantization_metadata(self):
+        """Load quantization metadata from the checkpoint."""
+        checkpoint_path = Path(self.path)
+        # Load weight mapping from index file
+        index_file = checkpoint_path / _metadata_fn
+
+        with open(index_file) as f:
+            index_data = json.load(f)
+            weight_map = index_data.get("weight_map", {})
+            self._build_weight_scale_mapping(weight_map)
+
+    def _build_weight_scale_mapping(self, weight_map: dict[str, str]):
+        """Analyze and build weight-scale tensor pairs from weight mapping."""
+        # Store the complete weight map for file location lookups
+        self._weight_map = weight_map
+
+        for tensor_name in weight_map.keys():
+            if tensor_name.endswith(".weight_scale_inv"):
+                weight_name = tensor_name.replace(".weight_scale_inv", ".weight")
+                if weight_name in weight_map:
+                    self._weight_scale_mapping[weight_name] = tensor_name
+
+    def _process_read_request(
+        self, f: Any, req: ReadItem, planner: LoadPlanner
+    ) -> None:
+        """Override the Helper function that processes a single read request."""
+        tensor_fqn = req.storage_index.fqn
+
+        # Check if this is a quantized tensor that needs dequantization
+        if self._is_tensor_quantized(tensor_fqn):
+            tensor = self._read_quantized_tensor_with_block_alignment(req, f)
+        else:
+            # Standard tensor reading
+            slices = tuple(
+                slice(offset, offset + length)
+                for offset, length in zip(req.storage_offsets, req.lengths)
+            )
+            tensor = f.get_slice(tensor_fqn)[slices]
+
+        target_tensor = planner.resolve_tensor(req).detach()
+
+        assert target_tensor.size() == tensor.size(), (
+            f"req {req.storage_index} mismatch sizes {target_tensor.size()} vs {tensor.size()}"
+        )
+
+        target_tensor.copy_(tensor)
+        planner.commit_tensor(req, target_tensor)
+
+    def _calculate_scale_shape(
+        self, weight: torch.Tensor, block_size: int
+    ) -> tuple[int, int]:
+        """Calculate expected scale tensor shape based on weight tensor and block size."""
+        rows, cols = weight.shape
+        block_rows = (rows + block_size - 1) // block_size  # Ceiling division
+        block_cols = (cols + block_size - 1) // block_size  # Ceiling division
+        return (block_rows, block_cols)
+
+    def _dequantize_tensor(
+        self,
+        weight: torch.Tensor,
+        scale_inv: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Dequantize tensor using block-wise scaling.
+
+        Args:
+            weight: Quantized weight tensor
+            scale_inv: Scale inverse tensor for dequantization
+
+        Returns:
+            Dequantized tensor
+        """
+        # Convert to float32 for computation
+        # Certain quantized dtypes like Float8_e4m3fn
+        # don't support multiplication on CPU yet in PyTorch.
+        upcasted_weight = weight.to(torch.float32)
+
+        # Get original dimensions
+        orig_shape = weight.shape
+
+        # Calculate block dimensions for the local shard
+        expected_scale_shape = self._calculate_scale_shape(weight, self.block_size)
+        block_rows, block_cols = expected_scale_shape
+
+        # Create output tensor in target dtype
+        dequantized = weight.detach().to(dtype=self.target_dtype, copy=True)
+
+        # Apply scaling factors to each block
+        for i in range(block_rows):
+            row_start = i * self.block_size
+            row_end = min(row_start + self.block_size, orig_shape[0])
+
+            for j in range(block_cols):
+                col_start = j * self.block_size
+                col_end = min(col_start + self.block_size, orig_shape[1])
+
+                # Get the block
+                block = upcasted_weight[row_start:row_end, col_start:col_end]
+
+                scale = scale_inv[i, j]
+                block = block * scale
+
+                # Explicitly convert block to target dtype
+                block_converted = block.to(dtype=self.target_dtype)
+                # Store the dequantized block
+                dequantized[row_start:row_end, col_start:col_end] = block_converted
+
+        return dequantized
+
+    def _is_tensor_quantized(self, tensor_fqn: str) -> bool:
+        """
+        Check if a tensor is a quantized.
+
+        Args:
+            tensor_fqn: Fully qualified name of the tensor
+
+        Returns:
+            True if tensor is quantized and has a corresponding scale tensor,
+            False otherwise
+        """
+        # Skip scale tensors themselves
+        if tensor_fqn.endswith(".weight_scale_inv"):
+            return False
+
+        # Check if this weight tensor has a corresponding scale tensor
+        if tensor_fqn not in self._weight_scale_mapping:
+            return False
+
+        return True
+
+    def _read_quantized_tensor_with_block_alignment(
+        self, req: ReadItem, safetensor_file: Any
+    ) -> torch.Tensor:
+        """
+        Read a quantized tensor with block alignment.
+
+        Args:
+            req: Read request containing tensor info and required slices
+            safetensor_file: Open safetensors file handle
+
+        Returns:
+            Dequantized tensor ready for use
+        """
+        tensor_fqn = req.storage_index.fqn
+        scale_fqn = self._weight_scale_mapping[tensor_fqn]
+
+        try:
+            # Load the quantized weight
+            weight_slices = tuple(
+                slice(offset, offset + length)
+                for offset, length in zip(req.storage_offsets, req.lengths)
+            )
+            quantized_tensor = safetensor_file.get_slice(tensor_fqn)[weight_slices]
+
+            # Load the corresponding scale inverse tensor
+            # Use weight_map to find the correct file for the scale tensor
+            scale_file_name = self._weight_map.get(scale_fqn)
+            if scale_file_name is None:
+                raise ValueError(f"Scale tensor {scale_fqn} not found in weight_map")
+
+            # Check if scale tensor is in the same file as the weight tensor
+            weight_file_name = self._weight_map.get(tensor_fqn)
+
+            if scale_file_name == weight_file_name:
+                # Scale tensor is in the same file, use current handle
+                scale_inv = safetensor_file.get_tensor(scale_fqn)
+            else:
+                # Scale tensor is in a different file, need to open it
+                from safetensors import safe_open  # type: ignore[import]
+
+                scale_file_path = Path(self.path) / scale_file_name
+                with safe_open(
+                    scale_file_path, framework="pt", device="cpu"
+                ) as scale_file:
+                    scale_inv = scale_file.get_tensor(scale_fqn)
+
+            # Perform dequantization
+            dequantized_tensor = self._dequantize_tensor(
+                weight=quantized_tensor,
+                scale_inv=scale_inv,
+            )
+
+            return dequantized_tensor
+
+        except Exception as e:
+            logger.error("Failed to read the quantized tensor!!")
+            raise e
diff --git a/torch/distributed/collective_utils.py b/torch/distributed/collective_utils.py
index 9c071a6c13a0..715cd251ea4d 100644
--- a/torch/distributed/collective_utils.py
+++ b/torch/distributed/collective_utils.py
@@ -234,24 +234,44 @@ def all_gather_object_enforce_type(
             )
 
 
-def _summarize_ranks(numbers: Iterable[int]) -> str:
-    numbers = sorted(numbers)
-    result = []
-    current_range_start = numbers[0]
-    for i in range(1, len(numbers)):
-        if numbers[i] == numbers[i - 1] + 1:
-            pass
+def _summarize_ranks(ranks: Iterable[int]) -> str:
+    ranks = sorted(ranks)
+    assert min(ranks) >= 0, "ranks should all be positive"
+    assert len(set(ranks)) == len(ranks), "ranks should not contain duplicates"
+    curr: Optional[Union[int, range]] = None
+    ranges = []
+    while ranks:
+        x = ranks.pop(0)
+        if curr is None:
+            curr = x
+        elif isinstance(curr, int):
+            if x == curr + 1:
+                curr = range(curr, x + 1, 1)
+            else:
+                step = x - curr
+                curr = range(curr, x + step, step)
         else:
-            if current_range_start == numbers[i - 1]:
-                result.append(str(current_range_start))
+            assert isinstance(curr, range)
+            if x == curr.stop:
+                curr = range(curr.start, curr.stop + curr.step, curr.step)
             else:
-                result.append(f"{current_range_start}-{numbers[i - 1]}")
-            current_range_start = numbers[i]
-    if current_range_start == numbers[-1]:
-        result.append(str(current_range_start))
-    else:
-        result.append(f"{current_range_start}-{numbers[-1]}")
-    return ", ".join(result)
+                ranges.append(curr)
+                curr = x
+
+    if isinstance(curr, int):
+        ranges.append(range(curr, curr + 1, 1))
+    elif isinstance(curr, range):
+        ranges.append(curr)
+
+    result = []
+    for r in ranges:
+        if len(r) == 1:
+            result.append(f"{r.start}")
+        elif r.step == 1:
+            result.append(f"{r.start}:{r.stop}")
+        else:
+            result.append(f"{r.start}:{r.stop}:{r.step}")
+    return ",".join(result)
 
 
 def _check_philox_rng_sync(
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index c36ce0318fb8..13bb084299c6 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -1014,8 +1014,8 @@ def _flatten(
             If no mesh_dim_name is provided, the default is a string concatenating the mesh_dim_names of the
             given submesh with each mesh_dim_name separated by "_". For example, if we have a 3D mesh
             DeviceMesh([[[0, 1], [2, 3]], [[4, 5], [6, 7]]], mesh_dim_names=("dp", "cp", "tp")), calling
-            mesh_3d["dp", "cp"]._flatten() will create a 1D submesh DeviceMesh([0, 1, 2, 3], mesh_dim_names=("dp_cp",))
-            on rank 0, 1, 2, 3 and a 1D submesh DeviceMesh([4, 5, 6, 7], mesh_dim_names=("dp_cp",)) on rank 4, 5, 6, 7.
+            mesh_3d["dp", "cp"]._flatten() will create a 1D submesh DeviceMesh([0, 2, 4, 6], mesh_dim_names=("dp_cp",))
+            on rank 0, 2, 4, 6 and a 1D submesh DeviceMesh([1, 3, 5, 7], mesh_dim_names=("dp_cp",)) on rank 1, 3, 5, 7.
 
             After the flattened dimension is created, to access the flattened dimension in mesh_3d, one can use the
             existing slicing method to obtain the flattened mesh through calling mesh_3d["dp_cp"].
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index a2409cce969a..14790e5dba8a 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1751,11 +1751,16 @@ def init_process_group(
     else:
         # backward compatible API
         if store is None:
-            rendezvous_iterator = rendezvous(
-                not_none(init_method), rank, world_size, timeout=timeout
-            )
-            store, rank, world_size = next(rendezvous_iterator)
-            store.set_timeout(timeout)
+            if backend == "fake":
+                from torch.testing._internal.distributed.fake_pg import FakeStore
+
+                store = FakeStore()
+            else:
+                rendezvous_iterator = rendezvous(
+                    not_none(init_method), rank, world_size, timeout=timeout
+                )
+                store, rank, world_size = next(rendezvous_iterator)
+                store.set_timeout(timeout)
 
             # Use a PrefixStore to avoid accidental overrides of keys used by
             # different systems (e.g. RPC) in case the store is multi-tenant.
@@ -1934,9 +1939,9 @@ def _new_process_group_helper(
     if "," not in str(backend) and ":" not in str(backend):
         assert backend in Backend.backend_type_map, f"Unknown backend type {backend}"
         if backend == Backend.UNDEFINED:
-            # Currently when backend is UNDEFINED, both ``gloo`` and ``nccl`` backends
-            # will be created, we use nccl(if cuda is available) or gloo as default
-            # backend so we can correctly call getDefaultBackend which in ProcessGroup.
+            # Currently when backend is UNDEFINED, only one backend will be initialized
+            # we use nccl (if cuda is available) or gloo as default backend
+            # so we can correctly call getDefaultBackend which in ProcessGroup.
             if Backend.NCCL in backend_config.get_device_backend_map().values():
                 pg._set_default_backend(ProcessGroup.BackendType.NCCL)
             else:
@@ -3327,6 +3332,7 @@ def send_object_list(
     group: Optional[ProcessGroup] = None,
     device: Optional[torch.device] = None,
     group_dst: Optional[int] = None,
+    use_batch: bool = False,
 ):
     """
     Sends picklable objects in ``object_list`` synchronously.
@@ -3347,6 +3353,10 @@ def send_object_list(
             ``device`` before sending. Default is ``None``.
         group_dst (int, optional): Destination rank on ``group``.
             Must specify one of ``dst`` and ``group_dst`` but not both
+        use_batch (bool, optional): If True, use batch p2p operations instead of
+            regular send operations. This avoids initializing 2-rank communicators and
+            uses existing entire group communicators. See batch_isend_irecv for usage and
+            assumptions. Default is ``False``.
     Returns:
         ``None``.
 
@@ -3410,7 +3420,12 @@ def send_object_list(
     object_sizes_tensor = torch.cat(size_list)
 
     # Send object sizes
-    send(object_sizes_tensor, group_dst=group_dst, group=group)
+    if use_batch:
+        batch_isend_irecv(
+            [P2POp(isend, object_sizes_tensor, group_peer=group_dst, group=group)]
+        ).pop().wait()
+    else:
+        send(object_sizes_tensor, group_dst=group_dst, group=group)
 
     # Concatenate and send serialized object tensors
     # Note: torch.cat will do an extra memory copy to the current device, if the tensor_list
@@ -3420,7 +3435,12 @@ def send_object_list(
     else:
         object_tensor = torch.cat(tensor_list)
 
-    send(object_tensor, group_dst=group_dst, group=group)
+    if use_batch:
+        batch_isend_irecv(
+            [P2POp(isend, object_tensor, group_peer=group_dst, group=group)]
+        ).pop().wait()
+    else:
+        send(object_tensor, group_dst=group_dst, group=group)
 
 
 @_exception_logger
@@ -3430,6 +3450,7 @@ def recv_object_list(
     group: Optional[ProcessGroup] = None,
     device: Optional[torch.device] = None,
     group_src: Optional[int] = None,
+    use_batch: bool = False,
 ):
     """
     Receives picklable objects in ``object_list`` synchronously.
@@ -3447,6 +3468,10 @@ def recv_object_list(
         device (``torch.device``, optional): If not None, receives on this device.
             Default is ``None``.
         group_src (int, optional): Destination rank on ``group``.  Invalid to specify both ``src`` and ``group_src``.
+        use_batch (bool, optional): If True, use batch p2p operations instead of
+            regular send operations. This avoids initializing 2-rank communicators and
+            uses existing entire group communicators. See batch_isend_irecv for usage and
+            assumptions. Default is ``False``.
 
     Returns:
         Sender rank. -1 if rank is not part of the group. If rank is part of the group,
@@ -3490,6 +3515,10 @@ def recv_object_list(
         >>> objects
         ['foo', 12, {1: 2}]
     """
+    group = _group_or_default_group(group)
+    group_src = _canonicalize_group_rank(group, src, group_src)
+    _check_not_self_rank(group, group_src, "source")
+
     if _rank_not_in_group(group):
         _warn_not_in_group("recv_object_list")
         return -1
@@ -3506,7 +3535,21 @@ def recv_object_list(
     )
 
     # Receive object sizes
-    rank_sizes = recv(object_sizes_tensor, src=src, group=group, group_src=group_src)
+    if use_batch:
+        work = batch_isend_irecv(
+            [
+                P2POp(
+                    irecv,
+                    object_sizes_tensor,
+                    group_peer=group_src,
+                    group=group,
+                )
+            ]
+        ).pop()
+        work.wait()
+        rank_sizes = get_global_rank(group, group_src)
+    else:
+        rank_sizes = recv(object_sizes_tensor, group=group, group_src=group_src)
 
     # Tensor to receive serialized objects into.
     object_tensor = torch.empty(  # type: ignore[call-overload]
@@ -3515,7 +3558,21 @@ def recv_object_list(
         device=current_device,
     )
 
-    rank_objects = recv(object_tensor, src=src, group=group, group_src=group_src)
+    if use_batch:
+        work = batch_isend_irecv(
+            [
+                P2POp(
+                    irecv,
+                    object_tensor,
+                    group_peer=group_src,
+                    group=group,
+                )
+            ]
+        ).pop()
+        work.wait()
+        rank_objects = get_global_rank(group, group_src)
+    else:
+        rank_objects = recv(object_tensor, group=group, group_src=group_src)
     assert rank_sizes == rank_objects, (
         "Mismatch in return ranks for object sizes and objects."
     )
@@ -4815,9 +4872,11 @@ def barrier(
         # may use default device 0, causing issues like hang or all processes
         # creating context on device 0.
         opts.device = device
-        warnings.warn(  # warn only once
-            "No device id is provided via `init_process_group` or `barrier `. Using the current device set by the user. "
-        )
+        if group.rank() == 0:
+            warnings.warn(  # warn only once
+                "barrier(): using the device under current context. "
+                "You can specify `device_id` in `init_process_group` to mute this warning."
+            )
 
     work = group.barrier(opts=opts)
 
diff --git a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
index c48f75ad331f..6a2e7ae35c4b 100644
--- a/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
+++ b/torch/distributed/elastic/multiprocessing/subprocess_handler/subprocess_handler.py
@@ -12,7 +12,7 @@
 from typing import Any, Optional
 
 from torch.numa.binding import (
-    maybe_temporarily_apply_numa_binding_to_current_process,
+    maybe_temporarily_apply_numa_binding_to_current_thread,
     NumaOptions,
 )
 
@@ -57,7 +57,7 @@ def __init__(
         self.local_rank_id = local_rank_id
 
         # See HACK [NUMA inheritance] in spawn.py for context.
-        with maybe_temporarily_apply_numa_binding_to_current_process(
+        with maybe_temporarily_apply_numa_binding_to_current_thread(
             gpu_index=local_rank_id, numa_options=numa_options
         ):
             self.proc: Popen = self._popen(args_str, env_vars)
diff --git a/torch/distributed/launcher/api.py b/torch/distributed/launcher/api.py
index 76edc14ef1f1..acf23b27ca2a 100644
--- a/torch/distributed/launcher/api.py
+++ b/torch/distributed/launcher/api.py
@@ -26,7 +26,6 @@
 from torch.distributed.elastic.rendezvous import RendezvousParameters
 from torch.distributed.elastic.rendezvous.utils import parse_rendezvous_endpoint
 from torch.distributed.elastic.utils.logging import get_logger
-from torch.multiprocessing.spawn import should_use_parallel_start
 from torch.numa.binding import NumaOptions
 
 
@@ -110,11 +109,6 @@ def __post_init__(self):
 
         if (
             self.numa_options is None
-            # The way we apply NUMA bindings currently depends
-            # on the processes being started sequentially.
-            # Technically, this filter does not matter for str entrypoints,
-            # but we ignore that nuance for now.
-            and not should_use_parallel_start(self.start_method)
             and torch.cuda.is_available()
             # We assume local_rank n uses cuda device n.
             and torch.cuda.device_count() == self.nproc_per_node
diff --git a/torch/distributed/pipelining/_IR.py b/torch/distributed/pipelining/_IR.py
index f21e9cde8d37..3dfb0fe25c4c 100644
--- a/torch/distributed/pipelining/_IR.py
+++ b/torch/distributed/pipelining/_IR.py
@@ -681,7 +681,7 @@ def _from_traced(
         ``output_loss_value_spec={'loss': True, 'model_out': False}``
         """
 
-        traced = exported_program.module()
+        traced = exported_program.module(check_guards=False)
 
         if split_policy is not None:
             logger.info("Auto-splitting model")
diff --git a/torch/distributed/pipelining/schedules.py b/torch/distributed/pipelining/schedules.py
index d3bc27e0e83a..ffc23a654ec4 100644
--- a/torch/distributed/pipelining/schedules.py
+++ b/torch/distributed/pipelining/schedules.py
@@ -9,6 +9,7 @@
 from abc import ABC, abstractmethod
 from collections import Counter, defaultdict
 from enum import Enum
+from functools import lru_cache
 from typing import Any, Callable, NamedTuple, Optional, Union
 
 import torch
@@ -207,6 +208,11 @@ def from_str(action_string: str):
         )
 
 
+@lru_cache
+def _get_profiler_function_name(action: _Action) -> str:
+    return f"PP:{str(action)}"
+
+
 def _format_pipeline_order(
     pipeline_order: dict[int, list[Optional[_Action]]],
     error_step_number: Optional[int] = None,
@@ -1919,148 +1925,155 @@ def _assert_unsharded(stage_idx: int):
                     action,
                 )
 
-                # TODO(whc) it's not actually safe to use _batch_p2p here in the uncommon case the model has skip-connections,
-                # since we do not want to batch up ops between more than a pair of ranks.  _sorted_batch_p2p would be
-                # safe to use instead.
-                # However, I was wondering if I should avoid calling batched operators at all in the case that there is
-                # only one operator per batch.  I could iterate through the 'fwd_send_ops' one by one and run them.
-                if comp_type == SEND_F:
-                    send_ops.append(_batch_p2p(stage.get_fwd_send_ops(mb_index)))
-                elif comp_type == SEND_B:
-                    send_ops.append(_batch_p2p(stage.get_bwd_send_ops(mb_index)))
-                elif comp_type == RECV_F:
-                    assert (
-                        stage_idx,
-                        mb_index,
-                    ) not in fwd_recv_ops, (
-                        "Recv twice for {stage_idx=} {mb_index=} without executing forward"
-                    )
-                    fwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
-                        stage.get_fwd_recv_ops(mb_index)
-                    )
-                elif comp_type == RECV_B:
-                    assert (
-                        stage_idx,
-                        mb_index,
-                    ) not in bwd_recv_ops, (
-                        "Recv twice for {stage_idx=} {mb_index=} without executing backward"
-                    )
-                    bwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
-                        stage.get_bwd_recv_ops(mb_index)
-                    )
-                elif comp_type == UNSHARD:
-                    if stage_uses_fsdp:
+                with record_function(_get_profiler_function_name(action)):
+                    # TODO(whc) it's not actually safe to use _batch_p2p here in the uncommon case the model has skip-connections,
+                    # since we do not want to batch up ops between more than a pair of ranks.  _sorted_batch_p2p would be
+                    # safe to use instead.
+                    # However, I was wondering if I should avoid calling batched operators at all in the case that there is
+                    # only one operator per batch.  I could iterate through the 'fwd_send_ops' one by one and run them.
+                    if comp_type == SEND_F:
+                        send_ops.append(_batch_p2p(stage.get_fwd_send_ops(mb_index)))
+                    elif comp_type == SEND_B:
+                        send_ops.append(_batch_p2p(stage.get_bwd_send_ops(mb_index)))
+                    elif comp_type == RECV_F:
                         assert (
-                            stage_idx not in unsharded_stages
-                            and stage_idx not in unshard_ops
-                        ), f"Unsharding the same {stage_idx=} twice"
-                        unshard_ops[stage_idx] = stage.submod.unshard(async_op=True)  # type: ignore[operator]
-                elif comp_type == RESHARD:
-                    if stage_uses_fsdp:
-                        assert stage_idx in unsharded_stages, (
-                            f"Resharding {stage_idx=} without unsharding"
+                            stage_idx,
+                            mb_index,
+                        ) not in fwd_recv_ops, (
+                            "Recv twice for {stage_idx=} {mb_index=} without executing forward"
                         )
-                        assert stage_idx not in unshard_ops, (
-                            f"Resharding {stage_idx=} before finishing unshard"
+                        fwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
+                            stage.get_fwd_recv_ops(mb_index)
                         )
-                        stage.submod.reshard()  # type: ignore[operator]
-                elif comp_type == FORWARD:
-                    if stage_uses_fsdp:
-                        _assert_unsharded(stage_idx)
-
-                    if (
-                        not stage.is_first
-                        # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
-                        and not is_prev_stage_on_this_rank
-                    ):
+                    elif comp_type == RECV_B:
                         assert (
                             stage_idx,
                             mb_index,
-                        ) in fwd_recv_ops, f"Computing {action=} before receiving input"
-                        _wait_batch_p2p(fwd_recv_ops.pop((stage_idx, mb_index)))
-
-                    output = stage.forward_one_chunk(
-                        mb_index, arg_mbs[mb_index], kwarg_mbs[mb_index]
-                    )
-                    self._maybe_compute_loss(stage, output, target_mbs, mb_index)
+                        ) not in bwd_recv_ops, (
+                            "Recv twice for {stage_idx=} {mb_index=} without executing backward"
+                        )
+                        bwd_recv_ops[(stage_idx, mb_index)] = _batch_p2p(
+                            stage.get_bwd_recv_ops(mb_index)
+                        )
+                    elif comp_type == UNSHARD:
+                        if stage_uses_fsdp:
+                            assert (
+                                stage_idx not in unsharded_stages
+                                and stage_idx not in unshard_ops
+                            ), f"Unsharding the same {stage_idx=} twice"
+                            unshard_ops[stage_idx] = stage.submod.unshard(async_op=True)  # type: ignore[operator]
+                    elif comp_type == RESHARD:
+                        if stage_uses_fsdp:
+                            assert stage_idx in unsharded_stages, (
+                                f"Resharding {stage_idx=} without unsharding"
+                            )
+                            assert stage_idx not in unshard_ops, (
+                                f"Resharding {stage_idx=} before finishing unshard"
+                            )
+                            stage.submod.reshard()  # type: ignore[operator]
+                    elif comp_type == FORWARD:
+                        if stage_uses_fsdp:
+                            _assert_unsharded(stage_idx)
+
+                        if (
+                            not stage.is_first
+                            # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                            and not is_prev_stage_on_this_rank
+                        ):
+                            assert (
+                                stage_idx,
+                                mb_index,
+                            ) in fwd_recv_ops, (
+                                f"Computing {action=} before receiving input"
+                            )
+                            _wait_batch_p2p(fwd_recv_ops.pop((stage_idx, mb_index)))
 
-                    # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
-                    # see [Note: V-schedule special case]
-                    if is_next_stage_on_this_rank:
-                        stage_index_to_stage[stage_idx + 1].set_local_fwd_input(
-                            output, mb_index
+                        output = stage.forward_one_chunk(
+                            mb_index, arg_mbs[mb_index], kwarg_mbs[mb_index]
                         )
+                        self._maybe_compute_loss(stage, output, target_mbs, mb_index)
 
-                elif comp_type == FULL_BACKWARD:
-                    if stage_uses_fsdp:
-                        _assert_unsharded(stage_idx)
+                        # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                        # see [Note: V-schedule special case]
+                        if is_next_stage_on_this_rank:
+                            stage_index_to_stage[stage_idx + 1].set_local_fwd_input(
+                                output, mb_index
+                            )
 
-                    if (
-                        not stage.is_last
-                        # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
-                        and not is_next_stage_on_this_rank
-                    ):
-                        assert (
-                            stage_idx,
-                            mb_index,
-                        ) in bwd_recv_ops, (
-                            f"Attempted to run compute {action=} before receiving input"
+                    elif comp_type == FULL_BACKWARD:
+                        if stage_uses_fsdp:
+                            _assert_unsharded(stage_idx)
+
+                        if (
+                            not stage.is_last
+                            # no recv op expected for V-schedule special case (see [Note: V-schedule special case])
+                            and not is_next_stage_on_this_rank
+                        ):
+                            assert (
+                                stage_idx,
+                                mb_index,
+                            ) in bwd_recv_ops, (
+                                f"Attempted to run compute {action=} before receiving input"
+                            )
+                            _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
+                        loss = self._maybe_get_loss(stage, mb_index)
+                        backward_counter[stage_idx] += 1
+                        last_backward = (
+                            backward_counter[stage_idx] == self._n_microbatches
                         )
-                        _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
-                    loss = self._maybe_get_loss(stage, mb_index)
-                    backward_counter[stage_idx] += 1
-                    last_backward = backward_counter[stage_idx] == self._n_microbatches
-                    grad_scale_factor = self._n_microbatches if self.scale_grads else 1
-                    stage.backward_one_chunk(
-                        mb_index,
-                        loss=loss,
-                        full_backward=True,
-                        last_backward=last_backward,
-                    )
-                    if last_backward:
-                        stage.scale_grads(grad_scale_factor)
-                    # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
-                    # see [Note: V-schedule special case]
-                    if is_prev_stage_on_this_rank:
-                        stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
-                            stage.get_local_bwd_output(mb_index), mb_index
+                        grad_scale_factor = (
+                            self._n_microbatches if self.scale_grads else 1
                         )
-                elif comp_type == BACKWARD_INPUT:
-                    if stage_uses_fsdp:
-                        _assert_unsharded(stage_idx)
-
-                    if not stage.is_last and not is_next_stage_on_this_rank:
-                        assert (
-                            stage_idx,
+                        stage.backward_one_chunk(
                             mb_index,
-                        ) in bwd_recv_ops, (
-                            f"Attempted to run compute {action=} before receiving input"
+                            loss=loss,
+                            full_backward=True,
+                            last_backward=last_backward,
                         )
-                        _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
-                    loss = self._maybe_get_loss(stage, mb_index)
-                    stage.backward_one_chunk(
-                        mb_index,
-                        loss=loss,
-                        full_backward=False,
-                        last_backward=False,
-                    )
-                    # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
-                    # see [Note: V-schedule special case]
-                    if is_prev_stage_on_this_rank:
-                        stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
-                            stage.get_local_bwd_output(mb_index), mb_index
+                        if last_backward:
+                            stage.scale_grads(grad_scale_factor)
+                        # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                        # see [Note: V-schedule special case]
+                        if is_prev_stage_on_this_rank:
+                            stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
+                                stage.get_local_bwd_output(mb_index), mb_index
+                            )
+                    elif comp_type == BACKWARD_INPUT:
+                        if stage_uses_fsdp:
+                            _assert_unsharded(stage_idx)
+
+                        if not stage.is_last and not is_next_stage_on_this_rank:
+                            assert (
+                                stage_idx,
+                                mb_index,
+                            ) in bwd_recv_ops, (
+                                f"Attempted to run compute {action=} before receiving input"
+                            )
+                            _wait_batch_p2p(bwd_recv_ops.pop((stage_idx, mb_index)))
+                        loss = self._maybe_get_loss(stage, mb_index)
+                        stage.backward_one_chunk(
+                            mb_index,
+                            loss=loss,
+                            full_backward=False,
+                            last_backward=False,
                         )
-                elif comp_type == BACKWARD_WEIGHT:
-                    if stage_uses_fsdp:
-                        _assert_unsharded(stage_idx)
-                    backward_counter[stage_idx] += 1
-                    stage.backward_weight_one_chunk(
-                        mb_index,
-                        last_backward=backward_counter[stage_idx]
-                        == self._n_microbatches,
-                    )
-                else:
-                    raise ValueError(f"{action=} is unknown or unsupported")
+                        # SEND/RECV op are avoided for special case with 2 adjacent stages on same rank
+                        # see [Note: V-schedule special case]
+                        if is_prev_stage_on_this_rank:
+                            stage_index_to_stage[stage_idx - 1].set_local_bwd_input(
+                                stage.get_local_bwd_output(mb_index), mb_index
+                            )
+                    elif comp_type == BACKWARD_WEIGHT:
+                        if stage_uses_fsdp:
+                            _assert_unsharded(stage_idx)
+                        backward_counter[stage_idx] += 1
+                        stage.backward_weight_one_chunk(
+                            mb_index,
+                            last_backward=backward_counter[stage_idx]
+                            == self._n_microbatches,
+                        )
+                    else:
+                        raise ValueError(f"{action=} is unknown or unsupported")
             except Exception as e:
                 logger.error(
                     "_PipelineScheduleRuntime caught exception at step %s when running action %s.  Full Schedule:",
diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
index c1abebde5b85..6615ced0398e 100644
--- a/torch/distributed/pipelining/stage.py
+++ b/torch/distributed/pipelining/stage.py
@@ -1424,6 +1424,7 @@ def _shape_inference(
                 ),
                 group=self.group,
                 device=self.device,
+                use_batch=True,
             )
             recv_args = objects[0]
             assert isinstance(recv_args, tuple), type(recv_args)
@@ -1489,6 +1490,7 @@ def _shape_inference(
                 ),
                 group=self.group,
                 device=self.device,
+                use_batch=True,
             )
             outputs_meta = tuple()
 
diff --git a/torch/distributed/tensor/_api.py b/torch/distributed/tensor/_api.py
index b0ee136c135f..7eeafaa8eaf9 100644
--- a/torch/distributed/tensor/_api.py
+++ b/torch/distributed/tensor/_api.py
@@ -269,22 +269,13 @@ def __new__(
         # new method instruct wrapper tensor from local_tensor and add
         # placement spec, it does not do actual distribution
         assert spec.tensor_meta is not None, "TensorMeta should not be None!"
-        extra_dispatch_keys = torch._C.DispatchKeySet.from_raw_repr(0)
-        if torch._C._dispatch_keys(local_tensor).has(torch._C.DispatchKey.Conjugate):
-            extra_dispatch_keys = extra_dispatch_keys.add(
-                torch._C.DispatchKey.Conjugate
-            )
-        if torch._C._dispatch_keys(local_tensor).has(torch._C.DispatchKey.Negative):
-            extra_dispatch_keys = extra_dispatch_keys.add(torch._C.DispatchKey.Negative)
-        r = torch.Tensor._make_wrapper_subclass(
+
+        r = torch.Tensor._make_dtensor(
             cls,
             spec.tensor_meta.shape,
-            strides=spec.tensor_meta.stride,
-            dtype=local_tensor.dtype,
-            device=local_tensor.device,
-            layout=local_tensor.layout,
-            requires_grad=requires_grad,
-            _extra_dispatch_keys=extra_dispatch_keys,
+            spec.tensor_meta.stride,
+            local_tensor,
+            requires_grad,
         )
 
         r._spec = spec
diff --git a/torch/distributed/tensor/_dispatch.py b/torch/distributed/tensor/_dispatch.py
index 03fb4f33a0f2..7ac7801b50bc 100644
--- a/torch/distributed/tensor/_dispatch.py
+++ b/torch/distributed/tensor/_dispatch.py
@@ -121,11 +121,17 @@ def __init__(self) -> None:
             aten._amp_foreach_non_finite_check_and_unscale_.default: found_inf_reduce_handler,
         }
 
-        # This flag is used internally to control whether we treat the torch.Tensor(non-DTensor)
-        # as implicitly replicated or we throw error to user.
-        # NOTE: It is EXTREMELY UNSAFE to turn this flag on by default so we intentionally leave
-        # it as False by default.
-        self._allow_implicit_replication = False
+    # This flag is used internally to control whether we treat the torch.Tensor(non-DTensor)
+    # as implicitly replicated or we throw error to user.
+    # NOTE: It is EXTREMELY UNSAFE to turn this flag on by default so we intentionally leave
+    # it as False by default.
+    @property
+    def _allow_implicit_replication(self) -> bool:
+        return torch._C._get_dtensor_allow_implicit_replication()
+
+    @_allow_implicit_replication.setter
+    def _allow_implicit_replication(self, value: bool) -> None:
+        return torch._C._set_dtensor_allow_implicit_replication(value)
 
     def dispatch(
         self,
@@ -159,6 +165,10 @@ def dispatch(
                 return out
             else:
                 raise
+        except Exception as e:
+            raise RuntimeError(
+                f"Sharding propagation failed for {op_info.schema}"
+            ) from e
 
         output_sharding = op_info.output_sharding
         logger.debug("output_sharding for %s: %s", op_call, output_sharding)
diff --git a/torch/distributed/tensor/_op_schema.py b/torch/distributed/tensor/_op_schema.py
index 0d91a432e83d..6f8c644095ee 100644
--- a/torch/distributed/tensor/_op_schema.py
+++ b/torch/distributed/tensor/_op_schema.py
@@ -477,10 +477,7 @@ def is_out_variant_op(self) -> bool:
         return "out" in self.op._schema.overload_name
 
     def is_view_op(self) -> bool:
-        return any(
-            a.alias_info is not None and not a.alias_info.is_write
-            for a in self.op._schema.arguments
-        )
+        return self.op._schema._is_view_op()
 
     def _recompute_comparison_key(self):
         if not self.schema_info:
diff --git a/torch/distributed/tensor/_ops/_view_ops.py b/torch/distributed/tensor/_ops/_view_ops.py
index 6a113c7ec06e..62e8c68e9be9 100644
--- a/torch/distributed/tensor/_ops/_view_ops.py
+++ b/torch/distributed/tensor/_ops/_view_ops.py
@@ -519,45 +519,48 @@ def maybe_get_shard_mesh_dim_and_placement(
                 return i, placement
         return None, None
 
+    # NOTE: This function has three responsibilities:
+    # 1. determine "theoretically" if an output dimension can be sharded, i.e. fill the shardable_dims map
+    # 2. determine "theoretically" the corresponding input dimension to shard on, via return value
+    # 3. throw an error when strict_view is enabled and we cannot shard an output dimension
+    # 1 and 2 doesn't require the info of whether current input is sharded.
+    # 3 requires that info, to decide whether we can error out. Maybe we can refactor
+    # to make this function purely "theoretical".
     def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
-        # TODO(whc) this helper is pretty hard to understand, at least it should be better documented if not refactored
         if isinstance(cmd, InputDim):
             return cmd
         elif isinstance(cmd, Flatten):
             for i, dim in enumerate(cmd.input_dims):
-                if isinstance(dim, InputDim):
-                    can_shard_dim = True
-                    shard_mesh_dim, shard_placement = (
-                        maybe_get_shard_mesh_dim_and_placement(dim)
-                    )
-                    input_sharded = shard_mesh_dim is not None
-                    if i > 0:
+                # so far all Flatten is always composed of InputDims; revisit this if needed
+                assert isinstance(dim, InputDim)
+                can_shard_dim = True
+                shard_mesh_dim, shard_placement = (
+                    maybe_get_shard_mesh_dim_and_placement(dim)
+                )
+                input_sharded = shard_mesh_dim is not None
+                if i > 0:
+                    can_shard_dim = False
+                    if strict_view and input_sharded:
+                        raise RuntimeError(
+                            f"Attempted to flatten multiple dimensions, with dimension {dim.input_dim} being sharded. ",
+                            "It cannot be performed without redistribution, which is disallowed by the current operator.",
+                        )
+                elif input_sharded:
+                    assert shard_placement is not None and shard_mesh_dim is not None
+                    tensor_dim_size = global_input_shape[shard_placement.dim]
+                    mesh_dim_size = mesh_sizes[shard_mesh_dim]
+                    if tensor_dim_size % mesh_dim_size != 0:
                         can_shard_dim = False
-                        if strict_view and input_sharded:
+                        if strict_view:
                             raise RuntimeError(
-                                f"Attempted to flatten sharded dimension {i}, ",
-                                "but only the leftmost dim of a Flatten can be sharded.",
+                                f"Attempted to flatten unevenly sharded dimension {i}, "
+                                "which would require resharding the input. "
+                                "Please explicitly redistribute the tensor instead."
                             )
-                    elif input_sharded:
-                        assert (
-                            shard_placement is not None and shard_mesh_dim is not None
-                        )
-                        tensor_dim_size = global_input_shape[shard_placement.dim]
-                        mesh_dim_size = mesh_sizes[shard_mesh_dim]
-                        if tensor_dim_size % mesh_dim_size != 0:
-                            can_shard_dim = False
-                            if strict_view:
-                                raise RuntimeError(
-                                    f"Attempted to flatten unevenly sharded dimension {i}, "
-                                    "which would require resharding the input. "
-                                    "Please explicitly redistribute the tensor instead."
-                                )
-
-                    shardable_dims[dim.input_dim] = [can_shard_dim] * mesh_ndim
-            dim0 = cmd.input_dims[0]
-            # TODO(whc) dim0 can be sharded or not sharded, can't it?
-            # should we only return it if its sharded in the placement?
-            return dim0 if isinstance(dim0, InputDim) else None
+                shardable_dims[dim.input_dim] = [can_shard_dim] * mesh_ndim
+
+            assert isinstance(cmd.input_dims[0], InputDim)
+            return cmd.input_dims[0]
         elif isinstance(cmd, Split):
             in_dim = get_in_dim_to_shard(cmd.input_dim)
             out_size = cmd.group_shape[cmd.split_id]
@@ -576,6 +579,14 @@ def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
                     out_size % mesh_dim_size == 0 for mesh_dim_size in mesh_sizes
                 ]
 
+                shard_mesh_dim, _ = maybe_get_shard_mesh_dim_and_placement(in_dim)
+                if strict_view and shard_mesh_dim is not None:
+                    if not shardable_dims[in_dim.input_dim][shard_mesh_dim]:
+                        raise RuntimeError(
+                            f"Attempted to split the sharded dimension {in_dim.input_dim} into multiple subdimensions. ",
+                            "It cannot be performed without redistribution, which is disallowed by the current operator.",
+                        )
+
                 # 2. here we special case things like [Shard(0), Shard(0)]
                 submesh_size = 1
                 for size, shard in zip(mesh_sizes, input_src_placements):
diff --git a/torch/distributed/tensor/_redistribute.py b/torch/distributed/tensor/_redistribute.py
index 11fc2d11e1a8..54d8723b92f8 100644
--- a/torch/distributed/tensor/_redistribute.py
+++ b/torch/distributed/tensor/_redistribute.py
@@ -7,6 +7,7 @@
 import torch
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.tensor._api as dtensor
+from torch.distributed._functional_collectives import _are_we_tracing
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor.device_mesh import DeviceMesh
 from torch.distributed.tensor.placement_types import (
@@ -181,10 +182,7 @@ def redistribute_local_tensor(
         # which should be an empty tensor
         return local_tensor
 
-    has_symints = any(isinstance(s, torch.SymInt) for s in current_spec.shape) or any(
-        isinstance(s, torch.SymInt) for s in target_spec.shape
-    )
-    if has_symints:
+    if _are_we_tracing():
         transform_infos = _gen_transform_infos_non_cached(current_spec, target_spec)
     else:
         transform_infos = _gen_transform_infos(current_spec, target_spec)
diff --git a/torch/distributed/tensor/_sharding_prop.py b/torch/distributed/tensor/_sharding_prop.py
index 68ff7589976e..cd5452a1e9c0 100644
--- a/torch/distributed/tensor/_sharding_prop.py
+++ b/torch/distributed/tensor/_sharding_prop.py
@@ -320,7 +320,7 @@ def propagate(self, op_info: OpInfo) -> None:
         # because SymInts are not hashable.
         # This is generally ok because this only happens during tracing in torch.compile,
         # and tracing does not need to be as fast as eagermode DTensor usages.
-        if op_info.schema.has_symints:
+        if _are_we_tracing():
             output_sharding = self.propagate_op_sharding_non_cached(op_info.schema)
         else:
             output_sharding = cast(
@@ -338,7 +338,6 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
             return OutputSharding(None, op_schema)
 
         out_tensor_meta = self._propagate_tensor_meta_non_cached(op_schema)
-
         if op_schema.op in self.op_strategy_funcs:
             # wrap the op_schema with op strategy for sharding strategy propagation
             strategy_schema = self._wrap_with_op_strategy(op_schema)
@@ -348,7 +347,7 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
 
             if isinstance(op_strategy, OpStrategy):
                 # single Op strategy
-                output_strategy = self._select_strategy(op_strategy)
+                output_strategy = self._select_strategy(op_strategy, op_schema)
 
                 # check if we need to redistribute the input
                 needs_redistribute = False
@@ -556,21 +555,56 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
                 f"Operator {op_schema.op} does not have a sharding strategy registered."
             )
 
-    def _select_strategy(self, strategy: OpStrategy) -> OpSpec:
+    def _select_strategy(
+        self, strategy: OpStrategy, op_schema: Optional[OpSchema] = None
+    ) -> OpSpec:
         if len(strategy.strategies) == 1:
             # short cut with only one possible OpSpec
             return strategy.strategies[0]
 
         op_spec_costs: list[float] = []
-        for op_spec in strategy.strategies:
+        no_redistribute_strategy_index: int = -1
+        for strategy_idx, op_spec in enumerate(strategy.strategies):
             assert op_spec.redistribute_cost is not None, (
                 "must set redistribute cost each OpSpec!"
             )
             redistribute_cost = sum(chain.from_iterable(op_spec.redistribute_cost))
             op_spec_costs.append(redistribute_cost)
 
+            # If there's no redistribute cost, we record the index of the strategy
+            # which doesn't need redistribute.
+            # TODO: Currently this only applies to OpStrategy selection. Requires extra
+            # logic to make it work for TupleStrategy, if needed.
+            if op_schema is not None and redistribute_cost == 0:
+                needs_redistribute = False
+                for spec_idx, input_spec in enumerate(op_schema.args_spec):
+                    desired_spec = (
+                        op_spec.output_spec
+                        if op_spec.input_specs is None
+                        else op_spec.input_specs[spec_idx]
+                    )
+                    if input_spec.placements != desired_spec.placements:
+                        needs_redistribute = True
+                        break
+
+                if not needs_redistribute:
+                    no_redistribute_strategy_index = strategy_idx
+
         # for eager execution, we just select the one with the minimal redistribute cost
-        return strategy.strategies[op_spec_costs.index(min(op_spec_costs))]
+        min_cost = min(op_spec_costs)
+        if min_cost < 0:
+            # If there's negative cost, we select the one with the minimal cost,
+            # even if this means we need to redistribute, e.g. via local chunking.
+            # E.g. this can happen for ops in self.op_to_shape_and_stride_idx
+            # when the inputs / outputs are sharded.
+            selected_strategy_index = op_spec_costs.index(min_cost)
+        elif min_cost == 0 and no_redistribute_strategy_index != -1:
+            # If there's no redistribute cost, we select the one with no redistribute.
+            selected_strategy_index = no_redistribute_strategy_index
+        else:
+            selected_strategy_index = op_spec_costs.index(min_cost)
+
+        return strategy.strategies[selected_strategy_index]
 
     def _adjust_shape_and_stride_args(
         self,
diff --git a/torch/distributed/tensor/experimental/_attention.py b/torch/distributed/tensor/experimental/_attention.py
index f33a52c495a4..6cd06727cd2b 100644
--- a/torch/distributed/tensor/experimental/_attention.py
+++ b/torch/distributed/tensor/experimental/_attention.py
@@ -49,25 +49,6 @@ class _RotateMethod(Enum):
 logger = logging.getLogger(__name__)
 
 
-def _need_scaling() -> bool:
-    if hasattr(torch.version, "hip") and torch.version.hip is not None:
-        gcn_arch_name = torch.cuda.get_device_properties("cuda").gcnArchName
-        _is_ck_supported = False
-        for arch in ["gfx942", "gfx950"]:
-            if arch in gcn_arch_name:
-                _is_ck_supported = True
-        # Check the function exists
-        _preferred_rocm_fa_library = torch.backends.cuda.preferred_rocm_fa_library
-        _CK_BACKEND = torch.backends.cuda._ROCmFABackends["ck"]
-        # Note: it is possible that CK is selected but not compiled in the binary.
-        if _is_ck_supported and _preferred_rocm_fa_library() == _CK_BACKEND:
-            # Unsure about CK's behavior, keep logsumexp untouched
-            return False
-        return True
-    else:
-        return False
-
-
 class _DispatchMode(Enum):
     MONKEY_PATCH = auto()
     TORCH_FUNCTION = auto()
@@ -489,8 +470,6 @@ def _templated_ring_attention(
             is_causal=is_causal_behavior.value,
             **kwargs,
         )
-        if _need_scaling():
-            logsumexp *= 0.6931471805599453
         sdpa_merger.step(out, logsumexp, partial)
 
     return *sdpa_merger.results(), *rest
diff --git a/torch/distributed/tensor/experimental/_func_map.py b/torch/distributed/tensor/experimental/_func_map.py
index fd91328c0b37..31cdd0f9a06f 100644
--- a/torch/distributed/tensor/experimental/_func_map.py
+++ b/torch/distributed/tensor/experimental/_func_map.py
@@ -24,10 +24,10 @@
 
 
 def local_map(
-    func: Callable,
-    out_placements: OutputPlacements,
-    in_placements: Optional[InputPlacements] = None,
-    in_grad_placements: Optional[InputPlacements] = None,
+    func: Optional[Callable] = None,
+    out_placements: OutputPlacements = None,
+    in_placements: InputPlacements = None,
+    in_grad_placements: InputPlacements = None,
     device_mesh: Optional[DeviceMesh] = None,
     *,
     redistribute_inputs: bool = False,
@@ -133,114 +133,144 @@ def local_map(
     .. note:: This API is currently experimental and subject to change
     """
 
-    def wrapped(device_mesh: Optional[DeviceMesh], *args, **kwargs):
-        # process input args
-        flat_args, args_spec = pytree.tree_flatten(args)
-        if in_placements is not None:
-            assert len(in_placements) == len(flat_args), (
-                f"in_placements length {len(in_placements)} does not match the number "
-                f"of input args {len(flat_args)}!"
+    if func is None:
+        # decorator mode
+        def decorated(func):
+            return local_map(
+                func=func,
+                out_placements=out_placements,
+                in_placements=in_placements,
+                in_grad_placements=in_grad_placements,
+                device_mesh=device_mesh,
+                redistribute_inputs=redistribute_inputs,
             )
 
-        # we assume every DTensor object is placed on the same device mesh
-        flat_local_args = []
-        seen_dtensor_arg = False
-        for idx, arg in enumerate(flat_args):
-            if isinstance(arg, DTensor):
-                # TODO: the current code doesn't consider the uneven sharding case
-                # Need to think about what the consequence is when the input DTensor
-                # is uneven sharded.
-                if device_mesh is None:  # infer device mesh from the DTensor arg
-                    device_mesh = arg.device_mesh
-
-                # this function is applied to at least one DTensor argument
-                seen_dtensor_arg = True
-
-                if in_placements is not None:
-                    spec = in_placements[idx]
-                    assert spec is not None, (
-                        f"DTensor input {arg} expects placements but received {spec}!"
-                    )
-
-                    if not isinstance(spec, tuple):
-                        spec = tuple(spec)
-
-                    if arg.placements != spec:
-                        if redistribute_inputs:
-                            # redistribute to input placements
-                            arg = arg.redistribute(placements=spec)
-                        else:
-                            raise ValueError(
-                                f"arg {arg} in local_map has a mismatched placements: "
-                                f"arg placements is {arg.placements} but the input "
-                                f"placements is {spec}! "
-                                "If redistribute_inputs is wanted, set "
-                                "redistribute_inputs=True to local_map."
-                            )
-
-                if in_grad_placements is not None:
-                    spec = in_grad_placements[idx]
-                    assert spec is not None, (
-                        f"DTensor input {arg} expects in grad placements but received {spec}!"
-                    )
-                    if not isinstance(spec, tuple):
-                        spec = tuple(spec)
-                    local_arg = arg.to_local(grad_placements=spec)
-                else:
-                    local_arg = arg.to_local()
-
-                if isinstance(local_arg, AsyncCollectiveTensor):
-                    local_arg = local_arg.wait()
-
-                flat_local_args.append(local_arg)
-            else:
-                # Non-Tensor input must have None in `in_placements`
-                if in_placements is not None and not isinstance(arg, torch.Tensor):
-                    spec = in_placements[idx]
-                    assert spec is None, (
-                        f"Non-Tensor input {arg} expects None placements "
-                        f"but received {spec}!"
-                    )
+        return decorated
 
-                flat_local_args.append(arg)
+    return functools.partial(
+        _local_map_wrapped,
+        func,
+        out_placements,
+        in_placements,
+        in_grad_placements,
+        device_mesh,
+        redistribute_inputs,
+    )
 
-        local_args = pytree.tree_unflatten(flat_local_args, args_spec)
 
-        out = func(*local_args, **kwargs)
+def _local_map_wrapped(
+    func: Callable,
+    out_placements: OutputPlacements,
+    in_placements: InputPlacements,
+    in_grad_placements: InputPlacements,
+    device_mesh: Optional[DeviceMesh],
+    redistribute_inputs: bool,
+    *args,
+    **kwargs,
+):
+    # process input args
+    flat_args, args_spec = pytree.tree_flatten(args)
+    if in_placements is not None:
+        assert len(in_placements) == len(flat_args), (
+            f"in_placements length {len(in_placements)} does not match the number "
+            f"of input args {len(flat_args)}!"
+        )
+
+    # we assume every DTensor object is placed on the same device mesh
+    flat_local_args = []
+    seen_dtensor_arg = False
+    for idx, arg in enumerate(flat_args):
+        if isinstance(arg, DTensor):
+            # TODO: the current code doesn't consider the uneven sharding case
+            # Need to think about what the consequence is when the input DTensor
+            # is uneven sharded.
+            if device_mesh is None:  # infer device mesh from the DTensor arg
+                device_mesh = arg.device_mesh
+
+            # this function is applied to at least one DTensor argument
+            seen_dtensor_arg = True
+
+            if in_placements is not None:
+                spec = in_placements[idx]
+                assert spec is not None, (
+                    f"DTensor input {arg} expects placements but received {spec}!"
+                )
+
+                if not isinstance(spec, tuple):
+                    spec = tuple(spec)
+
+                if arg.placements != spec:
+                    if redistribute_inputs:
+                        # redistribute to input placements
+                        arg = arg.redistribute(placements=spec)
+                    else:
+                        raise ValueError(
+                            f"arg {arg} in local_map has a mismatched placements: "
+                            f"arg placements is {arg.placements} but the input "
+                            f"placements is {spec}! "
+                            "If redistribute_inputs is wanted, set "
+                            "redistribute_inputs=True to local_map."
+                        )
+
+            if in_grad_placements is not None:
+                spec = in_grad_placements[idx]
+                assert spec is not None, (
+                    f"DTensor input {arg} expects in grad placements but received {spec}!"
+                )
+                if not isinstance(spec, tuple):
+                    spec = tuple(spec)
+                local_arg = arg.to_local(grad_placements=spec)
+            else:
+                local_arg = arg.to_local()
 
-        if seen_dtensor_arg:
-            # process output to be DTensor if we've seen DTensor inputs
-            flat_out, out_spec = pytree.tree_flatten(out)
+            if isinstance(local_arg, AsyncCollectiveTensor):
+                local_arg = local_arg.wait()
 
-            flat_dist_out = []
-            out_placements_tuple = (
-                out_placements
-                if isinstance(out_placements, tuple)
-                else (out_placements,)
-            )
-            assert len(flat_out) == len(out_placements_tuple), (
-                "local_map requires one PlacementType be provided for each output value,"
-                f" received {len(out_placements_tuple)} out_placements but"
-                f" {len(flat_out)} is expected!"
-            )
-            for out, spec in zip(flat_out, out_placements_tuple):
-                if isinstance(out, torch.Tensor):
-                    assert not isinstance(out, DTensor), (
-                        f"torch.Tensor output expected but received {type(out)}: {out}"
-                    )
-
-                    flat_dist_out.append(
-                        DTensor.from_local(out, device_mesh, spec, run_check=False)
-                    )
-                else:
-                    assert spec is None, (
-                        f"Non-tensor output {out} expects None placements but received {spec}!"
-                    )
-
-                    flat_dist_out.append(out)
-
-            return pytree.tree_unflatten(flat_dist_out, out_spec)
+            flat_local_args.append(local_arg)
         else:
-            return out
+            # Non-Tensor input must have None in `in_placements`
+            if in_placements is not None and not isinstance(arg, torch.Tensor):
+                spec = in_placements[idx]
+                assert spec is None, (
+                    f"Non-Tensor input {arg} expects None placements "
+                    f"but received {spec}!"
+                )
+
+            flat_local_args.append(arg)
+
+    local_args = pytree.tree_unflatten(flat_local_args, args_spec)
+
+    out = func(*local_args, **kwargs)
+
+    if seen_dtensor_arg:
+        # process output to be DTensor if we've seen DTensor inputs
+        flat_out, out_spec = pytree.tree_flatten(out)
+
+        flat_dist_out = []
+        out_placements_tuple = (
+            out_placements if isinstance(out_placements, tuple) else (out_placements,)
+        )
+        assert len(flat_out) == len(out_placements_tuple), (
+            "local_map requires one PlacementType be provided for each output value,"
+            f" received {len(out_placements_tuple)} out_placements but"
+            f" {len(flat_out)} is expected!"
+        )
+        for out, spec in zip(flat_out, out_placements_tuple):
+            if isinstance(out, torch.Tensor):
+                assert not isinstance(out, DTensor), (
+                    f"torch.Tensor output expected but received {type(out)}: {out}"
+                )
+
+                flat_dist_out.append(
+                    DTensor.from_local(out, device_mesh, spec, run_check=False)
+                )
+            else:
+                assert spec is None, (
+                    f"Non-tensor output {out} expects None placements but received {spec}!"
+                )
+
+                flat_dist_out.append(out)
 
-    return functools.partial(wrapped, device_mesh)
+        return pytree.tree_unflatten(flat_dist_out, out_spec)
+    else:
+        return out
diff --git a/torch/export/__init__.py b/torch/export/__init__.py
index 51f0865f4304..621cabf15a3b 100644
--- a/torch/export/__init__.py
+++ b/torch/export/__init__.py
@@ -69,6 +69,7 @@ def export_for_training(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any, ...], list[Any]]] = None,
     strict: bool = False,
     preserve_module_call_signature: tuple[str, ...] = (),
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     """
     :func:`export_for_training` takes any nn.Module along with example inputs, and produces a traced graph representing
@@ -157,6 +158,7 @@ def export_for_training(
         dynamic_shapes,
         strict=strict,
         preserve_module_call_signature=preserve_module_call_signature,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
     )
 
 
@@ -168,6 +170,7 @@ def export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any, ...], list[Any]]] = None,
     strict: bool = False,
     preserve_module_call_signature: tuple[str, ...] = (),
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     """
     :func:`export` takes any nn.Module along with example inputs, and produces a traced graph representing
@@ -279,6 +282,7 @@ def export(
             strict=strict,
             preserve_module_call_signature=preserve_module_call_signature,
             pre_dispatch=True,
+            prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         )
     except Exception as e:
         draft_export_msg = (
@@ -536,6 +540,7 @@ def draft_export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any, ...], list[Any]]] = None,
     preserve_module_call_signature: tuple[str, ...] = (),
     strict: bool = False,
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     """
     A version of torch.export.export which is designed to consistently produce
@@ -551,6 +556,7 @@ def draft_export(
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         strict=strict,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
     )
 
 
diff --git a/torch/export/_draft_export.py b/torch/export/_draft_export.py
index 755ed346fe93..2b14327b2451 100644
--- a/torch/export/_draft_export.py
+++ b/torch/export/_draft_export.py
@@ -371,6 +371,7 @@ def draft_export(
     preserve_module_call_signature: tuple[str, ...] = (),
     strict: bool = False,
     pre_dispatch: bool = True,
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     start_time = time.time()
     kwargs = kwargs or {}
@@ -396,6 +397,7 @@ def draft_export(
                 strict=strict,
                 pre_dispatch=pre_dispatch,
                 preserve_module_call_signature=preserve_module_call_signature,
+                prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
             )
         except Exception as exc:
             if (
@@ -420,6 +422,7 @@ def convert_dim_to_auto(dim: Any) -> Any:
                     strict=strict,
                     pre_dispatch=pre_dispatch,
                     preserve_module_call_signature=preserve_module_call_signature,
+                    prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
                 )
             else:
                 log_draft_export_usage(
diff --git a/torch/export/_leakage_detection_utils.py b/torch/export/_leakage_detection_utils.py
new file mode 100644
index 000000000000..c72152759d23
--- /dev/null
+++ b/torch/export/_leakage_detection_utils.py
@@ -0,0 +1,112 @@
+import gc
+import types
+import typing
+import weakref
+
+import torch
+
+
+"""
+These functions are used to detect potential fake tensor leakage when using PT2 export.
+See NOTE [export non-strict fake tensor leak detection]
+
+There are some complications that made this logic overly complicated:
+1) Python 3.10 and Python 3.12 have different ways of implementing referrer so
+   we need to account for whether it is ref.__dict__ or the real ref object
+
+2) There are some internal PT2 references to fake tensors like `TrackedFake`
+3) closures, generators, and bound methods can hold fake tensors.
+4) global object can hold onto a fake tensor
+
+In general, these utils are our last resort to detect fake tensors. if the leak happens
+within the model attributes, we have a separate mechanism to detect. This tool relies a bit
+on garbage collector internal details, so I think it is unsafe to turn on by default, hence
+this tool should be used as debugging tool.
+"""
+
+
+# Things we never want to flag as leaks
+_SKIP_TYPES = (
+    types.FrameType,
+    types.ModuleType,
+)
+
+
+def _is_globals_or_locals(obj: typing.Any) -> bool:
+    # These comparisons only make sense within this frame; still cheap to check.
+    return obj is globals() or obj is locals()
+
+
+def _is_tracked_fake(obj: typing.Any) -> bool:
+    return isinstance(obj, torch.fx.experimental.symbolic_shapes.TrackedFake)
+
+
+def _is_gm_meta_like_dict(d: dict, o: typing.Any) -> bool:
+    # Hope gm.meta was a custom dict we can assert on
+    return d.get("val", None) is o
+
+
+def _dict_is_attr_of_tracked_fake(d: dict) -> bool:
+    """
+    Python 3.10 quirk: sometimes the referrer is obj.__dict__ instead of obj.
+    Check if this dict is exactly the __dict__ of a TrackedFake.
+    """
+    for parent in gc.get_referrers(d):
+        if (
+            hasattr(parent, "__dict__")
+            and parent.__dict__ is d
+            and _is_tracked_fake(parent)
+        ):
+            return True
+    return False
+
+
+def find_legit_leaks_from_referrers(active_fakes: weakref.WeakSet) -> weakref.WeakSet:
+    legit_leak: weakref.WeakSet = weakref.WeakSet()
+
+    # This is so that we don't falsely flag generator to be holding fake tensor
+    fake_list = list(active_fakes)
+    fake_list_id = id(fake_list)
+
+    for act in fake_list:
+        # Track by id to avoid processing duplicate referrers
+        seen = set()
+        # Assume it's a leak unless we find only ignorable referrers
+        flagged = False
+
+        for r in gc.get_referrers(act):
+            rid = id(r)
+            if rid in seen:
+                continue
+            seen.add(rid)
+
+            # Skip our own fake_list
+            if rid == fake_list_id:
+                continue
+
+            # Fast-path: skip obvious non-owners
+            if _is_globals_or_locals(r):
+                continue
+            if isinstance(r, _SKIP_TYPES):
+                continue
+            if _is_tracked_fake(r):
+                # TrackedFake should be ignored
+                continue
+
+            # Handle dicts carefully (Python 3.10 sometimes shows __dict__)
+            if isinstance(r, dict):
+                if _is_gm_meta_like_dict(r, act):
+                    continue
+                if _dict_is_attr_of_tracked_fake(r):
+                    continue
+                flagged = True
+                break
+
+            # Any other referrer we don't explicitly whitelist counts as a leak
+            flagged = True
+            break
+
+        if flagged:
+            legit_leak.add(act)
+
+    return legit_leak
diff --git a/torch/export/_swap.py b/torch/export/_swap.py
index df003403569a..4c93956e32b4 100644
--- a/torch/export/_swap.py
+++ b/torch/export/_swap.py
@@ -163,7 +163,7 @@ def _remove_extraneous_pytrees(gm: torch.fx.GraphModule) -> None:
     """
 
     for node in gm.graph.nodes:
-        if node.op == "call_module":
+        if node.op == "call_module" and node.target != "_guards_fn":
             _try_remove_connecting_pytrees(node)
 
     gm.graph.eliminate_dead_code()
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index e729dce253e7..76d80ff6eeec 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -2,12 +2,15 @@
 # mypy: allow-untyped-defs
 import dataclasses
 import functools
+import gc
 import inspect
 import logging
+import os
 import re
 import sys
 import time
 import warnings
+import weakref
 from contextlib import contextmanager, nullcontext
 from typing import Any, Callable, Optional, Union
 from typing_extensions import TypeAlias
@@ -69,6 +72,7 @@
 from torch._logging import dtrace_structured
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch._utils_internal import log_export_usage
+from torch.export._leakage_detection_utils import find_legit_leaks_from_referrers
 from torch.export._unlift import _check_input_constraints_pre_hook
 from torch.export.dynamic_shapes import (
     _check_dynamic_shapes,
@@ -107,6 +111,8 @@
 
 log = logging.getLogger(__name__)
 
+NONSTRICT_EXPORT_SANITIZE_TRACE = "NONSTRICT_EXPORT_SANITIZE_TRACE"
+
 
 # Type alias for dynamic shapes specification
 _DynamicShapesSpec: TypeAlias = Union[dict[str, Any], tuple[Any, ...], list[Any]]
@@ -750,7 +756,7 @@ def _export_to_torch_ir(
     *,
     preserve_module_call_signature: tuple[str, ...] = (),
     disable_constraint_solver: bool = False,
-    allow_complex_guards_as_runtime_asserts: bool = False,
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
     restore_fqn: bool = True,
     _log_export_usage: bool = True,
     same_signature: bool = True,
@@ -810,10 +816,7 @@ def _export_to_torch_ir(
                     assume_static_by_default=True,
                     tracing_mode="symbolic",
                     disable_constraint_solver=disable_constraint_solver,
-                    # currently the following 2 flags are tied together for export purposes,
-                    # but untangle for sake of dynamo export api
-                    prefer_deferred_runtime_asserts_over_guards=True,
-                    allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+                    prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
                     _log_export_usage=_log_export_usage,
                     same_signature=same_signature,
                 )(
@@ -1402,7 +1405,7 @@ def _strict_export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]],
     preserve_module_call_signature: tuple[str, ...],
     orig_in_spec: TreeSpec,
-    allow_complex_guards_as_runtime_asserts: bool,
+    prefer_deferred_runtime_asserts_over_guards: bool,
     _to_aten_func: Callable,
 ) -> ExportArtifact:
     """
@@ -1416,7 +1419,7 @@ def _strict_export(
         dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         restore_fqn=False,  # don't need to restore because we will do it later
-        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         _log_export_usage=False,
     )
 
@@ -1859,7 +1862,7 @@ def _non_strict_export(
     dynamic_shapes: Optional[Union[dict[str, Any], tuple[Any], list[Any]]],
     preserve_module_call_signature: tuple[str, ...],
     orig_in_spec: TreeSpec,
-    allow_complex_guards_as_runtime_asserts: bool,
+    prefer_deferred_runtime_asserts_over_guards: bool,
     _to_aten_func: Callable,
 ) -> ExportArtifact:
     """
@@ -1956,7 +1959,7 @@ def forward(self, *args, **kwargs):
         args,
         kwargs,
         dynamic_shapes,
-        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,  # for shape env initialization
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,  # for shape env initialization
     )
 
     fake_params_buffers = _fakify_params_buffers(fake_mode, mod)
@@ -2037,6 +2040,7 @@ def _export_for_training(
     *,
     strict: bool = True,
     preserve_module_call_signature: tuple[str, ...] = (),
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     global _EXPORT_MODULE_HIERARCHY
     _EXPORT_MODULE_HIERARCHY = _get_module_hierarchy(mod)
@@ -2054,6 +2058,16 @@ def _export_for_training(
     # Call the appropriate export function based on the strictness of tracing.
     export_func = _strict_export if strict else _non_strict_export
 
+    alive_fake_input_ids_before_export: list[int] = []
+
+    if not strict and os.environ.get(NONSTRICT_EXPORT_SANITIZE_TRACE, "0") == "1":
+        gc.collect()
+        alive_fake_input_ids_before_export = [
+            id(i)
+            for i in gc.get_objects()
+            if isinstance(i, torch._subclasses.fake_tensor.FakeTensor)
+        ]
+
     export_artifact = export_func(
         mod=mod,
         args=args,
@@ -2061,7 +2075,7 @@ def _export_for_training(
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         orig_in_spec=orig_in_spec,
-        allow_complex_guards_as_runtime_asserts=False,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         _to_aten_func=_export_to_aten_ir_make_fx,
     )
 
@@ -2109,6 +2123,53 @@ def _export_for_training(
     )
 
     verify_additional_inputs(exported_program)
+
+    if not strict and os.environ.get(NONSTRICT_EXPORT_SANITIZE_TRACE, "0") == "1":
+        # See NOTE [export non-strict fake tensor leak detection]
+        from torch.fx.experimental.proxy_tensor import (
+            _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT,
+        )
+
+        fakes_after: list[torch._subclasses.fake_tensor.FakeTensor] = [
+            i
+            for i in gc.get_objects()
+            if isinstance(i, torch._subclasses.fake_tensor.FakeTensor)
+        ]
+
+        active_fakes: weakref.WeakSet = weakref.WeakSet()
+        for fake_tensor in fakes_after:
+            if id(fake_tensor) not in alive_fake_input_ids_before_export:
+                active_fakes.add(fake_tensor)
+
+        del fakes_after
+        del alive_fake_input_ids_before_export
+
+        legit_leak: weakref.WeakSet = find_legit_leaks_from_referrers(active_fakes)
+        leak_sources: list[str] = []
+        if len(legit_leak) > 0:
+            for fake_val in legit_leak:
+                if id(fake_val) in _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT:
+                    stack_trace = _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT[
+                        id(fake_val)
+                    ].meta.get("stack_trace", "<unknown stack trace>")
+
+                    # Get shape and dtype info
+                    shape_info = f"shape={fake_val.shape}, dtype={fake_val.dtype}"
+                    leak_info = f"FakeTensor({shape_info}): {stack_trace}"
+                    leak_sources.append(leak_info)
+
+            # Format the warning message more nicely
+            leak_details = "\n  ".join(leak_sources)
+            warnings.warn(
+                f"Detected {len(legit_leak)} fake tensors that are still alive after export.\n"
+                f"This is likely result of torch.export.export not being able to track side effects "
+                f"that is happening outside of model scope.\n\n"
+                f"Leaked tensors:\n  {leak_details}\n\n"
+                f"Alternatively, please file a bug report to PyTorch team for further debugging help."
+            )
+
+            del legit_leak
+
     return exported_program
 
 
@@ -2123,7 +2184,7 @@ def _export(
     strict: bool = True,
     preserve_module_call_signature: tuple[str, ...] = (),
     pre_dispatch: bool = False,
-    allow_complex_guards_as_runtime_asserts: bool = False,
+    prefer_deferred_runtime_asserts_over_guards: bool = False,
 ) -> ExportedProgram:
     """
     Traces either an nn.Module's forward function or just a callable with PyTorch
@@ -2154,7 +2215,7 @@ def _export(
         preserve_module_call_signature: A list of submodule paths for which the original
             calling conventions are preserved as metadata.
 
-        allow_complex_guards_as_runtime_asserts:
+        prefer_deferred_runtime_asserts_over_guards:
          With the current dynamic shapes language for dims and derived dims, we can run into constraints
          that are not expressible with the language. For example, flattening a matrix and adding to a vector,
          both fully dynamic (i.e. x.reshape([-1]) + y) emits a guard s0 * s1 = s2, which is not expressible.
@@ -2198,6 +2259,7 @@ def _export(
             dynamic_shapes,
             strict=strict,
             preserve_module_call_signature=preserve_module_call_signature,
+            prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         )
         dtrace_structured("exported_program", payload_fn=lambda: str(ep))
         return ep
@@ -2222,7 +2284,7 @@ def _export(
         dynamic_shapes=dynamic_shapes,
         preserve_module_call_signature=preserve_module_call_signature,
         orig_in_spec=original_in_spec,
-        allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
+        prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
         _to_aten_func=functools.partial(
             _export_to_aten_ir,
             pre_dispatch=pre_dispatch,
diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py
index b1dadb39c800..f876e462214c 100644
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@@ -1,10 +1,14 @@
 # mypy: allow-untyped-defs
 import copy
+import inspect
+import math
 import warnings
 from collections.abc import Sequence
 from itertools import chain
 from typing import Any, Optional
 
+import sympy
+
 import torch
 import torch.utils._pytree as pytree
 from torch._export.non_strict_utils import (
@@ -12,11 +16,16 @@
     _exit_enable_graph_inputs_of_type_nn_module,
     _get_graph_inputs_of_type_nn_module,
 )
+from torch._export.passes.add_runtime_assertions_for_constraints_pass import (
+    _convert_range_to_int,
+)
 from torch._export.utils import _check_input_constraints_for_graph
 from torch.export.unflatten import _assign_attr, _AttrKind
 from torch.fx.experimental.proxy_tensor import _pytree_subclasses_that_lose_info
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 from torch.fx.traceback import NodeSource, NodeSourceAction
+from torch.utils._sympy.solve import try_solve
+from torch.utils._sympy.value_ranges import ValueRanges
 
 from ._remove_effect_tokens_pass import _remove_effect_tokens
 from ._tree_utils import reorder_kwargs
@@ -73,20 +82,107 @@ def _check_inputs_match(args, kwargs, in_spec: pytree.TreeSpec) -> list:
     return flat_args_with_path
 
 
-@torch._dynamo.disable
-def _check_input_constraints_pre_hook(self, args, kwargs):
-    if not self.validate_inputs:
-        return
+def _convert_guards_code_to_fn(
+    guards_code: list[str],
+    paths_of_placeholders: list[pytree.KeyPath],
+):
+    """
+    Generates Python code given guards code and paths of placeholders.
+    We assume that, based on source information,
+    - the tracer generates the guards code
+    - the input spec generates the paths of placeholders.
+
+    Example:
+
+    Suppose we are given the guards code "L['z']['k'].size()[1] == 3"
+    and we are given that ['z']['k'] is the path of placeholder #2.
+    Then we will generate:
+    ```
+    torch._assert(
+        args[2].size()[0] == 3,
+        "Guard failed: z['k'].size()[0] == 3",
+    )
+    ```
 
-    flat_args_with_path = _check_inputs_match(args, kwargs, self._in_spec)
+    FAQ: Why do we generate code based on (flattened) args instead of
+    the original (unflattened) inputs? Because this would require
+    inserting an additional pytree.unflatten call in our graph.
+
+    FAQ: Why do we not emit RuntimeError on guard failure as we used to?
+    Because it is inconvenient :/, get used to AssertionError instead.
+    """
+
+    import ast
+
+    from torch.fx.experimental.symbolic_shapes import SYMPY_INTERP
+
+    actual_guards_code = []
+    shadow_guards_code = []
+    for c in guards_code:
+        a, s = c, c
+        for idx, path in enumerate(paths_of_placeholders):
+            # e.g., replace L['z']['k'] with args[2] for Python code (actual)
+            a = a.replace("L" + pytree.keystr(path), f"args[{idx}]")
+            # e.g., replace L['z']['k'] with z['k'] for error message (shadow)
+            s = s.replace(
+                "L" + pytree.keystr(path),
+                path[0].key + pytree.keystr(path[1:]),  # type: ignore[attr-defined]
+            )
+        actual_guards_code.append(a)
+        shadow_guards_code.append(s.replace("\n", ""))
+
+    # generate function code as str
+    code_str = "\ndef _(*args):\n"
+    for actual, shadow in zip(actual_guards_code, shadow_guards_code):
+        # printing guards code may potentially introduce redundant parens;
+        # we can normalize them out for readability by parsing/unparsing
+        # NOTE: this is not necessary for correctness, just deemed desirable
+        _shadow = ast.unparse(ast.parse(shadow, mode="eval"))
+        # actual code and shadow error message
+        code_str += f'  torch._assert({actual}, "Guard failed: {_shadow}")\n'
+    code_str += "  return\n"
+
+    # populate namespace with sympy globals, materialize function (named `_`)
+    namespace = {**SYMPY_INTERP}
+    exec(code_str, namespace)
+
+    # create and return a module whose forward is the materialized function
+    # NOTE: we want Dynamo to trace through this module, to repopulate guards:
+    # otherwise we would lose them when retracing
+    # NOTE: calling this module will be a side effect (no users): so it must
+    # be marked impure to avoid being not cleaned up by DCE
+    guards_fn = GuardsFn()
+    guards_fn.forward = torch._dynamo.dont_skip_tracing(namespace["_"])  # type: ignore[call-overload, method-assign]
+    guards_fn._is_impure = True  # type: ignore[assignment]
+    return guards_fn
 
+
+@torch._dynamo.disable
+def _check_input_constraints_for_module(self, args, kwargs):
+    flat_args_with_path = _check_inputs_match(args, kwargs, self._in_spec)
     _check_input_constraints_for_graph(
-        [node for node in self.graph.nodes if node.op == "placeholder"],
+        self.graph.find_nodes(op="placeholder"),
         flat_args_with_path,
         self.range_constraints,
     )
 
 
+def _check_input_constraints_pre_hook(self, args, kwargs):
+    # preserve current behavior for clients that do not want any validation
+    if not self.validate_inputs:
+        return
+
+    # when a guards function exists, assume that the graph does calls it!
+    # so we do not need to check input constraints...but we still want
+    # to check inputs match, otherwise we'd get obscure pytree errors
+    if hasattr(self, "_guards_fn"):
+        _check_inputs_match(args, kwargs, self._in_spec)
+        return
+
+    # NOTE: this call is Dynamo disabled, as it used to be
+    _check_input_constraints_for_module(self, args, kwargs)
+
+
 def _unlift_inputs_as_getattr(
     gm: torch.fx.GraphModule,
     lifted_inputs: Sequence[Optional[str]],
@@ -419,10 +515,149 @@ def _create_stateful_graph_module(
     return stateful_gm
 
 
-def _unlift_exported_program_lifted_states(ep: ExportedProgram) -> torch.fx.GraphModule:
+def _get_input_paths(example_inputs, signature):
+    """
+    Generate paths of placeholders, needed for generating the guards function.
+
+    NOTE: Here we make use of the example inputs used for export as well as
+    the signature of the unlifted graph module (not preserved by export).
+    """
+
+    args, kwargs = example_inputs
+    ctx = signature.bind(*args, **kwargs).arguments
+    flat_example_inputs_with_paths = pytree.tree_leaves_with_path(ctx)
+    return [path for path, _ in flat_example_inputs_with_paths]
+
+
+def _get_input_guards_for_graph(
+    placeholders: list[torch.fx.Node],
+    range_constraints: dict[sympy.Symbol, ValueRanges],
+    paths_for_placeholders: list[pytree.KeyPath],
+):
+    """
+    Guards generated by the tracer include conditions observed in code, but
+    but do not include some additional checks we typically do in export.
+    For example, when dynamic shapes get specialized, are specified to be
+    within a range, or are specified to be in some equational relation,
+    corresponding input invalidation is done within a pre_hook, specifically,
+    `_check_input_constraints_for_graph`.
+
+    Here we generate guards corresponding to the checks that happen in
+    `_check_input_constraints_for_graph`, and add them to the guards already
+    generated by the tracer. In the future, it may be worthwhile to separate
+    them so that we can allow clients to turn off one but not the other.
+    (Looking at you, AOTI.)
+
+    NOTE: We should eventually reconcile this logic with `build_guards` that
+    is used by AOT Precompile.
+    """
+
+    deferred_expressions = []
+    new_guards_code = []
+    sources: dict[sympy.Expr, str] = {}
+
+    def handle_symint(expr, src):
+        if len(expr.free_symbols) == 1:
+            # complex equations (e.g., involving derived dims) need to
+            # handled later, since we may not have enough information
+            # just as we are passing through the placeholders in order
+            deferred_expressions.append((src, expr))
+        if expr in sources:
+            # expressions that appear in multiple sources should force
+            # inputs corresponding to those sources to be equal
+            # e.g., x.shape[0] == y.shape[1]
+            orig_src = sources[expr]
+            new_guards_code.append(f"{src} == {orig_src}")
+        else:
+            sources[expr] = src
+            # process value ranges as elsewhere in export
+            min_val, max_val = _convert_range_to_int(range_constraints[expr])
+            if min_val > 2:
+                new_guards_code.append(f"{src} >= {min_val}")
+            if max_val < math.inf:
+                new_guards_code.append(f"{src} <= {max_val}")
+
+    for placeholder, path in zip(placeholders, paths_for_placeholders):
+        src = "L" + pytree.keystr(path)
+        meta = placeholder.meta["val"]
+        # specializations
+        if isinstance(meta, int):
+            new_guards_code.append(f"{src} == {meta}")
+        if isinstance(meta, float):
+            if meta == math.inf:
+                new_guards_code.append(f"{src} == math.inf")
+            elif meta == -math.inf:
+                new_guards_code.append(f"{src} == -math.inf")
+            else:
+                new_guards_code.append(f"{src} == {meta}")
+        elif isinstance(meta, str):
+            new_guards_code.append(f"{src} == '{meta}'")
+        # range constraints and equalities
+        elif isinstance(meta, torch.SymInt) and meta.node.expr in range_constraints:
+            handle_symint(meta.node.expr, src)
+        elif isinstance(meta, torch.Tensor):
+            for i, dim in enumerate(meta.shape):
+                src = "L" + pytree.keystr(path) + f".size()[{i}]"
+                if isinstance(dim, int):
+                    # specializations
+                    new_guards_code.append(f"{src} == {dim}")
+                elif (
+                    isinstance(dim, torch.SymInt) and dim.node.expr in range_constraints
+                ):
+                    # range constraints and equalities
+                    handle_symint(dim.node.expr, src)
+
+    unification_map: dict[sympy.Symbol, sympy.Expr] = {}
+    py_printer = torch.utils._sympy.printers.PythonPrinter()
+
+    # process complex equations (e.g., involving derived dims)
+    for src, expr in deferred_expressions:
+        # we know this is the only symbol in expr (see check above)
+        symbol = next(iter(expr.free_symbols))
+        if symbol in sources:
+            # if s0 is already known to be directly sourced from inputs,
+            # e.g., z.shape[2], we do not need to do anything further
+            # (assume we have already processed constraints on s0 above)
+            continue
+
+        # otherwise s0 has some "hidden" source like 'dim'
+        # example: src = y.shape[1], expr = s0 + 1
+        if symbol in unification_map:
+            # suppose that we already know that s0 = x.shape[0] * 2
+            # so we can emit the guard: x.shape[0] * 2 + 1 = y.shape[1]
+            substitution = expr.subs(unification_map)
+            new_guards_code.append(
+                py_printer.doprint(sympy.Eq(substitution, sympy.Symbol(src)))
+            )
+        else:
+            # we do not yet know what s0 is, but given s0 + 1 = y.shape[1],
+            # we can solve for s0...now knowing that s0 = y.shape[1] - 1
+            solution = try_solve(sympy.Eq(expr, sympy.Symbol(src)), symbol)
+            if solution is not None:
+                definition = solution[1]
+                unification_map[symbol] = definition
+
+    return new_guards_code
+
+
+def _unlift_exported_program_lifted_states(
+    ep: ExportedProgram, check_guards=True
+) -> torch.fx.GraphModule:
+    # force check_guards=False for executorch because
+    # its pass infra has too many calls to .module()
+    # and but does not like call modules in the graph
+    # TODO: update executorch to check_guards=False
+    frame = inspect.currentframe()
+    while frame is not None:
+        if "executorch" in frame.f_code.co_filename:
+            check_guards = False
+            break
+        frame = frame.f_back
+
     # TODO T206340015
     if ep.verifiers[0].dialect != "TRAINING":
         ep = _remove_effect_tokens(ep)
+
     new_gm = torch.fx.GraphModule(ep.graph_module, copy.deepcopy(ep.graph))
     _register_attrs_to_new_gm(new_gm, ep.graph_signature, ep.state_dict, ep.constants)
     forward_arg_names = (
@@ -489,4 +724,37 @@ def _unlift_exported_program_lifted_states(ep: ExportedProgram) -> torch.fx.Grap
     )
     unlift_gm = _create_stateful_graph_module(new_gm, ep.range_constraints, ep)
     unlift_gm.meta.update(ep.graph_module.meta)
+
+    # create a _guards_fn submodule and insert a call to it after placeholders
+    graph = unlift_gm.graph
+    placeholders = graph.find_nodes(op="placeholder")
+    if check_guards and placeholders and ep.example_inputs:
+        input_paths = _get_input_paths(
+            ep.example_inputs,
+            inspect.signature(unlift_gm.forward),
+        )
+        guards_code = _get_input_guards_for_graph(
+            placeholders, ep.range_constraints, input_paths
+        )
+        guards_code.extend(ep._guards_code)
+        unlift_gm._guards_fn = _convert_guards_code_to_fn(guards_code, input_paths)
+
+        root_nn_module_stack = torch.fx._utils.first_call_function_nn_module_stack(
+            graph
+        )
+        with graph.inserting_after(placeholders[-1]):
+            node = graph.call_module("_guards_fn", tuple(placeholders))
+            node.meta["nn_module_stack"] = root_nn_module_stack
+
+        unlift_gm.recompile()
+
     return unlift_gm
+
+
+class GuardsFn(torch.nn.Module):
+    """
+    Module class for guard functions.
+    """
+
+    def forward(self, *args):
+        pass
diff --git a/torch/export/custom_ops.py b/torch/export/custom_ops.py
index 57288fa344c1..9df7988da931 100644
--- a/torch/export/custom_ops.py
+++ b/torch/export/custom_ops.py
@@ -1,3 +1,6 @@
+# mypy: allow-untyped-defs
+import importlib
+
 import torch
 
 
@@ -24,3 +27,23 @@ def _access_subclass_inner_tensor(
             f"Attribute {attr} is not a tensor or doesn't exist in {src_subclass_tensor}"
         )
     return val
+
+
+def _call_custom_autograd_function_in_pre_dispatch(function_cls_name, *args, **kwargs):
+    """
+    Import a custom autograd function by string name and call it. This is pretty bad
+    because:
+    1) There is no schema
+
+    Ideally we should automatically wrap custom autograd functions with a custom op, but
+    that is too much work because we need to schematize custom autograd functions. For now,
+    we just hackily put it in the IR.
+    """
+    # Parse module and class name
+    module_name, class_name = function_cls_name.rsplit(".", 1)
+
+    # Import the module and get the class
+    module = importlib.import_module(module_name)
+    function_cls = getattr(module, class_name)
+    assert hasattr(function_cls, "apply")
+    return function_cls.apply(*args, **kwargs)
diff --git a/torch/export/dynamic_shapes.py b/torch/export/dynamic_shapes.py
index ccc3660f7600..de41fdfdb346 100644
--- a/torch/export/dynamic_shapes.py
+++ b/torch/export/dynamic_shapes.py
@@ -887,7 +887,7 @@ def verify(self, ep):
 
         epm = ep.module()
         for args, kwargs in self._examples:
-            torch.export._unlift._check_input_constraints_pre_hook(
+            torch.export._unlift._check_input_constraints_for_module(
                 epm, args, kwargs or {}
             )
 
@@ -945,7 +945,7 @@ def check_symbols(path, tensor, shape):
                         f"Unexpected dimension mapped to index {i} in input tensor shape {shape} "
                         f"specified at `dynamic_shapes{keystr(path)}` "
                         f"(expected None, an int, a Dim, Dim.AUTO, Dim.STATIC, or Dim.DYNAMIC, "
-                        f" but got {dim} instead)",
+                        f" but got {dim!r} instead)",
                         case_name="dynamic_shapes_validation",
                     )
         elif isinstance(shape, (tuple, list)):
@@ -968,7 +968,7 @@ def check_symbols(path, tensor, shape):
                         f"Unexpected dimension #{i} in input tensor shape {shape} "
                         f"specified at `dynamic_shapes{keystr(path)}` "
                         f"(expected None, an int, a Dim, Dim.AUTO, Dim.STATIC, or Dim.DYNAMIC, "
-                        f"but got {dim} instead)",
+                        f"but got {dim!r} instead)",
                         case_name="dynamic_shapes_validation",
                     )
         elif shape is not None:
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index 059b0054e596..1aa2e59d1752 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -1047,6 +1047,8 @@ class ExportedProgram:
     _verifiers: list[type[Verifier]]
     """List of verifier classes used to validate the exported program."""
 
+    _guards_code: list[str]
+
     def __init__(
         self,
         root: Union[torch.nn.Module, dict[str, Any]],
@@ -1084,6 +1086,8 @@ def __init__(
         # Validate should be always the last step of the constructor.
         self.validate()
 
+        self._guards_code = _convert_guards_to_code(_get_shape_env(self._graph_module))
+
     @property
     @compatibility(is_backward_compatible=False)
     def graph_module(self):
@@ -1379,13 +1383,20 @@ def __str__(self) -> str:
         )
         return string
 
-    def module(self) -> torch.fx.GraphModule:
+    def module(self, check_guards=True) -> torch.fx.GraphModule:
         """
         Returns a self contained GraphModule with all the parameters/buffers inlined.
+
+        - When `check_guards=True` (default), a `_guards_fn` submodule is generated
+          and a call to a `_guards_fn` submodule is inserted right after placeholders
+          in the graph. This module checks guards on inputs.
+        - When `check_guards=False`, a subset of these checks are performed by a
+          forward pre-hook on the graph module. No `_guards_fn` submodule is generated.
+
         """
         from ._unlift import _unlift_exported_program_lifted_states
 
-        module = _unlift_exported_program_lifted_states(self)
+        module = _unlift_exported_program_lifted_states(self, check_guards=check_guards)
 
         def _train(self, mode: bool = True):
             raise NotImplementedError("Calling train() is not supported yet.")
@@ -1677,3 +1688,25 @@ def _create_graph_module_for_export(root, graph):
         gm._graph = graph
 
     return gm
+
+
+def _convert_guards_to_code(shape_env):
+    if shape_env is None:
+        return []
+
+    local_vars = {
+        var
+        for var, sources in shape_env.var_to_sources.items()
+        if all(
+            not isinstance(source, torch._dynamo.source.ConstantSource)
+            for source in sources
+        )
+    }
+    py_printer = torch.fx.experimental.symbolic_shapes.ShapeGuardPythonPrinter(
+        shape_env.var_to_sources, lambda s: s.name(), shape_env.var_to_sources
+    )
+    return [
+        py_printer.doprint(guard.expr)
+        for guard in shape_env.guards
+        if guard.expr.free_symbols.issubset(local_vars)
+    ]
diff --git a/torch/export/passes/__init__.py b/torch/export/passes/__init__.py
index 8c36327f6cc4..5e9c5a66008b 100644
--- a/torch/export/passes/__init__.py
+++ b/torch/export/passes/__init__.py
@@ -52,6 +52,21 @@ def _get_new_device(
         if isinstance(v, torch.Tensor):
             ep._constants[k] = v.to(_get_new_device(v.device, location))
 
+    # move example_inputs if they exist
+    if ep.example_inputs is not None:
+        args, kwargs = ep.example_inputs
+        moved_args = pytree.tree_map_only(
+            torch.Tensor,
+            lambda tensor: tensor.to(_get_new_device(tensor.device, location)),
+            args,
+        )
+        moved_kwargs = pytree.tree_map_only(
+            torch.Tensor,
+            lambda tensor: tensor.to(_get_new_device(tensor.device, location)),
+            kwargs,
+        )
+        ep._example_inputs = (moved_args, moved_kwargs)
+
     for m in ep.graph_module.modules():
         if isinstance(m, torch.fx.GraphModule):
             for node in m.graph.nodes:
diff --git a/torch/export/pt2_archive/_package.py b/torch/export/pt2_archive/_package.py
index 1aac1f3e9b79..19edd03d44e3 100644
--- a/torch/export/pt2_archive/_package.py
+++ b/torch/export/pt2_archive/_package.py
@@ -13,6 +13,7 @@
 import torch.utils._pytree as pytree
 from torch._export.serde import schema
 from torch._export.serde.serialize import (
+    _dataclass_to_dict,
     _dict_to_dataclass,
     deserialize_device,
     deserialize_scalar_type,
@@ -21,9 +22,11 @@
     deserialize_stride,
     ExportedProgramDeserializer,
     serialize,
+    serialize_tensor_meta,
     SerializedArtifact,
 )
 from torch._inductor.cpp_builder import normalize_path_separator
+from torch._subclasses.fake_tensor import FakeTensor
 from torch.export import ExportedProgram
 from torch.export._tree_utils import reorder_kwargs
 from torch.export.pt2_archive._package_weights import (
@@ -322,6 +325,162 @@ def _package_aoti_files(
             logger.debug(weights_config)
 
 
+def _is_fake_tensor(t: torch.Tensor) -> bool:
+    return isinstance(t, FakeTensor)
+
+
+def _is_tensor_subclass(t: torch.Tensor) -> bool:
+    return isinstance(t, torch.Tensor) and type(t.data) is not torch.Tensor
+
+
+def _get_raw_tensor_bytes(value: torch.Tensor) -> bytes:
+    """
+    Get the raw bytes of a tensor. This is used to save the tensor in pt2 archive.
+    """
+    # NOTE: don't chain .cpu() with .data_ptr(). If an HtoD copy needs to be
+    # performed, the CPU copy needs to be kept alive when its underlying
+    # memory is accessed.
+    import ctypes
+
+    if _is_fake_tensor(value):
+        value_bytes = b""
+    elif value.data_ptr():
+        cpu_tensor = value.cpu()
+        value_untyped_storage = cpu_tensor.untyped_storage()
+        # we store the raw bytes the untyped storage. Tensor metadata is stored separately
+        value_bytes = bytes(
+            ctypes.cast(
+                value_untyped_storage.data_ptr(),
+                ctypes.POINTER(ctypes.c_ubyte * value_untyped_storage.size()),
+            ).contents
+        )
+    else:
+        # for empty tensor
+        value_bytes = b""
+    return value_bytes
+
+
+def _package_state_dict(
+    exported_program: ExportedProgram,
+    archive_writer: PT2ArchiveWriter,
+    pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+) -> schema.PayloadConfig:
+    weights_config: dict[str, schema.PayloadMeta] = {}
+    storage_map: dict[torch.UntypedStorage, str] = {}
+
+    idx = archive_writer.count_prefix(os.path.join(WEIGHTS_DIR, WEIGHT_FILENAME_PREFIX))
+    for weight_fqn, weight_tensor in exported_program.state_dict.items():
+        assert isinstance(weight_tensor, torch.Tensor), (
+            "only torch.Tensor is allowed in state_dict"
+        )
+        path_name = f"{WEIGHT_FILENAME_PREFIX}{idx}"
+        is_param = isinstance(weight_tensor, torch.nn.Parameter)
+        # use pickle for non-fake tensor subclasses
+        use_pickle = _is_tensor_subclass(weight_tensor) and not _is_fake_tensor(
+            weight_tensor
+        )
+        archive_path = os.path.join(WEIGHTS_DIR, path_name)
+        if use_pickle:
+            buffer = io.BytesIO()
+            torch.save(weight_tensor, buffer, pickle_protocol=pickle_protocol)
+            archive_writer.write_bytes(archive_path, buffer.getvalue())
+            idx += 1
+        else:
+            tensor_storage = weight_tensor.untyped_storage()
+            if tensor_storage not in storage_map:
+                storage_map[tensor_storage] = path_name
+                tensor_bytes = _get_raw_tensor_bytes(weight_tensor)
+                archive_writer.write_bytes(archive_path, tensor_bytes)
+                idx += 1
+            else:
+                path_name = storage_map[tensor_storage]
+
+        weights_config[weight_fqn] = schema.PayloadMeta(
+            path_name=path_name,
+            is_param=is_param,
+            use_pickle=use_pickle,
+            tensor_meta=serialize_tensor_meta(weight_tensor),
+        )
+
+    return schema.PayloadConfig(config=weights_config)
+
+
+def _package_constants(
+    exported_program: ExportedProgram,
+    archive_writer: PT2ArchiveWriter,
+    pickle_protocol: int = DEFAULT_PICKLE_PROTOCOL,
+) -> schema.PayloadConfig:
+    constants_config: dict[str, schema.PayloadMeta] = {}
+    storage_map: dict[torch.UntypedStorage, str] = {}
+
+    tensor_idx = archive_writer.count_prefix(
+        os.path.join(CONSTANTS_DIR, TENSOR_CONSTANT_FILENAME_PREFIX)
+    )
+    custom_obj_idx = archive_writer.count_prefix(
+        os.path.join(CONSTANTS_DIR, CUSTOM_OBJ_FILENAME_PREFIX)
+    )
+
+    for constant_fqn, constant in exported_program.constants.items():
+        if isinstance(constant, torch.Tensor):
+            use_pickle = _is_tensor_subclass(constant) and not _is_fake_tensor(constant)
+            path_name = f"{TENSOR_CONSTANT_FILENAME_PREFIX}{tensor_idx}"
+            archive_path = os.path.join(CONSTANTS_DIR, path_name)
+            if use_pickle:
+                buffer = io.BytesIO()
+                torch.save(constant, buffer, pickle_protocol=pickle_protocol)
+                archive_writer.write_bytes(archive_path, buffer.getvalue())
+                tensor_idx += 1
+            else:
+                # Only save once when tensors share the same storage
+                tensor_storage = constant.untyped_storage()
+                if tensor_storage not in storage_map:
+                    storage_map[tensor_storage] = path_name
+                    tensor_bytes = _get_raw_tensor_bytes(constant)
+                    archive_writer.write_bytes(archive_path, tensor_bytes)
+                    tensor_idx += 1
+                else:
+                    path_name = storage_map[tensor_storage]
+
+            constants_config[constant_fqn] = schema.PayloadMeta(
+                path_name=path_name,
+                is_param=False,
+                use_pickle=use_pickle,
+                tensor_meta=serialize_tensor_meta(constant),
+            )
+
+        elif isinstance(constant, torch._C.ScriptObject):
+            # use pickle for custom objects
+            path_name = f"{CUSTOM_OBJ_FILENAME_PREFIX}{custom_obj_idx}"
+            custom_obj_idx += 1
+            constants_config[constant_fqn] = schema.PayloadMeta(
+                path_name=path_name,
+                is_param=False,
+                use_pickle=True,
+                tensor_meta=None,
+            )
+            archive_path = os.path.join(CONSTANTS_DIR, path_name)
+            custom_obj_bytes = torch._C._pickle_save(constant)
+            archive_writer.write_bytes(archive_path, custom_obj_bytes)
+
+        else:
+            raise RuntimeError(f"Unsupported constant type: {type(constant)}")
+
+    return schema.PayloadConfig(config=constants_config)
+
+
+def _package_payload_config(
+    archive_writer: PT2ArchiveWriter,
+    payload_config: schema.PayloadConfig,
+    config_file: str,
+) -> None:
+    """
+    Save the payload config as json file in the archive.
+    """
+    archive_writer.write_string(
+        config_file, json.dumps(_dataclass_to_dict(payload_config))
+    )
+
+
 def _package_exported_programs(
     archive_writer: PT2ArchiveWriter,
     exported_programs: Optional[Union[ExportedProgram, dict[str, ExportedProgram]]],
@@ -337,15 +496,22 @@ def _package_exported_programs(
     assert isinstance(exported_programs, dict)
 
     for model_name, ep in exported_programs.items():
-        artifact: SerializedArtifact = serialize(ep, opset_version, pickle_protocol)
+        weights_config = _package_state_dict(ep, archive_writer, pickle_protocol)
+        weights_config_file = WEIGHTS_CONFIG_FILENAME_FORMAT.format(model_name)
+        _package_payload_config(archive_writer, weights_config, weights_config_file)
 
-        archive_writer.write_bytes(
-            MODELS_FILENAME_FORMAT.format(model_name), artifact.exported_program
+        constants_config = _package_constants(ep, archive_writer, pickle_protocol)
+        constants_config_file = CONSTANTS_CONFIG_FILENAME_FORMAT.format(model_name)
+        _package_payload_config(archive_writer, constants_config, constants_config_file)
+
+        artifact: SerializedArtifact = serialize(
+            ep,
+            opset_version,
+            pickle_protocol,
         )
-        # TODO:Consider dedup this with the weights saved in package_aoti_files
-        archive_writer.write_bytes(f"{WEIGHTS_DIR}{model_name}.pt", artifact.state_dict)
+
         archive_writer.write_bytes(
-            f"{CONSTANTS_DIR}{model_name}.pt", artifact.constants
+            MODELS_FILENAME_FORMAT.format(model_name), artifact.exported_program
         )
         archive_writer.write_bytes(
             SAMPLE_INPUTS_FILENAME_FORMAT.format(model_name),
@@ -571,8 +737,13 @@ def _load_state_dict(
     archive_reader: PT2ArchiveReader,
     model_name: str,
 ) -> Union[dict[str, torch.Tensor], bytes]:
+    # Make it BC compatible with legacy weight files
     legacy_weights_file = f"{WEIGHTS_DIR}{model_name}.pt"
     if legacy_weights_file in archive_reader.get_file_names():
+        logger.warning(
+            "You are loading weight from the legacy format. "
+            "Please generate a new pt2 file using torch.export.save()."
+        )
         return archive_reader.read_bytes(legacy_weights_file)
     else:
         weights_config_file = WEIGHTS_CONFIG_FILENAME_FORMAT.format(model_name)
@@ -618,8 +789,13 @@ def _load_constants(
     archive_reader: PT2ArchiveReader,
     model_name: str,
 ) -> Union[dict[str, torch.Tensor], bytes]:
+    # Make it BC compatible with legacy constant files
     legacy_constants_file = f"{CONSTANTS_DIR}{model_name}.pt"
     if legacy_constants_file in archive_reader.get_file_names():
+        logger.warning(
+            "You are loading constant from the legacy format. "
+            "Please generate a new pt2 file using torch.export.save()."
+        )
         return archive_reader.read_bytes(legacy_constants_file)
     else:
         constants_config_file = CONSTANTS_CONFIG_FILENAME_FORMAT.format(model_name)
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index a797e1d7ed4f..d09307f66d6b 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -134,6 +134,11 @@ class _SubmoduleBase:
     _ty: Optional[str]
 
     def type_name(self) -> Optional[str]:
+        """
+        Subclass of this class - InterpreterModule, InterpreterModuleDispatcher, represents
+        corresponding model in eager model. To get this type information for those modules
+        in eager model we need to use this method.
+        """
         return self._ty
 
 
@@ -645,10 +650,7 @@ def process_forward_inputs(self, *args, **kwargs):
         return flat_args
 
     def forward(self, *args, **kwargs):
-        flat_args = torch._dynamo.disable(
-            self.process_forward_inputs,
-            reason="do not trace into preprocessing the inputs",
-        )(*args, **kwargs)
+        flat_args = self.process_forward_inputs(*args, **kwargs)
         signature = self.module_call_graph[0].signature
 
         if is_fx_symbolic_tracing():
@@ -770,7 +772,17 @@ def unflatten(
         hierarchy as the original eager module pre-export.
     """
     module = _remove_effect_tokens(module)
-    return UnflattenedModule(module, flat_args_adapter)
+    m = UnflattenedModule(module, flat_args_adapter)
+
+    # Disable process_forward_inputs as the adapter has many
+    # non-dynamo-traceable behavior.
+    m.process_forward_inputs = torch._dynamo.disable(  # type: ignore[method-assign]
+        m.process_forward_inputs,
+        reason="do not trace into preprocessing the inputs",
+        recursive=True,
+    )
+
+    return m
 
 
 def _inplace_buffer_and_input_mutations(
diff --git a/torch/fx/_graph_pickler.py b/torch/fx/_graph_pickler.py
index 97e5755d7d52..a53cefb2c018 100644
--- a/torch/fx/_graph_pickler.py
+++ b/torch/fx/_graph_pickler.py
@@ -212,8 +212,6 @@ def __init__(self, node: SymNode) -> None:
         self.hint = node._hint
 
     def _to_sym_node(self) -> SymNode:
-        from torch.fx.experimental.sym_node import SymNode
-
         assert self.shape_env is not None
         return SymNode(self.expr, self.shape_env, self.pytype, self.hint)
 
diff --git a/torch/fx/_symbolic_trace.py b/torch/fx/_symbolic_trace.py
index 3a11b7b63159..4775bef4ba31 100644
--- a/torch/fx/_symbolic_trace.py
+++ b/torch/fx/_symbolic_trace.py
@@ -46,13 +46,19 @@
 _constant_attribute_types = get_args(_ConstantAttributeType)
 
 
-def is_fx_tracing():
+# We only want to print this once to avoid flooding logs
+@functools.lru_cache
+def is_fx_tracing_warning():
     log.warning(
         "is_fx_tracing will return true for both fx.symbolic_trace and "
         "torch.export. Please use "
         "is_fx_tracing_symbolic_tracing() for specifically fx.symbolic_trace "
         "or torch.compiler.is_compiling() for specifically torch.export/compile."
     )
+
+
+def is_fx_tracing():
+    is_fx_tracing_warning()
     return _is_fx_tracing_flag
 
 
diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py
index 525014bf1e80..3e53cb908fbf 100644
--- a/torch/fx/experimental/const_fold.py
+++ b/torch/fx/experimental/const_fold.py
@@ -164,6 +164,9 @@ def split_const_subgraphs(
     attributes on the module prior to running the non-constant portion of the
     graph.
     """
+
+    import sympy
+
     if not isinstance(module, torch.fx.GraphModule):
         mod_traced = torch.fx.symbolic_trace(module)
     else:
@@ -194,6 +197,10 @@ def split_const_subgraphs(
         if node.is_impure():
             continue
 
+        # Skip folding nodes that have symbolic fill_value
+        if isinstance(node.kwargs.get("fill_value", None), sympy.Expr):
+            continue
+
         # Must be a constant foldable node at this point.
         const_nodes.add(node)
         if node.op != "get_attr":
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index 577774228fe4..ae4d1c59823a 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -203,6 +203,9 @@ class _DisableUpdateTensorTracker(threading.local):
 _disable_update_tensor_tracker_tls = _DisableUpdateTensorTracker()
 
 
+_FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT: dict[int, torch.fx.Node] = {}
+
+
 def _is_proxy_tensor_update_tensor_tracker_disabled() -> bool:
     """
     Returns current state of disabling update tensor tracker.
@@ -1904,6 +1907,25 @@ def trace(  # type: ignore[override]
     ) -> fx.Graph:
         res = super().trace(root, concrete_args)
 
+        # NOTE [export non-strict fake tensor leak detection]
+        # In non-strict export, we don't have dynamo's side effect
+        # tracking logic which makes some cases hard to detect.
+        # In general, our detecting strategy is:
+        #  (1) We do gc.collect() before export and get the alive fake tensors
+        #  (2) We dump the proxy to fake tensor map from make_fx tracer (_FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT)
+        #  (3) We query gc again to get alive fake tensors
+        #  (4) We take the delta between (1) and (3)
+        #  (5) Filter out fake tensors that are:
+        #      (1) Associated with TrackedFake (input tracking thing in symbolic_shapes)
+        #      (2) Associated with gm.meta
+        #  (6) Do ID match with the proxies
+
+        global _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT
+        _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT.clear()
+
+        for key, val in self.tensor_tracker.items():
+            _FAKE_TENSOR_ID_TO_PROXY_MAP_FOR_EXPORT[id(key)] = val.proxy.node
+
         # Since we are making _AttrProxy mimic the original
         # submodule, when someone registers a module directly
         # to the tracer while tracing, the proxy object gets registered
@@ -2033,6 +2055,7 @@ def __init__(
         _allow_fake_constant: bool,
         _error_on_data_dependent_ops: bool,
         record_stack_traces: bool = False,
+        parent_tracer: Optional[_MakefxTracer] = None,
     ) -> None:
         # Configurations that are used to initialize the context managers and their states.
         # Should not modify them during tracing.
@@ -2064,6 +2087,7 @@ def __init__(
             nullcontext()
         )
         self.record_stack_traces = record_stack_traces
+        self.parent_tracer: Optional[_MakefxTracer] = parent_tracer
 
     def _checkpoint_modes(self) -> list[Any]:
         return [
@@ -2312,6 +2336,15 @@ def _wrap_func(f: Callable[_P, R], phs: Sequence[PHBase]) -> Callable[_P, R]:
                 )
                 raise
 
+        if (
+            self.is_hop_subgraph_tracer()
+            and (fake_mode := torch._guards.detect_fake_mode(args))
+            and fake_mode.shape_env is not None
+        ):
+            from torch.fx.passes.runtime_assert import insert_deferred_runtime_asserts
+
+            insert_deferred_runtime_asserts(t, fake_mode.shape_env, "reenter_make_fx")
+            t.recompile()
         # TODO: kind of a bad way to do it, should maybe figure out a better way
         if self.tracing_mode == "symbolic":
             assert self.fake_tensor_mode is not None
@@ -2322,6 +2355,9 @@ def trace(self, f: Callable, *args: object) -> fx.GraphModule:
         with self._init_modes_from_inputs(f, args):
             return self._trace_inner(f, *args)
 
+    def is_hop_subgraph_tracer(self) -> bool:
+        return self.parent_tracer is not None
+
     def trace_subgraph(self, f: Callable, *args: object) -> GraphModule:
         # Create a new tracer based on parent's config
         sub_tracer = _MakefxTracer(
@@ -2332,6 +2368,7 @@ def trace_subgraph(self, f: Callable, *args: object) -> GraphModule:
             self.record_module_stack,
             self._allow_fake_constant,
             self._error_on_data_dependent_ops,
+            parent_tracer=self,
         )
         with sub_tracer._init_modes_from_parent(self):
             return sub_tracer._trace_inner(f, *args)
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index fdc7f5f0d9d0..00f1fed899ac 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -213,7 +213,7 @@ def log_lru_cache_stats(wrapped_f: functools._lru_cache_wrapper[object]) -> None
 class SymIntEqByExpr:
     """
     This is a wrapper around SymInt which has alternative semantics for
-    equality.  Specifically, instead of erroring or guarding, we
+    equality and pickling.  Specifically, instead of erroring or guarding, we
     instead will hash/compare equality based on the underlying sympy
     expression; e.g., s0 and s1 will always compare as False.
 
@@ -222,31 +222,25 @@ class SymIntEqByExpr:
     canonicalize to the same expression via regular simplification.
     """
 
-    val: Union[torch.SymInt, int]
+    @staticmethod
+    def _extract(val: Union[torch.SymInt, int]) -> sympy.Expr:
+        if isinstance(val, torch.SymInt):
+            return val.node.expr
+        else:
+            return sympy.Integer(val)
 
     def __init__(self, val: Union[torch.SymInt, int]) -> None:
-        self.val = val
+        self.val: sympy.Expr = SymIntEqByExpr._extract(val)
 
     def __repr__(self) -> str:
         return repr(self.val)
 
-    def _extract(self) -> sympy.Expr:
-        if isinstance(self.val, torch.SymInt):
-            return self.val.node.expr
-        else:
-            return sympy.Integer(self.val)
-
     def __eq__(self, other: object) -> bool:
         assert isinstance(other, SymIntEqByExpr)
-
-        # int equality fastpath
-        if type(self.val) is int and type(other.val) is int:
-            return self.val == other.val
-
-        return self._extract() == other._extract()
+        return self.val == other.val
 
     def __hash__(self) -> int:
-        return hash(self._extract())
+        return hash(self.val)
 
 
 def _nested_int_aware_sort(
@@ -3536,7 +3530,6 @@ class ShapeEnvSettings:
     specialize_zero_one: bool
     duck_shape: bool
     prefer_deferred_runtime_asserts_over_guards: bool
-    allow_complex_guards_as_runtime_asserts: bool
     trace_asserts: bool
 
 
@@ -3674,10 +3667,6 @@ def _init(
         # in guards is helpful, since these guards in some sense are overly
         # pedantic.  See also https://github.com/pytorch/pytorch/issues/121749
         prefer_deferred_runtime_asserts_over_guards: bool = False,
-        # When True, does not emit or raise constraint violation errors on
-        # implicit guards generated by ops, and defers to runtime assertions
-        # in the graph instead. For export.
-        allow_complex_guards_as_runtime_asserts: bool = False,
         # XXX Add any new settings that could affect FakeTensor evaluation
         # to: torch._subclasses.fake_tensor._ShapeEnvSettings
         trace_asserts: bool = False,
@@ -3694,7 +3683,6 @@ def _init(
             specialize_zero_one=specialize_zero_one,
             duck_shape=duck_shape,
             prefer_deferred_runtime_asserts_over_guards=prefer_deferred_runtime_asserts_over_guards,
-            allow_complex_guards_as_runtime_asserts=allow_complex_guards_as_runtime_asserts,
             trace_asserts=trace_asserts,
         )
 
@@ -3906,10 +3894,6 @@ def duck_shape(self) -> bool:
     def prefer_deferred_runtime_asserts_over_guards(self) -> bool:
         return self.settings.prefer_deferred_runtime_asserts_over_guards
 
-    @property
-    def allow_complex_guards_as_runtime_asserts(self) -> bool:
-        return self.settings.allow_complex_guards_as_runtime_asserts
-
     @contextmanager
     def patch_source_specialization(
         self, source: Source, check_fn: Callable[[sympy.Symbol], sympy.Expr]
@@ -6538,7 +6522,6 @@ def _make_data_dependent_error(
         expr: sympy.Basic,
         unhinted_expr: sympy.Basic,
         *,
-        size_oblivious_result: Optional[sympy.Basic] = None,
         expr_sym_node_id: Optional[int] = None,
     ) -> GuardOnDataDependentSymNode:
         # TODO: in a Dynamo context, having user code, and having the
@@ -6552,11 +6535,6 @@ def _make_data_dependent_error(
             if s in self.size_like:
                 size_like_symbols.append(s)
         size_oblivious_result_msg = ""
-        if size_oblivious_result is not None:
-            size_oblivious_result_msg = (
-                f"ATTENTION: guard_size_oblivious would fix the error, evaluating expression to {size_oblivious_result}.\n"
-                "Maybe you need to add guard_size_oblivious to framework code, see doc below for more guidance.\n\n"
-            )
         sloc, maybe_extra_debug = self._get_stack_summary(True)
         if expr.is_integer:  # type: ignore[attr-defined]
             desc = (
@@ -6564,6 +6542,11 @@ def _make_data_dependent_error(
             )
         else:
             desc = "Could not guard on data-dependent expression"
+            size_oblivious_result_msg = (
+                "consider using data-dependent friendly APIs such as "
+                "guard_or_false, guard_or_true and statically_known_true"
+            )
+
         msg = (
             f"{desc} {expr} (unhinted: {unhinted_expr}).  "
             f"(Size-like symbols: {', '.join(map(str, size_like_symbols)) or 'none'})\n\n"
@@ -6659,7 +6642,7 @@ def _set_replacement(self, a: sympy.Symbol, tgt: sympy.Expr, msg: str) -> None:
         assert isinstance(a, sympy.Symbol)
 
         if (
-            self.allow_complex_guards_as_runtime_asserts
+            self.prefer_deferred_runtime_asserts_over_guards
             and not _is_supported_equivalence(tgt)
         ):
             return  # continuing leads to placeholder shapes having complex expressions that we can't resolve
@@ -7588,16 +7571,9 @@ def compute_concrete_val() -> sympy.Basic:
                         ok = True
 
                     if not ok:
-                        size_oblivious_result = None
-                        # compute size_oblivious_result to suggest it as a fix for the user if it works.
-                        if not size_oblivious:
-                            size_oblivious_result = self._maybe_evaluate_static(
-                                expr, size_oblivious=True
-                            )
                         raise self._make_data_dependent_error(
                             expr.xreplace(self.var_to_val),
                             expr,
-                            size_oblivious_result=size_oblivious_result,
                             expr_sym_node_id=self._expr_sym_node_id,
                         )
                 else:
@@ -7641,7 +7617,15 @@ def compute_concrete_val() -> sympy.Basic:
                 # is no longer necessary)
                 self._maybe_guard_rel(g)
 
-                if not self.allow_complex_guards_as_runtime_asserts:
+                if (
+                    torch.compiler.is_exporting()
+                    and self.prefer_deferred_runtime_asserts_over_guards
+                ):
+                    # it's fine to defer simple guards here without checking,
+                    # the _maybe_guard_rel() call above will set replacements if possible,
+                    # and so the result here will be statically known
+                    self.guard_or_defer_runtime_assert(g, f"evaluate_expr: {orig_expr}")
+                else:
                     # at this point, we've evaluated the concrete expr value, and have
                     # flipped/negated the guard if necessary. Now we know what to guard
                     # or defer to runtime assert on.
@@ -7650,11 +7634,6 @@ def compute_concrete_val() -> sympy.Basic:
                     )
                     self.guards.append(guard)
                     self.axioms.update(dict(self.get_implications(self.simplify(g))))
-                else:
-                    # it's fine to defer simple guards here without checking,
-                    # the _maybe_guard_rel() call above will set replacements if possible,
-                    # and so the result here will be statically known
-                    self.guard_or_defer_runtime_assert(g, f"evaluate_expr: {orig_expr}")
             else:
                 self._log_guard("eval [guard suppressed]", g, forcing_spec=forcing_spec)
 
diff --git a/torch/fx/interpreter.py b/torch/fx/interpreter.py
index 4e1ab646593a..a6cbe1cfe2c8 100644
--- a/torch/fx/interpreter.py
+++ b/torch/fx/interpreter.py
@@ -33,7 +33,7 @@ class Interpreter:
     transformations as well as analysis passes.
 
     Methods in the Interpreter class can be overridden to customize
-    the behavior of execution. The map of overrideable methods
+    the behavior of execution. The map of overridable methods
     in terms of call hierarchy::
 
         run()
diff --git a/torch/fx/node.py b/torch/fx/node.py
index 3699926faa2c..dbd6ed93ef26 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -151,9 +151,13 @@ def _get_qualified_name(func: Callable[..., Any]) -> str:
     if getattr(builtins, func.__name__, None) is func:
         return func.__name__
     # torch.Tensor.{fn}
-    if isinstance(
-        func, (types.MethodDescriptorType, types.WrapperDescriptorType)
-    ) and func is getattr(torch.Tensor, func.__name__, None):
+    if (
+        isinstance(func, (types.MethodDescriptorType, types.WrapperDescriptorType))
+        and func is getattr(torch.Tensor, func.__name__, None)
+    ) or (
+        func.__module__ == torch._tensor.__name__
+        and func.__qualname__ == f"Tensor.{func.__name__}"
+    ):
         return f"torch.Tensor.{func.__name__}"
     name = func.__name__
     if name == "<lambda>":
diff --git a/torch/fx/passes/shape_prop.py b/torch/fx/passes/shape_prop.py
index 3815b2f058f0..d734242abd82 100644
--- a/torch/fx/passes/shape_prop.py
+++ b/torch/fx/passes/shape_prop.py
@@ -7,7 +7,7 @@
 import torch.fx
 from torch._dispatch.python import enable_python_dispatcher
 from torch._guards import detect_fake_mode
-from torch._prims_common import contiguous_for_memory_format_or_false
+from torch._prims_common import is_contiguous_for_memory_format_or_false
 from torch._subclasses.meta_utils import is_sparse_any
 from torch.fx._compatibility import compatibility
 from torch.fx.node import map_aggregate, Node
@@ -57,7 +57,7 @@ def _extract_tensor_metadata(
             torch.channels_last_3d,
         }
         for query_format in memory_formats:
-            if contiguous_for_memory_format_or_false(
+            if is_contiguous_for_memory_format_or_false(
                 result, memory_format=query_format
             ):
                 memory_format = query_format
diff --git a/torch/fx/passes/splitter_base.py b/torch/fx/passes/splitter_base.py
index e0b2ff63ba07..8a23c73785e8 100644
--- a/torch/fx/passes/splitter_base.py
+++ b/torch/fx/passes/splitter_base.py
@@ -1,13 +1,16 @@
 # mypy: allow-untyped-defs
 import argparse
 import copy
+import json
 import logging
+import os
 from collections import defaultdict
 from collections.abc import Iterable, Sequence
 from dataclasses import dataclass
-from typing import Any, NamedTuple, Optional
+from typing import Any, Literal, NamedTuple, Optional
 
 import torch
+from torch._logging import trace_structured
 from torch.fx._compatibility import compatibility
 from torch.fx.node import map_arg
 from torch.fx.passes.graph_manipulation import get_size_of_node
@@ -32,6 +35,8 @@
     "Subgraph",
     "SplitResult",
     "generate_inputs_for_submodules",
+    "NodeEvent",
+    "NodeEventTracker",
 ]
 _LOGGER = logging.getLogger(__name__)
 
@@ -39,6 +44,35 @@
 DEFAULT_SKIP_FUSION = False
 DEFAULT_ALLOW_NON_TENSOR = False
 
+# ENV var and constants for node tracker
+
+TRACKER_DUMP_PATH = "_fx_net_tracker"
+NODES_SUFFIX = "_nodes.txt"
+ALL_SUFFIX = "_all.txt"
+
+ENV_FX_NET_ACC_SPLITTER_TRACKER_MODE = "FX_NET_ACC_SPLITTER_TRACKER_MODE"
+ENV_FX_NET_ACC_SPLITTER_TRACKER_DUMP_PATH = "FX_NET_ACC_SPLITTER_TRACKER_DUMP_PATH"
+ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES = (
+    "FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES"
+)
+
+DUMP_PREFIX = os.environ.get(
+    ENV_FX_NET_ACC_SPLITTER_TRACKER_DUMP_PATH, TRACKER_DUMP_PATH
+)
+
+"""
+Different modes of the event tracker for local debugging:
+"0": No local dumps. Information available by setting breakpoints and visually inspect in pdb.
+"1": Dump all events to DUMP_PREFIX_all.txt
+"2": In addition to events dump, track nodes specified by ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES
+     recursively and dump to DUMP_PREFIX_nodex.txt
+"3": In addition to events dump, track all nodes with more than 1 event recursively and dump to DUMP_PREFIX_nodex.txt
+In addition to the above local dumps, tracker is always enabled and dumps via trace_structured.
+"""
+TRACKER_MODE: Literal["0", "1", "2", "3"] = os.environ.get(
+    ENV_FX_NET_ACC_SPLITTER_TRACKER_MODE, "0"
+)  # type: ignore[assignment]
+
 
 class _SplitterSettingBase:
     def __init__(
@@ -99,6 +133,145 @@ def __init__(
         self.max_acc_splits: int = max_acc_splits
 
 
+@compatibility(is_backward_compatible=False)
+class NodeEvent:
+    """
+    An event in graph split that happened on a node.
+    source: Subject of the event
+    desc: readable description
+    dep: Optional dependency, usually the node that caused the event.
+    """
+
+    def __init__(
+        self, source: torch.fx.Node, desc: str, dep: Optional[torch.fx.Node] = None
+    ):
+        self.source = source
+        self.desc = desc
+        self.dep = dep
+
+    def to_str(self):
+        # source: The name of the subject of the event.
+        # desc: description of the event, in the format of <event_type>|<explanation>
+        # dep: The name of the cause of this event, which is another node, or #
+        # if it's caused by the subject node
+        return f"{self.source.name}: {self.desc} {self.dep.name if self.dep else '#'}"
+
+
+@compatibility(is_backward_compatible=False)
+class NodeEventTracker:
+    """
+    Tracks node events during the splitter execution.
+    """
+
+    def __init__(self, tracker_mode, dump_prefix):
+        self.tracker_mode = tracker_mode
+        self.dump_prefix = dump_prefix
+        # list of events
+        self.events = []
+        # dict from node name to event index
+        self.node_events = {}
+        self.writer = print
+
+    def add(self, node: torch.fx.Node, desc: str, dep: Optional[torch.fx.Node] = None):
+        """
+        Add a new event to the tracker.
+        """
+        event = NodeEvent(node, desc, dep)
+        self.events.append(event)
+        if node.name not in self.node_events:
+            self.node_events[node.name] = []
+        self.node_events[node.name].append(len(self.events) - 1)
+
+    def print_node(self, node_name, recursive=False, tab="", writer=None):
+        """
+        Print a node and its events.
+        @param recursive: if True, print nodes that caused the events on this current node.
+        @param tab: Indentation for dependencies.
+        @param writer: function to write to file. If None, use print.
+        """
+        if not writer:
+            writer = self.writer
+        for idx in self.node_events.get(node_name, []):
+            event = self.events[idx]
+            writer(tab + event.to_str())
+            if recursive and event.dep is not None:
+                self.print_node(
+                    event.dep.name, recursive=True, tab="| " + tab, writer=writer
+                )
+
+    def to_dict(self):
+        """
+        Create dict dump on all events.
+        """
+        ret: dict[str, list[str]] = {}
+        for name in self.node_events.keys():
+            ret[name] = []
+            for idx in self.node_events.get(name, []):
+                event = self.events[idx]
+                ret[name].append(event.to_str())
+        return ret
+
+    def print_all(self, writer=None):
+        """
+        Print all nodes in a list.
+        @param writer: function to write to file. If None, use print.
+        """
+        if not writer:
+            writer = self.writer
+        for name in self.node_events.keys():
+            writer(f"Node: {name}:")
+            self.print_node(name, recursive=False, tab="  ", writer=writer)
+
+    def dump(self):
+        """
+        Function to be invoked at the end of the finder execution to printout tracked events specified by the mode.
+        """
+        # dump via trace_structured
+        trace_structured(
+            "artifact",
+            metadata_fn=lambda: {
+                "name": "fx_net_acc_splitter_finder_events",
+                "encoding": "json",
+            },
+            payload_fn=lambda: json.dumps(self.to_dict()),
+        )
+
+        def writeln(f):
+            def fn(x):
+                return f.write(x + "\n")
+
+            return fn
+
+        # Mode 0: no local dump
+        # Mode >=1: Dump all events to file
+        if self.tracker_mode >= 1:
+            with open(self.dump_prefix + ALL_SUFFIX, "w") as f:
+                self.print_all(writeln(f))
+
+        def dump_selected_nodes(nodes):
+            with open(self.dump_prefix + NODES_SUFFIX, "w") as f:
+                for node_name in nodes:
+                    writeln(f"===== Tracking node {node_name} =====")
+                    self.print_node(
+                        node_name, recursive=True, tab="|-", writer=writeln(f)
+                    )
+                    writeln(f"===== End of tracking node {node_name} =====")
+
+        # Mode 2: Dump specific nodes in recursive manner.
+        # Mode 3: Dump all nodes with more than 1 event in recursive manner.
+        if self.tracker_mode == 2 or self.tracker_mode == 3:
+            nodes = (
+                os.environ.get(ENV_FX_NET_ACC_SPLITTER_TRACKER_TRACKED_NODES, "").split(
+                    ","
+                )
+                if self.tracker_mode == 2
+                else [
+                    name for name, events in self.node_events.items() if len(events) > 1
+                ]
+            )
+            dump_selected_nodes(nodes)
+
+
 @compatibility(is_backward_compatible=False)
 class FxNetAccNodesFinder:
     """
@@ -125,6 +298,8 @@ def __init__(
         self.allow_non_tensor = allow_non_tensor
         self.acc_nodes: NodeSet = set()
 
+        self.tracker = NodeEventTracker(int(TRACKER_MODE), DUMP_PREFIX)
+
     def reduce_acc_nodes_non_tensor_input_helper(self, cpu_worklist: NodeList):
         """
         Transitively excludes nodes from ACC supported set.
@@ -139,7 +314,9 @@ def reduce_acc_nodes_non_tensor_input_helper(self, cpu_worklist: NodeList):
             for user in node.users:
                 if user in self.acc_nodes:
                     self.acc_nodes.remove(user)
+                    self.tracker.add(user, "acc_del|user_of_new_cpu_node", node)
                     if not is_node_output_tensor(user):
+                        self.tracker.add(user, "new_cpu_node|non_tensor_output")
                         cpu_worklist.append(user)
 
     def reduce_acc_nodes_non_tensor_input(self):
@@ -156,6 +333,7 @@ def reduce_acc_nodes_non_tensor_input(self):
                 continue
             if is_node_output_tensor(node):
                 continue
+            self.tracker.add(node, "new_cpu_node|callable_non_tensor_input")
             non_tensor_cpu_nodes.append(node)
 
         self.reduce_acc_nodes_non_tensor_input_helper(non_tensor_cpu_nodes)
@@ -174,6 +352,9 @@ def reduce_acc_nodes_non_tensor_output(self):
                 for user in acc_node.users:
                     if user not in self.acc_nodes:
                         new_cpu_nodes.append(acc_node)
+                        self.tracker.add(
+                            acc_node, "acc_del|non_tensor_output_with_cpu_user", user
+                        )
                         break
 
             if not new_cpu_nodes:
@@ -186,17 +367,22 @@ def reduce_acc_nodes_non_tensor_output(self):
 
     def __call__(self) -> NodeSet:
         submodules = dict(self.module.named_modules())
-        self.acc_nodes = {
-            n
-            for n in self.module.graph.nodes
-            if n.op in CALLABLE_NODE_OPS
-            and self.operator_support.is_node_supported(submodules, n)
-        }
+        self.acc_nodes = set()
+        for n in self.module.graph.nodes:
+            if n.op not in CALLABLE_NODE_OPS:
+                self.tracker.add(n, "init_cpu|not_callable")
+                continue
+            if not self.operator_support.is_node_supported(submodules, n):
+                self.tracker.add(n, "init_cpu|operator_support")
+                continue
+
+            self.tracker.add(n, "init_acc|callable_and_operator_supported")
+            self.acc_nodes.add(n)
 
         if not self.allow_non_tensor:
             self.reduce_acc_nodes_non_tensor_input()
             self.reduce_acc_nodes_non_tensor_output()
-
+        self.tracker.dump()
         return self.acc_nodes
 
 
diff --git a/torch/fx/passes/utils/matcher_utils.py b/torch/fx/passes/utils/matcher_utils.py
index 4ecbe8640def..aa58b52933f9 100644
--- a/torch/fx/passes/utils/matcher_utils.py
+++ b/torch/fx/passes/utils/matcher_utils.py
@@ -95,7 +95,7 @@ def __init__(
             )
 
         for node in pattern.nodes:
-            if node.op != "output":
+            if node.op != "output" and not node.is_impure():
                 assert len(node.users) > 0, (
                     "SubgraphMatcher cannot be initialized with an pattern with dead code"
                 )
diff --git a/torch/headeronly/macros/Macros.h b/torch/headeronly/macros/Macros.h
index 3a4fc3936963..558edb175ae2 100644
--- a/torch/headeronly/macros/Macros.h
+++ b/torch/headeronly/macros/Macros.h
@@ -259,7 +259,8 @@ using namespace c10::xpu;
 // to resolve potential warnings.
 #if __CUDA_ARCH__ == 750
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
-#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890
+#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 || \
+    __CUDA_ARCH__ == 1200
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
 #else
 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
diff --git a/torch/library.py b/torch/library.py
index bbdaebea9521..d36c18158148 100644
--- a/torch/library.py
+++ b/torch/library.py
@@ -104,7 +104,7 @@ def __init__(self, ns, kind, dispatch_key=""):
                 " is a reserved namespace. Please try creating a library with another name.",
             )
 
-        frame = traceback.extract_stack(limit=3)[0]
+        frame = traceback.extract_stack(limit=2)[0]
         filename, lineno = frame.filename, frame.lineno
         self.m: Optional[Any] = torch._C._dispatch_library(
             kind, ns, dispatch_key, filename, lineno
diff --git a/torch/masked/maskedtensor/_ops_refs.py b/torch/masked/maskedtensor/_ops_refs.py
index 8135f149a1bf..9a4df21429ad 100644
--- a/torch/masked/maskedtensor/_ops_refs.py
+++ b/torch/masked/maskedtensor/_ops_refs.py
@@ -285,7 +285,9 @@ def layout(func, *args, **kwargs):
     return _get_data(args[0]).layout
 
 
-@register_dispatch_func([torch.ops.aten.is_contiguous])
+@register_dispatch_func(
+    [torch.ops.aten.is_contiguous, torch.ops.aten.sym_is_contiguous]
+)
 def is_contiguous(func, *args, **kwargs):
     data = _get_data(args[0])
     if data.is_sparse:
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
index 0b522591c63e..b11e5714fc2e 100644
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@@ -13,7 +13,7 @@
 from typing import Optional
 
 from torch.numa.binding import (
-    maybe_temporarily_apply_numa_binding_to_current_process,
+    maybe_temporarily_apply_numa_binding_to_current_thread,
     NumaOptions,
 )
 
@@ -29,7 +29,6 @@
     "ProcessException",
     "ProcessExitedException",
     "ProcessRaisedException",
-    "should_use_parallel_start",
     "spawn",
     "SpawnContext",
     "start_processes",
@@ -227,17 +226,6 @@ def __init__(self, processes, error_files):
         super().__init__(processes, error_files)
 
 
-def should_use_parallel_start(start_method: str) -> bool:
-    """
-    Returns:
-        Whether we will start subprocesses in parallel.
-    """
-    return (
-        start_method == "forkserver"
-        and os.environ.get(ENV_VAR_PARALLEL_START, "0") == "1"
-    )
-
-
 # Note: [start_processes]
 # mp.start_processes handles both start_method='spawn' and 'fork'. It's supposed to be a
 # more generalized API than mp.spawn. Currently we only document mp.spawn as it's the
@@ -259,16 +247,16 @@ def start_processes(
     # this func will start processes in parallel if start_method is 'forkserver'.
     # Please opt in to this perf optimization by setting env var (TORCH_MP_PARALLEL_START) to 1.
     # todo: investigate why spawn does not work with threadpool and raises SIGINT
-    if should_use_parallel_start(start_method):
+    if (
+        start_method == "forkserver"
+        and os.environ.get(ENV_VAR_PARALLEL_START, "0") == "1"
+    ):
         log.info("Starting processes in parallel.")
         start_parallel = True
     else:
         # Set env var TORCH_MP_PARALLEL_START to 0 to disable parallel start
         start_parallel = False
 
-    if numa_options is not None and start_parallel:
-        raise ValueError("NUMA binding is not compatible with parallel start")
-
     mp = multiprocessing.get_context(start_method)
     error_files = [None] * nprocs
     processes = [None] * nprocs
@@ -292,8 +280,8 @@ def start_process(i):
             daemon=daemon,
         )
 
-        # HACK [NUMA inheritance]: Subprocesses inherit the parent process's CPU
-        # affinity. So, we temporarily apply the bindings to the current process,
+        # HACK [NUMA inheritance]: Subprocesses inherit the parent thread's CPU
+        # affinity. So, we temporarily apply the bindings to the current thread,
         # and then immediately undo them.
         # This is necessary because the alternatives would be to
         # either
@@ -305,7 +293,7 @@ def start_process(i):
         # can result in worse memory locality, because torch and CUDA
         # initialization would occur before applying the bindings, thus
         # allowing some memory to be allocated on the wrong NUMA nodes.
-        with maybe_temporarily_apply_numa_binding_to_current_process(
+        with maybe_temporarily_apply_numa_binding_to_current_thread(
             gpu_index=i, numa_options=numa_options
         ):
             process.start()
diff --git a/torch/nativert/ModelRunner.cpp b/torch/nativert/ModelRunner.cpp
index 633a66c1bd93..a7688860561e 100644
--- a/torch/nativert/ModelRunner.cpp
+++ b/torch/nativert/ModelRunner.cpp
@@ -17,42 +17,6 @@ namespace torch::nativert {
 using torch::nativert::jsonToGraph;
 using torch::nativert::detail::itreeSpecLoads;
 
-namespace {
-std::shared_ptr<Weights> loadWeightsDefault(
-    Graph& graph,
-    caffe2::serialize::PyTorchStreamReader& reader,
-    std::string_view modelName) {
-  auto weightsPath = fmt::format(
-      "{}{}.pt", torch::_export::archive_spec::WEIGHTS_DIR, modelName);
-  auto constantsPath = fmt::format(
-      "{}{}.pt", torch::_export::archive_spec::CONSTANTS_DIR, modelName);
-  TORCH_CHECK(
-      reader.hasRecord(weightsPath), weightsPath, " not found in package");
-  TORCH_CHECK(
-      reader.hasRecord(constantsPath), constantsPath, " not found in package");
-  const auto& [weightsData, weightsSize] = reader.getRecord(weightsPath);
-  auto weights =
-      torch::jit::pickle_load_obj(
-          std::string_view{static_cast<char*>(weightsData.get()), weightsSize})
-          .toGenericDict();
-  const auto& [constantsData, constantsSize] = reader.getRecord(constantsPath);
-  auto constants =
-      torch::jit::pickle_load_obj(
-          std::string_view{
-              static_cast<char*>(constantsData.get()), constantsSize})
-          .toGenericDict();
-  std::unordered_map<std::string, c10::IValue> stateDict;
-  std::unordered_map<std::string, c10::IValue> constantsDict;
-  for (const auto& item : weights) {
-    stateDict[item.key().toStringRef()] = item.value();
-  }
-  for (const auto& item : constants) {
-    constantsDict[item.key().toStringRef()] = item.value();
-  }
-  return std::make_shared<Weights>(&graph, stateDict, constantsDict);
-}
-} // namespace
-
 ModelRunner::ModelRunner(
     const std::string& packagePath,
     const std::string& modelName) {
@@ -81,6 +45,16 @@ ModelRunner::ModelRunner(
                   .get_fqn()
                   .empty());
 
+  tensorPaths_ = getPayloadConfig(
+      pytorchStreamReader,
+      torch::_export::archive_spec::WEIGHTS_CONFIG_FILENAME_FORMAT,
+      modelName);
+
+  constantPaths_ = getPayloadConfig(
+      pytorchStreamReader,
+      torch::_export::archive_spec::CONSTANTS_CONFIG_FILENAME_FORMAT,
+      modelName);
+
   graph_ = jsonToGraph(exportedProgram_.get_graph_module());
 
   std::vector<const Value*> userInputs(
@@ -106,7 +80,7 @@ ModelRunner::ModelRunner(
   graph_->applyDevicePlacement(placement);
   selectScalarOverload(graph_.get());
 
-  auto weights = loadWeightsDefault(*graph_, *pytorchStreamReader, modelName);
+  auto weights = loadWeightsDefault(*graph_, pytorchStreamReader);
 
   weights->validateAllWeightsLoaded();
 
@@ -116,6 +90,46 @@ ModelRunner::ModelRunner(
       config, graph_, std::move(weights), pytorchStreamReader);
 }
 
+std::unordered_map<std::string, std::string> ModelRunner::getPayloadConfig(
+    const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
+        pytorchStreamReader,
+    std::string_view configFilenameFormat,
+    const std::string& modelName) {
+  std::string configPath =
+      fmt::format(fmt::runtime(configFilenameFormat), modelName);
+
+  TORCH_CHECK(
+      pytorchStreamReader->hasRecord(configPath),
+      configPath,
+      " not found in package");
+
+  const auto& [configData, configSize] =
+      pytorchStreamReader->getRecord(configPath);
+  const std::string configSerialized{
+      reinterpret_cast<char*>(configData.get()), configSize};
+
+  auto configJson = nlohmann::json::parse(configSerialized)
+                        .template get<torch::_export::PayloadConfig>();
+  auto config = configJson.get_config();
+  std::unordered_map<std::string, std::string> targetPaths;
+  for (const auto& configEntry : config) {
+    targetPaths[configEntry.first] = configEntry.second.get_path_name();
+  }
+  return targetPaths;
+}
+
+std::shared_ptr<Weights> ModelRunner::loadWeightsDefault(
+    Graph& graph,
+    const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>& reader) {
+  return std::make_shared<Weights>(
+      &graph,
+      reader,
+      tensorPaths_,
+      torch::_export::archive_spec::WEIGHTS_DIR,
+      constantPaths_,
+      torch::_export::archive_spec::CONSTANTS_DIR);
+}
+
 c10::IValue ModelRunner::run(
     const std::vector<c10::IValue>& args,
     const std::unordered_map<std::string, c10::IValue>& kwargs) {
@@ -138,6 +152,11 @@ std::vector<c10::IValue> ModelRunner::runWithFlatInputsAndOutputs(
   return executor_->execute(std::move(flatInputs));
 }
 
+uint64_t ModelRunner::numOutputs() const {
+  TORCH_CHECK(executor_, "ModelRunner not initialized");
+  return executor_->graphSignature().userOutputs().size();
+}
+
 ModelRunnerHandle::ModelRunnerHandle(
     const std::string& packagePath,
     const std::string& modelName)
diff --git a/torch/nativert/ModelRunner.h b/torch/nativert/ModelRunner.h
index e037e3b26ca8..ae433e43081d 100644
--- a/torch/nativert/ModelRunner.h
+++ b/torch/nativert/ModelRunner.h
@@ -32,7 +32,19 @@ class TORCH_API ModelRunner {
   std::vector<c10::IValue> runWithFlatInputsAndOutputs(
       std::vector<c10::IValue> flatInputs);
 
+  uint64_t numOutputs() const;
+
+  std::shared_ptr<Weights> loadWeightsDefault(
+      Graph& graph,
+      const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>& reader);
+
  private:
+  std::unordered_map<std::string, std::string> getPayloadConfig(
+      const std::shared_ptr<caffe2::serialize::PyTorchStreamReader>&
+          pytorchStreamReader,
+      std::string_view configFormat,
+      const std::string& modelName);
+
   // original non-delegated graph from torch.export()
   std::shared_ptr<Graph> graph_;
 
@@ -42,5 +54,9 @@ class TORCH_API ModelRunner {
   ITreeSpec outputSpec_;
 
   torch::_export::ExportedProgram exportedProgram_;
+
+  std::unordered_map<std::string, std::string> tensorPaths_;
+
+  std::unordered_map<std::string, std::string> constantPaths_;
 };
 } // namespace torch::nativert
diff --git a/torch/nativert/backends/__init__.py b/torch/nativert/backends/__init__.py
new file mode 100644
index 000000000000..0981407836bd
--- /dev/null
+++ b/torch/nativert/backends/__init__.py
@@ -0,0 +1,4 @@
+from .lowered_aoti_module import LoweredBackendModule
+
+
+__all__ = ["LoweredBackendModule"]
diff --git a/torch/nativert/backends/lower_utils.py b/torch/nativert/backends/lower_utils.py
new file mode 100644
index 000000000000..2b337f4f2c9d
--- /dev/null
+++ b/torch/nativert/backends/lower_utils.py
@@ -0,0 +1,50 @@
+import torch
+from torch.export import ExportedProgram
+from torch.export.pt2_archive._package import AOTI_FILES, package_pt2
+from torch.types import FileLike
+
+from .lowered_aoti_module import LoweredBackendModule
+
+
+def lower_exported_program(
+    exported_program: ExportedProgram, model_name: str, backend_id: str
+) -> tuple[ExportedProgram, AOTI_FILES]:
+    """
+    Lower an exported program to AOTInductor and return a delegate ExportedProgram
+    with the `executorch_call_delegate` HOP
+    """
+    args, kwargs = exported_program.example_inputs
+    aoti_files = torch._inductor.aot_compile(
+        exported_program.module(), args, kwargs, options={"aot_inductor.package": True}
+    )
+    assert isinstance(aoti_files, list)
+
+    lowered_aoti_module = LoweredBackendModule(
+        exported_program, backend_id, module_name=model_name
+    )
+
+    aoti_delegate_ep = torch.export.export(lowered_aoti_module, args, kwargs)
+
+    return aoti_delegate_ep, aoti_files
+
+
+def package_nativert_with_aoti_delegate(
+    f: FileLike,
+    model_name: str,
+    backend_id: str,
+    original_ep: ExportedProgram,
+    delegate_ep: ExportedProgram,
+    delegate_files: AOTI_FILES,
+) -> None:
+    """
+    Package a pt2 archive file that can be consumed by NativeRT with AOTI Delegate
+    """
+    package_pt2(
+        f,
+        exported_programs={
+            model_name: original_ep,
+            f"{model_name}-{backend_id}": delegate_ep,
+        },
+        aoti_files={f"{model_name}-{backend_id}": delegate_files},  # type: ignore[dict-item]
+    )
+    return
diff --git a/torch/nativert/backends/lowered_aoti_module.py b/torch/nativert/backends/lowered_aoti_module.py
new file mode 100644
index 000000000000..b0de0e3a26d1
--- /dev/null
+++ b/torch/nativert/backends/lowered_aoti_module.py
@@ -0,0 +1,33 @@
+from typing import Optional
+
+import torch
+from torch.export import ExportedProgram
+
+
+class LoweredBackendModule(torch.nn.Module):
+    def __init__(
+        self,
+        original_exported_program: ExportedProgram,
+        backend_id: str,
+        *,
+        module_name: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        self._backend_id = backend_id
+        self._module_name = module_name
+        self._original_exported_program = original_exported_program
+
+    @property
+    def backend_id(self) -> str:
+        return self._backend_id
+
+    @property
+    def module_name(self) -> Optional[str]:
+        return self._module_name
+
+    @property
+    def original_module(self) -> ExportedProgram:
+        return self._original_exported_program
+
+    def forward(self, *args):  # type: ignore[no-untyped-def]
+        return torch._higher_order_ops.executorch_call_delegate(self, *args)
diff --git a/torch/nativert/executor/ConstantFolder.cpp b/torch/nativert/executor/ConstantFolder.cpp
index 8ab6322fb53e..4f78e13de5b6 100644
--- a/torch/nativert/executor/ConstantFolder.cpp
+++ b/torch/nativert/executor/ConstantFolder.cpp
@@ -127,6 +127,7 @@ void ConstantFolder::unlinkConstants(
   for (const auto& f : foldables_) {
     VLOG(1) << "Const-folded node: " << *f.node;
   }
+  LOG(INFO) << "Const-folded " << foldables_.size() << " nodes";
 
   // remove moved (i.e., associated w/ const-folded nodes) kernels
   // from the input kernel vector
diff --git a/torch/nativert/executor/DelegateExecutor.cpp b/torch/nativert/executor/DelegateExecutor.cpp
index 78ec4a0c1582..6585ac34ddd6 100644
--- a/torch/nativert/executor/DelegateExecutor.cpp
+++ b/torch/nativert/executor/DelegateExecutor.cpp
@@ -28,6 +28,7 @@ char* _mkdtemp(char* outputDir) {
 std::string extractToTemporaryFolder(
     caffe2::serialize::PyTorchStreamReader& packageReader,
     const std::string& targetPath) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
   char outputDir[] = "/tmp/delegate_model_XXXXXX";
   char* tempdir = _mkdtemp(outputDir);
   TORCH_CHECK(
diff --git a/torch/nativert/executor/OpKernelKind.h b/torch/nativert/executor/OpKernelKind.h
index 045664cfdee1..5a8ba38316f6 100644
--- a/torch/nativert/executor/OpKernelKind.h
+++ b/torch/nativert/executor/OpKernelKind.h
@@ -11,6 +11,7 @@ enum class OpKernelKind : uint8_t {
   // static dispatch kernels that don't reuse
   // out TensorImpl
   kNativeStaticDispatchKernel,
+  kTritonKernel,
 };
 
 } // namespace torch::nativert
diff --git a/torch/nativert/executor/triton/CpuTritonKernelManager.cpp b/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
new file mode 100644
index 000000000000..1f8d394ecf39
--- /dev/null
+++ b/torch/nativert/executor/triton/CpuTritonKernelManager.cpp
@@ -0,0 +1,91 @@
+#include <torch/nativert/executor/triton/CpuTritonKernelManager.h>
+
+#include <c10/util/Logging.h>
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#endif // _WIN32
+
+namespace torch::nativert {
+
+namespace {
+void* _dlopen(const char* filename) {
+#if defined(_WIN32)
+  return nullptr;
+#else
+  return dlopen(filename, RTLD_NOW | RTLD_LOCAL);
+#endif
+}
+
+void* _dlsym(void* handle, const char* name) {
+#if defined(_WIN32)
+  return nullptr;
+#else
+  return dlsym(handle, name);
+#endif
+}
+
+char* _dlerror() {
+#if defined(_WIN32)
+  throw std::runtime_error("dlerror not supported on Windows");
+#else
+  return dlerror();
+#endif
+}
+
+} // namespace
+
+CpuTritonKernelManager::CpuTritonKernelManager(
+    std::string kernel_name,
+    std::string kernel_bin_path,
+    std::string kernel_launcher_bin_path)
+    : TritonKernelManager(std::move(kernel_name), std::move(kernel_bin_path)),
+      kernel_launcher_bin_path_(std::move(kernel_launcher_bin_path)) {}
+
+void CpuTritonKernelManager::load() {
+  if (C10_LIKELY(kernel_fn_ != nullptr)) {
+    return;
+  }
+
+  kernel_handle_.reset(_dlopen(kernel_bin_path_.c_str()));
+  TORCH_CHECK(
+      kernel_handle_ != nullptr,
+      "could not dlopen ",
+      kernel_bin_path_,
+      ": ",
+      _dlerror());
+
+  launcher_handle_.reset(_dlopen(kernel_launcher_bin_path_.c_str()));
+  TORCH_CHECK(
+      launcher_handle_ != nullptr,
+      "could not dlopen ",
+      kernel_launcher_bin_path_,
+      ": ",
+      _dlerror());
+
+  kernel_fn_ = _dlsym(kernel_handle_.get(), kernel_name_.c_str());
+  TORCH_CHECK(
+      kernel_fn_ != nullptr,
+      "could not dlsym ",
+      kernel_name_,
+      ": ",
+      _dlerror());
+
+  launcher_fn_ =
+      reinterpret_cast<launcher_ptr_t>(_dlsym(launcher_handle_.get(), "run"));
+  TORCH_CHECK(launcher_fn_ != nullptr, "could not dlsym run: ", _dlerror());
+}
+
+void CpuTritonKernelManager::launch(
+    const LaunchParams& launch_params,
+    void** args /* { ...inputs, output }*/) {
+  load();
+  launcher_fn_(
+      launch_params.grid_dims.x,
+      launch_params.grid_dims.y,
+      launch_params.grid_dims.z,
+      args,
+      kernel_fn_);
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/triton/CpuTritonKernelManager.h b/torch/nativert/executor/triton/CpuTritonKernelManager.h
new file mode 100644
index 000000000000..45b3327c878e
--- /dev/null
+++ b/torch/nativert/executor/triton/CpuTritonKernelManager.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <torch/nativert/executor/triton/TritonKernelManager.h>
+
+#include <c10/core/Device.h>
+#include <c10/util/FbcodeMaps.h>
+
+#ifndef _WIN32
+#include <dlfcn.h>
+#endif
+
+typedef void* kernel_ptr_t;
+typedef void (
+    *launcher_ptr_t)(uint32_t, uint32_t, uint32_t, void**, kernel_ptr_t);
+
+namespace torch::nativert {
+
+struct DlcloseDeleter {
+  void operator()(void* p) const {
+    if (p) {
+#if defined(_WIN32)
+      TORCH_CHECK(false, "Windows is not supported");
+#else
+      dlclose(p);
+#endif
+    }
+  }
+};
+
+class CpuTritonKernelManager final : public TritonKernelManager {
+ public:
+  CpuTritonKernelManager(
+      std::string kernel_name,
+      std::string kernel_bin_path,
+      std::string kernel_launcher_bin_path);
+  ~CpuTritonKernelManager() final = default;
+  void launch(const LaunchParams& launch_params, void** args) final;
+
+ private:
+  void load();
+
+  kernel_ptr_t kernel_fn_{nullptr};
+  launcher_ptr_t launcher_fn_{nullptr};
+
+  std::unique_ptr<void, DlcloseDeleter> kernel_handle_{nullptr};
+  std::unique_ptr<void, DlcloseDeleter> launcher_handle_{nullptr};
+
+  std::string kernel_launcher_bin_path_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nativert/executor/triton/CudaTritonKernelManager.cpp b/torch/nativert/executor/triton/CudaTritonKernelManager.cpp
new file mode 100644
index 000000000000..47f72ce0c5e3
--- /dev/null
+++ b/torch/nativert/executor/triton/CudaTritonKernelManager.cpp
@@ -0,0 +1,155 @@
+#include <torch/nativert/executor/triton/TritonKernelManager.h>
+
+#include <ATen/cuda/Exceptions.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
+
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/Logging.h>
+
+namespace {
+const at::cuda::NVRTC& get_nvrtc() {
+  return at::globalContext().getNVRTC();
+}
+} // namespace
+
+#define CU_LOG_ERROR(fn, result, ...)                   \
+  {                                                     \
+    LOG(ERROR) << #fn << " returned error: " << result; \
+    const char* errMsg = nullptr;                       \
+    get_nvrtc().cuGetErrorString(result, &errMsg);      \
+    LOG(ERROR) << "cuGetErrorString: " << errMsg;       \
+  }
+
+namespace torch::nativert {
+
+// cuda kernels require an extra level of indirection
+// for who knows what reason.
+class CudaKernelInputs final : public KernelInputs {
+ public:
+  CudaKernelInputs(size_t num_args, size_t num_attrs)
+      : KernelInputs(num_args, num_attrs), arg_ptrs_(num_args) {};
+  ~CudaKernelInputs() final = default;
+
+  void add_arg(void* arg) override {
+    TORCH_CHECK(arg_idx_ < num_args_, "Too many args");
+    arg_ptrs_[arg_idx_] = arg;
+    inputs_[arg_idx_] = reinterpret_cast<void*>(&arg_ptrs_[arg_idx_]);
+    arg_idx_++;
+  }
+
+ private:
+  std::vector<void*> arg_ptrs_;
+};
+
+class CudaTritonKernelManager final : public TritonKernelManager {
+ public:
+  CudaTritonKernelManager(std::string kernel_name, std::string kernel_bin_path);
+  ~CudaTritonKernelManager() final;
+
+  CudaTritonKernelManager(const CudaTritonKernelManager& other);
+  CudaTritonKernelManager& operator=(const CudaTritonKernelManager& other);
+  CudaTritonKernelManager(CudaTritonKernelManager&& other) noexcept;
+  CudaTritonKernelManager& operator=(CudaTritonKernelManager&& other) noexcept;
+
+  void launch(const LaunchParams& launch_params, void** args) final;
+  std::unique_ptr<KernelInputs> create_inputs(size_t num_args, size_t num_attrs)
+      const final {
+    return std::unique_ptr<KernelInputs>(
+        new CudaKernelInputs(num_args, num_attrs));
+  }
+
+ private:
+  CUfunction load();
+  c10::FastMap<c10::DeviceIndex, CUfunction> cache_;
+  std::vector<CUmodule> loaded_modules_;
+};
+
+CudaTritonKernelManager::CudaTritonKernelManager(
+    std::string kernel_name,
+    std::string kernel_bin_path)
+    : TritonKernelManager(std::move(kernel_name), std::move(kernel_bin_path)) {
+  TORCH_CHECK(
+      at::globalContext().hasCUDA() || at::globalContext().hasHIP(),
+      "cuda or hip required");
+};
+
+CudaTritonKernelManager::~CudaTritonKernelManager() {
+  const auto& nvrtc = get_nvrtc();
+  for (auto& mod : loaded_modules_) {
+    if (CUresult err = nvrtc.cuModuleUnload(mod); err != 0) {
+      CU_LOG_ERROR(nvrtc.cuModuleUnload, err);
+    }
+  }
+}
+
+CUfunction CudaTritonKernelManager::load() {
+  const auto idx = c10::cuda::current_device();
+  if (const auto res = cache_.find(idx); res != cache_.end()) {
+    return res->second;
+  }
+
+  const auto& nvrtc = get_nvrtc();
+
+  CUmodule mod_ptr = nullptr;
+
+  if (CUresult err = nvrtc.cuModuleLoad(&mod_ptr, kernel_bin_path_.c_str());
+      err != 0) {
+    CU_LOG_ERROR(nvrtc.cuModuleLoad, err);
+    return nullptr;
+  }
+
+  CUfunction func = nullptr;
+
+  if (CUresult err =
+          nvrtc.cuModuleGetFunction(&func, mod_ptr, kernel_name_.c_str());
+      err != 0) {
+    CU_LOG_ERROR(nvrtc.cuModuleGetFunction, err);
+    return nullptr;
+  }
+
+  loaded_modules_.emplace_back(mod_ptr);
+  return cache_.emplace(idx, func).first->second;
+}
+
+void CudaTritonKernelManager::launch(
+    const LaunchParams& launch_params,
+    void** args /* { ...inputs, output }*/) {
+  const constexpr int kThreadsPerWarp = 2 << 4;
+
+  auto kernel_fn = load();
+  TORCH_CHECK(
+      kernel_fn != nullptr, "failed to load triton kernel: ", kernel_name_);
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream().stream();
+
+  AT_CUDA_DRIVER_CHECK(get_nvrtc().cuLaunchKernel(
+      kernel_fn,
+      launch_params.grid_dims.x,
+      launch_params.grid_dims.y,
+      launch_params.grid_dims.z,
+      /* blockDimX = */ kThreadsPerWarp * launch_params.num_warps,
+      /* blockDimY = */ 1,
+      /* blockDimZ = */ 1,
+      /* sharedMemBytes = */ launch_params.shared_memory_bytes,
+      stream,
+      args,
+      nullptr));
+}
+
+static std::unique_ptr<TritonKernelManager> _create_cuda_triton_kernel_manager(
+    std::string kernel_name,
+    std::string kernel_bin_path) {
+  return std::make_unique<CudaTritonKernelManager>(
+      std::move(kernel_name), std::move(kernel_bin_path));
+}
+
+} // namespace torch::nativert
+
+namespace {
+static bool _initialized_cuda_triton_kernel_manager = []() {
+  torch::nativert::create_cuda_triton_kernel_manager =
+      &torch::nativert::_create_cuda_triton_kernel_manager;
+  return true;
+}();
+} // namespace
diff --git a/torch/nativert/executor/triton/TritonKernelManager.h b/torch/nativert/executor/triton/TritonKernelManager.h
new file mode 100644
index 000000000000..ffa8e2573bc0
--- /dev/null
+++ b/torch/nativert/executor/triton/TritonKernelManager.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <string>
+
+#include <c10/util/Exception.h>
+
+namespace torch::nativert {
+
+struct GridDims {
+ public:
+  GridDims(int x = 1, int y = 1, int z = 1) : x(x), y(y), z(z) {}
+  int x;
+  int y;
+  int z;
+};
+
+struct LaunchParams {
+  int num_warps = 4;
+  int shared_memory_bytes = 0;
+  GridDims grid_dims;
+};
+
+class KernelInputs {
+ public:
+  KernelInputs(size_t num_args, size_t num_attrs)
+      : num_args_(num_args),
+        inputs_(num_args + num_attrs),
+        num_attrs_(num_attrs) {}
+  virtual ~KernelInputs() = default;
+
+  virtual void add_arg(void* arg) {
+    TORCH_CHECK(arg_idx_ < num_args_, "Too many args");
+    inputs_[arg_idx_++] = arg;
+  }
+
+  void add_attribute(void* attr) {
+    TORCH_CHECK(attr_idx_ < num_attrs_, "Too many attributes");
+    inputs_[num_args_ + attr_idx_++] = attr;
+  }
+
+  void** as_void() {
+    return inputs_.data();
+  }
+
+ protected:
+  size_t num_args_;
+  size_t arg_idx_ = 0;
+  std::vector<void*> inputs_;
+
+ private:
+  size_t num_attrs_;
+  size_t attr_idx_ = 0;
+};
+
+class TritonKernelManager {
+ public:
+  TritonKernelManager(std::string kernel_name, std::string kernel_bin_path)
+      : kernel_name_(std::move(kernel_name)),
+        kernel_bin_path_(std::move(kernel_bin_path)) {}
+  virtual ~TritonKernelManager() = default;
+  virtual std::unique_ptr<KernelInputs> create_inputs(
+      size_t num_args,
+      size_t num_attrs) const {
+    return std::make_unique<KernelInputs>(num_args, num_attrs);
+  }
+  virtual void launch(const LaunchParams& launch_params, void** args) = 0;
+
+ protected:
+  std::string kernel_name_, kernel_bin_path_;
+};
+
+inline std::unique_ptr<TritonKernelManager> (
+    *create_cuda_triton_kernel_manager)(std::string, std::string) = nullptr;
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/KernelFactory.cpp b/torch/nativert/kernels/KernelFactory.cpp
index 1702751e704b..3fc4f2bcdc53 100644
--- a/torch/nativert/kernels/KernelFactory.cpp
+++ b/torch/nativert/kernels/KernelFactory.cpp
@@ -14,6 +14,7 @@
 #include <torch/nativert/kernels/HigherOrderKernel.h>
 #include <torch/nativert/kernels/KernelFactory.h>
 #include <torch/nativert/kernels/PrimKernelRegistry.h>
+#include <torch/nativert/kernels/TritonKernel.h>
 
 namespace torch::nativert {
 
@@ -130,6 +131,11 @@ ExecutionKernels KernelFactory::initializeNodeKernels(
     } else if (c10::starts_with(
                    node.target(), "torch.ops.higher_order.call_torchbind")) {
       nodeKernels.push_back(std::make_unique<CallTorchBindKernel>(&node));
+    } else if (c10::starts_with(
+                   node.target(),
+                   "torch.ops.higher_order.triton_kernel_wrapper_functional")) {
+      nodeKernels.push_back(
+          std::make_unique<TritonKernel>(&node, pytorchStreamReader.get()));
     } else if (
         c10::starts_with(
             node.target(),
@@ -175,17 +181,16 @@ ExecutionKernels KernelFactory::initializeNodeKernels(
               executionKernels.constFoldingExecutions.empty(),
               "HigherOrderKernel does not support const folding");
           if (executorConfig.maxParallelOps > 1) {
-            graphExecutors.emplace_back(
-                std::unique_ptr<GraphExecutorBase>(new ParallelGraphExecutor(
-                    *subgraph,
-                    std::move(executionKernels.nodeKernels),
-                    executorConfig)));
+            graphExecutors.emplace_back(std::make_unique<ParallelGraphExecutor>(
+                *subgraph,
+                std::move(executionKernels.nodeKernels),
+                executorConfig));
           } else {
-            graphExecutors.emplace_back(std::unique_ptr<GraphExecutorBase>(
-                new torch::nativert::SerialGraphExecutor(
+            graphExecutors.emplace_back(
+                std::make_unique<torch::nativert::SerialGraphExecutor>(
                     *subgraph,
                     std::move(executionKernels.nodeKernels),
-                    executorConfig)));
+                    executorConfig));
           }
         }
       }
diff --git a/torch/nativert/kernels/KernelRegistry.cpp b/torch/nativert/kernels/KernelRegistry.cpp
index 5a04f8a7bf54..f416210cc393 100644
--- a/torch/nativert/kernels/KernelRegistry.cpp
+++ b/torch/nativert/kernels/KernelRegistry.cpp
@@ -390,6 +390,7 @@ REGISTER_CPU_KERNEL("torch.ops.aten.leaky_relu.default", aten_leaky_relu, {
     return;
   }
   auto& out_t = KernelOutput(0).toTensor();
+  fastResizeToZero(out_t);
   at::cpu::leaky_relu_out(out_t, in0_t, in1_s);
 })
 
@@ -901,6 +902,7 @@ REGISTER_CPU_KERNEL("torch.ops.aten.repeat.default", aten_repeat, {
     return;
   }
   at::Tensor& out = KernelOutput(0).toTensor();
+  fastResizeToZero(out);
   at::native::repeat_out(out, self, repeats);
 })
 
@@ -1017,6 +1019,7 @@ REGISTER_CPU_KERNEL("torch.ops.aten.full_like.default", aten_full_like, {
         in0_t, dtype, layout, device, pin_memory, memory_format);
   }
   auto& out_t = KernelOutput(0).toTensor();
+  fastResizeToZero(out_t);
   at::native::resize_(out_t, in0_t.sizes(), std::nullopt);
   at::native::fill_out(out_t, in1_s);
 })
@@ -1055,6 +1058,7 @@ REGISTER_CPU_KERNEL("torch.ops.fb.scale_gradient.default", fb_scale_gradient, {
     KernelOutput(0) = create_empty_from(in_0);
   }
   auto& out = KernelOutput(0).toTensor();
+  fastResizeToZero(out);
   out.resize_(in_0.sizes());
   out.copy_(in_0);
 })
diff --git a/torch/nativert/kernels/TritonKernel.cpp b/torch/nativert/kernels/TritonKernel.cpp
new file mode 100644
index 000000000000..84fbf09a37f4
--- /dev/null
+++ b/torch/nativert/kernels/TritonKernel.cpp
@@ -0,0 +1,137 @@
+#include <torch/nativert/kernels/TritonKernel.h>
+
+#include <fmt/ostream.h>
+
+#include <c10/util/Enumerate.h>
+#include <c10/util/Exception.h>
+
+#include <ATen/Tensor.h>
+#include <ATen/core/op_registration/op_registration.h>
+
+#include <torch/nativert/executor/DelegateExecutor.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+#include <torch/nativert/executor/triton/CpuTritonKernelManager.h>
+
+namespace torch::nativert {
+
+TritonKernel::TritonKernel(
+    const Node* node,
+    caffe2::serialize::PyTorchStreamReader* reader)
+    : OpKernel(node, OpKernelKind::kTritonKernel) {
+  TORCH_CHECK(reader != nullptr, "reader is null");
+
+  std::string kernel_name{};
+  bool found_grid = false;
+  for (const auto& attr : node_->attributes()) {
+    if (attr.name.empty()) {
+      attr_ptrs_.emplace_back(std::visit(
+          [](auto&& arg) -> void* {
+            using T = std::decay_t<decltype(arg)>;
+            if constexpr (std::is_same_v<T, None>) {
+              return nullptr;
+            }
+            return static_cast<void*>(const_cast<T*>(&arg));
+          },
+          attr.value));
+    } else if (attr.name == "name") {
+      kernel_name = std::get<std::string>(attr.value);
+    } else if (attr.name == "grid") {
+      found_grid = true;
+      auto grid = std::get<std::vector<int64_t>>(attr.value);
+      TORCH_CHECK(grid.size() == 3, "grid must be a 3D vector");
+      launch_params_.grid_dims = GridDims(
+          static_cast<int>(grid[0]),
+          static_cast<int>(grid[1]),
+          static_cast<int>(grid[2]));
+    } else if (attr.name == "num_warps") {
+      if (const int num_warps = static_cast<int>(std::get<int64_t>(attr.value));
+          num_warps > 0) {
+        launch_params_.num_warps = num_warps;
+      }
+    } else if (attr.name == "shared_memory_bytes") {
+      if (const int shared_memory_bytes =
+              static_cast<int>(std::get<int64_t>(attr.value));
+          shared_memory_bytes > 0) {
+        launch_params_.shared_memory_bytes = shared_memory_bytes;
+      }
+    } else if (attr.name == "output_indices") {
+      output_indices_ = std::get<std::vector<int64_t>>(attr.value);
+    }
+  }
+
+  TORCH_CHECK(!kernel_name.empty(), "kernel name not found");
+  TORCH_CHECK(found_grid, "grid attribute not found");
+  TORCH_CHECK(!output_indices_.empty(), "output_indices attribute not found");
+
+  auto kernel_prefix = std::string("data/triton") + "/" + kernel_name;
+
+  auto tmp_dir = extractToTemporaryFolder(*reader, kernel_prefix) + "/";
+
+  if (reader->hasRecord(kernel_prefix + "/" + kernel_name + ".cubin")) {
+    TORCH_CHECK(
+        create_cuda_triton_kernel_manager != nullptr,
+        "couldn't find cuda loader -- is this a gpu build?");
+    loader_ = create_cuda_triton_kernel_manager(
+        kernel_name, tmp_dir + kernel_name + ".cubin");
+  }
+
+  if (reader->hasRecord(kernel_prefix + "/" + kernel_name + ".hsaco")) {
+    TORCH_CHECK(
+        create_cuda_triton_kernel_manager != nullptr,
+        "couldn't find cuda loader -- is this a gpu build?");
+    loader_ = create_cuda_triton_kernel_manager(
+        kernel_name, tmp_dir + kernel_name + ".hsaco");
+  }
+
+  if (loader_ == nullptr) {
+    loader_ = std::unique_ptr<TritonKernelManager>(new CpuTritonKernelManager(
+        kernel_name,
+        tmp_dir + kernel_name + ".so",
+        tmp_dir + kernel_name + ".launcher.so"));
+  }
+}
+
+TritonKernel::~TritonKernel() = default;
+
+void TritonKernel::computeInternal(ExecutionFrame& executionFrame) const {
+  const auto num_inputs = node_->inputs().size();
+  const auto num_attrs = attr_ptrs_.size();
+
+  auto* loader = const_cast<TritonKernelManager*>(loader_.get());
+
+  auto inputs = loader->create_inputs(num_inputs, num_attrs);
+
+  for (const auto i : c10::irange(num_inputs)) {
+    inputs->add_arg(input(i, executionFrame).toTensor().data_ptr());
+  }
+
+  for (const auto i : c10::irange(num_attrs)) {
+    inputs->add_attribute(attr_ptrs_[i]);
+  }
+
+  loader->launch(launch_params_, inputs->as_void());
+
+  auto& out = output(0, executionFrame);
+  if (out.isNone()) {
+    auto list = c10::List<at::Tensor>();
+    for (const auto& i : output_indices_) {
+      list.emplace_back(input(i, executionFrame).toTensor());
+    }
+    out = c10::IValue(std::move(list));
+    return;
+  }
+
+  // todo: check if this is redundant
+  auto out_t = out.toTensorList();
+  for (const auto& i : output_indices_) {
+    out_t[i] = input(i, executionFrame).toTensor();
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/TritonKernel.h b/torch/nativert/kernels/TritonKernel.h
new file mode 100644
index 000000000000..4f9f0e47b00c
--- /dev/null
+++ b/torch/nativert/kernels/TritonKernel.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <c10/core/Device.h>
+
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/executor/OpKernel.h>
+#include <torch/nativert/executor/triton/TritonKernelManager.h>
+#include <torch/nativert/graph/Graph.h>
+
+namespace torch::nativert {
+
+class TritonKernel : public OpKernel {
+ public:
+  TritonKernel() = delete;
+  TritonKernel(
+      const Node* node,
+      caffe2::serialize::PyTorchStreamReader* reader);
+  ~TritonKernel() override;
+
+  void computeInternal(ExecutionFrame& executionFrame) const override;
+
+ private:
+  std::unique_ptr<TritonKernelManager> loader_;
+
+  // unnamed node attributes will be passed as arguments to the kernel
+  std::vector<void*> attr_ptrs_;
+  std::vector<int64_t> output_indices_;
+  LaunchParams launch_params_;
+};
+
+} // namespace torch::nativert
diff --git a/torch/nested/_internal/nested_tensor.py b/torch/nested/_internal/nested_tensor.py
index 14e71c506385..d3c4ba8c9166 100644
--- a/torch/nested/_internal/nested_tensor.py
+++ b/torch/nested/_internal/nested_tensor.py
@@ -234,14 +234,25 @@ def _maybe_min_seqlen(self) -> Optional[int]:
         mt = self._min_seqlen_tensor
         return None if mt is None else _load_val_from_tensor(mt)
 
+    def _is_contiguous_or_false(self):
+        if self.lengths() is not None:
+            return False
+        from torch._prims_common import is_contiguous_for_memory_format_or_false
+
+        return is_contiguous_for_memory_format_or_false(
+            self._values, memory_format=torch.contiguous_format
+        )
+
     def __repr__(self):  # type: ignore[override]
         # We should implement this in torch/_tensor_str.py instead
         grad_fn_str = (
             f", requires_grad={self.requires_grad}" if self.requires_grad else ""
         )
+
         if self.grad_fn:
             grad_fn_str = f", grad_fn={self.grad_fn}"
-        return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self.is_contiguous()})"
+
+        return f"NestedTensor(size={self._size}, offsets={self._offsets}{grad_fn_str}, contiguous={self._is_contiguous_or_false()})"
 
     # TODO: Remove this in favor of the default tensor subclass serialization logic.
     # We don't do this today because of https://github.com/pytorch/pytorch/issues/125622.
diff --git a/torch/nested/_internal/ops.py b/torch/nested/_internal/ops.py
index 1f26a4d90a4a..19b1fe670835 100644
--- a/torch/nested/_internal/ops.py
+++ b/torch/nested/_internal/ops.py
@@ -516,6 +516,29 @@ def is_contiguous_general(func, *args, **kwargs):
 )(is_contiguous_general)
 
 
+@register_jagged_func(
+    torch.ops.aten.sym_is_contiguous.default, "self: jt_all, memory_format: any?"
+)
+def sym_is_contiguous_general(func, *args, **kwargs):
+    _, new_kwargs = normalize_function(  # type: ignore[misc]
+        func, args=args, kwargs=kwargs, normalize_to_only_use_kwargs=True
+    )
+    inp = new_kwargs.pop("input")
+
+    # If created from narrow() check for lengths
+    if inp.lengths() is not None:
+        return False
+
+    new_kwargs["memory_format"] = new_kwargs.get(
+        "memory_format", torch.contiguous_format
+    )
+
+    if new_kwargs["memory_format"] == torch.preserve_format:
+        return True
+
+    return torch.ops.aten.sym_is_contiguous.default(inp._values, **new_kwargs)
+
+
 @register_jagged_func(
     torch.ops.aten.clone.default, "input: jt_all, memory_format: any?"
 )
@@ -2660,7 +2683,7 @@ def flex_njt(
     kernel_options: Dict[str, Any],
     score_mod_other_buffers: Tuple = (),
     mask_mod_other_buffers: Tuple = (),
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     assert query.dim() == 4 and key.dim() == 4 and value.dim() == 4
 
     # TODO: Support this if needed; determine if NJT buffers need be unwrapped as dense.
@@ -2673,6 +2696,9 @@ def flex_njt(
             "currently supported. Please file an issue if this is important to you."
         )
 
+    # Always set them since 0 sized elements are not handled gracefully
+    kernel_options = {**kernel_options, "OUTPUT_MAX": True, "OUTPUT_LOGSUMEXP": True}
+
     # need to pass dense tensor of shape (B, n_heads, sum(seq_len), D)
     output = flex_attention_hop(
         query.values().unsqueeze(0),
@@ -2703,7 +2729,15 @@ def flex_njt(
         max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
     ).transpose(1, 2)
 
-    return (output_njt, logsumexp_njt)
+    max_scores_njt = torch.nested.nested_tensor_from_jagged(
+        output[2].transpose(1, 2).squeeze(0),
+        query._offsets,  # type: ignore[attr-defined]
+        query._lengths,  # type: ignore[attr-defined]
+        min_seqlen=query._maybe_min_seqlen,  # type: ignore[attr-defined]
+        max_seqlen=query._maybe_max_seqlen,  # type: ignore[attr-defined]
+    ).transpose(1, 2)
+
+    return (output_njt, logsumexp_njt, max_scores_njt)
 
 
 @flex_attention_backward_hop.py_impl(NestedTensor)  # type: ignore[misc]
diff --git a/torch/nn/attention/experimental/_paged_attention.py b/torch/nn/attention/experimental/_paged_attention.py
index 2e31b5ec3cec..70eadcdadfaa 100644
--- a/torch/nn/attention/experimental/_paged_attention.py
+++ b/torch/nn/attention/experimental/_paged_attention.py
@@ -198,6 +198,7 @@ def convert_logical_block_mask(
         self,
         block_mask: BlockMask,
         batch_idx: Optional[torch.Tensor] = None,
+        kv_len: Optional[torch.Tensor] = None,
     ) -> BlockMask:
         """
         Converts a logical block mask by mapping its logical kv indices to the corresponding
@@ -210,6 +211,8 @@ def convert_logical_block_mask(
                 batch dimension. This provides flexibility to convert a
                 block mask with smaller batch size than the page table;
                 shape :math:`(B)`.
+            kv_len (Optional[Tensor]): actual KV sequence length for upper bound check;
+                shape :math:`(B,)` to handle multiple batches.
         """
         B, H, ROWS, MAX_BLOCKS_IN_COL = block_mask.kv_indices.shape
 
@@ -261,7 +264,7 @@ def convert_logical_block_mask(
                 .to(torch.int32)
             )
 
-        new_mask_mod = self.get_mask_mod(block_mask.mask_mod)
+        new_mask_mod = self.get_mask_mod(block_mask.mask_mod, kv_len)
 
         seq_lengths = (block_mask.seq_lengths[0], self.n_pages * self.page_size)
         return BlockMask.from_kv_blocks(
@@ -275,7 +278,9 @@ def convert_logical_block_mask(
         )
 
     def get_mask_mod(
-        self, mask_mod: Optional[_mask_mod_signature]
+        self,
+        mask_mod: Optional[_mask_mod_signature],
+        kv_len: Optional[torch.Tensor] = None,
     ) -> _mask_mod_signature:
         """
         Converts a mask_mod based on mapping from the physical block index to the logical
@@ -283,6 +288,7 @@ def get_mask_mod(
 
         Args:
             mask_mod (_mask_mod_signature): mask_mod based on the logical block index.
+            kv_len (Optional[torch.Tensor]): actual KV sequence length for upper bound check.
         """
         if mask_mod is None:
             mask_mod = noop_mask
@@ -297,14 +303,21 @@ def new_mask_mod(
             physical_kv_offset = physical_kv_idx % self.page_size
             logical_block_idx = self.physical_to_logical[b, physical_kv_block]
             logical_kv_idx = logical_block_idx * self.page_size + physical_kv_offset
-            return torch.where(
-                logical_block_idx >= 0, mask_mod(b, h, q_idx, logical_kv_idx), False
+            live_block = logical_block_idx >= 0
+            within_upper_bound = (
+                logical_kv_idx < kv_len[b] if kv_len is not None else True
             )
+            within_lower_bound = logical_kv_idx >= 0
+            is_valid = live_block & within_upper_bound & within_lower_bound
+
+            return torch.where(is_valid, mask_mod(b, h, q_idx, logical_kv_idx), False)
 
         return new_mask_mod
 
     def get_score_mod(
-        self, score_mod: Optional[_score_mod_signature]
+        self,
+        score_mod: Optional[_score_mod_signature],
+        kv_len: Optional[torch.Tensor] = None,
     ) -> _score_mod_signature:
         """
         Converts a score_mod based on mapping from the physical block index to the logical
@@ -312,6 +325,8 @@ def get_score_mod(
 
         Args:
             score_mod (_score_mod_signature): score_mod based on the logical block index.
+            `kv_len (Optional[torch.Tensor]): actual KV sequence length for upper bound check.
+
         """
         if score_mod is None:
             score_mod = _identity
@@ -327,8 +342,15 @@ def new_score_mod(
             physical_kv_offset = physical_kv_idx % self.page_size
             logical_block_idx = self.physical_to_logical[b, physical_kv_block]
             logical_kv_idx = logical_block_idx * self.page_size + physical_kv_offset
+            live_block = logical_block_idx >= 0
+            within_upper_bound = (
+                logical_kv_idx < kv_len[b] if kv_len is not None else True
+            )
+            within_lower_bound = logical_kv_idx >= 0
+            is_valid = live_block & within_upper_bound & within_lower_bound
+
             return torch.where(
-                logical_block_idx >= 0,
+                is_valid,
                 score_mod(score, b, h, q_idx, logical_kv_idx),
                 float("-inf"),
             )
diff --git a/torch/nn/attention/flex_attention.py b/torch/nn/attention/flex_attention.py
index 175a2627e977..f99acaf50a12 100644
--- a/torch/nn/attention/flex_attention.py
+++ b/torch/nn/attention/flex_attention.py
@@ -9,7 +9,7 @@
 import operator
 import warnings
 from enum import Enum
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, NamedTuple, Optional, Union
 
 import torch
 from torch import Tensor
@@ -69,6 +69,8 @@ def _warn_once(
 __all__ = [
     "BlockMask",
     "flex_attention",
+    "AuxOutput",
+    "AuxRequest",
     "FlexKernelOptions",
     "create_block_mask",
     "create_mask",
@@ -199,6 +201,26 @@ class FlexKernelOptions(TypedDict, total=False):
     """ROCm-specific waves per execution unit."""
 
 
+class AuxRequest(NamedTuple):
+    """Request which auxiliary outputs to compute from flex_attention.
+
+    Each field is a boolean indicating whether that auxiliary output should be computed.
+    """
+
+    lse: bool = False
+    max_scores: bool = False
+
+
+class AuxOutput(NamedTuple):
+    """Auxiliary outputs from flex_attention operation.
+
+    Fields will be None if not requested, or contain the tensor if requested.
+    """
+
+    lse: Optional[Tensor] = None
+    max_scores: Optional[Tensor] = None
+
+
 class _ModificationType(Enum):
     """Enum for the type of modification function.
     - SCORE_MOD: score_mod function which accepts a score as the first argument
@@ -627,6 +649,15 @@ def causal_mask(b, h, q_idx, kv_idx):
                 assert new_block_mask.kv_num_blocks.shape == (2, 1, 1)
                 assert new_block_mask.kv_indices.shape == (2, 1, 1, 4)
         """
+        index = (index,) if not isinstance(index, tuple) else index
+        padded = (*index, slice(None), slice(None), slice(None))[:3]
+        sizes = self.kv_num_blocks.shape[:3]
+        index = tuple(
+            (slice(i + n, i + n + 1) if -n <= i < 0 else slice(i, i + 1))
+            if isinstance(i, int)
+            else i
+            for i, n in zip(padded, sizes)
+        )
         new_kv_num_blocks = self.kv_num_blocks[index]
         new_kv_indices = self.kv_indices[index]
         if self.full_kv_num_blocks is not None:
@@ -1263,7 +1294,12 @@ def causal_mask(b, h, q_idx, kv_idx):
 
 
 def _apply_kernel_options(
-    query: Tensor, key: Tensor, value: Tensor, return_lse: bool, kernel_options
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    return_lse: bool,
+    kernel_options,
+    return_aux: Optional[AuxRequest] = None,
 ):
     kernel_options = {} if kernel_options is None else dict(kernel_options)
 
@@ -1273,24 +1309,42 @@ def _apply_kernel_options(
     # This forces all biases grad scatters to be done in the DQ iteration loop of the backwards
     kernel_options.setdefault("WRITE_DQ", True)
 
+    any_inputs_on_cpu_device = (
+        query.device.type == "cpu"
+        or key.device.type == "cpu"
+        or value.device.type == "cpu"
+    )
+
+    # Determine what auxiliary outputs are needed
+    output_lse = return_lse
+    output_max = False
+
+    if return_aux is not None:
+        # New API takes precedence over legacy parameters
+        output_lse = return_aux.lse
+        output_max = return_aux.max_scores
+
     # If forward kernel needs to return logsumexp is decided by this rule internally.
     assert "OUTPUT_LOGSUMEXP" not in kernel_options
     kernel_options["OUTPUT_LOGSUMEXP"] = True
-    if not return_lse:
+    if not output_lse:
         # We used to check if q,k,v required grads but since captured buffers can require grad
         # we always write unless in no_grad
-        output_logsumexp = torch.is_grad_enabled()
-        kernel_options["OUTPUT_LOGSUMEXP"] = output_logsumexp
-        any_inputs_on_cpu_device = (
-            query.device.type == "cpu"
-            or key.device.type == "cpu"
-            or value.device.type == "cpu"
-        )
+        kernel_options["OUTPUT_LOGSUMEXP"] = torch.is_grad_enabled()
         if any_inputs_on_cpu_device:
-            # CPU with torch.compile now supports infernece, and will not return lse
+            # CPU with torch.compile now supports inference, and will not return lse
             # TODO: support CPU for training and return lse
             kernel_options["OUTPUT_LOGSUMEXP"] = False
 
+    # If forward kernel needs to return max is decided by this rule internally.
+    assert "OUTPUT_MAX" not in kernel_options
+    kernel_options["OUTPUT_MAX"] = output_max
+    if any_inputs_on_cpu_device and output_max:
+        # CPU doesn't support returning max yet
+        # TODO: support CPU for returning max
+        raise NotImplementedError("Returning max scores is not supported on CPU.")
+        kernel_options["OUTPUT_MAX"] = False
+
     return kernel_options
 
 
@@ -1306,11 +1360,8 @@ def _validate_device(query: Tensor, key: Tensor, value: Tensor):
     """TODO: Remove once non cuda/cpu devices support is added
     We only need to check query since we have already that q,k,v are on the same device
     """
-    if (
-        query.device.type != "cuda"
-        and query.device.type != "cpu"
-        and query.device.type != "hpu"
-    ):
+    supported_devices = {"cuda", "cpu", "xpu", "hpu"}
+    if query.device.type not in supported_devices:
         raise ValueError(
             "FlexAttention is only supported on CUDA, CPU or HPU devices. "
             f"Found input tensors on {query.device.type} device."
@@ -1405,7 +1456,9 @@ def flex_attention(
     enable_gqa: bool = False,
     return_lse: bool = False,
     kernel_options: Optional[FlexKernelOptions] = None,
-) -> Union[Tensor, tuple[Tensor, Tensor]]:
+    *,
+    return_aux: Optional[AuxRequest] = None,
+) -> Union[Tensor, tuple[Tensor, Tensor], tuple[Tensor, AuxOutput]]:
     r"""This function implements scaled dot product attention with an arbitrary attention score modification function.
 
     This function computes the scaled dot product attention between query, key, and value tensors with a user-defined
@@ -1439,14 +1492,23 @@ def score_mod(
         block_mask (Optional[BlockMask]): BlockMask object that controls the blocksparsity pattern of the attention.
         scale (Optional[float]): Scaling factor applied prior to softmax. If none, the default value is set to :math:`\frac{1}{\sqrt{E}}`.
         enable_gqa (bool): If set to True, enables Grouped Query Attention (GQA) and broadcasts key/value heads to query heads.
-        return_lse (bool): Whether to return the logsumexp of the attention scores. Default is False.
+        return_lse (bool): Whether to return the logsumexp of the attention scores. Default is False. **Deprecated**: Use ``return_aux=AuxRequest(lse=True)`` instead.
         kernel_options (Optional[FlexKernelOptions]):
             Options to control the behavior of the underlying Triton kernels.
             See :class:`FlexKernelOptions` for available options and usage examples.
+        return_aux (Optional[AuxRequest]): Specifies which auxiliary outputs to compute and return.
+            If None, only the attention output is returned. Use ``AuxRequest(lse=True, max_scores=True)``
+            to request both auxiliary outputs.
 
     Returns:
         output (Tensor): Attention output; shape :math:`(B, Hq, L, Ev)`.
 
+        When ``return_aux`` is not None:
+            aux (AuxOutput): Auxiliary outputs with requested fields populated.
+
+        When ``return_aux`` is None (deprecated paths):
+            lse (Tensor): Log-sum-exp of attention scores; shape :math:`(B, Hq, L)`. Only returned if ``return_lse=True``.
+
     Shape legend:
         - :math:`N: \text{Batch size} ... : \text{Any number of other batch dimensions (optional)}`
         - :math:`S: \text{Source sequence length}`
@@ -1550,21 +1612,65 @@ def score_mod(
             f"but got {query.device} and {block_mask.kv_num_blocks.device}."  # type: ignore[union-attr]
         )
 
+    # Handle deprecation warnings for old parameters
+    if return_lse and return_aux is not None:
+        raise ValueError(
+            "Cannot specify both return_lse and return_aux. "
+            "return_lse is deprecated, please use return_aux=AuxRequest(lse=True) instead."
+        )
+    elif return_lse and return_aux is None:
+        _warn_once(
+            "deprecated_return_lse",
+            "return_lse is deprecated and will be removed in v2.10. "
+            "Please use return_aux=AuxRequest(lse=True) instead.",
+            category=FutureWarning,
+        )
+
     kernel_options = _apply_kernel_options(
         query,
         key,
         value,
         return_lse,
         kernel_options,
+        return_aux,
     )
 
+    def _finalize_outputs(
+        out,
+        lse,
+        max_scores,
+        *,
+        return_aux: Optional[AuxRequest],
+        return_lse: bool,
+    ):
+        """Normalize stats and build return value (aux-aware, legacy-compatible)."""
+        ln2 = math.log(2.0)
+        return_lse = return_lse or return_aux is not None and return_aux.lse
+        return_max = return_aux is not None and return_aux.max_scores
+
+        lse_scaled = lse * ln2 if (return_lse and lse.numel() > 0) else None
+        max_scaled = (
+            max_scores * ln2 if (return_max and max_scores.numel() > 0) else None
+        )
+
+        if return_aux is not None:
+            return out, AuxOutput(
+                lse=lse_scaled,
+                max_scores=max_scaled,
+            )
+
+        if return_lse:
+            return out, lse_scaled
+
+        return out
+
     if torch.compiler.is_dynamo_compiling():
         # mark head_dim and number of heads to be static
         for x in [query, key, value]:
             torch._dynamo.mark_static(x, -3)
             torch._dynamo.mark_static(x, -1)
 
-        out, lse = flex_attention_hop(
+        out, lse, max_scores = flex_attention_hop(
             query,
             key,
             value,
@@ -1573,10 +1679,9 @@ def score_mod(
             scale,
             kernel_options,  # type: ignore[union-attr]
         )
-        if return_lse:
-            return out, lse * math.log(2)
-        else:
-            return out
+        return _finalize_outputs(
+            out, lse, max_scores, return_aux=return_aux, return_lse=return_lse
+        )
 
     if not _FLEX_ATTENTION_DISABLE_COMPILE_DEBUG:
         _warn_once(
@@ -1620,7 +1725,7 @@ def _flex_attention_hop_wrapper(*args, **kwargs):
                             _flex_attention_hop_wrapper, backend=backend, fullgraph=True
                         )
 
-                    out, lse = flex_fn(
+                    out, lse, max_scores = flex_fn(
                         query,
                         key,
                         value,
@@ -1629,7 +1734,6 @@ def _flex_attention_hop_wrapper(*args, **kwargs):
                         scale,
                         kernel_options,
                     )
-    if return_lse:
-        return out, lse * math.log(2)
-    else:
-        return out
+    return _finalize_outputs(
+        out, lse, max_scores, return_aux=return_aux, return_lse=return_lse
+    )
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 6b61c3a5799d..92142fd44df8 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -5823,7 +5823,6 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
                 assert attn_mask is None
                 temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
                 attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-                attn_bias.to(query.dtype)
 
             if attn_mask is not None:
                 if attn_mask.dtype == torch.bool:
diff --git a/torch/numa/binding.py b/torch/numa/binding.py
index 1995f58f0585..b92a046676f9 100644
--- a/torch/numa/binding.py
+++ b/torch/numa/binding.py
@@ -14,7 +14,7 @@
 
 __all__ = [
     "AffinityMode",
-    "maybe_temporarily_apply_numa_binding_to_current_process",
+    "maybe_temporarily_apply_numa_binding_to_current_thread",
     "NumaOptions",
 ]
 
@@ -48,11 +48,11 @@ class NumaOptions:
 
 
 @contextmanager
-def maybe_temporarily_apply_numa_binding_to_current_process(
+def maybe_temporarily_apply_numa_binding_to_current_thread(
     *, gpu_index: int, numa_options: Optional[NumaOptions]
 ) -> Iterator[None]:
     """
-    1. Applies NUMA binding to the current process, suitable for the process
+    1. Applies NUMA binding to the current thread, suitable for the thread
     which will be interacting with GPU gpu_index.
     2. Resets to the original CPU affinity before exiting the context manager.
     """
@@ -60,17 +60,17 @@ def maybe_temporarily_apply_numa_binding_to_current_process(
         yield
         return
 
-    original_logical_cpu_indices = _get_allowed_cpu_indices_for_current_process()
-    _apply_numa_binding_to_current_process(
+    original_logical_cpu_indices = _get_allowed_cpu_indices_for_current_thread()
+    _apply_numa_binding_to_current_thread(
         gpu_index=gpu_index, numa_options=numa_options
     )
     yield
-    _bind_current_process_to_logical_cpus(
+    _bind_current_thread_to_logical_cpus(
         logical_cpu_indices=original_logical_cpu_indices
     )
 
 
-def _apply_numa_binding_to_current_process(
+def _apply_numa_binding_to_current_thread(
     *, gpu_index: int, numa_options: NumaOptions
 ) -> None:
     kwargs = {
@@ -94,9 +94,9 @@ def _apply_numa_binding_to_current_process(
             _get_ranges_str_from_ints(logical_cpu_indices),
         )
 
-        _bind_current_process_to_logical_cpus(logical_cpu_indices=logical_cpu_indices)
+        _bind_current_thread_to_logical_cpus(logical_cpu_indices=logical_cpu_indices)
         logger.info(
-            "Successfully bound to logical_cpu_indices=%r for NUMA binding",
+            "Successfully bound to logical_cpu_indices=%s for NUMA binding",
             _get_ranges_str_from_ints(logical_cpu_indices),
         )
 
@@ -132,8 +132,8 @@ def _raise_if_logical_cpu_indices_invalid(*, logical_cpu_indices: set[int]) -> N
         raise RuntimeError("Must bind to a non-empty set of CPU indices")
 
 
-def _bind_current_process_to_logical_cpus(*, logical_cpu_indices: set[int]) -> None:
-    # 0 represents the current process
+def _bind_current_thread_to_logical_cpus(*, logical_cpu_indices: set[int]) -> None:
+    # 0 represents the current thread
     os.sched_setaffinity(0, logical_cpu_indices)
 
 
@@ -383,7 +383,7 @@ def _get_allowed_logical_cpu_indices_for_numa_node(*, numa_node_index: int) -> s
     all_cpu_indices = _get_cpu_indices_for_numa_node_MAYBE_NOT_ALLOWED(
         numa_node_index=numa_node_index
     )
-    allowed_cpu_indices = _get_allowed_cpu_indices_for_current_process()
+    allowed_cpu_indices = _get_allowed_cpu_indices_for_current_thread()
     return all_cpu_indices & allowed_cpu_indices
 
 
@@ -393,7 +393,7 @@ def _get_cpu_indices_for_numa_node_MAYBE_NOT_ALLOWED(
     """
     Returns:
         Indices of all CPUs associated with numa_node_index. However, the list
-        is not filtered based on whether the process is allowed to use them.
+        is not filtered based on whether the thread is allowed to use them.
     """
     cpulist_absolute_path = f"/sys/devices/system/node/node{numa_node_index}/cpulist"
     try:
@@ -542,6 +542,6 @@ def _get_numa_node_indices_for_socket_index(*, socket_index: int) -> set[int]:
     return matching_numa_node_indices
 
 
-def _get_allowed_cpu_indices_for_current_process() -> set[int]:
-    # 0 denotes current process
+def _get_allowed_cpu_indices_for_current_thread() -> set[int]:
+    # 0 denotes current thread
     return os.sched_getaffinity(0)
diff --git a/torch/onnx/README.md b/torch/onnx/README.md
index 7c8596365f27..3878f48d70be 100644
--- a/torch/onnx/README.md
+++ b/torch/onnx/README.md
@@ -4,92 +4,3 @@ Torch->ONNX converter / exporter.
 
 - User-facing docs: https://pytorch.org/docs/main/onnx.html
 - Developer docs: https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter
-
-> Read the following if you are contributing to `torch.onnx`
-
-## Symbolic functions Opsets
-
-Opset 9 is the base version. It is selected as the base version because
-
-1. It is the first opset version supported by PyTorch export.
-2. Opset 9 is more robust than previous opset versions. Opset versions like 7/8 have limitations
-    that certain basic operators cannot be expressed in ONNX. Instead of basing on these limitations,
-    we chose to handle them as special cases separately.
-
-Backward support for opset versions beyond opset 7 is not in our roadmap.
-
-For opset versions other than 9, by default they will inherit the symbolic functions defined in
-symbolic_opset9.py.
-
-To extend support for updated operators in different opset versions on top of opset 9,
-simply add the updated symbolic functions in the respective symbolic_opset{version}.py file.
-Check out topk in symbolic_opset10.py, and upsample_nearest2d in symbolic_opset8.py for example.
-
-## Editing Symbolic Files
-
-- Use the internal `registration.onnx_symbolic` decorator to register a new symbolic function. Search for `def reshape(g, self, shape):` to see an example.
-- Parameter names must *exactly* match the names in
-  aten/src/ATen/native/native_functions.yaml, because
-  dispatch is done with keyword arguments.
-- Looking for inplace ops? They're detected by
-  `_jit_pass_onnx_remove_inplace_ops_for_onnx`, and
-  transparently dispatched to their non inplace versions in
-  "run_symbolic_function". See Note [Export inplace](#export-inplace)
-
-### A note on Tensor types
-
-In general, we should avoid depending on the type of Tensor Values contained
-within the trace graph. However, this is sometimes unavoidable (due to ONNX
-spec requirements, etc). The TensorType object has accessors for these properties that return the property if it is statically known and return nullopt otherwise.
-
-In general, we should prefer to rely on the least specific information possible.
-For example, not relying on tensor properties at all is better than relying
-on the number of dimensions which is better than relying on
-concrete shapes. Doing so will make the export symbolics
-more robust to different graphs.
-
-### Extra context for symbolic functions
-
-The first argument of a symbolic function is always a `GraphContext` object.
-
-`GraphContext` contains all methods defined in a `torch.Graph` object and context
-for the symbolic function.
-
-In general, symbolic functions only require inputs and attributes to
-the original node. An example of a symbolic function needing context is
-`prim::Loop`. It needs access to the sub-block of the original node.
-
-### Export inplace
-
-It would be better for us to export inplace annotations,
-than to not export them, since it is useful information that can
-help the target of an ONNX export export more efficiently. However,
-ONNX doesn't currently formalize inplace. Fortunately, it's sound to drop
-inplace annotations, but we are losing information this way.
-
-### Pointwise by scalar
-
-What happens if you add a tensor with a constant (e.g., x + 2)?  There are
-some moving parts to implementing the ONNX translation in this case:
-
-- By the time we get the scalar in a symbolic function here, it is no longer a
-  Python long/float, but a PyTorch tensor with `numel == 1` (eventually, we want
-  it to be a zero dim tensor but this change has not happened yet.) However, the
-  type of this scalar is *exactly* what the user wrote in Python, which may not
-  match the tensor it is being added to. PyTorch will do implicit conversions on
-  scalars; however, ONNX will not, so we must do the conversion ourselves. This
-  is what `symbolic_helper._if_scalar_type_as()` and
-  `_jit_pass_onnx_scalar_type_analysis` does.
-
-- Dispatch to these functions takes advantage an outrageous coincidence
-    between the tensor and scalar name.  When we add two tensors together,
-    you get the dispatch:
-
-    add(*[self, other], **{"alpha": alpha})
-
-    When you add a tensor and a scalar, you get the dispatch:
-
-    add(*[self], **{"other": other, "alpha": alpha})
-
-    By having the argument name line up with the name of the scalar attribute
-    if it exists, we can write a single function for both overloads.
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index 7eaa0a5677c4..668f47c15bc8 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -6,78 +6,41 @@
     # Modules
     "errors",
     "ops",
-    "symbolic_helper",
-    "utils",
-    # All opsets
-    "symbolic_opset7",
-    "symbolic_opset8",
-    "symbolic_opset9",
-    "symbolic_opset10",
-    "symbolic_opset11",
-    "symbolic_opset12",
-    "symbolic_opset13",
-    "symbolic_opset14",
-    "symbolic_opset15",
-    "symbolic_opset16",
-    "symbolic_opset17",
-    "symbolic_opset18",
-    "symbolic_opset19",
-    "symbolic_opset20",
-    # Enums
-    "OperatorExportTypes",
-    "TrainingMode",
-    "TensorProtoDataType",
-    "JitScalarType",
     # Public functions
     "export",
     "is_in_onnx_export",
-    "select_model_mode_for_export",
-    "register_custom_op_symbolic",
-    "unregister_custom_op_symbolic",
     # Base error
     "OnnxExporterError",
     "ONNXProgram",
 ]
 
 from typing import Any, Callable, TYPE_CHECKING
-from typing_extensions import deprecated
 
 import torch
 from torch._C import _onnx as _C_onnx
-from torch._C._onnx import OperatorExportTypes, TensorProtoDataType, TrainingMode
-
-from ._internal.exporter._onnx_program import ONNXProgram
-from ._type_utils import JitScalarType
-from .errors import OnnxExporterError
-from .utils import (
-    _run_symbolic_function,
-    _run_symbolic_method,
-    register_custom_op_symbolic,
-    select_model_mode_for_export,
-    unregister_custom_op_symbolic,
+from torch._C._onnx import (  # Deprecated members that are excluded from __all__
+    OperatorExportTypes as OperatorExportTypes,
+    TensorProtoDataType as TensorProtoDataType,
+    TrainingMode as TrainingMode,
 )
 
-
-from . import (  # usort: skip. Keep the order instead of sorting lexicographically
-    errors,
-    ops,
+from . import errors, ops
+from ._internal.exporter._onnx_program import ONNXProgram
+from ._internal.torchscript_exporter import (  # Deprecated members that are excluded from __all__
     symbolic_helper,
-    symbolic_opset7,
-    symbolic_opset8,
-    symbolic_opset9,
     symbolic_opset10,
-    symbolic_opset11,
-    symbolic_opset12,
-    symbolic_opset13,
-    symbolic_opset14,
-    symbolic_opset15,
-    symbolic_opset16,
-    symbolic_opset17,
-    symbolic_opset18,
-    symbolic_opset19,
-    symbolic_opset20,
+    symbolic_opset9,
     utils,
 )
+from ._internal.torchscript_exporter._type_utils import (
+    JitScalarType,  # Deprecated members that are excluded from __all__
+)
+from ._internal.torchscript_exporter.utils import (  # Deprecated members that are excluded from __all__
+    register_custom_op_symbolic,
+    select_model_mode_for_export,
+    unregister_custom_op_symbolic,
+)
+from .errors import OnnxExporterError
 
 
 if TYPE_CHECKING:
@@ -85,10 +48,10 @@
     from collections.abc import Collection, Mapping, Sequence
 
 # Set namespace for exposed private names
-JitScalarType.__module__ = "torch.onnx"
 ONNXProgram.__module__ = "torch.onnx"
 OnnxExporterError.__module__ = "torch.onnx"
 
+# TODO(justinchuby): Remove these two properties
 producer_name = "pytorch"
 producer_version = _C_onnx.PRODUCER_VERSION
 
@@ -102,16 +65,11 @@ def export(
     f: str | os.PathLike | None = None,
     *,
     kwargs: dict[str, Any] | None = None,
-    export_params: bool = True,
     verbose: bool | None = None,
     input_names: Sequence[str] | None = None,
     output_names: Sequence[str] | None = None,
     opset_version: int | None = None,
-    dynamic_axes: Mapping[str, Mapping[int, str]]
-    | Mapping[str, Sequence[int]]
-    | None = None,
-    keep_initializers_as_inputs: bool = False,
-    dynamo: bool = False,
+    dynamo: bool = True,
     # Dynamo only options
     external_data: bool = True,
     dynamic_shapes: dict[str, Any] | tuple[Any, ...] | list[Any] | None = None,
@@ -124,6 +82,12 @@ def export(
     dump_exported_program: bool = False,
     artifacts_dir: str | os.PathLike = ".",
     fallback: bool = False,
+    # BC options
+    export_params: bool = True,
+    keep_initializers_as_inputs: bool = False,
+    dynamic_axes: Mapping[str, Mapping[int, str]]
+    | Mapping[str, Sequence[int]]
+    | None = None,
     # Deprecated options
     training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
     operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
@@ -136,7 +100,7 @@ def export(
 
     Setting ``dynamo=True`` enables the new ONNX export logic
     which is based on :class:`torch.export.ExportedProgram` and a more modern
-    set of translation logic. This is the recommended way to export models
+    set of translation logic. This is the recommended and default way to export models
     to ONNX.
 
     When ``dynamo=True``:
@@ -146,21 +110,17 @@ def export(
     #. If the model is already an ExportedProgram, it will be used as-is.
     #. Use :func:`torch.export.export` and set ``strict=False``.
     #. Use :func:`torch.export.export` and set ``strict=True``.
-    #. Use ``draft_export`` which removes some soundness guarantees in data-dependent
-       operations to allow export to proceed. You will get a warning if the exporter
-       encounters any unsound data-dependent operation.
-    #. Use :func:`torch.jit.trace` to trace the model then convert to ExportedProgram.
-       This is the most unsound strategy but may be useful for converting TorchScript
-       models to ONNX.
 
     Args:
         model: The model to be exported.
         args: Example positional inputs. Any non-Tensor arguments will be hard-coded into the
             exported model; any Tensor arguments will become inputs of the exported model,
             in the order they occur in the tuple.
-        f: Path to the output ONNX model file. E.g. "model.onnx".
+        f: Path to the output ONNX model file. E.g. "model.onnx". This argument is kept for
+            backward compatibility. It is recommended to leave unspecified (None)
+            and use the returned :class:`torch.onnx.ONNXProgram` to serialize the model
+            to a file instead.
         kwargs: Optional example keyword inputs.
-        export_params: If false, parameters (weights) will not be exported.
         verbose: Whether to enable verbose logging.
         input_names: names to assign to the input nodes of the graph, in order.
         output_names: names to assign to the output nodes of the graph, in order.
@@ -170,7 +130,52 @@ def export(
             of the runtime backend or compiler you want to run the exported model with.
             Leave as default (``None``) to use the recommended version, or refer to
             the ONNX operators documentation for more information.
+        dynamo: Whether to export the model with ``torch.export`` ExportedProgram instead of TorchScript.
+        external_data: Whether to save the model weights as an external data file.
+            This is required for models with large weights that exceed the ONNX file size limit (2GB).
+            When False, the weights are saved in the ONNX file with the model architecture.
+        dynamic_shapes: A dictionary or a tuple of dynamic shapes for the model inputs. Refer to
+            :func:`torch.export.export` for more details. This is only used (and preferred) when dynamo is True.
+            Note that dynamic_shapes is designed to be used when the model is exported with dynamo=True, while
+            dynamic_axes is used when dynamo=False.
+        custom_translation_table: A dictionary of custom decompositions for operators in the model.
+            The dictionary should have the callable target in the fx Node as the key (e.g. ``torch.ops.aten.stft.default``),
+            and the value should be a function that builds that graph using ONNX Script. This option
+            is only valid when dynamo is True.
+        report: Whether to generate a markdown report for the export process. This option
+            is only valid when dynamo is True.
+        optimize: Whether to optimize the exported model. This option
+            is only valid when dynamo is True. Default is True.
+        verify: Whether to verify the exported model using ONNX Runtime. This option
+            is only valid when dynamo is True.
+        profile: Whether to profile the export process. This option
+            is only valid when dynamo is True.
+        dump_exported_program: Whether to dump the :class:`torch.export.ExportedProgram` to a file.
+            This is useful for debugging the exporter. This option is only valid when dynamo is True.
+        artifacts_dir: The directory to save the debugging artifacts like the report and the serialized
+            exported program. This option is only valid when dynamo is True.
+        fallback: Whether to fallback to the TorchScript exporter if the dynamo exporter fails.
+            This option is only valid when dynamo is True. When fallback is enabled, It is
+            recommended to set dynamic_axes even when dynamic_shapes is provided.
+        export_params: **When ``f`` is specified**: If false, parameters (weights) will not be exported.
+
+            You can also leave it unspecified and use the returned :class:`torch.onnx.ONNXProgram`
+            to control how initializers are treated when serializing the model.
+        keep_initializers_as_inputs: **When ``f`` is specified**: If True, all the
+            initializers (typically corresponding to model weights) in the
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the user inputs are added as inputs.
+
+            Set this to True if you intend to supply model weights at runtime.
+            Set it to False if the weights are static to allow for better optimizations
+            (e.g. constant folding) by backends/runtimes.
+
+            You can also leave it unspecified and use the returned :class:`torch.onnx.ONNXProgram`
+            to control how initializers are treated when serializing the model.
         dynamic_axes:
+            Prefer specifying ``dynamic_shapes`` when ``dynamo=True`` and when ``fallback``
+            is not enabled.
 
             By default the exported model will have the shapes of all input and output tensors
             set to exactly match those given in ``args``. To specify axes of tensors as
@@ -252,84 +257,12 @@ def forward(self, x):
                           dim_param: "sum_dynamic_axes_1"  # axis 0
                 ...
 
-        keep_initializers_as_inputs: If True, all the
-            initializers (typically corresponding to model weights) in the
-            exported graph will also be added as inputs to the graph. If False,
-            then initializers are not added as inputs to the graph, and only
-            the user inputs are added as inputs.
-
-            Set this to True if you intend to supply model weights at runtime.
-            Set it to False if the weights are static to allow for better optimizations
-            (e.g. constant folding) by backends/runtimes.
-
-        dynamo: Whether to export the model with ``torch.export`` ExportedProgram instead of TorchScript.
-        external_data: Whether to save the model weights as an external data file.
-            This is required for models with large weights that exceed the ONNX file size limit (2GB).
-            When False, the weights are saved in the ONNX file with the model architecture.
-        dynamic_shapes: A dictionary or a tuple of dynamic shapes for the model inputs. Refer to
-            :func:`torch.export.export` for more details. This is only used (and preferred) when dynamo is True.
-            Note that dynamic_shapes is designed to be used when the model is exported with dynamo=True, while
-            dynamic_axes is used when dynamo=False.
-        custom_translation_table: A dictionary of custom decompositions for operators in the model.
-            The dictionary should have the callable target in the fx Node as the key (e.g. ``torch.ops.aten.stft.default``),
-            and the value should be a function that builds that graph using ONNX Script. This option
-            is only valid when dynamo is True.
-        report: Whether to generate a markdown report for the export process. This option
-            is only valid when dynamo is True.
-        optimize: Whether to optimize the exported model. This option
-            is only valid when dynamo is True. Default is True.
-        verify: Whether to verify the exported model using ONNX Runtime. This option
-            is only valid when dynamo is True.
-        profile: Whether to profile the export process. This option
-            is only valid when dynamo is True.
-        dump_exported_program: Whether to dump the :class:`torch.export.ExportedProgram` to a file.
-            This is useful for debugging the exporter. This option is only valid when dynamo is True.
-        artifacts_dir: The directory to save the debugging artifacts like the report and the serialized
-            exported program. This option is only valid when dynamo is True.
-        fallback: Whether to fallback to the TorchScript exporter if the dynamo exporter fails.
-            This option is only valid when dynamo is True. When fallback is enabled, It is
-            recommended to set dynamic_axes even when dynamic_shapes is provided.
-
         training: Deprecated option. Instead, set the training mode of the model before exporting.
         operator_export_type: Deprecated option. Only ONNX is supported.
         do_constant_folding: Deprecated option.
-        custom_opsets: Deprecated.
-            A dictionary:
-
-            * KEY (str): opset domain name
-            * VALUE (int): opset version
-
-            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
-            the opset version is set to 1. Only custom opset domain name and version should be
-            indicated through this argument.
+        custom_opsets: Deprecated option.
         export_modules_as_functions: Deprecated option.
-
-            Flag to enable
-            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
-            particular types of modules to export as local functions in ONNX.
-            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
-            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
-            Module variables will be exported as function attributes. There are two categories of function
-            attributes.
-
-            1. Annotated attributes: class variables that have type annotations via
-            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
-            will be exported as attributes.
-            Annotated attributes are not used inside the subgraph of ONNX local function because
-            they are not created by PyTorch JIT tracing, but they may be used by consumers
-            to determine whether or not to replace the function with a particular fused kernel.
-
-            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
-            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
-            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
-
-            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
-            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
-            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
-                only if the type of the ``nn.Module`` is found in the set.
-        autograd_inlining: Deprecated.
-            Flag used to control whether to inline autograd functions.
-            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
+        autograd_inlining: Deprecated option.
 
     Returns:
         :class:`torch.onnx.ONNXProgram` if dynamo is True, otherwise None.
@@ -342,6 +275,8 @@ def forward(self, x):
         *autograd_inlining* is now deprecated.
     .. versionchanged:: 2.7
         *optimize* is now True by default.
+    .. versionchanged:: 2.9
+        *dynamo* is now True by default.
     """
     if dynamo is True or isinstance(model, torch.export.ExportedProgram):
         from torch.onnx._internal.exporter import _compat
@@ -385,7 +320,7 @@ def forward(self, x):
     else:
         import warnings
 
-        from torch.onnx.utils import export
+        from ._internal.torchscript_exporter.utils import export
 
         warnings.warn(
             "You are using the legacy TorchScript-based ONNX export. Starting in PyTorch 2.9, "
@@ -429,7 +364,7 @@ def forward(self, x):
 
 def is_in_onnx_export() -> bool:
     """Returns whether it is in the middle of ONNX export."""
-    from torch.onnx._globals import GLOBALS
     from torch.onnx._internal.exporter import _flags
+    from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 
     return GLOBALS.in_onnx_export or _flags._is_onnx_exporting
diff --git a/torch/onnx/_flags.py b/torch/onnx/_flags.py
index e30fd20a293a..b88e3b3363f1 100644
--- a/torch/onnx/_flags.py
+++ b/torch/onnx/_flags.py
@@ -43,8 +43,8 @@ def _load_boolean_flag(
     return state
 
 
-PLACEHOLDER: bool = _load_boolean_flag(
-    "TORCH_ONNX_PLACEHOLDER",
-    this_will="do nothing",
-    default=True,
+ENABLE_DRAFT_EXPORT: bool = _load_boolean_flag(
+    "TORCH_ONNX_ENABLE_DRAFT_EXPORT",
+    this_will="enable torch.export.draft_export as a strategy for capturing models",
+    default=False,
 )
diff --git a/torch/onnx/_internal/exporter/_capture_strategies.py b/torch/onnx/_internal/exporter/_capture_strategies.py
index 598f4ec5ffa6..89a2b7e9e5e2 100644
--- a/torch/onnx/_internal/exporter/_capture_strategies.py
+++ b/torch/onnx/_internal/exporter/_capture_strategies.py
@@ -12,7 +12,7 @@
 from typing import Any, Callable, TYPE_CHECKING
 
 import torch
-from torch.export import _draft_export
+from torch.onnx import _flags
 
 
 if TYPE_CHECKING:
@@ -251,7 +251,7 @@ class TorchExportDraftExportStrategy(CaptureStrategy):
     def _capture(
         self, model, args, kwargs, dynamic_shapes
     ) -> torch.export.ExportedProgram:
-        ep = _draft_export.draft_export(
+        ep = torch.export.draft_export(
             model, args, kwargs=kwargs, dynamic_shapes=dynamic_shapes
         )
         report = ep._report  # type: ignore[attr-defined]
@@ -263,24 +263,27 @@ def _capture(
     def _enter(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
-            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`..."
+            f"Obtain model graph for `{model_repr}` with `torch.export.draft_export`..."
         )
 
     def _success(self, model) -> None:
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
-            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`... ✅"
+            f"Obtain model graph for `{model_repr}` with `torch.export.draft_export`... ✅"
         )
 
     def _failure(self, model, e) -> None:
         del e  # Unused
         model_repr = _take_first_line(repr(model))
         self._verbose_print(
-            f"Obtain model graph for `{model_repr}` with `torch.export draft_export`... ❌"
+            f"Obtain model graph for `{model_repr}` with `torch.export.draft_export`... ❌"
         )
 
 
-CAPTURE_STRATEGIES = (
+CAPTURE_STRATEGIES: tuple[type[CaptureStrategy], ...] = (
     TorchExportNonStrictStrategy,  # strict=False is preferred over strict=True because it does not have dynamo issues
     TorchExportStrictStrategy,
 )
+
+if _flags.ENABLE_DRAFT_EXPORT:
+    CAPTURE_STRATEGIES = (*CAPTURE_STRATEGIES, TorchExportDraftExportStrategy)
diff --git a/torch/onnx/_internal/exporter/_compat.py b/torch/onnx/_internal/exporter/_compat.py
index 2e25730adca2..0bc0c6182fca 100644
--- a/torch/onnx/_internal/exporter/_compat.py
+++ b/torch/onnx/_internal/exporter/_compat.py
@@ -4,6 +4,7 @@
 # mypy: disable-error-code=attr-defined
 from __future__ import annotations
 
+import io
 import logging
 import warnings
 from collections.abc import Mapping, Sequence
@@ -11,7 +12,7 @@
 
 import torch
 from torch.onnx import _constants as onnx_constants
-from torch.onnx._internal._lazy_import import onnxscript_apis, onnxscript_ir as ir
+from torch.onnx._internal._lazy_import import onnx, onnxscript_apis, onnxscript_ir as ir
 from torch.onnx._internal.exporter import (
     _constants,
     _core,
@@ -61,7 +62,7 @@ def export_compat(
     keep_initializers_as_inputs: bool = False,
     external_data: bool = True,
     report: bool = False,
-    optimize: bool = False,
+    optimize: bool = True,
     verify: bool = False,
     profile: bool = False,
     dump_exported_program: bool = False,
@@ -211,11 +212,23 @@ def export_compat(
         onnx_program.optimize()
 
     if f is not None:
-        onnx_program.save(
-            f,
-            include_initializers=export_params,
-            keep_initializers_as_inputs=keep_initializers_as_inputs,
-            external_data=external_data,
-        )
+        if isinstance(f, io.BytesIO):
+            # For legacy export compatibility, we allow f to be a BytesIO object.
+            # This is not explicitly supported but we may need to maintain the
+            # behavior indefinitely.
+            warnings.warn(
+                "Saving ONNX model to a BytesIO object is deprecated. "
+                "Please use a file path instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            onnx.save(onnx_program.model_proto, f)
+        else:
+            onnx_program.save(
+                f,
+                include_initializers=export_params,
+                keep_initializers_as_inputs=keep_initializers_as_inputs,
+                external_data=external_data,
+            )
 
     return onnx_program
diff --git a/torch/onnx/_internal/exporter/_core.py b/torch/onnx/_internal/exporter/_core.py
index 4bdec29536e6..33a19d629388 100644
--- a/torch/onnx/_internal/exporter/_core.py
+++ b/torch/onnx/_internal/exporter/_core.py
@@ -79,7 +79,7 @@
     f"""\
     Failed to export the model with torch.export. {_BLUE}This is step 1/3{_END} of exporting the model to ONNX. Next steps:
     - Modify the model code for `torch.export.export` to succeed. Refer to https://pytorch.org/docs/stable/generated/exportdb/index.html for more information.
-    - Debug `torch.export.export` and summit a PR to PyTorch.
+    - Debug `torch.export.export` and submit a PR to PyTorch.
     - Create an issue in the PyTorch GitHub repository against the {_BLUE}*torch.export*{_END} component and attach the full error stack as well as reproduction scripts."""
 )
 
@@ -1340,6 +1340,8 @@ def export(
                 export_status.torch_export_non_strict = result.success
             elif strategy_class is _capture_strategies.TorchExportStrictStrategy:
                 export_status.torch_export_strict = result.success
+            elif strategy_class is _capture_strategies.TorchExportDraftExportStrategy:
+                export_status.torch_export_draft_export = result.success
 
             if result.exception is not None:
                 failed_results.append(result)
diff --git a/torch/onnx/_internal/exporter/_flags.py b/torch/onnx/_internal/exporter/_flags.py
index de20e27418df..0f07508f831e 100644
--- a/torch/onnx/_internal/exporter/_flags.py
+++ b/torch/onnx/_internal/exporter/_flags.py
@@ -3,17 +3,20 @@
 from __future__ import annotations
 
 import functools
-from typing import Any, Callable, cast, TypeVar
+from typing import Callable, TypeVar
+from typing_extensions import ParamSpec
 
 
 _is_onnx_exporting = False
 
-TCallable = TypeVar("TCallable", bound=Callable[..., Any])
+# Use ParamSpec to preserve parameter types instead of erasing to Any
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
 
 
-def set_onnx_exporting_flag(func: TCallable) -> TCallable:
+def set_onnx_exporting_flag(func: Callable[_P, _R]) -> Callable[_P, _R]:
     @functools.wraps(func)
-    def wrapper(*args: Any, **kwargs: Any) -> Any:
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
         global _is_onnx_exporting
         _is_onnx_exporting = True
         try:
@@ -22,4 +25,4 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
             # Ensure it resets even if an exception occurs
             _is_onnx_exporting = False
 
-    return cast(TCallable, wrapper)
+    return wrapper
diff --git a/torch/onnx/_internal/exporter/_reporting.py b/torch/onnx/_internal/exporter/_reporting.py
index dcbd84d26091..e2e02e089c5d 100644
--- a/torch/onnx/_internal/exporter/_reporting.py
+++ b/torch/onnx/_internal/exporter/_reporting.py
@@ -22,7 +22,7 @@ class ExportStatus:
     torch_export_strict: bool | None = None
     # Whether torch.export.export(..., strict=False) succeeds
     torch_export_non_strict: bool | None = None
-    # Whether torch.export._draft_export.draft_export() succeeds
+    # Whether torch.export.draft_export() succeeds
     torch_export_draft_export: bool | None = None
     # Whether decomposition succeeds
     decomposition: bool | None = None
@@ -47,7 +47,7 @@ def _format_export_status(status: ExportStatus) -> str:
         f"```\n"
         f"{_status_emoji(status.torch_export_non_strict)} Obtain model graph with `torch.export.export(..., strict=False)`\n"
         f"{_status_emoji(status.torch_export_strict)} Obtain model graph with `torch.export.export(..., strict=True)`\n"
-        f"{_status_emoji(status.torch_export_draft_export)} Obtain model graph with `torch.export._draft_export.draft_export`\n"
+        f"{_status_emoji(status.torch_export_draft_export)} Obtain model graph with `torch.export.draft_export`\n"
         f"{_status_emoji(status.decomposition)} Decompose operators for ONNX compatibility\n"
         f"{_status_emoji(status.onnx_translation)} Translate the graph into ONNX\n"
         f"{_status_emoji(status.onnx_checker)} Run `onnx.checker` on the ONNX model\n"
diff --git a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
index 039eeb3e2fc2..8c045d11a2b8 100644
--- a/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
+++ b/torch/onnx/_internal/exporter/_torchlib/_torchlib_registry.py
@@ -10,6 +10,7 @@
 import logging
 from collections.abc import Sequence
 from typing import Any, Callable, TypeVar
+from typing_extensions import ParamSpec
 
 import onnxscript
 
@@ -17,7 +18,9 @@
 from torch.onnx._internal.exporter import _constants, _registration
 
 
-_T = TypeVar("_T", bound=Callable)
+# Use ParamSpec for better type preservation instead of bound Callable TypeVar
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
 
 logger = logging.getLogger("__name__")
 
@@ -33,7 +36,7 @@ def onnx_impl(
     opset_introduced: int = 18,
     no_compile: bool = False,
     private: bool = False,
-) -> Callable[[_T], _T]:
+) -> Callable[[Callable[_P, _R]], Callable[_P, _R]]:
     """Register an ONNX implementation of a torch op."""
 
     if isinstance(target, torch._ops.OpOverloadPacket):
@@ -44,8 +47,8 @@ def onnx_impl(
         )
 
     def wrapper(
-        func: _T,
-    ) -> _T:
+        func: Callable[_P, _R],
+    ) -> Callable[_P, _R]:
         processed_func: Any
         if no_compile:
             processed_func = func
diff --git a/torch/onnx/_internal/torchscript_exporter/README.md b/torch/onnx/_internal/torchscript_exporter/README.md
new file mode 100644
index 000000000000..af0ca464beda
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/README.md
@@ -0,0 +1,91 @@
+# TorchScript Exporter
+
+> [!NOTE]
+> This directory hosts code for the legacy TorchScript-based ONNX exporter. It is *deprecated* since PyTorch 2.9 and should be removed along with TorchScript.
+
+## Symbolic functions Opsets
+
+Opset 9 is the base version. It is selected as the base version because
+
+1. It is the first opset version supported by PyTorch export.
+2. Opset 9 is more robust than previous opset versions. Opset versions like 7/8 have limitations
+    that certain basic operators cannot be expressed in ONNX. Instead of basing on these limitations,
+    we chose to handle them as special cases separately.
+
+Backward support for opset versions beyond opset 7 is not in our roadmap.
+
+For opset versions other than 9, by default they will inherit the symbolic functions defined in
+symbolic_opset9.py.
+
+To extend support for updated operators in different opset versions on top of opset 9,
+simply add the updated symbolic functions in the respective symbolic_opset{version}.py file.
+Check out topk in symbolic_opset10.py, and upsample_nearest2d in symbolic_opset8.py for example.
+
+## Editing Symbolic Files
+
+- Use the internal `registration.onnx_symbolic` decorator to register a new symbolic function. Search for `def reshape(g, self, shape):` to see an example.
+- Parameter names must *exactly* match the names in
+  aten/src/ATen/native/native_functions.yaml, because
+  dispatch is done with keyword arguments.
+- Looking for inplace ops? They're detected by
+  `_jit_pass_onnx_remove_inplace_ops_for_onnx`, and
+  transparently dispatched to their non inplace versions in
+  "run_symbolic_function". See Note [Export inplace](#export-inplace)
+
+### A note on Tensor types
+
+In general, we should avoid depending on the type of Tensor Values contained
+within the trace graph. However, this is sometimes unavoidable (due to ONNX
+spec requirements, etc). The TensorType object has accessors for these properties that return the property if it is statically known and return nullopt otherwise.
+
+In general, we should prefer to rely on the least specific information possible.
+For example, not relying on tensor properties at all is better than relying
+on the number of dimensions which is better than relying on
+concrete shapes. Doing so will make the export symbolics
+more robust to different graphs.
+
+### Extra context for symbolic functions
+
+The first argument of a symbolic function is always a `GraphContext` object.
+
+`GraphContext` contains all methods defined in a `torch.Graph` object and context
+for the symbolic function.
+
+In general, symbolic functions only require inputs and attributes to
+the original node. An example of a symbolic function needing context is
+`prim::Loop`. It needs access to the sub-block of the original node.
+
+### Export inplace
+
+It would be better for us to export inplace annotations,
+than to not export them, since it is useful information that can
+help the target of an ONNX export export more efficiently. However,
+ONNX doesn't currently formalize inplace. Fortunately, it's sound to drop
+inplace annotations, but we are losing information this way.
+
+### Pointwise by scalar
+
+What happens if you add a tensor with a constant (e.g., x + 2)?  There are
+some moving parts to implementing the ONNX translation in this case:
+
+- By the time we get the scalar in a symbolic function here, it is no longer a
+  Python long/float, but a PyTorch tensor with `numel == 1` (eventually, we want
+  it to be a zero dim tensor but this change has not happened yet.) However, the
+  type of this scalar is *exactly* what the user wrote in Python, which may not
+  match the tensor it is being added to. PyTorch will do implicit conversions on
+  scalars; however, ONNX will not, so we must do the conversion ourselves. This
+  is what `symbolic_helper._if_scalar_type_as()` and
+  `_jit_pass_onnx_scalar_type_analysis` does.
+
+- Dispatch to these functions takes advantage an outrageous coincidence
+    between the tensor and scalar name.  When we add two tensors together,
+    you get the dispatch:
+
+    add(*[self, other], **{"alpha": alpha})
+
+    When you add a tensor and a scalar, you get the dispatch:
+
+    add(*[self], **{"other": other, "alpha": alpha})
+
+    By having the argument name line up with the name of the scalar attribute
+    if it exists, we can write a single function for both overloads.
diff --git a/test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_shared_shared_dicts b/torch/onnx/_internal/torchscript_exporter/__init__.py
similarity index 100%
rename from test/dynamo_expected_failures/CPython313-test_dict-DictTest.test_reverse_iterator_for_shared_shared_dicts
rename to torch/onnx/_internal/torchscript_exporter/__init__.py
diff --git a/torch/onnx/_experimental.py b/torch/onnx/_internal/torchscript_exporter/_experimental.py
similarity index 100%
rename from torch/onnx/_experimental.py
rename to torch/onnx/_internal/torchscript_exporter/_experimental.py
diff --git a/torch/onnx/_globals.py b/torch/onnx/_internal/torchscript_exporter/_globals.py
similarity index 100%
rename from torch/onnx/_globals.py
rename to torch/onnx/_internal/torchscript_exporter/_globals.py
diff --git a/torch/onnx/_type_utils.py b/torch/onnx/_internal/torchscript_exporter/_type_utils.py
similarity index 100%
rename from torch/onnx/_type_utils.py
rename to torch/onnx/_internal/torchscript_exporter/_type_utils.py
diff --git a/torch/onnx/_internal/jit_utils.py b/torch/onnx/_internal/torchscript_exporter/jit_utils.py
similarity index 97%
rename from torch/onnx/_internal/jit_utils.py
rename to torch/onnx/_internal/torchscript_exporter/jit_utils.py
index f3f82c0db7c2..6c00b6a9c8c4 100644
--- a/torch/onnx/_internal/jit_utils.py
+++ b/torch/onnx/_internal/torchscript_exporter/jit_utils.py
@@ -1,9 +1,6 @@
 # mypy: allow-untyped-defs
 """Utilities for manipulating the torch.Graph object and the torchscript."""
 
-# TODO(justinchuby): Move more of the symbolic helper functions here and expose
-# them to the user.
-
 from __future__ import annotations
 
 import dataclasses
@@ -14,8 +11,8 @@
 
 import torch
 from torch import _C
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import registration
+from torch.onnx._internal.torchscript_exporter import registration
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
 
 
 _ATTR_PATTERN = re.compile("^(.+)_(([ifstgz])|(ty))$")
@@ -89,7 +86,6 @@ def op(
             The value representing the single output of this operator (see the `outputs`
             keyword argument for multi-return nodes).
         """
-        # FIXME(justinchuby): Add the return type back once we know how to handle mypy
         return _add_op(self, opname, *raw_args, outputs=outputs, **kwargs)
 
     def aten_op(self, operator: str, *args, overload_name: str = "", **kwargs):
@@ -211,8 +207,6 @@ def _add_op(
     The set of operators and the inputs/attributes they take
     is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
 
-    This function is monkey-patched onto Graph.
-
     Args:
         graph_context: The Torch Graph or Block.
         opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
@@ -337,7 +331,6 @@ def _add_attribute(node: _C.Node, key: str, value: Any, aten: bool):
     return getattr(node, f"{kind}_")(name, value)
 
 
-# TODO: Expose this to user when migrating symbolic helper functions to here.
 def _is_tensor(x: _C.Value) -> bool:
     return x.type().isSubtypeOf(_C.TensorType.get())
 
diff --git a/torch/onnx/_internal/onnx_proto_utils.py b/torch/onnx/_internal/torchscript_exporter/onnx_proto_utils.py
similarity index 99%
rename from torch/onnx/_internal/onnx_proto_utils.py
rename to torch/onnx/_internal/torchscript_exporter/onnx_proto_utils.py
index 04ed0f83ef84..c79786cf707d 100644
--- a/torch/onnx/_internal/onnx_proto_utils.py
+++ b/torch/onnx/_internal/torchscript_exporter/onnx_proto_utils.py
@@ -9,10 +9,9 @@
 from typing import Any, TYPE_CHECKING
 
 import torch
-import torch.jit._trace
 import torch.serialization
 from torch.onnx import errors
-from torch.onnx._internal import jit_utils, registration
+from torch.onnx._internal.torchscript_exporter import jit_utils, registration
 
 
 if TYPE_CHECKING:
diff --git a/torch/onnx/_internal/registration.py b/torch/onnx/_internal/torchscript_exporter/registration.py
similarity index 100%
rename from torch/onnx/_internal/registration.py
rename to torch/onnx/_internal/torchscript_exporter/registration.py
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
new file mode 100644
index 000000000000..a5e85aed01ef
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_helper.py
@@ -0,0 +1,2380 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+
+__all__ = [
+    "_apply_params",
+    "_arange_cast_helper",
+    "_arange_helper",
+    "_argmin_argmax_helper",
+    "_as_list_type",
+    "_avgpool_helper",
+    "_batchnorm_helper",
+    "_block_list_in_opset",
+    "_embedding_bag_helper",
+    "_flatten_helper",
+    "_generate_wrapped_number",
+    "_get_const",
+    "_get_dim_for_cross",
+    "_get_interpolate_attributes",
+    "_get_tensor_dim_size",
+    "_get_tensor_rank",
+    "_get_tensor_sizes",
+    "_handle_reduce_dim_none",
+    "_if_scalar_type_as",
+    "_index_fill_reshape_helper",
+    "_interpolate_get_scales_and_mode",
+    "_interpolate_get_scales_if_available",
+    "_interpolate_get_scales",
+    "_interpolate_helper",
+    "_interpolate_size_to_scales",
+    "_interpolate_warning",
+    "_is_bool",
+    "_is_constant",
+    "_is_fp",
+    "_is_list",
+    "_is_none",
+    "_is_onnx_constant",
+    "_is_packed_list",
+    "_is_scalar_list",
+    "_is_split_static",
+    "_is_tensor_list",
+    "_is_tensor",
+    "_is_tuple_construct",
+    "_is_value",
+    "_linalg_vector_norm_helper",
+    "_lt_helper",
+    "_max_helper",
+    "_maybe_cast_reduce_op_input",
+    "_maybe_cast_to_type",
+    "_maybe_get_const",
+    "_maybe_get_scalar",
+    "_min_helper",
+    "_node_get",
+    "_numel_helper",
+    "_onnx_opset_unsupported_detailed",
+    "_onnx_opset_unsupported",
+    "_onnx_unsupported",
+    "_op_with_optional_float_cast",
+    "_optional_input_placeholder_tensor",
+    "_overload_by_arg_count",
+    "_parse_arg",
+    "_reduce_op_symbolic_helper",
+    "_reduce_with_dtype_helper",
+    "_reducesum_helper",
+    "_repeat_interleave_single_value_repeat_helper",
+    "_repeat_interleave_split_helper",
+    "_reshape_helper",
+    "_scalar",
+    "_scatter_helper",
+    "_select_helper",
+    "_size_helper",
+    "_slice_helper",
+    "_sort_helper",
+    "_squeeze_helper",
+    "_topk_helper",
+    "_try_get_scalar_type",
+    "_type_promote_from_values",
+    "_unbind_helper",
+    "_unimplemented",
+    "_unpack_list",
+    "_unpack_quantized_tensor",
+    "_unpack_tuple",
+    "_unsqueeze_helper",
+    "_var_mean_helper",
+    "args_have_same_dtype",
+    "cast_pytorch_to_onnx",
+    "check_training_mode",
+    "dequantize_helper",
+    "is_complex_value",
+    "parse_args",
+    "pytorch_name_to_type",
+    "quantize_helper",
+    "quantized_args",
+    "requantize_bias_helper",
+    "scalar_name_to_pytorch",
+    "scalar_type_to_onnx",
+    "scalar_type_to_pytorch_type",
+]
+
+import functools
+import inspect
+import math
+import sys
+import typing
+import warnings
+from typing import Any, Callable, Literal, NoReturn, TypeVar as _TypeVar
+from typing_extensions import Concatenate as _Concatenate, ParamSpec as _ParamSpec
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import _type_utils, jit_utils, utils
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+if typing.TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torch.types import Number
+
+_T = _TypeVar("_T")
+_U = _TypeVar("_U")
+_P = _ParamSpec("_P")
+
+# ---------------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------------
+
+_ValueDescriptor = Literal[
+    "v",
+    "i",
+    "is",
+    "f",
+    "fs",
+    "b",
+    "s",
+    "t",
+    "none",
+]
+
+
+def _parse_arg(
+    value,
+    desc: _ValueDescriptor,
+    arg_name: str | None = None,
+    node_name: str | None = None,
+):
+    if desc == "none":
+        return value
+    if desc == "v" or not _is_value(value):
+        return value
+
+    node = value.node()
+    if node.mustBeNone():
+        return None
+    if node.kind() == "onnx::Constant":
+        node_val = _node_get(node, "value")
+        if desc == "i":
+            return int(node_val)
+        elif desc == "f":
+            return float(node_val)
+        elif desc == "b":
+            return bool(node_val)
+        elif desc == "s":
+            return str(node_val)
+        elif desc == "t":
+            return node_val
+        elif desc == "is":
+            return [int(v) for v in node_val]
+        elif desc == "fs":
+            return [float(v) for v in node_val]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not understand the Constant node '{node}' "
+                f"specified with descriptor '{desc}'.",
+                value,
+            )
+    elif node.kind() == "prim::ListConstruct":
+        if desc == "is":
+            for v in node.inputs():
+                element_node = v.node()
+                if element_node.kind() != "onnx::Constant":
+                    raise errors.SymbolicValueError(
+                        f"Failed to export a node '{element_node}' "
+                        f"(in list node {node}) "
+                        f"because it is not constant. "
+                        f"Please try to make things (e.g. kernel sizes) static if possible.",
+                        value,
+                    )
+            return [int(_node_get(v.node(), "value")) for v in value.node().inputs()]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not know how to unpack the ListConstruct node that "
+                f"is not a list of integers: '{node}'",
+                value,
+            )
+
+    if arg_name is None or node_name is None:
+        raise errors.SymbolicValueError(
+            f"Expected node type 'onnx::Constant', got '{node.kind()}'.",
+            value,
+        )
+
+    raise errors.SymbolicValueError(
+        "Expected node type 'onnx::Constant' "
+        f"for argument '{arg_name}' of node '{node_name}', got '{node.kind()}'.",
+        value,
+    )
+
+
+def _node_get(node: _C.Node, key: str):
+    """Gets attributes of a node which is polymorphic over return type."""
+    assert isinstance(node, _C.Node)
+    sel = node.kindOf(key)
+    return getattr(node, sel)(key)
+
+
+def _is_onnx_constant(value: _C.Value):
+    """Whether a Value is an ONNX constant."""
+    return value.node().kind() == "onnx::Constant"
+
+
+def _maybe_get_const(
+    value: _C.Value | torch.Tensor | Number | Sequence | None,
+    descriptor: _ValueDescriptor,
+):
+    # NOTE: prim::Constant at this stage usually means something not compatible in ONNX,
+    # otherwise it'd be converted to onnx::Constant
+    # TODO(justinchuby): Replace insinstance with _is_value once we figure out mypy
+    if isinstance(value, _C.Value) and _is_onnx_constant(value):
+        return _parse_arg(value, descriptor)
+    return value
+
+
+def _maybe_get_scalar(value):
+    value_t = _maybe_get_const(value, "t")
+    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
+        return value_t
+    return value
+
+
+def _get_const(value, desc, arg_name):
+    if not _is_constant(value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected a constant value of the '{arg_name}' argument, "
+            f"got '{value}'",
+            value,
+        )
+    return _parse_arg(value, desc)
+
+
+def _unpack_list(list_value: _C.Value) -> list[_C.Value]:
+    list_node = list_value.node()
+    if list_node.kind() != "prim::ListConstruct":
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type prim::ListConstruct, got '{list_node}'.",
+            list_value,
+        )
+    return list(list_node.inputs())
+
+
+def _unpack_tuple(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
+    tuple_node = tuple_value.node()
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type 'prim::TupleConstruct', "
+            f"got '{tuple_node.kind()}'.",
+            tuple_value,
+        )
+    return tuple(tuple_node.inputs())
+
+
+def _unpack_quantized_tensor(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
+    """Unpacks a quantized tensor into a tuple of tensor and scale/zero_point.
+    Args:
+        tuple_value: A tuple of tensor, scale, zero_point, and optionally axis.
+    Returns:
+        A tuple of tensor, scale, zero_point, and optionally axis.
+    """
+    tuple_node = tuple_value.node()
+    # A quantized tensor is represented as tuple of the form (tensor, scale, zero_point, <axis>)
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected the output of `{tuple_node}` to be a quantized "
+            f"tensor. Is this likely due to missing support for quantized "
+            f"`{tuple_node.kind()}`. Please create an issue on {_constants.PYTORCH_GITHUB_ISSUES_URL}",
+            tuple_value,
+        )
+    unpacked = tuple(tuple_node.inputs())
+    assert len(unpacked) == 3 or len(unpacked) == 4
+    return unpacked
+
+
+# Check if list_value is output from prim::ListConstruct
+# This is usually called before _unpack_list to ensure the list can be unpacked.
+def _is_packed_list(list_value: Any) -> bool:
+    return _is_value(list_value) and list_value.node().kind() == "prim::ListConstruct"
+
+
+def parse_args(
+    *arg_descriptors: _ValueDescriptor,
+) -> Callable[[Callable[_Concatenate[_U, _P], _T]], Callable[_Concatenate[_U, _P], _T]]:
+    """A decorator which converts args from torch._C.Value to built-in types.
+
+    For example:
+
+    ```
+    @parse_args('v', 'i', 'fs')
+    foo(g, a, b, c):
+        assert isinstance(a, torch._C.Value)
+        assert isinstance(b, int)
+        assert isinstance(c, list)
+        assert isinstance(c[0], float)
+    ```
+
+    Args:
+        arg_descriptors: list of str, where each element is
+            a string that specifies the type to convert to. Valid descriptors:
+            "v": no conversion, keep torch._C.Value.
+            "i": int
+            "is": list of int
+            "f": float
+            "fs": list of float
+            "b": bool
+            "s": str
+            "t": torch.Tensor
+            "none": the variable is unused
+    """
+
+    def decorator(
+        fn: Callable[_Concatenate[_U, _P], _T],
+    ) -> Callable[_Concatenate[_U, _P], _T]:
+        fn._arg_descriptors = arg_descriptors  # type: ignore[attr-defined]
+
+        @functools.wraps(fn)
+        def wrapper(g: _U, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+            # some args may be optional, so the length may be smaller
+            FILE_BUG_MSG = (
+                "If you believe this is not due to custom symbolic implementation within your code or "
+                "an external library, please file an issue at "
+                "https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to report this bug."
+            )
+            assert len(arg_descriptors) >= len(args), (
+                f"A mismatch between the number of arguments ({len(args)}) and "
+                f"their descriptors ({len(arg_descriptors)}) was found at symbolic function '{fn.__name__}'. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            try:
+                sig = inspect.signature(fn)
+                arg_names = list(sig.parameters.keys())[1:]
+                fn_name = fn.__name__
+            except Exception:
+                # FIXME(justinchuby): Avoid catching Exception.
+                # Catch a more specific exception instead.
+                arg_names = [None] * len(args)  # type: ignore[list-item]
+                fn_name = None
+            args = [
+                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[method-assign]
+                for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
+            ]
+            # only support _outputs in kwargs
+            assert len(kwargs) <= 1, (
+                f"Symbolic function {fn.__name__}'s '**kwargs' can contain a single "
+                f"key/value entry. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            if len(kwargs) == 1:
+                assert "_outputs" in kwargs, (
+                    f"Symbolic function {fn.__name__}'s '**kwargs' can only contain "
+                    f"'_outputs' key at '**kwargs'. "
+                    f"{FILE_BUG_MSG}"
+                )
+            return fn(g, *args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+def quantized_args(
+    *arg_q_descriptors: bool,
+    scale: float | None = None,
+    zero_point: int | None = None,
+    quantize_output: bool = True,
+) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    """A decorator which extends support for quantized version of the base operator.
+
+    Quantization is detected by examining the arguments that are annotated by
+    `arg_q_descriptors`.
+
+    If quantization is detected, the base operator symbolic function will be wrapped with
+    argument de-quantization and output quantization.
+
+    Otherwise, only the base symbolic function will be invoked.
+
+    For example:
+
+    ```
+    @quantized_args(True, False)
+    def foo(g, x, y):
+        return x + y
+    ```
+
+    is equivalent to
+
+    ```
+    def q_foo(g, x, y):
+        if is_quantized_tensor(x):
+            x = dequantize(x)
+            out = foo(g, x, y)
+            return quantize(out)
+        else:
+            return foo(g, x, y)
+    ```
+
+    Args:
+        arg_q_descriptors: A sequence of bool, where each element represents if the
+          argument is QTensor for quantized version of this operator. It defaults
+          to False for unspecified (variable length) arguments.
+        scale: Quantized output scale. If None, derive from
+          the first quantized input scale.
+        zero_point: Quantized output zero point. If None,
+          derive from the first quantized input zero point.
+        quantize_output: If True, quantize the output of the base operator. Default is True
+    """
+
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(g, *args, **kwargs):
+            nonlocal scale
+            nonlocal zero_point
+            if scale is not None:
+                _scale = g.op("Constant", value_t=torch.tensor(scale))
+            else:
+                _scale = None
+            if zero_point is not None:
+                _zero_point = g.op("Constant", value_t=torch.tensor(zero_point))
+            else:
+                _zero_point = None
+
+            # Support variable length arguments by marking unspecified ones as non-quantized
+            arg_q_descriptors_extended = arg_q_descriptors + (False,) * (
+                len(args) - len(arg_q_descriptors)
+            )
+            descriptor_args = tuple(zip(arg_q_descriptors_extended, args))
+
+            def _is_arg_quantized(descriptor, arg):
+                return descriptor and _is_value(arg) and _is_tuple_construct(arg)
+
+            # Run regular symbolic function if none of the argument is QTensor.
+            is_quantized: list[bool] = []
+            for descriptor, arg in descriptor_args:
+                # ListConstruct
+                if _is_packed_list(arg):
+                    is_quantized.extend(
+                        _is_arg_quantized(descriptor, arg_input)
+                        for arg_input in arg.node().inputs()
+                    )
+                else:
+                    is_quantized.append(_is_arg_quantized(descriptor, arg))
+
+            if not any(is_quantized):
+                return fn(g, *args, **kwargs)
+
+            # Dequantize arguments that are quantized
+            non_quantized_args = []
+            for descriptor, arg in descriptor_args:
+                if _is_arg_quantized(descriptor, arg):
+                    # Quantized arg is a tuple of (value, scale, zero_point)
+                    dequantized_arg, arg_scale, arg_zero_point, _ = dequantize_helper(
+                        g, arg
+                    )
+                    non_quantized_args.append(dequantized_arg)
+                    # Set scale and zero_point to the first quantized input if not already set
+                    if _scale is None:
+                        _scale = arg_scale
+                    if _zero_point is None:
+                        _zero_point = arg_zero_point
+                # ListConstruct
+                elif _is_packed_list(arg):
+                    for arg_input in arg.node().inputs():
+                        if _is_arg_quantized(descriptor, arg_input):
+                            # Quantized arg is a tuple of (value, scale, zero_point)
+                            (
+                                dequantized_arg,
+                                arg_scale,
+                                arg_zero_point,
+                                _,
+                            ) = dequantize_helper(g, arg_input)
+                            # Set scale and zero_point to the first quantized input if not already set
+                            if _scale is None:
+                                _scale = arg_scale
+                            if _zero_point is None:
+                                _zero_point = arg_zero_point
+                            arg_input.replaceAllUsesWith(dequantized_arg)
+                    non_quantized_args.append(arg)
+                else:
+                    # Non-quantized arg
+                    non_quantized_args.append(arg)
+            # TODO(justinchuby): Only single output is supported for now. We may want to
+            # support multiple outputs in the future.
+            output = fn(g, *non_quantized_args, **kwargs)
+
+            assert _scale is not None, "Bug: Scale must be set for quantized operator"
+            assert _zero_point is not None, (
+                "Bug: Zero point must be set for quantized operator"
+            )
+
+            if quantize_output:
+                return quantize_helper(g, output, _scale, _zero_point)
+            return output
+
+        return wrapper
+
+    return decorator
+
+
+def _scalar(x: Any) -> Number | None:
+    """Convert a scalar tensor into a Python value."""
+    if isinstance(x, torch.Tensor) and x.shape == ():
+        return x.item()
+    return None
+
+
+def _if_scalar_type_as(self, tensor):
+    """
+    Convert self into the same type of tensor, as necessary.
+    We only support implicit casting for scalars, so we never
+    actually need to insert an ONNX cast operator here; just
+    fix up the scalar.
+    """
+    if isinstance(self, _C.Value):
+        return self
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        tensor, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        ty = scalar_type.scalar_name().lower()
+        return getattr(self, ty)()
+    return self
+
+
+def _is_none(x: Any) -> bool:
+    return x is None or (x.node().mustBeNone() if isinstance(x, _C.Value) else False)
+
+
+def _is_value(x: Any) -> bool:
+    return isinstance(x, _C.Value)
+
+
+def _is_constant(value: Any) -> bool:
+    return not _is_value(value) or value.node().kind() in {
+        "onnx::Constant",
+        "prim::Constant",
+    }
+
+
+def _is_tensor(x: _C.Value) -> bool:
+    return x.type().isSubtypeOf(_C.TensorType.get())
+
+
+# Note: _C.JitType is not exposed to Python and cannot be checked in runtime.
+def _as_list_type(jit_type: _C.JitType) -> _C.ListType | None:
+    if isinstance(jit_type, _C.ListType):
+        return jit_type
+    return None
+
+
+def _is_list(x: _C.Value) -> bool:
+    return _as_list_type(x.type()) is not None
+
+
+def _is_tensor_list(x: _C.Value) -> bool:
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    return isinstance(x_type.getElementType(), _C.TensorType)
+
+
+def _is_scalar_list(x: _C.Value) -> bool:
+    """Checks if x is a scalar list, for example: List[float], List[int].
+
+    Besides checking the type is ListType, we also check if the data type is
+    a valid ONNX data type.
+    """
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    scalar_type = _type_utils.JitScalarType.from_value(x)
+    return scalar_type.onnx_compatible()
+
+
+def _is_tuple_construct(x: _C.Value) -> bool:
+    return x.node().kind() == "prim::TupleConstruct"
+
+
+def is_complex_value(x: _C.Value) -> bool:
+    assert _is_value(x)
+    return _type_utils.JitScalarType.from_value(
+        x, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.COMPLEX32,
+        _type_utils.JitScalarType.COMPLEX64,
+        _type_utils.JitScalarType.COMPLEX128,
+    }
+
+
+def _get_tensor_rank(x: _C.Value) -> int | None:
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    return x_type.dim()
+
+
+def _get_tensor_sizes(x: _C.Value, allow_nonstatic: bool = True):
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    if allow_nonstatic:
+        # Each individual symbol is returned as None.
+        # e.g. [1, "a", "b"] -> [1, None, None]
+        return x_type.varyingSizes()
+    # returns None, if exists any symbol in sizes.
+    # e.g. [1, "a", "b"] -> None
+    return x_type.sizes()
+
+
+def _get_tensor_dim_size(x: _C.Value, dim: int) -> int | None:
+    sizes = _get_tensor_sizes(x)
+    return sizes[dim] if sizes else None
+
+
+def _get_dim_for_cross(x: _C.Value, dim: int | None):
+    if dim == -1:
+        tensor_rank = _get_tensor_rank(x)
+        assert tensor_rank is not None
+        return dim + tensor_rank
+    # If dim is not given, it defaults to the first dimension found with the size 3
+    if dim is None:
+        sizes = _get_tensor_sizes(x)
+        assert sizes is not None
+        for index, size in enumerate(sizes):
+            if size is not None and size == 3:
+                return index
+    return dim
+
+
+def _unimplemented(op: str, msg: str, value: _C.Value | None = None) -> None:
+    # For BC reasons, the behavior for Caffe2 does not raise exception for unimplemented operators
+    if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+        _onnx_unsupported(f"{op}, {msg}", value)
+
+
+def _onnx_unsupported(op_name: str, value: _C.Value | None = None) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of operator {op_name}. "
+        f"Please feel free to request support or submit a pull request "
+        f"on PyTorch GitHub: {_constants.PYTORCH_GITHUB_ISSUES_URL}"
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _onnx_opset_unsupported(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    value: _C.Value | None = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in opset {current_opset}. "
+        f"Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _onnx_opset_unsupported_detailed(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    reason: str,
+    value: _C.Value | None = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in "
+        f"opset {current_opset}. {reason}. Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _block_list_in_opset(name: str):
+    def symbolic_fn(*args, **kwargs):
+        raise errors.OnnxExporterError(
+            f"ONNX export failed on {name}, which is not implemented for opset "
+            f"{GLOBALS.export_onnx_opset_version}. "
+            "Try exporting with other opset versions."
+        )
+
+    return symbolic_fn
+
+
+def _try_get_scalar_type(*args) -> _type_utils.JitScalarType | None:
+    for arg in args:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            arg, _type_utils.JitScalarType.UNDEFINED
+        )
+        if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+            return scalar_type
+    return None
+
+
+def _type_promote_from_values(*args) -> _type_utils.JitScalarType:
+    undef = _type_utils.JitScalarType.UNDEFINED
+    jit_types = [_try_get_scalar_type(arg) for arg in args]
+    if len(jit_types) == 0:
+        return undef
+    if len(jit_types) == 1:
+        return jit_types[0]  # type: ignore[return-value]
+    new_dtype = jit_types[0].dtype()  # type: ignore[union-attr]
+    for t in jit_types:
+        new_dtype = torch.promote_types(new_dtype, t.dtype())  # type: ignore[union-attr]
+    return _type_utils.JitScalarType.from_dtype(new_dtype)
+
+
+def _maybe_cast_to_type(
+    g: jit_utils.GraphContext, value, jit_type: _type_utils.JitScalarType
+):
+    if (
+        _type_utils.JitScalarType.from_value(value, _type_utils.JitScalarType.UNDEFINED)
+        != jit_type
+    ):
+        return g.op(
+            "Cast",
+            value,
+            to_i=jit_type.onnx_type(),
+        )
+    return value
+
+
+def _select_helper(g: jit_utils.GraphContext, self, dim, index, apply_reshape=True):
+    index_const = _maybe_get_scalar(index)
+    index_dim = _get_tensor_rank(index)
+    if not _is_value(index_const):
+        # Index is a constant scalar. Make it a size 1 constant tensor.
+        index = g.op("Constant", value_t=torch.LongTensor([index_const]))
+    elif index_dim is not None and apply_reshape:
+        if index_dim == 0:
+            # Index is a scalar. Reshape it to a size 1 tensor.
+            index = _reshape_helper(
+                g, index, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+
+    index_scalar_type = _type_utils.JitScalarType.from_value(
+        index, _type_utils.JitScalarType.UNDEFINED
+    )
+    if index_scalar_type not in {
+        _type_utils.JitScalarType.INT64,
+        _type_utils.JitScalarType.INT,
+    }:
+        index = g.op("Cast", index, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+def _slice_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes,
+    starts,
+    ends,
+    steps=None,
+):
+    if g.opset <= 9:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (
+            _slice as _slice9,
+        )
+
+        return _slice9(g, input, axes, starts, ends)
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset10 import (
+            _slice as _slice10,
+        )
+
+        return _slice10(g, input, axes, starts, ends, steps)
+
+
+def _is_fp(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    }
+
+
+def _is_bool(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {_type_utils.JitScalarType.BOOL}
+
+
+def _generate_wrapped_number(g: jit_utils.GraphContext, scalar):
+    """Creates a wrapped number based on https://github.com/pytorch/pytorch/issues/9515.
+
+    A Tensor is a considered a "wrapped number" if it is
+    auto-wrapped from a C++ or Python number type. Integer types are
+    wrapped as 0-dim int64 tensors and floating-point types are
+    wrapped as 0-dim double tensors.
+
+    The input to this function is constant value. If the data type
+    is a floating point type, it is converted to a 0-dim double
+    tensor, else it is converted to a 0-dim tensor of its original type
+    """
+    assert not isinstance(scalar, torch.Tensor)
+    if isinstance(scalar, float):
+        return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
+    return g.op("Constant", value_t=torch.tensor(scalar))
+
+
+def _sort_helper(g: jit_utils.GraphContext, input, dim, descending=True, out=None):
+    if out is not None:
+        _unimplemented("Sort", "Out parameter is not supported")
+    shape_ = g.op("Shape", input)
+    dim_size_ = g.op(
+        "Gather",
+        shape_,
+        g.op("Constant", value_t=torch.tensor([dim], dtype=torch.int64)),
+    )
+    if g.opset <= 10:
+        if not descending:
+            _unimplemented("Sort", "Ascending is not supported")
+        return g.op("TopK", input, dim_size_, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, dim_size_, axis_i=dim, largest_i=descending, outputs=2
+        )
+
+
+def _topk_helper(
+    g: jit_utils.GraphContext, input, k, dim, largest=True, sorted=False, out=None
+):
+    if out is not None:
+        _unimplemented("TopK", "Out parameter is not supported")
+    if not _is_value(k):
+        k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
+    else:
+        k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
+        if _try_get_scalar_type(k) != _type_utils.JitScalarType.INT64:
+            k = g.op("Cast", k, to_i=_C_onnx.TensorProtoDataType.INT64)
+    if g.opset <= 10:
+        if not largest:
+            _unimplemented("TopK", "Ascending is not supported")
+        return g.op("TopK", input, k, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, k, axis_i=dim, largest_i=largest, sorted_i=sorted, outputs=2
+        )
+
+
+def _lt_helper(g: jit_utils.GraphContext, input, other):
+    if g.opset <= 8:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset8 import lt as _lt8
+
+        return _lt8(g, input, other)
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import lt as _lt9
+
+        return _lt9(g, input, other)
+
+
+def _interpolate_warning(interpolate_mode):
+    onnx_op = (
+        "onnx:Resize" if GLOBALS.export_onnx_opset_version >= 10 else "onnx:Upsample"
+    )
+    warnings.warn(
+        "You are trying to export the model with "
+        + onnx_op
+        + " for ONNX opset version "
+        "" + str(GLOBALS.export_onnx_opset_version) + ". "
+        "This operator might cause results to not match the expected results by PyTorch.\n"
+        "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
+        "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
+        "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
+        "We recommend using opset 11 and above for models using this operator."
+    )
+
+
+def _unsqueeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if len(axes_i) == 0:
+        # unnecessary unsqueeze if axes length==0
+        return input
+    elif _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Unsqueeze", input, axes)
+        return g.op("Unsqueeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Unsqueeze with dynamic axes.", input
+        )
+    return g.op("Unsqueeze", input, axes_i[0])
+
+
+def _squeeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Squeeze", input, axes)
+        return g.op("Squeeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Squeeze with dynamic axes.", input
+        )
+    axes_t = axes_i[0]
+    axes_rank = _get_tensor_rank(axes_t)
+    assert axes_rank is not None
+    if axes_rank > 1:
+        raise errors.SymbolicValueError(
+            "For Squeeze axses as input, the axes rank must be one in ONNX spec.", input
+        )
+    elif axes_rank == 0:
+        # The axes is a scalar. Unsqueeze it to a rank 1 tensor.
+        axes_t = _unsqueeze_helper(g, axes_t, [0])
+        return g.op("Squeeze", input, axes_t)
+    return g.op("Squeeze", input, axes_t)
+
+
+def _reducesum_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes_i=None,
+    keepdims_i=1,
+    noop_with_empty_axes_i=0,
+):
+    keepdims_i = _maybe_get_const(keepdims_i, "i")
+    if g.opset >= 13:
+        if axes_i:
+            if not _is_value(axes_i):
+                axes_i = g.op(
+                    "Constant", value_t=torch.tensor(axes_i, dtype=torch.long)
+                )
+            return g.op(
+                "ReduceSum",
+                input,
+                axes_i,
+                keepdims_i=keepdims_i,
+                noop_with_empty_axes_i=noop_with_empty_axes_i,
+            )
+        return g.op(
+            "ReduceSum",
+            input,
+            keepdims_i=keepdims_i,
+            noop_with_empty_axes_i=noop_with_empty_axes_i,
+        )
+    else:
+        return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i)
+
+
+def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, dim):
+    output_size = _maybe_get_const(output_size, "is")
+    if _is_value(output_size):
+        offset = 2
+        offsets = g.op("Constant", value_t=torch.ones(offset, dtype=torch.float32))
+        dividend = g.op("Cast", output_size, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        divisor = _slice_helper(
+            g, g.op("Shape", input), axes=[0], ends=[sys.maxsize], starts=[offset]
+        )
+        divisor = g.op("Cast", divisor, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        scale_dims = g.op("Div", dividend, divisor)
+        scales = g.op("Concat", offsets, scale_dims, axis_i=0)
+    else:
+        scales_constant = [
+            1.0
+            if i < 2
+            else float(output_size[-(dim - i)])
+            / float(input.type().sizes()[-(dim - i)])
+            for i in range(0, dim)
+        ]
+        scales = g.op(
+            "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
+        )
+    return scales
+
+
+def _interpolate_get_scales_if_available(g: jit_utils.GraphContext, scales):
+    available_scales = _maybe_get_const(scales[0], "fs") != -1 and not _is_none(
+        scales[0]
+    )
+
+    if not available_scales:
+        return None
+
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scales_list = g.op(
+        "Constant", value_t=torch.tensor(_maybe_get_const(scales[0], "fs"))
+    )
+    scales = g.op("Concat", offsets, scales_list, axis_i=0)
+    return scales
+
+
+def _get_interpolate_attributes(g: jit_utils.GraphContext, mode, args):
+    if mode == "nearest":
+        align_corners = None
+        scales = args[0:]
+    else:
+        align_corners = args[0]
+        scales = args[1:]
+    scales = _interpolate_get_scales_if_available(g, scales)
+    return scales, align_corners
+
+
+def _interpolate_get_scales(g: jit_utils.GraphContext, scale_factor, dim):
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scale_factor_rank = _get_tensor_rank(scale_factor)
+    if isinstance(scale_factor.type(), _C.ListType) or (
+        scale_factor_rank is not None and scale_factor_rank > 0
+    ):
+        return g.op("Concat", offsets, scale_factor, axis_i=0)
+    else:
+        scale_factor = _unsqueeze_helper(g, scale_factor, [0])
+        scale_factor = g.op(
+            "Cast", scale_factor, to_i=_C_onnx.TensorProtoDataType.FLOAT
+        )
+        scales = [scale_factor for i in range(dim - 2)]
+    scale_factor = g.op("Concat", offsets, *scales, axis_i=0)
+    return scale_factor
+
+
+def _interpolate_get_scales_and_mode(
+    g: jit_utils.GraphContext, input, size, scale_factor, mode, align_corners
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    _interpolate_warning(mode)
+
+    align_corners = _maybe_get_const(align_corners, "b")
+    if isinstance(align_corners, bool) and align_corners:
+        return _unimplemented("interpolate", "align_corners == True")
+
+    if not input.type().dim():
+        return _unimplemented("interpolate", "missing input shape")
+    dim = input.type().dim()
+
+    if not _is_none(scale_factor):
+        scale_factor = _interpolate_get_scales(g, scale_factor, dim)
+    elif not _is_none(size):
+        if not _is_packed_list(size):
+            is_scalar = _maybe_get_const(size, "t").dim() == 0
+            if is_scalar:
+                size = _unsqueeze_helper(g, size, [0])
+                size = [size for i in range(dim - 2)]
+                size = g.op("Concat", *size, axis_i=0)
+        scale_factor = _interpolate_size_to_scales(g, input, size, dim)
+    else:
+        return _unimplemented(
+            "interpolate", "Both size and scales are None in __interpolate"
+        )
+    return scale_factor, mode
+
+
+def _argmin_argmax_helper(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+    op_name: str,
+):
+    def op_wrapper(input, axis_i, keepdims_i):
+        if g.opset >= 12:
+            return g.op(
+                op_name,
+                input,
+                axis_i=axis_i,
+                keepdims_i=keepdims_i,
+                select_last_index_i=False,
+            )
+        return g.op(op_name, input, axis_i=axis_i, keepdims_i=keepdims_i)
+
+    if _is_none(dim):
+        flattened = _reshape_helper(
+            g, input, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        output = op_wrapper(flattened, axis_i=0, keepdims_i=False)
+        if keepdim:
+            input_shape = g.op("Shape", input)
+            input_shape_shape = g.op("Shape", input_shape)
+            new_shape = g.op(
+                "ConstantOfShape",
+                input_shape_shape,
+                value_t=torch.tensor([1], dtype=torch.int64),
+            )
+            output = g.op("Reshape", output, new_shape)
+        return output
+
+    dim = _parse_arg(dim, "i")
+    return op_wrapper(input, axis_i=dim, keepdims_i=keepdim)
+
+
+def _interpolate_helper(name, dim, interpolate_mode):
+    @quantized_args(True, False, False)
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args)
+        align_corners = _maybe_get_scalar(align_corners)
+        coordinate_transformation_mode = (
+            "asymmetric"
+            if interpolate_mode == "nearest"
+            else "align_corners"
+            if align_corners
+            else "half_pixel"
+        )
+
+        if scales is None:
+            input_size = g.op("Shape", input)
+            input_size_beg = _slice_helper(
+                g, input_size, axes=[0], ends=[2], starts=[0]
+            )
+            output_size = g.op(
+                "Cast", output_size, to_i=_C_onnx.TensorProtoDataType.INT64
+            )
+            output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
+
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+                empty_scales = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+                empty_scales = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                empty_scales,
+                output_size,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+        else:
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                scales,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+
+    return symbolic_fn
+
+
+def __interpolate_helper(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    align_corners = _maybe_get_const(align_corners, "b")
+    align_corners = False if not isinstance(align_corners, bool) else align_corners
+    coordinate_transformation_mode = (
+        "asymmetric"
+        if mode == "nearest"
+        else "align_corners"
+        if align_corners
+        else "half_pixel"
+    )
+
+    if not _is_none(size):
+        input_size = g.op("Shape", input)
+        input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
+        # in some cases size is not a packed list but size is a scalar
+        # We need to also verify that (_maybe_get_const(size, "t").dim() == 0)
+        # but this information is not always available. Try to get the dim,
+        # and if not assume that it is not a scalar.
+        try:
+            is_scalar = not _is_packed_list(size) and (
+                _maybe_get_const(size, "t").dim() == 0
+            )
+        except AttributeError:
+            is_scalar = not _is_packed_list(size)
+            if not is_scalar:
+                warnings.warn(
+                    "Cannot verify if the output_size is a scalar "
+                    "while exporting interpolate. Assuming that it is not a scalar."
+                )
+
+        if is_scalar:
+            rank = _get_tensor_rank(input)
+            if rank is None:
+                return _unimplemented(
+                    "interpolate (with a scalar output_size)",
+                    "missing input shape (try giving an array of output_size values)",
+                )
+            size = _unsqueeze_helper(g, size, [0])
+            size = [size for i in range(rank - 2)]
+            size = g.op("Concat", *size, axis_i=0)
+        size = g.op("Cast", size, to_i=_C_onnx.TensorProtoDataType.INT64)
+        size = g.op("Concat", input_size, size, axis_i=0)
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+            empty_scales = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+            empty_scales = g.op(
+                "Constant", value_t=torch.tensor([], dtype=torch.float32)
+            )
+
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            empty_scales,
+            size,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )
+    else:  # if not _is_none(scales)
+        rank = _get_tensor_rank(input)
+        if rank is None:
+            return _unimplemented("interpolate (with scales)", "missing input shape")
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+
+        scales = _interpolate_get_scales(g, scale_factor, rank)
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            scales,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )  # only valid when mode="nearest"
+
+
+def _unbind_helper(g: jit_utils.GraphContext, self, dim, _outputs):
+    if g.opset < 11:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import unbind
+    elif g.opset <= 12:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import (
+            unbind,  # type: ignore[no-redef]
+        )
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset13 import (
+            unbind,  # type: ignore[no-redef]
+        )
+    return unbind(g, self, dim, _outputs)
+
+
+def _scatter_helper(g: jit_utils.GraphContext, self, dim, index, src):
+    if g.opset <= 10:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import (
+            scatter,  # type: ignore[no-redef]
+        )
+    return scatter(g, self, dim, index, src)
+
+
+def _repeat_interleave_split_helper(g: jit_utils.GraphContext, self, reps, dim):
+    if g.opset <= 12:
+        split_out = g.op("Split", self, split_i=[1] * reps, axis_i=dim, outputs=reps)
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset13 import split
+
+        repeats = g.op("Constant", value_t=torch.tensor([1] * reps))
+        split_out = split(g, self, repeats, dim, _outputs=reps)
+    return split_out if reps > 1 else [split_out]
+
+
+def _repeat_interleave_single_value_repeat_helper(
+    g: jit_utils.GraphContext, self, repeats, dim
+):
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (
+        flatten,
+        unsqueeze,
+    )
+
+    if not _is_tensor(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+
+    const_repeats: bool = _is_constant(repeats)
+    reps = _maybe_get_const(repeats, "t")
+
+    # Convert 'repeats' to 1-d if it is 0-d.
+    if _get_tensor_rank(repeats) == 0:
+        repeats = g.op("Reshape", repeats, g.op("Constant", value_t=torch.tensor([1])))
+
+    # Create a new dim of size 1, then expand it to be 'repeats' long, and finally collapse it.
+    unsqueezed = unsqueeze(g, self, dim + 1)
+
+    # repeats_per_dim is 1 for all dims except for the new unsqueezed dim, where it has value 'repeats'.
+    if const_repeats:
+        # 'Repeats' is a constant, 'repeats_per_dim' can be a constant.
+        onehot = torch.ones(_get_tensor_rank(unsqueezed), dtype=torch.int64)  # type: ignore[arg-type]
+        onehot[dim + 1] = reps
+        repeats_per_dim = g.op("Constant", value_t=onehot)
+    else:
+        # 'Repeats' is a variable, 'repeats_per_dim' cannot be a constant.
+        onehot = g.op(
+            "OneHot",
+            unsqueeze(g, dim + 1, 0),  # indices, must be >= 1-dimensional
+            g.op(
+                "Constant", value_t=torch.tensor(_get_tensor_rank(unsqueezed))
+            ),  # depth
+            g.op(
+                "Concat", g.op("Constant", value_t=torch.tensor([1])), repeats, axis_i=0
+            ),  # on/off values
+        )
+        repeats_per_dim = flatten(g, onehot, 0, 1)
+
+    tiled = g.op("Tile", unsqueezed, repeats_per_dim)
+    return flatten(g, tiled, dim, dim + 1)
+
+
+def _arange_cast_helper(
+    g: jit_utils.GraphContext, end, start=None, step=None, dtype=None
+) -> tuple[
+    _type_utils.JitScalarType,
+    _C.Value | None,
+    _C.Value | None,
+    _C.Value | None,
+]:
+    def _is_all_integral(scalars):
+        for scalar in scalars:
+            scalar_type = _type_utils.JitScalarType.from_value(
+                scalar, _type_utils.JitScalarType.UNDEFINED
+            )
+            if (
+                scalar_type != _type_utils.JitScalarType.INT64
+                and scalar_type != _type_utils.JitScalarType.UNDEFINED
+            ):
+                return False
+        return True
+
+    # This logic is based on torch.arange docs. If "dtype" is provided,
+    # infer input types from dtype. If not, then check if any of start, stop,
+    # or step are floating point, and infer the type from get_default.
+    # Otherwise, the dtype is inferred to be torch.int64.
+    if dtype is None or (_is_value(dtype) and _is_none(dtype)):
+        if _is_all_integral([start, end, step]):
+            scalar_type = _type_utils.JitScalarType.INT64
+        else:
+            scalar_type = _type_utils.JitScalarType.from_dtype(
+                torch.get_default_dtype()
+            )
+    else:
+        assert isinstance(dtype, int)
+        # TODO(justinchuby): Check if dtype is indeed a int.
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    start = g.op("Cast", start, to_i=scalar_type.onnx_type()) if start else None
+    end = g.op("Cast", end, to_i=scalar_type.onnx_type()) if end else None
+    step = g.op("Cast", step, to_i=scalar_type.onnx_type()) if step else None
+    return scalar_type, end, start, step
+
+
+def _arange_helper(g: jit_utils.GraphContext, *args):
+    if g.opset <= 10:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import arange
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import (
+            arange,  # type: ignore[no-redef]
+        )
+    return arange(g, *args)
+
+
+def _size_helper(g: jit_utils.GraphContext, self, dim):
+    full_shape = g.op("Shape", self)
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import select
+
+    return select(g, full_shape, g.op("Constant", value_t=torch.tensor([0])), dim)
+
+
+def _index_fill_reshape_helper(g: jit_utils.GraphContext, self, dim, index):
+    # 1. reshape index => [1, ..., 1, dim, 1, ..., 1]
+    # 2. expand index => [..., dim, ...], same shape as self except for dim.
+    # 3. expand value as well.
+    # 4. apply onnx::scatter.
+
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import expand
+
+    if g.opset <= 10:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import (
+            scatter,  # type: ignore[no-redef]
+        )
+
+    if self.type().dim() is None:
+        return _unimplemented("index_fill", "input rank not accessible")
+    self_dim = self.type().dim()
+    dim_value = _parse_arg(dim, "i")
+    if dim_value < 0:
+        dim_value += self_dim
+    unsqueezed_index = _unsqueeze_helper(
+        g, index, [i for i in range(self_dim) if i != dim_value]
+    )
+    expanded_index_shape = scatter(
+        g, g.op("Shape", self), 0, _unsqueeze_helper(g, dim, [0]), g.op("Shape", index)
+    )
+    expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
+    return expanded_index_shape, expanded_index
+
+
+# By default, when any value in the 'shape' input is equal to zero
+# the corresponding dimension value is copied from the input tensor dynamically.
+# allowzero=1 indicates that if any value in the 'shape' input is set to zero,
+# the zero value is honored, similar to NumPy.
+# allowzero=1 is only supported for opset version >= 14.
+def _reshape_helper(g: jit_utils.GraphContext, input, shape, allowzero=0):
+    shape = _maybe_get_const(shape, "is")
+    if not _is_value(shape):
+        shape = g.op("Constant", value_t=torch.LongTensor(shape))
+    if g.opset <= 13:
+        if allowzero == 1:
+            _onnx_opset_unsupported(
+                "Reshape with allowzero=1", GLOBALS.export_onnx_opset_version, 14, input
+            )
+        return g.op("Reshape", input, shape)
+    else:
+        return g.op("Reshape", input, shape, allowzero_i=allowzero)
+
+
+def _batchnorm_helper(
+    g: jit_utils.GraphContext, input, weight, bias, running_mean, running_var
+):
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import _var_mean
+
+    batch_size = _get_tensor_dim_size(input, 0)
+    channel_size = _get_tensor_dim_size(input, 1)
+
+    if weight is None or _is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or _is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    # If track_running_stats is set to False batch statistics are instead used during evaluation time
+    if (
+        running_mean is None
+        or _is_none(running_mean)
+        or running_var is None
+        or _is_none(running_var)
+    ):
+        assert batch_size is not None and channel_size is not None
+        reshape_in = _reshape_helper(
+            g,
+            input,
+            g.op(
+                "Constant",
+                value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64),
+            ),
+        )
+        trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
+        running_var, running_mean = _var_mean(
+            g,
+            trans_in,
+            g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
+            False,
+            False,
+        )
+    return weight, bias, running_mean, running_var
+
+
+def _avgpool_helper(
+    tuple_fn: Callable[[Any], Sequence[int]],
+    padding: int | Sequence[int],
+    kernel_size,
+    stride,
+    divisor_override,
+    name,
+) -> tuple[int, ...]:
+    if divisor_override and divisor_override.node().kind() != "prim::Constant":
+        _unimplemented(name, "divisor_override")
+    return tuple(tuple_fn(padding))
+
+
+def check_training_mode(op_train_mode: int, op_name: str) -> None:
+    """Warns the user if the model's training mode and the export mode do not agree."""
+    if GLOBALS.training_mode == _C_onnx.TrainingMode.PRESERVE:
+        return
+
+    if op_train_mode:
+        op_mode_enum = _C_onnx.TrainingMode.TRAINING
+    else:
+        op_mode_enum = _C_onnx.TrainingMode.EVAL
+    if op_mode_enum == GLOBALS.training_mode:
+        # The modes agree. Do nothing
+        return
+
+    op_mode_text = f"train={bool(op_train_mode)}"
+    # Setting the model mode could result in op_mode != GLOBALS.training_mode
+    # if the model is a FuncModule. In this case we warn the user of
+    # the state and export depending on op_mode
+    # This is to support use-cases of fixing certain layer weights
+    # in training.
+    warnings.warn(
+        f"ONNX export mode is set to {GLOBALS.training_mode}, but operator '{op_name}' "
+        f"is set to {op_mode_text}. Exporting with {op_mode_text}."
+    )
+
+
+def _flatten_helper(g: jit_utils.GraphContext, input, start_dim, end_dim, dim):
+    input_size = g.op("Shape", input)
+    slice1 = _slice_helper(g, input_size, axes=[0], starts=[0], ends=[start_dim])
+    slices = [slice1, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
+    if end_dim < dim - 1:
+        slice3 = _slice_helper(
+            g, input_size, axes=[0], starts=[end_dim + 1], ends=[dim]
+        )
+        slices = [
+            slice1,
+            g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+            slice3,
+        ]
+
+    final_shape = g.op("Concat", *slices, axis_i=0)
+    from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (
+        _reshape_from_tensor,
+    )
+
+    return _reshape_from_tensor(g, input, final_shape)
+
+
+def _is_split_static(split_size_or_sizes, _outputs):
+    if _outputs is None:
+        return False
+    if (
+        _is_value(split_size_or_sizes)
+        and split_size_or_sizes.node().kind() != "onnx::Constant"
+    ):
+        return False
+    return True
+
+
+def _optional_input_placeholder_tensor(g):
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+def _handle_reduce_dim_none(g: jit_utils.GraphContext, self, op_name):
+    rank = _get_tensor_rank(self)
+    if rank is not None and any(
+        _get_tensor_dim_size(self, i) == 0 for i in range(rank)
+    ):
+        # If input tensor is empty, according to ONNX ReduceSum definition,
+        # set keepdims=1 so that the resulted tensor has the same rank as the input.
+        return g.op(op_name, self, keepdims_i=1)
+    return g.op(op_name, self, keepdims_i=0)
+
+
+def dequantize_helper(
+    g: jit_utils.GraphContext,
+    qtensor: _C.Value,
+    qdtype: _C_onnx.TensorProtoDataType | None = None,
+) -> tuple[_C.Value, _C.Value, _C.Value, _C.Value | None]:
+    """Appends to graph `g` ONNX nodes that dequantizes `qtensor` into `tensor`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        qtensor: torch._C.Value, either a tuple of (quantized_tensor, scale, zero_point)
+            for per tensor quantization, or
+            (quantized_tensor, scale, zero_point, axis) for per channel quantization,
+            representing the quantized tensor.
+        qdtype: torch.onnx.TensorProtoDataType default None, if not None, represents the
+            data type of quantized tensor. It must be either
+            torch.onnx.TensorProtoDataType.UINT8 or torch.onnx.TensorProtoDataType.INT8.
+    """
+    unpacked_qtensors = _unpack_quantized_tensor(qtensor)
+    tensor, scale, zero_point = unpacked_qtensors[:3]
+    axis = unpacked_qtensors[3] if len(unpacked_qtensors) >= 4 else None
+    axis_i = _get_const(axis, "i", "axis")
+    input_qdtype = _type_utils.JitScalarType.from_value(tensor)
+    if qdtype is None:
+        if input_qdtype is not None:
+            qdtype = input_qdtype.onnx_type()
+        else:
+            qdtype = _C_onnx.TensorProtoDataType.UINT8
+    value = g.op("Cast", tensor, to_i=qdtype)
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    zero_point = g.op("Cast", zero_point, to_i=qdtype)
+
+    if axis_i is not None and GLOBALS.export_onnx_opset_version < 13:
+        _onnx_opset_unsupported_detailed(
+            "DequantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            qtensor,
+        )
+
+    return (
+        g.op("DequantizeLinear", value, scale, zero_point, axis_i=axis_i),
+        scale,
+        zero_point,
+        axis,
+    )
+
+
+def quantize_helper(
+    g: jit_utils.GraphContext,
+    tensor: _C.Value,
+    scale: _C.Value,
+    zero_point: _C.Value,
+    axis: _C.Value | None = None,
+) -> _C.Value:
+    """Appends to graph `g` ONNX nodes that quantizes `tensor` based on `scale`, `zero_point` and `axis`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        tensor: torch._C.Value, representing the tensor to be quantized.
+        scale: torch._C.Value, quantized scale.
+        zero_point: torch._C.Value, quantized zero point.
+        axis: Optional[torch._C.Value] default None, if None, represents per tensor quantization.
+            Otherwise, represents per channel quantization, along given axis.
+
+    Returns:
+        A TupleConstruct storing information of the quantized tensor.
+    """
+    if (
+        axis is not None
+        and not _is_none(axis)
+        and GLOBALS.export_onnx_opset_version < 13
+    ):
+        _onnx_opset_unsupported_detailed(
+            "QuantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            tensor,
+        )
+
+    assert scale is not None
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    assert zero_point is not None
+    if _type_utils.JitScalarType.from_value(
+        zero_point, _type_utils.JitScalarType.UNDEFINED
+    ) not in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+    }:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    output = g.op(
+        "QuantizeLinear",
+        tensor,
+        scale,
+        zero_point,
+        axis_i=_get_const(axis, "i", "axis"),
+    )
+    args = [output, scale, zero_point]
+    if axis is not None and not _is_none(axis):
+        args.append(axis)
+    return g.op("prim::TupleConstruct", *args)
+
+
+def requantize_bias_helper(
+    g: jit_utils.GraphContext, bias, input_scale, weight_scale, axis=None
+):
+    """In PyTorch, bias is float and is quantized to int32 implicitly inside the quantized ATen op kernel.
+    In ONNX we need to make the quantization explicit because operators expect all of their inputs to be quantized.
+    Since int32 is not a supported output type by ONNX operator `QuantizeLinear`, quantization is exported using
+    regular operators.
+    """
+    bias_scale = g.op("Mul", weight_scale, input_scale)
+    bias_scale_shape = g.op("Shape", bias_scale)
+    bias_zero_point = g.op(
+        "ConstantOfShape", bias_scale_shape, value_t=torch.tensor([0], dtype=torch.int)
+    )
+    q_bias = g.op(
+        "Cast", g.op("Div", bias, bias_scale), to_i=_C_onnx.TensorProtoDataType.INT32
+    )
+    axis_args = []
+    if axis is not None and not _is_none(axis):
+        axis_args.append(axis)
+    return g.op("prim::TupleConstruct", q_bias, bias_scale, bias_zero_point, *axis_args)
+
+
+def args_have_same_dtype(args):
+    assert args
+    base_dtype = _type_utils.JitScalarType.from_value(args[0])
+    has_same_dtype = all(
+        _type_utils.JitScalarType.from_value(elem) == base_dtype for elem in args
+    )
+    return has_same_dtype
+
+
+def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kwargs):
+    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
+    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
+    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
+    `Clip<int>(INPUT)` (opset version < 12).
+
+    Args:
+        g (torch._C.Graph): graph to write the ONNX representation into.
+        op_name (str): operator name in ONNX.
+        *args (tuple): operands to the operator.
+        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
+            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
+            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
+
+    Returns:
+        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
+    """
+    opset_before = kwargs.pop("opset_before", None)
+    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
+
+    inputs = list(args)
+    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
+
+    require_cast = not _is_fp(inputs[0]) and (
+        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
+    )
+
+    if require_cast:
+        for input in inputs:
+            if input.isCompleteTensor():
+                input_scalar_type = _type_utils.JitScalarType.from_value(input)
+                if input_scalar_type != dtype_0:
+                    raise errors.SymbolicValueError(
+                        f"Inputs of {op_name} must have same dtype."
+                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
+                        input,
+                    )
+        for i, input in enumerate(inputs):
+            if input.isCompleteTensor() and not _is_fp(input):
+                inputs[i] = g.op(
+                    "Cast",
+                    input,
+                    to_i=target_float_t.onnx_type(),
+                )
+
+    self = g.op(op_name, *inputs, **kwargs)
+
+    if require_cast:
+        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
+
+    return self
+
+
+def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        # This check only covers traced modules where dtype is present
+        # pytorch reduce-ops cast all other integral types to int64
+        if not _is_fp(self) and scalar_type != _type_utils.JitScalarType.INT64:
+            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return self
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+def _reduce_op_symbolic_helper(onnx_op_name, allow_multi_dim_support=True):
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = _maybe_cast_reduce_op_input(g, self)
+        if dim is None or dim == ():
+            # Dim can be 0, which will cause (not dim) == True. So we don't want to do
+            # (not dim)
+            # all-reduce path
+            return _handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            # dim-reduce path
+            keepdim = _get_const(keepdim, "i", "keepdim")
+            if g.opset < 18:
+                desc = "is" if allow_multi_dim_support else "i"
+                dim = _get_const(dim, desc, "dim")
+                dim_list = dim if allow_multi_dim_support else [dim]
+                return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
+            else:
+                if _is_value(dim):
+                    axes = dim
+                else:
+                    if allow_multi_dim_support:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor(dim, dtype=torch.long)
+                        )
+                    else:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor([dim], dtype=torch.long)
+                        )
+                return g.op(onnx_op_name, self, axes, keepdims_i=keepdim)
+
+    return symbolic
+
+
+def _overload_by_arg_count(fn):
+    @functools.wraps(fn)
+    def wrapper(g, *args):
+        overloads = fn(g, *args)
+        for overload in overloads:
+            arg_descriptors = overload._arg_descriptors
+            if len(arg_descriptors) == len(args):
+                return overload(g, *args)
+        return _unimplemented(f"aten::{fn.__name__}", f"with {len(args)} arguments")
+
+    return wrapper
+
+
+def _reduce_with_dtype_helper(
+    onnx_op: str, name: str, allow_multi_dim_support: bool = True
+):
+    symbolic = _reduce_op_symbolic_helper(
+        onnx_op, allow_multi_dim_support=allow_multi_dim_support
+    )
+
+    @_overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @quantized_args(True)
+        @parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        dim_desc = "is" if allow_multi_dim_support else "i"
+
+        @quantized_args(True)
+        @parse_args("v", dim_desc, "i", "none")  # type: ignore[arg-type]
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+def _max_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMax", self, keepdims_i=0)
+    # torch.max(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
+    # torch.max(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            max = g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMax", self, axis_i=dim, keepdims_i=keepdim)
+        return max, indices
+
+
+def _min_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMin", self, keepdims_i=0)
+    # torch.min(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
+    # torch.min(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            min = g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMin", self, axis_i=dim, keepdims_i=keepdim)
+        return min, indices
+
+
+def _numel_helper(g: jit_utils.GraphContext, self):
+    shape = g.op("Shape", self)
+    return g.op("ReduceProd", shape, keepdims_i=0)
+
+
+@parse_args("v", "is", "i", "i")
+def _var_mean_helper(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    if g.opset < 18:
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+    else:
+        axes = None
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+            mean = g.op("ReduceMean", input, axes, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        if axes is None:
+            var = g.op("ReduceMean", sqr_sub, keepdims_i=keepdim_mean)
+        else:
+            var = g.op("ReduceMean", sqr_sub, axes, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+
+
+def _embedding_bag_helper(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return _onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    indices_len = _unsqueeze_helper(
+        g,
+        _size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
+        [0],
+    )
+    if not include_last_offset:
+        offsets = [offsets, indices_len]
+        offsets = g.op("Concat", *offsets, axis_i=0)
+
+    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
+    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
+    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
+    offsets_starts = _slice_helper(
+        g, offsets, axes=[0], starts=[0], ends=[sys.maxsize], steps=[1]
+    )
+    offsets_ends = _slice_helper(
+        g, offsets, axes=[0], starts=[1], ends=[sys.maxsize], steps=[1]
+    )
+
+    loop_len = _size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
+
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, n_blocks=1
+    )
+    loop_block = loop_context.block
+
+    # FIXME(justinchuby): We need to handle what happens when we call b.op on a node return
+    block_input_iter = utils._add_input_to_block(loop_block)
+    utils._add_input_to_block(loop_block)
+
+    indices_start = loop_context.op(
+        "Gather", offsets_starts, block_input_iter, axis_i=0
+    )
+    indices_end = loop_context.op("Gather", offsets_ends, block_input_iter, axis_i=0)
+    indices_start = _unsqueeze_helper(loop_context, indices_start, [0])
+    indices_end = _unsqueeze_helper(loop_context, indices_end, [0])
+
+    indices_row = loop_context.op("Slice", indices, indices_start, indices_end, zero)
+    embeddings = loop_context.op("Gather", embedding_matrix, indices_row, axis_i=0)
+    if not _is_none(per_sample_weights):
+        per_sample_weights_row = loop_context.op(
+            "Slice", per_sample_weights, indices_start, indices_end, zero
+        )
+        per_sample_weights_row = _unsqueeze_helper(
+            loop_context, per_sample_weights_row, [1]
+        )
+        embeddings = loop_context.op("Mul", embeddings, per_sample_weights_row)
+    if mode == 0:
+        embeddings = _reducesum_helper(
+            loop_context, embeddings, axes_i=[0], keepdims_i=0
+        )
+    elif mode == 1:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMean", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMean", embeddings, axes, keepdims_i=0)
+    else:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMax", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMax", embeddings, axes, keepdims_i=0)
+
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, embeddings)
+
+    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+    return loop.node().output(), None, None, None
+
+
+def _linalg_vector_norm_helper(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    axes = None
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
+    if _is_none(dim):
+        self = _reshape_helper(g, self, [-1])
+        keepdim = False
+    elif g.opset >= 18:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+
+    if ord == math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMax", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMax", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == -math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMin", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMin", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == 0:
+        if g.opset < 11:
+            return _onnx_opset_unsupported_detailed(
+                "linalg_vector_norm", 9, 11, "ord=0 not supported", self
+            )
+        else:
+            if dim is None:
+                self = _reshape_helper(
+                    g,
+                    self,
+                    g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+                )
+                keepdim = False
+
+            cond_op = g.op(
+                "Not",
+                g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0]))),
+            )
+            cond_op = g.op(
+                "Cast",
+                cond_op,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+            return _reducesum_helper(g, cond_op, axes_i=dim, keepdims_i=keepdim)
+    elif ord == 1:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL1")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, axes, keepdim=keepdim
+                )
+    elif ord == 2:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL2")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, axes, keepdim=keepdim
+                )
+    else:
+        ord_op = g.op("Constant", value_t=torch.tensor(ord, dtype=torch.float32))
+        result = _reducesum_helper(
+            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
+        )
+        result = g.op(
+            "Pow",
+            result,
+            g.op(
+                "Div",
+                g.op("Constant", value_t=torch.tensor(1, dtype=torch.float32)),
+                ord_op,
+            ),
+        )
+
+    if not _is_none(dtype):
+        dtype = _get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())  # type: ignore[arg-type]
+    return result
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    "Byte": _C_onnx.TensorProtoDataType.UINT8,
+    "Char": _C_onnx.TensorProtoDataType.INT8,
+    "Double": _C_onnx.TensorProtoDataType.DOUBLE,
+    "Float": _C_onnx.TensorProtoDataType.FLOAT,
+    "Half": _C_onnx.TensorProtoDataType.FLOAT16,
+    "Int": _C_onnx.TensorProtoDataType.INT32,
+    "Long": _C_onnx.TensorProtoDataType.INT64,
+    "Short": _C_onnx.TensorProtoDataType.INT16,
+    "Bool": _C_onnx.TensorProtoDataType.BOOL,
+    "ComplexFloat": _C_onnx.TensorProtoDataType.COMPLEX64,
+    "ComplexDouble": _C_onnx.TensorProtoDataType.COMPLEX128,
+    "BFloat16": _C_onnx.TensorProtoDataType.BFLOAT16,
+    "Undefined": _C_onnx.TensorProtoDataType.UNDEFINED,
+}
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_name_to_pytorch = {
+    "uint8_t": "Byte",
+    "int8_t": "Char",
+    "double": "Double",
+    "float": "Float",
+    "half": "Half",
+    "int": "Int",
+    "int64_t": "Long",
+    "int16_t": "Short",
+    "bool": "Bool",
+    "complex64": "ComplexFloat",
+    "complex128": "ComplexDouble",
+    "qint8": "QInt8",
+    "quint8": "QUInt8",
+    "qint32": "QInt32",
+    "bfloat16": "BFloat16",
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# This indicates each scalar type's corresponding
+# torch type. Related source:
+# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+scalar_type_to_pytorch_type = [
+    torch.uint8,  # 0
+    torch.int8,  # 1
+    torch.short,  # 2
+    torch.int,  # 3
+    torch.int64,  # 4
+    torch.half,  # 5
+    torch.float,  # 6
+    torch.double,  # 7
+    torch.complex32,  # 8
+    torch.complex64,  # 9
+    torch.complex128,  # 10
+    torch.bool,  # 11
+    torch.qint8,  # 12
+    torch.quint8,  # 13
+    torch.qint32,  # 14
+    torch.bfloat16,  # 15
+]
+
+# Deprecated. Internally use _type_utils.ScalarType
+# source of truth is
+# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
+pytorch_name_to_type = {
+    "Byte": torch.uint8,
+    "Char": torch.int8,
+    "Double": torch.double,
+    "Float": torch.float,
+    "Half": torch.half,
+    "Int": torch.int,
+    "Long": torch.int64,
+    "Short": torch.short,
+    "Bool": torch.bool,
+    "ComplexFloat": torch.complex64,
+    "ComplexDouble": torch.complex128,
+    "QInt8": torch.qint8,
+    "QUInt8": torch.quint8,
+    "QInt32": torch.qint32,
+    "BFloat16": torch.bfloat16,
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_type_to_onnx = [
+    cast_pytorch_to_onnx["Byte"],  # 0
+    cast_pytorch_to_onnx["Char"],  # 1
+    cast_pytorch_to_onnx["Short"],  # 2
+    cast_pytorch_to_onnx["Int"],  # 3
+    cast_pytorch_to_onnx["Long"],  # 4
+    cast_pytorch_to_onnx["Half"],  # 5
+    cast_pytorch_to_onnx["Float"],  # 6
+    cast_pytorch_to_onnx["Double"],  # 7
+    cast_pytorch_to_onnx["Undefined"],  # 8
+    cast_pytorch_to_onnx["ComplexFloat"],  # 9
+    cast_pytorch_to_onnx["ComplexDouble"],  # 10
+    cast_pytorch_to_onnx["Bool"],  # 11
+    cast_pytorch_to_onnx["Char"],  # 12
+    cast_pytorch_to_onnx["Byte"],  # 13
+    cast_pytorch_to_onnx["Int"],  # 14
+    cast_pytorch_to_onnx["BFloat16"],  # 15
+]
+
+# Global set to store the list of quantized operators in the network.
+# This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX.
+_quantized_ops: set[int] = set()
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
new file mode 100644
index 000000000000..6b36396250b4
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset10.py
@@ -0,0 +1,1187 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import TYPE_CHECKING
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 10
+# Opset 10 is supported by ONNX release 1.5.0
+# release on 04/24/19
+
+
+__all__ = [
+    "dequantize",
+    "div",
+    "embedding_bag",
+    "fake_quantize_per_tensor_affine",
+    "flip",
+    "fmod",
+    "isfinite",
+    "isinf",
+    "nan_to_num",
+    "quantize_per_tensor",
+    "quantized_add_relu",
+    "quantized_add",
+    "quantized_cat",
+    "quantized_conv1d_relu",
+    "quantized_conv2d_relu",
+    "quantized_conv3d_relu",
+    "quantized_conv1d",
+    "quantized_conv2d",
+    "quantized_conv3d",
+    "quantized_conv_transpose1d",
+    "quantized_conv_transpose2d",
+    "quantized_conv_transpose3d",
+    "quantized_group_norm",
+    "quantized_hardswish",
+    "quantized_instance_norm",
+    "quantized_layer_norm",
+    "quantized_leaky_relu",
+    "quantized_linear",
+    "quantized_linear_relu",
+    "quantized_mul",
+    "quantized_sigmoid",
+    "slice",
+    "sort",
+    "topk",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=10)
+
+
+@_onnx_symbolic("aten::div")
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return opset9.true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    else:
+        return opset9._div_rounding_mode(g, self, other, rounding_mode)
+
+
+@_onnx_symbolic("aten::_floor_divide")
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = opset9.true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does truncation rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op("Xor", g.op("Less", self, zero), g.op("Less", other, zero))
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Mod", self, other, fmod_i=0)
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Sub", div, one)
+        return g.op("Where", fixup_mask, fixup, div)
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, descending=descending, out=out)
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+def _aten_max_pool_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+) -> _C.Value:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, _ = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze",
+            pool_result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    return pool_result
+
+
+# For MaxPool
+def _adjust_attributes_of_max_pool(
+    expand_size: int,
+    kernel_size: Sequence[int] | int,
+    stride: Sequence[int] | int,
+    padding: Sequence[int] | int,
+    dilation: Sequence[int] | int,
+) -> tuple[Sequence[int], Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(dilation, int):
+        dilation = [dilation] * expand_size
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        # 2D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 3:
+        # 3D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    else:
+        # When padding is already done for all dimensions,
+        # we don't need to double it
+        # eg: (1, 1, 1, 1, 1, 1)
+        pads = padding  # type: ignore[assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads, dilation)
+
+
+def _aten_max_pool_with_indices_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+    n_dims_one: Sequence[int],
+    n_dims_zero: Sequence[int],
+    n_dims_axes: Sequence[int],
+) -> tuple[_C.Value, Sequence[int]]:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+    _, flatten_indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        dilations_i=dilations,
+        kernel_shape_i=n_dims_one,
+        strides_i=n_dims_one,
+    )
+
+    ends = g.op("Constant", value_t=torch.tensor(n_dims_one))
+    starts = g.op("Constant", value_t=torch.tensor(n_dims_zero))
+    axes = g.op("Constant", value_t=torch.tensor(n_dims_axes))
+
+    delta = g.op("Slice", flatten_indices, starts, ends, axes)
+    indices = g.op("Sub", indices, delta)
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze", pool_result, value_t=torch.tensor([0], dtype=torch.int64)
+        )
+        indices = g.op("Squeeze", indices, value_t=torch.tensor([0], dtype=torch.int64))
+
+    return (pool_result, indices)
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[symbolic_helper._apply_params("max_pool1d", 1, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[symbolic_helper._apply_params("max_pool2d", 2, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[symbolic_helper._apply_params("max_pool3d", 3, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool1d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool1d_with_indices",
+            1,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool2d_with_indices",
+            2,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool3d_with_indices",
+            3,
+            return_indices=True,
+        )
+    ],
+)
+def _max_pool(name: str, expand_size: int, return_indices: bool):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    def symbolic_fn(
+        g: jit_utils.GraphContext,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        dilation: Sequence[int],
+        ceil_mode: bool,
+    ):
+        kernel_shape, strides, pads, dilations = _adjust_attributes_of_max_pool(
+            expand_size, kernel_size, stride, padding, dilation
+        )
+
+        if return_indices:
+            return _aten_max_pool_with_indices_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+                ([1] * expand_size),
+                ([0] * expand_size),
+                ([2 + i for i in range(expand_size)]),
+            )
+        else:
+            return _aten_max_pool_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+            )
+
+    return symbolic_fn
+
+
+# For AvgPool
+def _adjust_attributes_of_avg_pool(
+    expand_size: int,
+    kernel_size: Sequence[int] | int,
+    stride: Sequence[int] | int,
+    padding: Sequence[int] | int,
+) -> tuple[Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        pads = padding * expand_size  # type: ignore[operator, assignment]
+    else:
+        pads = padding * 2  # type: ignore[operator, assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[symbolic_helper._apply_params("avg_pool1d", 1)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[symbolic_helper._apply_params("avg_pool2d", 2)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[symbolic_helper._apply_params("avg_pool3d", 3)],
+)
+def _avg_pool(name, expand_size):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        kernel_shape, strides, pads = _adjust_attributes_of_avg_pool(
+            expand_size, kernel_size, stride, padding
+        )
+
+        result = g.op(
+            "AveragePool",
+            input,
+            ceil_mode_i=ceil_mode,
+            count_include_pad_i=count_include_pad,
+            kernel_shape_i=kernel_shape,
+            pads_i=pads,
+            strides_i=strides,
+        )
+
+        return result
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+def _interpolate(name, dim, interpolate_mode):
+    @symbolic_helper.quantized_args(True, False, False)
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Resize", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Resize", input, scales, mode_s=mode)
+
+
+def _slice(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    axes: list | torch.Tensor | torch._C.Value,
+    starts: list | torch.Tensor | torch._C.Value,
+    ends: list | torch.Tensor | torch._C.Value,
+    steps: list | torch.Tensor | torch._C.Value | None = None,
+):
+    def is_none_value(value):
+        if value is None:
+            return True
+        return (
+            isinstance(value, torch._C.Value)
+            and value.node().kind() == "prim::Constant"
+            and isinstance(value.type(), _C.NoneType)
+        )
+
+    def to_slice_input(list_or_value, default_value=None):
+        # Convert input param into a 1D torch.Value.
+        if is_none_value(list_or_value) and default_value is not None:
+            list_or_value = [default_value]
+
+        if isinstance(list_or_value, torch.Tensor):
+            return g.op("Constant", value_t=list_or_value.clone().detach())
+        elif isinstance(list_or_value, list):
+            return g.op("Constant", value_t=torch.tensor(list_or_value))
+
+        rank = symbolic_helper._get_tensor_rank(list_or_value)
+        if rank == 0:
+            return symbolic_helper._unsqueeze_helper(g, list_or_value, [0])
+        if rank == 1:
+            return list_or_value
+        raise errors.SymbolicValueError(
+            f"Rank must be 0 or 1, not {rank}", list_or_value
+        )
+
+    def get_const_value(list_or_value):
+        if isinstance(list_or_value, (list, torch.Tensor)):
+            if len(list_or_value) == 1:
+                return list_or_value[0]
+            return None
+        return symbolic_helper._maybe_get_const(list_or_value, "i")
+
+    # Check if slice is a no-op
+    if (
+        get_const_value(starts) == 0
+        and get_const_value(ends) == _constants.INT64_MAX
+        and (steps is None or get_const_value(steps) == 1)
+    ):
+        return input
+
+    axes = to_slice_input(axes)
+    starts = to_slice_input(starts, default_value=0)
+    ends = to_slice_input(ends, default_value=_constants.INT64_MAX)
+    if steps is None:
+        return g.op("Slice", input, starts, ends, axes)
+    steps = to_slice_input(steps, default_value=1)
+    return g.op("Slice", input, starts, ends, axes, steps)
+
+
+@_onnx_symbolic("aten::slice")
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int? start=None, int? end=None, int step=1) -> Tensor
+        dims, start, end, step = args
+    elif len(args) == 3:
+        # aten::slice(t[] l, int? start=None, int? end=None, int step=1) -> t[]
+        start, end, step = args
+        dims = [0]
+    else:
+        raise errors.SymbolicValueError("Unknown aten::slice signature", self)
+
+    return symbolic_helper._slice_helper(
+        g,
+        self,
+        axes=dims,
+        starts=start,
+        ends=end,
+        steps=step,
+    )
+
+
+@_onnx_symbolic("aten::flip")
+@symbolic_helper.parse_args("v", "is")
+def flip(g: jit_utils.GraphContext, input, dims):
+    return symbolic_helper._slice_helper(
+        g,
+        input,
+        axes=dims,
+        starts=[-1] * len(dims),
+        ends=[-_constants.INT64_MAX] * len(dims),
+        steps=[-1] * len(dims),
+    )
+
+
+@_onnx_symbolic("aten::fmod")
+def fmod(g: jit_utils.GraphContext, input, other):
+    return g.op("Mod", input, other, fmod_i=1)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    warnings.warn(
+        "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
+        "Please use opset 11 or higher to export model for dynamic input shape.'"
+    )
+    offsets_dim_0 = symbolic_helper._get_tensor_dim_size(offsets, 0)
+    if offsets_dim_0 is not None:
+        if include_last_offset:
+            offset_len = offsets_dim_0 - 1
+            offsets_extended = offsets
+        else:
+            offset_len = offsets_dim_0
+            offsets_extended = [
+                offsets,
+                g.op("Constant", value_t=torch.tensor([sys.maxsize])),
+            ]
+            offsets_extended = g.op("Concat", *offsets_extended, axis_i=0)
+        list_ = []
+        for i in range(offset_len):
+            start_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(g, offsets_extended, torch.tensor(0), torch.tensor(i)),
+                [0],
+            )
+            end_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(
+                    g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)
+                ),
+                [0],
+            )
+            axes_ = g.op("Constant", value_t=torch.tensor([0]))
+            indices_row = g.op("Slice", indices, start_, end_, axes_)
+
+            embeddings = g.op("Gather", embedding_matrix, indices_row)
+            if not symbolic_helper._is_none(per_sample_weights):
+                per_sample_weights_row = g.op(
+                    "Slice", per_sample_weights, start_, end_, axes_
+                )
+                per_sample_weights_row = symbolic_helper._unsqueeze_helper(
+                    g, per_sample_weights_row, [1]
+                )
+                embeddings = g.op("Mul", embeddings, per_sample_weights_row)
+            if mode == 0:
+                embeddings = symbolic_helper._reducesum_helper(
+                    g, embeddings, axes_i=[0], keepdims_i=0
+                )
+            elif mode == 1:
+                embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
+            else:
+                embeddings = g.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
+
+            embeddings = symbolic_helper._unsqueeze_helper(g, embeddings, [0])
+            list_.append(embeddings)
+
+        output = g.op("Concat", *list_, axis_i=0)
+        # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+        return output, None, None, None
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with unknown shape of offsets for opset 10 is not supported. "
+            "please use opset 11 or higher."
+        )
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is a special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) == (0, 127):
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Quantize range (0, 127) not supported, requires opset 13 Clip",
+            inputs,
+        )
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127)]:
+        raise errors.SymbolicValueError(
+            f"For (quant_min, quant_max), ONNX allows only (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    scale = symbolic_helper._maybe_get_scalar(scale)
+    if scale is None:
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Non-constant scale not supported",
+            inputs,
+        )
+    scale = scale.float().data  # Avoid exporter generating double type
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    return g.op(
+        "DequantizeLinear",
+        g.op("QuantizeLinear", inputs, scale, zero_point),
+        scale,
+        zero_point,
+    )
+
+
+@_onnx_symbolic("aten::isinf")
+def isinf(g: jit_utils.GraphContext, input):
+    return g.op("IsInf", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE))
+
+
+@_onnx_symbolic("aten::isfinite")
+def isfinite(g: jit_utils.GraphContext, input):
+    inf_node = isinf(g, input)
+    nan_node = opset9.isnan(g, input)
+    return opset9.__not_(g, opset9.__or_(g, inf_node, nan_node))
+
+
+@_onnx_symbolic("aten::quantize_per_tensor")
+def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    # TODO(justinchuby): Extract all the cast ops into a helper function.
+    zero_point = g.op(
+        "Cast", zero_point, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return symbolic_helper.quantize_helper(g, input, scale, zero_point)
+
+
+@_onnx_symbolic("aten::dequantize")
+def dequantize(g: jit_utils.GraphContext, input):
+    return symbolic_helper.dequantize_helper(g, input)[0]
+
+
+@_onnx_symbolic("aten::nan_to_num")
+@symbolic_helper.parse_args("v", "f", "f", "f")
+def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
+    # Cannot create a int type tensor with inf/nan values, so we simply
+    # return the original tensor
+    if not symbolic_helper._is_fp(input):
+        return input
+    input_dtype = _type_utils.JitScalarType.from_value(input).dtype()
+    if nan is None:
+        nan = 0.0
+    nan_cond = opset9.isnan(g, input)
+    nan_result = g.op(
+        "Where",
+        nan_cond,
+        g.op("Constant", value_t=torch.tensor([nan], dtype=input_dtype)),
+        input,
+    )
+
+    # For None values of posinf, neginf we use the greatest/lowest finite
+    # value representable by input's dtype.
+    finfo = torch.finfo(input_dtype)
+    if posinf is None:
+        posinf = finfo.max
+    posinf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_result),
+        opset9.gt(g, nan_result, g.op("Constant", value_t=torch.LongTensor([0]))),
+    )
+    nan_posinf_result = g.op(
+        "Where",
+        posinf_cond,
+        g.op("Constant", value_t=torch.tensor([posinf], dtype=input_dtype)),
+        nan_result,
+    )
+
+    if neginf is None:
+        neginf = finfo.min
+    neginf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_posinf_result),
+        opset9.lt(
+            g, nan_posinf_result, g.op("Constant", value_t=torch.LongTensor([0]))
+        ),
+    )
+    return g.op(
+        "Where",
+        neginf_cond,
+        g.op("Constant", value_t=torch.tensor([neginf], dtype=input_dtype)),
+        nan_posinf_result,
+    )
+
+
+# Quantized symbolics ---------------------------------------------------------
+# https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+# Support starts from opset 10 because `DequantizeLinear` and `QuantizeLinear` were
+# introduced in opset version 10.
+@_onnx_symbolic("quantized::linear")
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add")
+def quantized_add(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add_relu")
+def quantized_add_relu(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::mul")
+def quantized_mul(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.mul(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::hardswish")
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::sigmoid")
+def quantized_sigmoid(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.sigmoid(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::leaky_relu")
+def quantized_leaky_relu(
+    g: jit_utils.GraphContext, x, negative_slope, inplace, op_scale, op_zero_point
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.leaky_relu(g, x, negative_slope, inplace)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::layer_norm")
+def quantized_layer_norm(
+    g: jit_utils.GraphContext,
+    x,
+    normalized_shape,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.layer_norm(g, x, normalized_shape, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::group_norm")
+def quantized_group_norm(
+    g: jit_utils.GraphContext,
+    x,
+    num_groups,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.group_norm(g, x, num_groups, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "f", "v", "v")
+def quantized_instance_norm(
+    g: jit_utils.GraphContext,
+    q_input,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    input, _, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+
+    output = opset9.instance_norm(
+        g, input, weight, bias, None, None, False, 0.0, eps, False
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::cat")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def quantized_cat(
+    g: jit_utils.GraphContext,
+    q_inputs: _C.Value,
+    dim: int,
+    op_scale: _C.Value,
+    op_zero_point: _C.Value,
+) -> _C.Value:
+    unpacked_inputs = symbolic_helper._unpack_list(q_inputs)
+    dequantized = [
+        symbolic_helper.dequantize_helper(g, input)[0] for input in unpacked_inputs
+    ]
+    concatenated = g.op("Concat", *dequantized, axis_i=dim)
+    return symbolic_helper.quantize_helper(g, concatenated, op_scale, op_zero_point)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
new file mode 100644
index 000000000000..f437e2670768
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset11.py
@@ -0,0 +1,1472 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 11."""
+
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import TYPE_CHECKING
+
+import torch
+from torch import _C
+from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset10 as opset10,
+    symbolic_opset9 as opset9,
+    utils,
+)
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "add",
+    "append",
+    "arange",
+    "argsort",
+    "atleast_1d",
+    "atleast_2d",
+    "atleast_3d",
+    "cat",
+    "chunk",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "constant_pad_nd",
+    "cumsum",
+    "Delete",
+    "embedding_bag",
+    "embedding_renorm",
+    "flatten",
+    "gather",
+    "hardtanh",
+    "hstack",
+    "im2col",
+    "index_fill",
+    "index",
+    "index_copy",
+    "index_put",
+    "insert",
+    "linalg_det",
+    "linalg_vector_norm",
+    "logdet",
+    "masked_scatter",
+    "masked_select",
+    "mm",
+    "narrow",
+    "normal",
+    "pad",
+    "pixel_shuffle",
+    "pop",
+    "prim_constant_chunk",
+    "reflection_pad",
+    "relu6",
+    "remainder",
+    "replication_pad",
+    "round",
+    "scatter",
+    "select",
+    "size",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "squeeze",
+    "stack",
+    "topk",
+    "unbind",
+    "unique_dim",
+    "unsqueeze",
+    "vstack",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=11)
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(min_val, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(max_val, dtype=scalar_type.dtype()),
+    )
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Clip", self, min_val, max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::clamp")
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    def _cast_if_not_none(tensor, dtype):
+        if tensor is not None and not symbolic_helper._is_none(tensor):
+            return g.op(
+                "Cast",
+                tensor,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            return tensor
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        min = _cast_if_not_none(min, scalar_type)
+        max = _cast_if_not_none(max, scalar_type)
+
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if (
+            symbolic_helper._get_tensor_rank(min) == 0
+            and symbolic_helper._get_tensor_rank(max) == 0
+        ):
+            return symbolic_helper._op_with_optional_float_cast(
+                g, "Clip", self, min, max, opset_before=12
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(min) == 0:
+        max = opset9.unused(g)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(max) == 0:
+        min = opset9.unused(g)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::relu6")
+def relu6(g: jit_utils.GraphContext, input):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(6, dtype=scalar_type.dtype()),
+    )
+    return clamp(g, input, min_val, max_val)
+
+
+@_onnx_symbolic("aten::select")
+# Opset 11 gather accepts negative indices
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+def select(g: jit_utils.GraphContext, self, dim, index):
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::index_put")
+def index_put(
+    g: jit_utils.GraphContext, self, indices_list_value, values, accumulate=False
+):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        return values
+
+    if len(indices_list) > 1:
+        for idx_ in range(len(indices_list)):
+            if symbolic_helper._is_bool(indices_list[idx_]):
+                indices_list[idx_] = g.op("NonZero", indices_list[idx_])
+        index = indices_list[0]
+
+        for ind in indices_list[1:]:
+            index = opset9.add(g, index, ind)
+        broadcast_index_shape = g.op("Shape", index)
+        indices_list = [
+            symbolic_helper._unsqueeze_helper(
+                g, opset9.expand(g, ind, broadcast_index_shape, None), [-1]
+            )
+            for ind in indices_list
+        ]
+        index = g.op("Concat", *indices_list, axis_i=-1)
+    else:
+        # Replace index_put node with masked_scatter or masked_fill
+        # when inputs to the index_put node contains a single boolean input.
+        #
+        # index_put -> masked_fill
+        #   * input index contains single tensor of Bool type (e.g.: %24 <- %23).
+        #   * input value contains single element (e.g.: %18).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %16 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #               aten::to(%8, %26, %27, %11, %12, %28, %29, %15)
+        #   %18 : Float(requires_grad=0, device=cpu) = prim::Constant[value={1}]()
+        #   %23 : Bool(8, strides=[1], device=cpu) = aten::view(%16, %22)
+        #   %24 : Tensor?[] = prim::ListConstruct(%23)
+        #   %25 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #                aten::index_put(%mask, %24, %18, %30)
+        #   return (%25)
+        #
+        #
+        # index_put -> masked_scatter
+        #   * input index contains single tensor of Bool type (e.g.: %32 <- %31).
+        #   * input value contains multiple elements (e.g.: %28).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %28 : Float(8, strides=[1], requires_grad=0, device=cpu)
+        #                = prim::Constant[value= 1  1  1  1  1  1  1  1 [ CPUFloatType{8} ]]()
+        #   %15 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::ne(%mask, %some_const)
+        #   %23 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::to(%15, %34, %35, %18, %19, %36, %37, %22)
+        #   %38 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        #   %30 : int[] = prim::Constant[value=[-1]]()
+        #   %31 : Bool(8, strides=[1], device=cpu) = aten::view(%23, %30)
+        #   %32 : Tensor?[] = prim::ListConstruct(%31)
+        #   %33 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #               = aten::index_put(%mask, %32, %28, %38)
+        #   return (%33)
+        index = indices_list[0]
+        bool_inp = index
+        if symbolic_helper._is_bool(bool_inp):
+            rank = symbolic_helper._get_tensor_rank(values)
+            if rank is not None and rank == 0:
+                return opset9.masked_fill(g, self, bool_inp, values)
+            mask_rank = symbolic_helper._get_tensor_rank(bool_inp)
+            self_rank = symbolic_helper._get_tensor_rank(self)
+            if (
+                mask_rank is not None
+                and self_rank is not None
+                and self_rank > mask_rank
+            ):
+                # Unsqueeze 'bool_inp' to be broadcastable to shape of 'self'.
+                bool_inp = symbolic_helper._unsqueeze_helper(
+                    g, bool_inp, list(range(mask_rank, self_rank))
+                )
+            return masked_scatter(g, self, bool_inp, values)
+        broadcast_index_shape = g.op("Shape", index)
+        index = symbolic_helper._unsqueeze_helper(g, index, [-1])
+    sub_data_shape = symbolic_helper._slice_helper(
+        g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[sys.maxsize]
+    )
+    values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
+    # Check if values is a singular value and expand accordingly
+    rank = symbolic_helper._get_tensor_rank(values)
+    if rank is not None and rank == 0:
+        values = opset9.expand(g, values, values_shape, None)
+    values = symbolic_helper._reshape_helper(g, values, values_shape)
+
+    self_scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if self_scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        values_scalar_type = _type_utils.JitScalarType.from_value(
+            values, _type_utils.JitScalarType.UNDEFINED
+        )
+        if self_scalar_type != values_scalar_type:
+            values = g.op("Cast", values, to_i=self_scalar_type.onnx_type())
+    elif accumulate:
+        raise errors.SymbolicValueError("self does not have a valid scalar type.", self)
+
+    if accumulate:
+        zeros = g.op(
+            "ConstantOfShape",
+            g.op("Shape", self),
+            value_t=torch.tensor([0], dtype=self_scalar_type.dtype()),
+        )
+        result = g.op("ScatterND", zeros, index, values)
+        result = add(g, self, result)
+    else:
+        result = g.op("ScatterND", self, index, values)
+
+    return result
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None and rank != 4:
+        return symbolic_helper._unimplemented("pixel_shuffle", "only support 4d input")
+    return g.op("DepthToSpace", self, blocksize_i=upscale_factor, mode_s="CRD")
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bicubic2d",
+    decorate=[symbolic_helper._apply_params("upsample_bicubic2d", 4, "cubic")],
+)
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    return symbolic_helper._interpolate_helper(name, dim, interpolate_mode)
+
+
+@_onnx_symbolic("aten::__interpolate")
+@symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    return symbolic_helper.__interpolate_helper(
+        g, input, size, scale_factor, mode, align_corners, recompute_scale_factor
+    )
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True")
+    return g.op("GatherElements", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(src)
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+        return g.op(
+            "ScatterElements", self, index, opset9.expand_as(g, src, index), axis_i=dim
+        )
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+def cumsum(g: jit_utils.GraphContext, self, dim, dtype=None):
+    dim_tensor = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.int))
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        cast = g.op(
+            "Cast", self, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    else:
+        cast = self
+    csum = g.op("CumSum", cast, dim_tensor)
+    return csum
+
+
+@_onnx_symbolic("aten::masked_select")
+def masked_select(g: jit_utils.GraphContext, self, mask):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    return g.op("GatherND", self, index)
+
+
+@_onnx_symbolic("aten::masked_scatter")
+def masked_scatter(g: jit_utils.GraphContext, self, mask, source):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    # NOTE: source can have more elements than needed.
+    # It could also have arbitrary shape.
+    # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor.
+    source = symbolic_helper._reshape_helper(g, source, torch.LongTensor([-1]))
+    source = symbolic_helper._slice_helper(
+        g,
+        source,
+        axes=torch.LongTensor([0]),
+        starts=torch.LongTensor([0]),
+        ends=opset9.size(g, index, torch.LongTensor([0])),
+    )
+    return g.op("ScatterND", self, index, source)
+
+
+@_onnx_symbolic("aten::len")
+def _len(g: jit_utils.GraphContext, self):
+    if (
+        symbolic_helper._is_tensor_list(self)
+        or self.node().kind() == "onnx::SplitToSequence"
+    ):
+        return g.op("SequenceLength", self)
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::__getitem_")
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    if symbolic_helper._is_tensor_list(self):
+        # SequenceAt requires that the input be a List of Tensors
+        return g.op("SequenceAt", self, i)
+    else:
+        from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (
+            __getitem_ as getitem,
+        )
+
+        return getitem(g, self, i)
+
+
+@_onnx_symbolic("aten::_set_item")
+def _set_item(g: jit_utils.GraphContext, tensor_list, i, v):
+    tensor_list = g.op("SequenceErase", tensor_list, i)
+    return g.op("SequenceInsert", tensor_list, v, i)
+
+
+@_onnx_symbolic("aten::append")
+def append(g: jit_utils.GraphContext, self, tensor):
+    return g.op("SequenceInsert", self, tensor)
+
+
+@_onnx_symbolic("aten::add")
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        tensor_list_node = other.node()
+        if tensor_list_node.kind() != "prim::ListConstruct":
+            return symbolic_helper._unimplemented(
+                "add", "does not support adding dynamic tensor list to another"
+            )
+        tensors = symbolic_helper._unpack_list(other)
+        l = self
+        for t in tensors:
+            l = g.op("SequenceInsert", l, t)
+        return l
+
+    return opset9.add(g, self, other, alpha)
+
+
+@_onnx_symbolic("aten::insert")
+def insert(g: jit_utils.GraphContext, self, pos, tensor):
+    return g.op("SequenceInsert", self, tensor, pos)
+
+
+@_onnx_symbolic("aten::pop")
+def pop(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::Delete")
+def Delete(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.quantized_args(True)
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.cat(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.stack(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim, new_axis_i=1)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def _unique2(g: jit_utils.GraphContext, self, sorted, return_inverse, return_counts):
+    u, _indices, inverse_indices, counts = g.op(
+        "Unique", self, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::unique_dim")
+@symbolic_helper.parse_args("v", "i", "i", "i", "i")
+def unique_dim(
+    g: jit_utils.GraphContext, self, dim, sorted, return_inverse, return_counts
+):
+    u, _indices, inverse_indices, counts = g.op(
+        "Unique", self, axis_i=dim, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, descending=descending, out=out)
+
+
+@_onnx_symbolic("aten::argsort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def argsort(g: jit_utils.GraphContext, self, dim, descending, out=None):
+    _, indices = symbolic_helper._sort_helper(
+        g, self, dim, descending=descending, out=out
+    )
+    return indices
+
+
+@_onnx_symbolic("aten::round")
+@symbolic_helper.parse_args("v", "i")
+def round(g: jit_utils.GraphContext, self, decimals=0):
+    if not symbolic_helper._is_fp(self):
+        return self
+    if decimals == 0:
+        return g.op("Round", self)
+    mul = g.op("Mul", self, g.op("Constant", value_t=torch.tensor(pow(10, decimals))))
+    round = g.op("Round", mul)
+    return g.op(
+        "Mul", round, g.op("Constant", value_t=torch.tensor(pow(10, -1 * decimals)))
+    )
+
+
+@_onnx_symbolic("aten::remainder")
+def remainder(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_fp(input) or symbolic_helper._is_fp(other):
+        return opset9.remainder(g, input, other)
+    return g.op("Mod", input, other, fmod_i=0)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+    else:
+        return opset9.split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+    else:
+        return opset9.unbind(g, self, dim, _outputs)
+
+
+def _prepare_onnx_paddings(g: jit_utils.GraphContext, input, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+
+    Args:
+        input: the input tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
+            where m is in range [0, n].
+    """
+    if (
+        not symbolic_helper._is_packed_list(pad)
+        and symbolic_helper._is_list(pad)
+        and symbolic_helper._is_scalar_list(pad)
+    ):
+        pad = g.op("ConcatFromSequence", pad, axis_i=0, new_axis_i=1)
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # Assume zero-dimensions in the beginning, pad the "pad" sequence with zeros in the beginning
+    pad_len = opset9.size(g, pad, g.op("Constant", value_t=torch.tensor([0])))
+    # Set extension = [0] * (dim * 2 - len(pad))
+    rank = symbolic_helper._get_tensor_rank(input)
+    if rank is None:
+        rank = g.op("Size", g.op("Shape", input))
+    else:
+        rank = g.op("Constant", value_t=torch.tensor(rank, dtype=torch.int64))
+    extension = g.op(
+        "Sub",
+        g.op("Mul", rank, g.op("Constant", value_t=torch.tensor(2, dtype=torch.int64))),
+        pad_len,
+    )
+    # Concat pad with extension: paddings = [dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, 0, 0, ... ]
+    # Currently ONNX only supports int64 type for Pad
+    pad = g.op("Cast", pad, to_i=_C_onnx.TensorProtoDataType.INT64)
+    paddings = g.op(
+        "Concat",
+        pad,
+        g.op(
+            "ConstantOfShape", extension, value_t=torch.tensor([0], dtype=torch.int64)
+        ),
+        axis_i=0,
+    )
+    # Reshape and reverse order and collate first beginnings and then ends
+    # paddings = [[..., 0, dim_n-1_begin, dim_n_begin],
+    #               [..., 0, dim_n-1_end, dim_n_end]]
+    # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end]
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))
+    )
+    paddings = g.op("Transpose", opset10.flip(g, paddings, [0]), perm_i=[1, 0])
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1]))
+    )
+    padding_c = g.op("Cast", paddings, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return padding_c
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value=None):
+    mode = "constant"
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, input)
+    pad = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, pad, value, mode_s=mode)
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::pad")
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return opset9._pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic("aten::linalg_det")
+def linalg_det(g: jit_utils.GraphContext, self):
+    return g.op("Det", self)
+
+
+@_onnx_symbolic("aten::logdet")
+def logdet(g: jit_utils.GraphContext, input):
+    return opset9.log(g, linalg_det(g, input))
+
+
+@_onnx_symbolic("aten::arange")
+def arange(g: jit_utils.GraphContext, *args):
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    if len(args) == 2 and all(isinstance(val, int) for val in args):
+        # aten::arange(Scalar start, Scalar end)
+        dtype = torch.int64
+        # Start index.
+        start = g.op(
+            "Constant",
+            value_t=torch.tensor(args[0], dtype=dtype),
+        )
+        # End (exclusive) index.
+        end = g.op(
+            "Constant",
+            value_t=torch.tensor(args[1], dtype=dtype),
+        )
+        # Step size from start to end indexes.
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=dtype),
+        )
+        return g.op("Range", start, end, delta_default)
+    elif len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        start_default = g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=type_.dtype()),
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start_default, end, delta_default)
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        _, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        return g.op("Range", start, end, step)
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start, end, delta_default)
+    else:
+        return symbolic_helper._unimplemented(
+            "aten::arange", f"with {len(args)} arguments"
+        )
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::squeeze")
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    # dim as a tensor
+    if not symbolic_helper._is_constant(dim):
+        return symbolic_helper._squeeze_helper(g, self, [dim])
+
+    dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    input_rank = symbolic_helper._get_tensor_rank(self)
+    adjusted_dim = dim
+    if input_rank is not None and dim < 0:
+        adjusted_dim += input_rank
+    dim_size = symbolic_helper._get_tensor_dim_size(self, adjusted_dim)
+    if (dim < 0 and input_rank is None) or dim_size is None:
+        # If onnx shape inference is not on, export always as dynamic.
+        # Because we cannot tell if observed static shape is also static at runtime.
+        # create "cond" node (condition is shape[i]==1)
+        dim_constant = g.op("Constant", value_t=torch.tensor([dim]))
+        size = symbolic_helper._size_helper(g, self, dim_constant)
+        const_one = g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))
+        cond = g.op("Equal", size, const_one)
+        # create the "If" node and add the "then" and "else" blocks to it.
+        if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+            g, "If", cond, n_blocks=2
+        )
+        squeeze_ = symbolic_helper._squeeze_helper(if_context, self, [dim])
+        utils._add_output_to_block(if_context.block, squeeze_)
+        identity_ = else_context.op("Identity", self)
+        utils._add_output_to_block(else_context.block, identity_)
+        return if_op
+
+    # For static input shape
+    dim = adjusted_dim
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please export with dynamic_axes argument."
+        )
+        return self
+    return symbolic_helper._squeeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::unsqueeze")
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    if symbolic_helper._is_constant(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    return symbolic_helper._unsqueeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::index")
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    # Handle single mask index.
+    if len(indices) == 1:
+        index = indices[0]
+        if not symbolic_helper._is_none(index) and (
+            symbolic_helper._is_bool(index)
+            or _type_utils.JitScalarType.from_value(index)
+            == _type_utils.JitScalarType.UINT8
+        ):
+            index = opset9.nonzero(g, index)
+            return g.op("GatherND", self, index)
+    return opset9.index(g, self, index)
+
+
+@_onnx_symbolic("aten::index_fill")
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = opset9.expand(g, value, expanded_index_shape, None)
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::bitwise_right_shift")
+@_onnx_symbolic("aten::__rshift_")
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="RIGHT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::bitwise_left_shift")
+@_onnx_symbolic("aten::__lshift_")
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="LEFT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+def _get_im2col_indices_along_dim(
+    g: jit_utils.GraphContext, input_d, kernel_size_d, dilation_d, padding_d, stride_d
+):
+    # Input is always 4-D (N, C, H, W)
+    # Calculate indices of sliding blocks along spatial dimension
+    # Slide kernel over input each dim d:
+    # each dimension d ranges from 0 to input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
+    # with steps = stride
+
+    blocks_d = g.op(
+        "Add", input_d, g.op("Constant", value_t=torch.tensor(padding_d * 2))
+    )
+    blocks_d = g.op(
+        "Sub",
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(dilation_d * (kernel_size_d - 1))),
+    )
+
+    # Stride kernel over input and find starting indices along dim d
+    blocks_d_indices = g.op(
+        "Range",
+        g.op("Constant", value_t=torch.tensor(0)),
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(stride_d)),
+    )
+
+    # Apply dilation on kernel and find its indices along dim d
+    kernel_grid = torch.arange(0, kernel_size_d * dilation_d, dilation_d)
+    kernel_grid = g.op("Constant", value_t=kernel_grid.unsqueeze(0))
+
+    # Broadcast and add kernel staring positions (indices) with
+    # kernel_grid along dim d, to get block indices along dim d
+    blocks_d_indices = symbolic_helper._unsqueeze_helper(
+        g, blocks_d_indices, [0]
+    )  # Reshape to [1, -1]
+    kernel_mask = symbolic_helper._reshape_helper(
+        g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))
+    )
+    block_mask = g.op("Add", blocks_d_indices, kernel_mask)
+
+    return block_mask
+
+
+def _get_im2col_padded_input(g: jit_utils.GraphContext, input, padding_h, padding_w):
+    # Input is always 4-D tensor (N, C, H, W)
+    # Padding tensor has the following format: (padding_h, padding_w)
+    # Reshape the padding to follow ONNX format: (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
+    pad = g.op("Constant", value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
+    return g.op("Pad", input, pad)
+
+
+def _get_im2col_output_shape(g: jit_utils.GraphContext, input, kernel_h, kernel_w):
+    batch_dim = size(g, input, g.op("Constant", value_t=torch.tensor(0)))
+    channel_dim = size(g, input, g.op("Constant", value_t=torch.tensor(1)))
+    channel_unfolded = g.op(
+        "Mul", channel_dim, g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w))
+    )
+
+    return g.op(
+        "Concat",
+        symbolic_helper._unsqueeze_helper(g, batch_dim, [0]),
+        symbolic_helper._unsqueeze_helper(g, channel_unfolded, [0]),
+        g.op("Constant", value_t=torch.tensor([-1])),
+        axis_i=0,
+    )
+
+
+@_onnx_symbolic("aten::im2col")
+@symbolic_helper.parse_args("v", "is", "is", "is", "is")
+def im2col(g: jit_utils.GraphContext, input, kernel_size, dilation, padding, stride):
+    # Input is always 4-D tensor (N, C, H, W)
+    # All other args are int[2]
+
+    input_h = size(g, input, g.op("Constant", value_t=torch.tensor(2)))
+    input_w = size(g, input, g.op("Constant", value_t=torch.tensor(3)))
+
+    stride_h, stride_w = stride[0], stride[1]
+    padding_h, padding_w = padding[0], padding[1]
+    dilation_h, dilation_w = dilation[0], dilation[1]
+    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
+
+    blocks_row_indices = _get_im2col_indices_along_dim(
+        g, input_h, kernel_h, dilation_h, padding_h, stride_h
+    )
+    blocks_col_indices = _get_im2col_indices_along_dim(
+        g, input_w, kernel_w, dilation_w, padding_w, stride_w
+    )
+
+    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
+    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
+
+    # For a 4D matrix of size (1, 1, 3, 3) as below with kernel_size=2, stride=1, and dilation=1
+    # [[[[1., 2., 3.,],
+    #    [4., 5., 6.,],
+    #    [7., 8., 9.,]]]]
+    # First gather indices along rows (dim=2) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[1., 2., 3.],
+    #     [4., 5., 6.]],
+    #    [[4., 5., 6.],
+    #     [7., 8., 9.]]]]]
+    # And then gather along cols (dim=4) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[[1., 2.],
+    #      [4., 5.]],
+    #     [[2., 3.],
+    #      [5., 6]]],
+    #    [[[4., 5.],
+    #      [7., 8.]],
+    #     [[5., 6.],
+    #      [8., 9.]]]]]]
+    # Transpose dims 3 (depth) and 4 (rows), and then reshape to output shape (1, 1, 4, 4) to get:
+    #  [[[1., 2., 4., 5.],
+    #    [2., 3., 5., 6.],
+    #    [4., 5., 7., 8.],
+    #    [5., 6., 8., 9.]]]
+    output = g.op("Gather", padded_input, blocks_row_indices, axis_i=2)
+    output = g.op("Gather", output, blocks_col_indices, axis_i=4)
+    output = g.op("Transpose", output, perm_i=[0, 1, 2, 4, 3, 5])
+    return symbolic_helper._reshape_helper(g, output, output_shape)
+
+
+@_onnx_symbolic("aten::narrow")
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    end = g.op("Add", start, length)
+    return symbolic_helper._slice_helper(g, input, axes=dim, starts=start, ends=end)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim == 1:
+        return input
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1:
+        if end_dim == -1 or (dim is not None and end_dim == dim - 1):
+            return g.op("Flatten", input, axis_i=start_dim)
+    elif start_dim == 0:
+        if end_dim == -2 or (dim is not None and end_dim == dim - 2):
+            return g.op("Flatten", input, axis_i=end_dim + 1)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    # if end_dim is negative add dim
+    if end_dim < 0:
+        end_dim = dim + end_dim
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self,
+    ord,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return symbolic_helper._embedding_bag_helper(
+        g,
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
+    )
+
+
+@_onnx_symbolic("aten::embedding_renorm")
+@symbolic_helper.parse_args("v", "v", "f", "f")
+def embedding_renorm(g: jit_utils.GraphContext, weight, indices, max_norm, norm_type):
+    unique_indices = g.op("Unique", indices)
+    partial_weight = g.op("Gather", weight, unique_indices)
+    norm_i = int(norm_type)
+    if norm_i == 1:
+        norm_type = "ReduceL1"
+    elif norm_i == 2:
+        norm_type = "ReduceL2"
+    else:
+        raise errors.SymbolicValueError(
+            f"Unsupported: ONNX export of embedding_renorm with norm: {norm_i}. "
+            "Only 1. and 2. are supported.",
+            weight,
+        )
+    partial_weight_norm = g.op(norm_type, partial_weight, axes_i=[1], keepdims_i=1)
+    # https://github.com/pytorch/pytorch/blob/0a07488ed2c47765e337e290bd138c0e6e459cbd/aten/src/ATen/native/Embedding.cpp#L177
+    # Add 1e-7 to prevent division by zero.
+    partial_weight_norm_ = g.op(
+        "Add", partial_weight_norm, g.op("Constant", value_t=torch.tensor(1e-7))
+    )
+    max_norm = torch.tensor(max_norm)
+    scales = g.op("Div", max_norm, partial_weight_norm_)
+    partial_weight_renorm = g.op("Mul", partial_weight, scales)
+    partial_weight_renorm = g.op(
+        "Where",
+        g.op("Greater", partial_weight_norm, max_norm),
+        partial_weight_renorm,
+        partial_weight,
+    )
+    return g.op(
+        "ScatterND",
+        weight,
+        symbolic_helper._unsqueeze_helper(g, unique_indices, [1]),
+        partial_weight_renorm,
+    )
+
+
+@_onnx_symbolic("aten::chunk")
+def chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    # Calculate chunk size for dynamic chunk
+    dim_size = g.op("Gather", g.op("Shape", self), dim, axis_i=0)
+    chunk_size_s = g.op(
+        "Sub", chunks, g.op("Constant", value_t=torch.tensor([1], dtype=torch.long))
+    )
+    chunk_size = g.op("Div", g.op("Add", dim_size, chunk_size_s), chunks)
+    # Create splits vector
+    chunk_vec = [
+        opset9.expand(g, chunk_size, chunk_size_s, None),
+        g.op("Sub", dim_size, g.op("Mul", chunk_size, chunk_size_s)),
+    ]
+    chunk_vec = g.op("Concat", *chunk_vec, axis_i=0)
+    return split(g, self, chunk_vec, dim)
+
+
+@_onnx_symbolic("aten::normal")
+def normal(
+    g: jit_utils.GraphContext,
+    mean,
+    std,
+    sizes=None,
+    generator=None,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+):
+    # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
+    # scale-location transformation of that distribution, which has mean mu and variance sigma's square. If x is a sample
+    # from a mean 0 and variance 1 distribution then
+    #       sigma x+mu
+    # is a sample with mean mu and variance sigma's square.
+    if sizes is not None and not symbolic_helper._is_none(sizes):
+        mean = opset9.expand(g, mean, sizes, None)
+    result = opset9.mul(g, std, g.op("RandomNormalLike", mean))
+    return add(g, result, mean)
+
+
+@_onnx_symbolic("aten::atleast_1d")
+def atleast_1d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 1D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1]))
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1]))
+        )
+    return self
+
+
+@_onnx_symbolic("aten::atleast_2d")
+def atleast_2d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 2D
+    #       If it's 1D, unsqueeze to 2D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+    return self
+
+
+@_onnx_symbolic("aten::atleast_3d")
+def atleast_3d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 3D
+    #       If it's 1D, unsqueeze to 3D
+    #       If it's 2D, unsqueeze to 3D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            elif tensor_rank == 2:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    elif tensor_rank == 2:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    return self
+
+
+@_onnx_symbolic("prim::ConstantChunk")
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    input_shape = g.op("Shape", self)
+    axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+    input_shape_dim = g.op("Gather", input_shape, axis, axis_i=0)
+    start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+    chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
+    chunk_size_minus_1 = g.op(
+        "Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long)
+    )
+    input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
+    chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
+    res = []
+    for i in range(chunks):
+        index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
+        end = g.op("Mul", chunk_dim, index)
+        res.append(g.op("Slice", self, start, end, axis))
+        start = end
+    return res
+
+
+@_onnx_symbolic("aten::hstack")
+def hstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_1d(g, tensor_list)
+    first_tensor = g.op(
+        "SequenceAt",
+        tensor_list,
+        g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)),
+    )
+    first_tensor_shape = g.op("Shape", first_tensor)
+    first_tensor_dim = g.op("Size", first_tensor_shape)
+
+    const_one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+    equal_to_one = g.op("Equal", first_tensor_dim, const_one)
+
+    (
+        if_op_greater,
+        (if_context_equal, else_context_equal),
+        _,
+    ) = jit_utils.add_op_with_blocks(g, "If", equal_to_one, n_blocks=2, outputs=1)
+    result_if = if_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0
+    )
+    utils._add_output_to_block(if_context_equal.block, result_if)
+    result_else = else_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=1, new_axis_i=0
+    )
+    utils._add_output_to_block(else_context_equal.block, result_else)
+    result = if_op_greater.node().output()
+
+    return result
+
+
+@_onnx_symbolic("aten::vstack")
+def vstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_2d(g, tensor_list)
+    return g.op("ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
new file mode 100644
index 000000000000..431660409717
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset12.py
@@ -0,0 +1,465 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+from __future__ import annotations
+
+import functools
+import sys
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+    utils,
+)
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 12
+
+__all__ = [
+    "argmax",
+    "argmin",
+    "binary_cross_entropy_with_logits",
+    "celu",
+    "cross_entropy_loss",
+    "dropout",
+    "einsum",
+    "ge",
+    "le",
+    "native_dropout",
+    "nll_loss",
+    "nll_loss2d",
+    "nll_loss_nd",
+    "outer",
+    "pow",
+    "tensordot",
+    "unfold",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=12)
+
+
+def _einsum_helper(g: jit_utils.GraphContext, equation, tensors):
+    if not tensors:
+        raise RuntimeError("Einsum inputs are empty.")
+    # ONNX does not support bool for Einsum inputs.
+    if symbolic_helper._is_bool(tensors[0]):
+        tensors = [
+            g.op("Cast", tensor, to_i=_C_onnx.TensorProtoDataType.INT64)
+            for tensor in tensors
+        ]
+        return g.op(
+            "Cast",
+            g.op("Einsum", *tensors, equation_s=equation),
+            to_i=_C_onnx.TensorProtoDataType.BOOL,
+        )
+    else:
+        return g.op("Einsum", *tensors, equation_s=equation)
+
+
+@_onnx_symbolic("aten::einsum")
+@symbolic_helper.parse_args("s", "v", "is")
+def einsum(g: jit_utils.GraphContext, equation, tensor_list, path=None):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return _einsum_helper(g, equation, tensors)
+
+
+@_onnx_symbolic("aten::outer")
+@symbolic_helper.parse_args("v", "v")
+def outer(g: jit_utils.GraphContext, input, other):
+    # make sure to cast other to self's type
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(input):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(input).onnx_type(),
+        )
+    return _einsum_helper(g, "i,j->ij", [input, other])
+
+
+def _dropout_returns_masked_input_and_mask(
+    g: jit_utils.GraphContext, input: torch._C.Value, p: float, train: bool
+) -> tuple[torch._C.Value, torch._C.Value | None]:
+    symbolic_helper.check_training_mode(train, "dropout")
+    # In eval mode, dropout is non-op. That is, if the node's
+    # train param is set to False, dropout just returns its inputs.
+    if not train:
+        return input, None
+    p = g.op("Constant", value_t=torch.tensor(p))
+    t = g.op("Constant", value_t=torch.tensor(train, dtype=torch.bool))
+    r, mask = g.op("Dropout", input, p, t, outputs=2)
+    return r, mask
+
+
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    masked, _ = _dropout_returns_masked_input_and_mask(g, input, p, train)
+    return masked
+
+
+@_onnx_symbolic("aten::native_dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+def native_dropout(g: jit_utils.GraphContext, input, p, train):
+    return _dropout_returns_masked_input_and_mask(g, input, p, train)
+
+
+@_onnx_symbolic("aten::nll_loss")
+def nll_loss(g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    # in onnx NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return nllloss
+
+
+@_onnx_symbolic("aten::nll_loss2d")
+def nll_loss2d(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::nll_loss_nd")
+def nll_loss_nd(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::cross_entropy_loss")
+def cross_entropy_loss(
+    g: jit_utils.GraphContext,
+    self,
+    target,
+    weight,
+    reduction,
+    ignore_index,
+    label_smoothing,
+):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    label_smoothing = symbolic_helper._maybe_get_const(label_smoothing, "f")
+    if label_smoothing is not None and label_smoothing > 0.0:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX does not support label_smoothing", self
+        )
+
+    # in onnx SoftmaxCrossEntropyLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return celoss
+
+
+@_onnx_symbolic("aten::binary_cross_entropy_with_logits")
+@symbolic_helper.parse_args("v", "v", "v", "v", "i")
+def binary_cross_entropy_with_logits(
+    g: jit_utils.GraphContext, input, target, weight, pos_weight, reduction
+):
+    p = g.op("Constant", value_t=torch.tensor([1]))
+    sig_x = opset9.sigmoid(g, input)
+    log_sig_x = opset9.log(g, sig_x)
+    sub_1_x = opset9.sub(g, p, sig_x)
+    sub_1_y = opset9.sub(g, p, target)
+    log_1_x = opset9.log(g, sub_1_x)
+    if pos_weight is None or symbolic_helper._is_none(pos_weight):
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g, opset9.mul(g, target, log_sig_x), opset9.mul(g, sub_1_y, log_1_x)
+            ),
+        )
+    else:
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g,
+                opset9.mul(g, opset9.mul(g, target, log_sig_x), pos_weight),
+                opset9.mul(g, sub_1_y, log_1_x),
+            ),
+        )
+
+    if weight is not None and not symbolic_helper._is_none(weight):
+        output = opset9.mul(g, weight, output)
+
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return g.op("ReduceSum", output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "binary_cross_entropy_with_logits with reduction other than none, mean, or sum",
+            input,
+        )
+
+
+@_onnx_symbolic("aten::celu")
+def celu(g: jit_utils.GraphContext, self, alpha):
+    alpha = symbolic_helper._maybe_get_const(alpha, "f")
+    # if the input is of type double cast it to float
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.DOUBLE
+    ):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        out = g.op("Celu", self, alpha_f=alpha)
+        return g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+    return g.op("Celu", self, alpha_f=alpha)
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::pow")
+def pow(g: jit_utils.GraphContext, self, exponent):
+    return g.op("Pow", self, exponent)
+
+
+@_onnx_symbolic("aten::ge")
+def ge(g: jit_utils.GraphContext, input, other):
+    return g.op("GreaterOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::le")
+def le(g: jit_utils.GraphContext, input, other):
+    return g.op("LessOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    const_size = symbolic_helper._maybe_get_const(size, "i")
+    const_step = symbolic_helper._maybe_get_const(step, "i")
+    if not symbolic_helper._is_value(const_size) and not symbolic_helper._is_value(
+        const_step
+    ):
+        return opset9.unfold(g, input, dimension, const_size, const_step)
+
+    sizedim = symbolic_helper._get_tensor_dim_size(input, dimension)
+    if sizedim is not None:
+        low_start = g.op("Constant", value_t=torch.tensor(0))
+        low_end = g.op("Constant", value_t=torch.tensor(sizedim))
+        hi_end = g.op("Constant", value_t=torch.tensor(sizedim + 1))
+        low_indices = g.op("Range", low_start, low_end, step)
+        hi_indices = g.op("Range", size, hi_end, step)
+
+        low_size = symbolic_helper._size_helper(
+            g, low_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+        hi_size = symbolic_helper._size_helper(
+            g, hi_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+
+        ndim = symbolic_helper._get_tensor_rank(input)
+        assert ndim is not None
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+
+        unsqueeze_list = []
+        loop_condition = g.op("Constant", value_t=torch.tensor(1))
+        loop_condition = g.op(
+            "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+        )
+        loop_len = g.op("Min", low_size, hi_size)
+
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        cond = utils._add_input_to_block(loop_block)  # noqa: F841
+
+        starts = loop_context.op("Gather", low_indices, block_input_iter)
+        ends = loop_context.op("Gather", hi_indices, block_input_iter)
+        axes = loop_context.op("Constant", value_t=torch.tensor([2]))
+        starts = symbolic_helper._unsqueeze_helper(loop_context, starts, [0])
+        ends = symbolic_helper._unsqueeze_helper(loop_context, ends, [0])
+        stack = loop_context.op("Slice", input, starts, ends, axes)
+
+        unsqueeze = symbolic_helper._unsqueeze_helper(
+            loop_context, loop_context.op("Transpose", stack, perm_i=perm), [dimension]
+        )
+        unsqueeze_list.append(unsqueeze)
+        concat = loop_context.op("Concat", *unsqueeze_list, axis_i=0)
+
+        cond_out = loop_context.op(
+            "Cast", loop_condition, _C_onnx.TensorProtoDataType.BOOL
+        )
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, concat)
+
+        loop_output = loop.node().output()
+        perm = [0, 1, 2, 3, 4]
+        perm[0], perm[dimension + 1] = perm[dimension + 1], perm[0]
+        transpose = g.op("Transpose", loop_output, perm_i=perm)
+        squeeze = symbolic_helper._squeeze_helper(g, transpose, [0])
+
+        return squeeze
+
+    return symbolic_helper._unimplemented("Unfold", "input size not accessible")
+
+
+@_onnx_symbolic("aten::tensordot")
+@symbolic_helper.parse_args("v", "v", "is", "is", "v")
+def tensordot(g: jit_utils.GraphContext, input_a, input_b, dims_a, dims_b, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Tensordot", "Out parameter is not supported for tensordot."
+        )
+
+    dim_count_a = symbolic_helper._get_tensor_rank(input_a)
+    if dim_count_a is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_a) of unknown rank.",
+            input_a,
+        )
+
+    dim_count_b = symbolic_helper._get_tensor_rank(input_b)
+    if dim_count_b is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_b) of unknown rank.",
+            input_b,
+        )
+
+    dims_a = [
+        (dims_a[i] + dim_count_a) if (dims_a[i] < 0) else dims_a[i]
+        for i in range(len(dims_a))
+    ]
+    dims_b = [
+        (dims_b[i] + dim_count_b) if (dims_b[i] < 0) else dims_b[i]
+        for i in range(len(dims_b))
+    ]
+
+    left_dims_a = [i for i in range(dim_count_a) if (i not in dims_a)]
+    left_dims_b = [i for i in range(dim_count_b) if (i not in dims_b)]
+
+    new_input_a = opset9.permute(g, input_a, left_dims_a + dims_a)
+    new_input_b = opset9.permute(g, input_b, dims_b + left_dims_b)
+
+    input_shape = g.op("Shape", new_input_a)
+    left_sizes_a = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(left_dims_a)]
+    )
+    shape_sizes = [
+        left_sizes_a,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", output_a)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", new_input_b)
+    left_sizes_b = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[len(dims_b)], ends=[sys.maxsize]
+    )
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(dims_b)]
+    )
+    shape_sizes = [
+        slices,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    input_shape = g.op("Shape", output_b)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    output = einsum(g, "ij,jk->ik", g.op("prim::ListConstruct", *[output_a, output_b]))
+
+    shape_sizes = [left_sizes_a, left_sizes_b]
+    return opset9._reshape_from_tensor(g, output, shape_sizes)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py
new file mode 100644
index 000000000000..e9da6a426f7f
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset13.py
@@ -0,0 +1,1113 @@
+# mypy: allow-untyped-defs
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 13
+import functools
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset11 as opset11,
+    symbolic_opset9 as opset9,
+    utils,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=13)
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    softmax = g.op("Softmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+
+    return softmax
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return return_op
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    dim_val = symbolic_helper._maybe_get_const(dim, "is")
+    if not symbolic_helper._is_value(dim_val) and len(dim_val) == 0:
+        return g.op("ReduceL2", self, keepdims_i=0)
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return g.op("Split", self, split_size_or_sizes, axis_i=dim, outputs=_outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            raise errors.SymbolicValueError(
+                "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    splits = g.op("Constant", value_t=torch.tensor(splits))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::tensor_split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def tensor_split(
+    g: jit_utils.GraphContext, self, indices_or_sections, dim, _outputs=None
+):
+    axis = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    axis = opset11.unsqueeze(g, axis, 0)
+    const_1 = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+
+    if symbolic_helper._is_split_static(indices_or_sections, _outputs):
+        split_val = symbolic_helper._node_get(indices_or_sections.node(), "value")
+
+        if split_val.dim() > 0:
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            res = []
+            assert _outputs is not None
+            for i in range(_outputs - 1):
+                end = g.op(
+                    "Gather",
+                    indices_or_sections,
+                    g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+                    axis_i=0,
+                )
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+
+            end = symbolic_helper._size_helper(g, self, axis)
+            res.append(g.op("Slice", self, start, end, axis))
+            return res
+
+        split_size = symbolic_helper._get_const(
+            indices_or_sections, "i", "indices_or_sections"
+        )
+
+        size = symbolic_helper._get_tensor_dim_size(self, dim)
+        if size is None:
+            if _outputs is not None:
+                size = split_size * _outputs
+            else:
+                raise errors.SymbolicValueError(
+                    "Unknown dimension size not supported", self
+                )
+
+        min_split_size = size // split_size
+        num_splits_one_extra = size % split_size
+
+        splits = num_splits_one_extra * [min_split_size + 1]
+        leftover = (split_size - num_splits_one_extra) * [min_split_size]
+
+        splits = g.op(
+            "Constant", value_t=torch.tensor(splits + leftover, dtype=torch.long)
+        )
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+    if (
+        symbolic_helper._is_tensor(indices_or_sections)
+        and symbolic_helper._get_tensor_rank(indices_or_sections) == 1
+    ):
+        loop_len = symbolic_helper._size_helper(
+            g, indices_or_sections, g.op("Constant", value_t=torch.tensor(0))
+        )
+        loop_len = opset11.unsqueeze(g, loop_len, 0)
+        loop_condition = g.op("Cast", const_1, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+        # To make the first slice in the below loop work,
+        # we pad a zero to the first position so that it will be the initial start of slice.
+        padding_0 = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+        indices_or_sections = g.op("Concat", padding_0, indices_or_sections, axis_i=0)
+
+        final_splits = g.op("SequenceEmpty")
+        # Loop inputs
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, final_splits, outputs=1, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        cond = utils._add_input_to_block(loop_block)  # noqa: F841
+        final_splits = utils._add_input_to_block(loop_block)
+
+        start = loop_context.op(
+            "Gather", indices_or_sections, block_input_iter, axis_i=0
+        )
+        end = loop_context.op(
+            "Gather",
+            indices_or_sections,
+            loop_context.op("Add", block_input_iter, const_1),
+            axis_i=0,
+        )
+
+        slice = loop_context.op("Slice", self, start, end, axis)
+        final_splits = loop_context.op("SequenceInsert", final_splits, slice)
+
+        # Loop outputs
+        cond_out = loop_context.op("Identity", loop_condition)
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, final_splits)
+
+        loop_out = loop.node().output()
+        start = g.op(
+            "Gather",
+            indices_or_sections,
+            g.op("Constant", value_t=torch.tensor(-1, dtype=torch.long)),
+            axis_i=0,
+        )
+        start = opset11.unsqueeze(g, start, 0)
+        end = symbolic_helper._size_helper(g, self, axis)
+
+        last_slice = g.op("Slice", self, start, end, axis)
+
+        return g.op("SequenceInsert", loop_out, last_slice)
+
+    else:  # scalar tensor
+        dim_size = symbolic_helper._size_helper(g, self, axis)
+        min_split_size = g.op("Div", dim_size, indices_or_sections)
+        min_split_size_plus_1 = g.op(
+            "Add",
+            min_split_size,
+            const_1,
+        )
+        num_splits_one_extra = g.op("Mod", dim_size, indices_or_sections)
+        splits = g.op("Tile", min_split_size_plus_1, num_splits_one_extra)
+        leftover = g.op(
+            "Tile",
+            min_split_size,
+            g.op(
+                "Sub",
+                opset11.unsqueeze(g, indices_or_sections, 0),
+                num_splits_one_extra,
+            ),
+        )
+
+        splits = g.op("Concat", splits, leftover, axis_i=0)
+        if _outputs is None:
+            return g.op("SplitToSequence", self, splits, axis_i=dim)
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    splits = g.op("Constant", value_t=torch.tensor([1] * _outputs))
+    outputs = g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        g.op("Squeeze", out, g.op("Constant", value_t=torch.tensor([dim])))
+        for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, opset9.nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = opset9.nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_channel_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i")
+def fake_quantize_per_channel_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    axis,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    # ONNX defines zero_point to be int8 or uint8
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point, axis_i=axis)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point)
+
+
+def _reduce_op_symbolic(onnx_op_name):
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = symbolic_helper._maybe_cast_reduce_op_input(g, self)
+        if dim is None:
+            # all-reduce path
+            return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
+            return g.op(onnx_op_name, self, dim, keepdims_i=keepdim)
+
+    return symbolic
+
+
+@_onnx_symbolic(
+    "aten::sum",
+    decorate=[symbolic_helper._apply_params("ReduceSum", "sum")],
+)
+def _reduce_with_dtype(onnx_op, name):
+    symbolic = _reduce_op_symbolic(onnx_op)
+
+    @symbolic_helper._overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @symbolic_helper.parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        @symbolic_helper.parse_args("v", "v", "i", "none")
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/core.py#L6097
+# NOTE: Supporting aten::unflatten before opset13 needs helper function to adjust ONNX op changes in Concat, Slice, ...
+@_onnx_symbolic("aten::unflatten")
+def unflatten(g: jit_utils.GraphContext, input, dim, unflattened_size):
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+
+    # dim could be negative
+    input_dim = g.op("Constant", value_t=torch.tensor([input_dim], dtype=torch.int64))
+    dim = g.op("Add", input_dim, dim)
+    dim = g.op("Mod", dim, input_dim)
+
+    input_size = g.op("Shape", input)
+
+    head_start_idx = g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))
+    head_end_idx = g.op(
+        "Reshape", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    head_part_rank = g.op("Slice", input_size, head_start_idx, head_end_idx)
+
+    dim_plus_one = g.op(
+        "Add", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    tail_start_idx = g.op(
+        "Reshape",
+        dim_plus_one,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64)),
+    )
+    tail_end_idx = g.op(
+        "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+    )
+    tail_part_rank = g.op("Slice", input_size, tail_start_idx, tail_end_idx)
+
+    final_shape = g.op(
+        "Concat", head_part_rank, unflattened_size, tail_part_rank, axis_i=0
+    )
+
+    return symbolic_helper._reshape_helper(g, input, final_shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented("unsafe_chunk", "unknown dimension size")
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+
+    # TODO: So far we don"t have a module using this method. We"ll keep
+    # this as a constant unless we see a request of dynamics in any
+    # user's modules.
+    splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::tile")
+def tile(g: jit_utils.GraphContext, self, dims):
+    self_shape = g.op("Shape", self)
+    self_rank = g.op("Size", self_shape)
+    dims_rank = g.op("Size", dims)
+    diff = g.op("Sub", self_rank, dims_rank)
+    const_zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    # 1. If dims is shorter than self.shape pad dims with 1
+    dims_shorter_than_self_shape = g.op("Greater", diff, const_zero)
+    (
+        if_op_greater,
+        (if_context_greater, else_context_greater),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_shorter_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_greater.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_greater = if_context_greater.op("Reshape", diff, const_one)
+    exapnd_ones_greater = if_context_greater.op("Expand", const_one, diff_1d_greater)
+    dims_ = if_context_greater.op("Concat", exapnd_ones_greater, dims, axis_i=0)
+    utils._add_output_to_block(if_context_greater.block, dims_)
+    identity_dim = else_context_greater.op("Identity", dims)
+    utils._add_output_to_block(else_context_greater.block, identity_dim)
+    dims_final = if_op_greater.node().output()
+
+    # 2. If dims is longer than self.shape pad self.shape with 1
+    dims_longer_than_self_shape = g.op("Less", diff, const_zero)
+    (
+        if_op_less,
+        (if_context_less, else_context_less),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_longer_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_less.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_less = if_context_less.op(
+        "Reshape",
+        if_context_less.op("Abs", diff),
+        const_one,
+    )
+    exapnd_ones_less = if_context_less.op("Expand", const_one, diff_1d_less)
+    self_final_shape = if_context_less.op(
+        "Concat", exapnd_ones_less, self_shape, axis_i=0
+    )
+    self_ = if_context_less.op("Reshape", self, self_final_shape)
+    utils._add_output_to_block(if_context_less.block, self_)
+    identity_self = else_context_less.op("Identity", self)
+    utils._add_output_to_block(else_context_less.block, identity_self)
+    self_final = if_op_less.node().output()
+
+    dims_final = g.op("Cast", dims_final, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Tile", self_final, dims_final)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    final_dim = dim
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    output_sizes = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            output_sizes[idx], input_sizes[idx] = 0, -1
+
+    # Check if all indices should be repeated the same number of times.
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    cond_dynamic_repeats = repeats_dim == 1 and repeats_sizes[0] is None
+    # If input size is dynamic or repeats vector is dynamic
+    if output_sizes[dim] == 0 or cond_dynamic_repeats:
+        reps = symbolic_helper._size_helper(g, self, dim)
+        reps = opset11.unsqueeze(g, reps, 0)
+
+        # Check if repeats is dynamic
+        # As repeats is dynamic, we use a where node as a substitute for the if statement
+        # If repests_dim = 1, expand repeats otherwise use original tensor
+        if cond_dynamic_repeats:
+            repeat_dim = symbolic_helper._size_helper(
+                g, repeats, g.op("Constant", value_t=torch.LongTensor([0]))
+            )
+            repeat_cond = g.op(
+                "Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+            repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats)
+    # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
+    # provided along one of the dynamic axes provided. A simple example would be
+    # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2
+    # Now, repeat interleaving can be performed in pytorch when the value of * matches
+    # with the number of elements in repeat, for example if * -> 2, number of repeats
+    # should be 2 as well.
+    else:
+        return opset9.repeat_interleave(g, self, repeats, final_dim)
+
+    reps_like = g.op(
+        "ConstantOfShape",
+        g.op("Shape", repeats),
+        value_t=torch.tensor([1], dtype=torch.long),
+    )
+    r_splits = split(g, repeats, reps_like, 0)
+    i_splits = split(g, self, reps_like, dim)
+
+    output_sizes[dim], input_sizes[dim] = -1, 1
+
+    # Create a loop to iterate over each value along the dimension
+    # and perform individual interleaving using the repeats tensor
+    # Loop is of the following pattern
+    # input (trip_count, cond)
+    #   int trip_count = ...;
+    #   bool cond = ...;
+    #   for (int i=0; i < trip_count && cond; ++i) {
+    #     cond = ...;
+    #   }
+
+    # Loop conditions
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    loop_len = reps
+
+    # Create an empty sequence to store final expansions
+    final_splits = g.op("SequenceEmpty")
+
+    # Loop inputs
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, final_splits, n_blocks=1
+    )
+
+    loop_block = loop_context.block
+    block_input_iter = utils._add_input_to_block(loop_block)
+    cond = utils._add_input_to_block(loop_block)  # noqa: F841
+    final_splits = utils._add_input_to_block(loop_block)
+
+    r_split = loop_context.op("SequenceAt", r_splits, block_input_iter)
+    i_split = loop_context.op("SequenceAt", i_splits, block_input_iter)
+
+    i_split = opset11.unsqueeze(loop_context, i_split, dim + 1)
+    r_concat = [
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[: dim + 1])),
+        r_split,
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1 :])),
+    ]
+    r_concat = loop_context.op("Concat", *r_concat, axis_i=0)
+    i_split = opset9.expand(loop_context, i_split, r_concat, None)
+    i_split = symbolic_helper._reshape_helper(
+        loop_context, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes))
+    )
+    final_splits = loop_context.op("SequenceInsert", final_splits, i_split)
+
+    # Loop outputs
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, final_splits)
+
+    loop_out = loop.node().output()
+    loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim)
+    return loop_out
+
+
+@_onnx_symbolic("aten::diagonal")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def diagonal(g: jit_utils.GraphContext, self, offset, dim1, dim2):
+    rank = symbolic_helper._get_tensor_rank(self)
+    # Replace negative indexing when rank is known
+    if rank is not None:
+        dim1 = dim1 if dim1 >= 0 else dim1 + rank
+        dim2 = dim2 if dim2 >= 0 else dim2 + rank
+
+    dim1_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim1]))
+    )
+    dim2_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim2]))
+    )
+    # Create appropriate mask
+    mask_shape = g.op("Concat", dim1_size, dim2_size, axis_i=0)
+    mask = opset9.zeros(g, mask_shape, None, None, None)
+    mask = g.op("EyeLike", mask, k_i=offset)
+    # dim1 and dim2 appended as a dimension at the end of the shape
+
+    if rank is not None:
+        axes = list(range(rank))
+        axes.remove(dim1)
+        axes.remove(dim2)
+        self = g.op("Transpose", self, perm_i=axes + [dim1, dim2])
+    else:
+        return symbolic_helper._unimplemented("diagonal", "unknown input rank")
+
+    # Multiply input and mask to calculate values along diagonal
+    # The mask consists of one values where diagonal values are to be calculated
+    # For example:
+    # [[1.1, 1.2, 1.3],   *    [[1, 0, 0]   =   [[1.1, 0, 0],
+    #  [2.1, 2.2, 2.3],         [0, 1, 0]        [0, 2.2, 0],
+    #  [3.1, 3.2, 3.3]]         [0, 0, 1]]       [0, 0, 3.3]]
+    result = g.op("Mul", self, mask)
+    result = symbolic_helper._reducesum_helper(g, result, axes_i=[-1], keepdims_i=0)
+
+    # Calculate gather indices based on offset and dims
+    # If offset is greater than zero, set offset to zero as this aids in
+    # calculation of selection window
+    offset_op = g.op("Constant", value_t=torch.LongTensor([offset]))
+    if offset >= 0:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", dim1_size, g.op("Sub", dim2_size, offset_op)),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+        offset = 0
+    else:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", g.op("Add", dim1_size, offset_op), dim2_size),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+    diag_size = g.op("Concat", diag_size, axis_i=0)
+
+    # Calculate which diagonal values to select
+    # For example, in cases with offsets:
+    # [[0, 1.1, 0]
+    #  [0, 0, 2.2]]
+    # we need to select the last two columns, so we create a tensor
+    # with all columns that are to be selected
+    # So in this example, it is [1, 2]
+    select_window_ones_fill = opset9.ones(g, diag_size, 4, None, None)
+    select_window = g.op(
+        "CumSum",
+        select_window_ones_fill,
+        g.op("Constant", value_t=torch.LongTensor([0])),
+    )
+    select_window = g.op(
+        "Add",
+        select_window,
+        g.op("Constant", value_t=torch.LongTensor([abs(offset) - 1])),
+    )
+
+    gather_shape = [
+        opset9.size(g, result, dim=g.op("Constant", value_t=torch.LongTensor([axis])))
+        for axis in list(range(rank))[:-2]
+    ]
+    gather_shape.append(diag_size)
+    gather_shape = g.op("Concat", *gather_shape, axis_i=0)
+    gather_indices = opset9.zeros(g, gather_shape, 4, None, None)
+
+    # There might be cases where offset value is greater than number of rows/columns
+    # and might cause the diagonal to overrun and as a result of this, diag_size would be zero.
+    # For example, if
+    #       offset = 9, dim1_size = 2 (columns), dim2_size = 4 (rows)
+    #       diag_size = max(min(2, (4-9)), 0) = 0, based on calculation above
+    # Cases with diagonal overrun always result in diag_size = max(0, -ve value) = 0
+    # In cases without diagonal overrun, we select the appropriate rows/columns along which we
+    # are calculating diagonal values. In cases with diagonal overrun, we return a tensor which has
+    # the dimension of the row/column where overrun occurred as 0-dim, as we are essentially
+    # returning an empty tensor
+    overrun_cond = g.op(
+        "Not",
+        g.op(
+            "Equal",
+            diag_size,
+            g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64)),
+        ),
+    )
+
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", overrun_cond, n_blocks=2
+    )
+
+    gather_indices_if_block = if_context.op("Add", gather_indices, select_window)
+    gather_indices_if_block = symbolic_helper._unsqueeze_helper(
+        if_context, gather_indices_if_block, [rank - 1]
+    )
+    final_non_overrun = if_context.op(
+        "GatherND", result, gather_indices_if_block, batch_dims_i=rank - 2
+    )
+    final_overrun = opset9.zeros(else_context, gather_shape, 6, None, None)
+    utils._add_output_to_block(if_context.block, final_non_overrun)
+    utils._add_output_to_block(else_context.block, final_overrun)
+    return if_op
+
+
+# Quantized ops
+
+
+@_onnx_symbolic("quantized::linear")
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py
new file mode 100644
index 000000000000..5675f362893e
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset14.py
@@ -0,0 +1,296 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 14.
+
+Note [ONNX operators that are added/updated in opset 14]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    HardSwish, Trilu
+
+Updated operators:
+    Reshape
+    Add, Sub, Mul, Div
+    GRU, LSTM, RNN
+    BatchNorm, Cumsum, Relu
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+from __future__ import annotations
+
+import functools
+
+import torch
+from torch.onnx import _constants
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+)
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+__all__ = [
+    "hardswish",
+    "tril",
+    "triu",
+    "reshape",
+    "batch_norm",
+    "quantized_hardswish",
+    "scaled_dot_product_attention",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=14)
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.parse_args("v")
+def hardswish(g: jit_utils.GraphContext, self):
+    return g.op("HardSwish", self)
+
+
+@_onnx_symbolic("aten::tril")
+def tril(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=0)
+
+
+@_onnx_symbolic("aten::triu")
+def triu(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=1)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v")
+def reshape(g: jit_utils.GraphContext, self, shape):
+    # NOTE: Due to bug in ORT https://github.com/microsoft/onnxruntime/issues/10664
+    #       Reshape export cannot utilize the new allowzero attribute introduced in opset 14.
+    return symbolic_helper._reshape_helper(g, self, shape, allowzero=0)
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            14,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    symbolic_helper.check_training_mode(training, "batch_norm")
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        training_mode_i=0 if not training else 1,
+        outputs=1 if not training else 3,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        return res
+
+
+@_onnx_symbolic("quantized::hardswish")
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/nn.py#L1504
+# aten_scaled_dot_product_attention
+# NOTE: Need op.Trilu
+@_onnx_symbolic("aten::scaled_dot_product_attention")
+@symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v", "b")
+def scaled_dot_product_attention(
+    g: jit_utils.GraphContext,
+    query: torch._C.Value,
+    key: torch._C.Value,
+    value: torch._C.Value,
+    attn_mask: torch._C.Value | None = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: torch._C.Value | None = None,
+    enable_gqa: bool = False,
+):
+    assert (not is_causal) or (is_causal and symbolic_helper._is_none(attn_mask)), (
+        "is_causal and attn_mask cannot be set at the same time"
+    )
+    assert not enable_gqa, (
+        "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
+    )
+
+    if symbolic_helper._is_none(scale):
+        scale = _attention_scale(g, query)
+
+    if is_causal:
+        attn_mask = _causal_attention_mask(g, query, key)
+
+    # Swap the last two axes of key
+    # NOTE: onnx-script has different logic here, because the attribute perms in
+    # transpose needs list of ints
+    key_shape_builtin = symbolic_helper._get_tensor_rank(key)
+    key_transposed_axes = list(range(key_shape_builtin))
+    key_transposed_axes[-1], key_transposed_axes[-2] = (
+        key_transposed_axes[-2],
+        key_transposed_axes[-1],
+    )
+    key_transposed = g.op("Transpose", key, perm_i=key_transposed_axes)
+
+    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
+    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+    query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
+    key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
+    mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
+
+    if symbolic_helper._is_none(attn_mask):
+        mul_qk_add = mul_qk
+        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+    elif (
+        _type_utils.JitScalarType.from_value(attn_mask)
+        == _type_utils.JitScalarType.BOOL
+    ):
+        # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
+        const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+        const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+        attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+        # When using scaled dot product attention with a boolean mask, the softmax operation might return NaN values
+        # due to the presence of -inf in an entire row (padding tokens), resulting in 0/0 (NaN) in the softmax output.
+        # This is because there's no safe softmax imp in ONNX, so we need to handle NaN values explicitly to match
+        # the behavior of PyTorch with boolean masks.
+        attn_weight = g.op("Where", g.op("IsNaN", attn_weight), const_zero, attn_weight)
+    elif _type_utils.JitScalarType.from_value(attn_mask) in (
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    ):
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+    else:
+        raise ValueError(
+            f"Unsupported type for attn_mask: {_type_utils.JitScalarType.from_value(attn_mask)}"
+        )
+
+    if dropout_p != 0:
+        attn_weight = g.op(
+            "Dropout",
+            attn_weight,
+            g.op("Constant", value_t=torch.tensor(dropout_p, dtype=torch.float)),
+        )
+
+    return g.op("MatMul", attn_weight, value)
+
+
+def _attention_scale(
+    g: jit_utils.GraphContext, query: torch._C.Value
+) -> torch._C.Value:
+    """Calculate the scale factor for the attention result.
+
+    Args:
+        query: Tensor of shape [..., L, E]
+
+    Returns:
+        Scalar scale factor := 1 / math.sqrt(query.size(-1))
+    """
+    query_shape = g.op("Shape", query)
+    query_shape_last = g.op(
+        "Slice",
+        query_shape,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+        g.op(
+            "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+        ),
+    )
+    embedding_size = g.op(
+        "Cast",
+        query_shape_last,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    const_one = g.op("Constant", value_t=torch.tensor([1.0], dtype=torch.float))
+    scale = g.op("Div", const_one, g.op("Sqrt", embedding_size))
+    # Add a Cast to convert the scale back to original type
+    scale = g.op(
+        "Cast",
+        scale,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    return scale
+
+
+def _causal_attention_mask(
+    g: jit_utils.GraphContext, query: torch._C.Value, key: torch._C.Value
+) -> torch._C.Value:
+    """Create a causal mask for the given query and key tensors.
+
+    Equivalent to::
+        mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_mask = torch.zeros(L, S, dtype=torch.float)
+        attn_mask = attn_mask.masked_fill(not mask, -float("inf"))
+
+    Args:
+        query: Tensor of shape [..., L, E]
+        key: Tensor of shape [..., S, E]
+
+    Returns:
+        Tensor of shape [L, S]
+    """
+
+    query_shape = g.op("Shape", query)
+    key_shape = g.op("Shape", key)
+
+    last_idx = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    second_last_idx = g.op("Constant", value_t=torch.tensor([-2], dtype=torch.int64))
+    target_length = g.op("Slice", query_shape, second_last_idx, last_idx)
+    source_length = g.op("Slice", key_shape, second_last_idx, last_idx)
+    # attn_mask = torch.ones(L, S) := {
+    size = g.op("Concat", target_length, source_length, axis_i=0)
+    const_one = g.op("Constant", value_t=torch.tensor([1.0]))
+    attn_mask = g.op("Expand", const_one, size)
+    # }
+    attn_mask = g.op("Trilu", attn_mask, upper_i=0)
+    # The causal mask has 0s in the lower triangle and -inf in the upper triangle.
+    const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+    const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+    attn_mask = g.op(
+        "Where", g.op("Equal", attn_mask, const_zero), const_neg_inf, const_zero
+    )
+    return attn_mask
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset15.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset15.py
new file mode 100644
index 000000000000..4f86a7f2f862
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset15.py
@@ -0,0 +1,84 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 15.
+
+Note [ONNX operators that are added/updated in opset 15]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/master/docs/Changelog.md#version-15-of-the-default-onnx-operator-set
+New operators:
+    Bernoulli
+    CastLike
+    Optional
+    OptionalGetElement
+    OptionalHasElement
+
+Updated operators:
+    BatchNormalization https://github.com/onnx/onnx/pull/3545
+                        Backwards compatible
+                        TODO: test coverage for mixed types inputs.
+    Pow                https://github.com/onnx/onnx/pull/3412
+                        Backwards compatible
+                        TODO: bfloat16 support.
+    Shape              https://github.com/onnx/onnx/pull/3580
+                        Backwards compatible
+                        TODO: optional start/end attribute.
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch import _C
+from torch.onnx._internal.torchscript_exporter import (
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=15)
+
+
+@_onnx_symbolic("aten::__is_")
+def aten__is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if isinstance(self.type(), _C.OptionalType):
+            none = g.op("OptionalHasElement", self)
+            return g.op("Not", none)
+        else:
+            return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return opset9.eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@opset9.wrap_logical_op_with_negation  # type: ignore[has-type]
+def aten__isnot_(g: jit_utils.GraphContext, self, other):
+    return aten__is_(g, self, other)
+
+
+@_onnx_symbolic("aten::bernoulli")
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+    if p is None or symbolic_helper._is_none(p):
+        return g.op("Bernoulli", input)
+    return opset9.bernoulli(g, input, p, generator, out)
+
+
+@_onnx_symbolic("prim::unchecked_cast")
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    # exists to refine the type of the Value
+    # if x is Optional[Tensor], unchecked_cast will cast
+    # x to Tensor, so the rest of the graph knows that x is a Tensor.
+    if isinstance(self.type(), _C.OptionalType):
+        return g.op("OptionalGetElement", self)
+
+    return self
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset16.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset16.py
new file mode 100644
index 000000000000..a617270a2a7c
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset16.py
@@ -0,0 +1,191 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 16.
+
+Note [ONNX Operators that are added/updated in opset 16]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-16-of-the-default-onnx-operator-set
+New operators:
+    GridSample https://github.com/onnx/onnx/pull/3557
+
+Updated operators:
+    Identity
+    If
+    LeakyRelu
+    Loop
+    PRelu
+    RoiAlign
+    Scan
+    ScatterElements
+    ScatterND
+    Where
+    GreaterOrEqual
+    LessOrEqual
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch.nn.functional import (
+    GRID_SAMPLE_INTERPOLATION_MODES,
+    GRID_SAMPLE_PADDING_MODES,
+)
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    utils,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=16)
+
+
+# note (mkozuki): Why `grid_sampler` instead of `grid_sample`?
+# Because `torch.nn.functional.grid_sample` calls `torch.grid_sampler`.
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+def grid_sampler(
+    g: jit_utils.GraphContext,
+    input,
+    grid,
+    mode_enum,
+    padding_mode_enum,
+    align_corners,
+):
+    # Check the input and grid tensor rank beforehand.
+    if symbolic_helper._get_tensor_rank(input) == 5:
+        return symbolic_helper._onnx_unsupported("GridSample with 5D volumetric input")
+    mode_s = {v: k for k, v in GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg]
+    padding_mode_s = {v: k for k, v in GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg]
+        padding_mode_enum
+    ]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src_sizes = symbolic_helper._get_tensor_sizes(src)
+    index_sizes = symbolic_helper._get_tensor_sizes(index)
+
+    if len(src_sizes) != len(index_sizes):
+        return symbolic_helper._unimplemented(
+            "scatter_add",
+            f"`index` ({index_sizes}) should have the same dimensionality as `src` ({src_sizes})",
+        )
+
+    # PyTorch only allows index shape <= src shape, so we can only consider
+    # taking index as subset size to src, like PyTorch does. When sizes for src
+    # and index are not matched or there are dynamic axes, we take index shape to
+    # slice src to accommodate.
+    if src_sizes != index_sizes or None in index_sizes:
+        adjusted_shape = g.op("Shape", index)
+        starts = g.op("Constant", value_t=torch.tensor([0] * len(index_sizes)))
+        src = g.op("Slice", src, starts, adjusted_shape)
+
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim, reduction_s="add")
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+
+        return g.op(
+            "ScatterElements",
+            self,
+            index,
+            src,
+            axis_i=dim,
+            reduction_s="add",
+        )
+
+
+@_onnx_symbolic("aten::scatter_reduce")
+@symbolic_helper.parse_args("v", "i", "v", "v", "s", "b")
+def scatter_reduce(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    dim: int,
+    index: torch._C.Value,
+    src: torch._C.Value,
+    reduce: str,
+    include_self: bool,
+):
+    if reduce == "mean":
+        raise errors.OnnxExporterError(
+            "ONNX does not support mean reduction for scatter_reduce"
+        )
+    if not include_self:
+        raise errors.OnnxExporterError(
+            "ONNX does not support include_self=False for scatter_reduce"
+        )
+
+    reduce_mode = {  # convert torch string name to onnx string name
+        "mean": "none",  # 'mean' doesn't support in ONNX 1.14 definition
+        "sum": "add",
+        "prod": "mul",
+        "amin": "min",
+        "amax": "max",
+    }
+    onnx_reduce = reduce_mode[reduce]
+
+    self_rank = g.op("Size", g.op("Shape", self))
+
+    # if self_rank == 0:  # assert (index_rank == 0 and rank_src == 0)
+    self_rank_is_zero = g.op(
+        "Equal", self_rank, g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+    )
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=3
+    )
+    neg_1 = if_context.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+
+    self_reshape = if_context.op("Reshape", self, neg_1)
+    utils._add_output_to_block(if_context.block, self_reshape)
+    index_reshape = if_context.op("Reshape", index, neg_1)
+    utils._add_output_to_block(if_context.block, index_reshape)
+    src_reshape = if_context.op("Reshape", src, neg_1)
+    utils._add_output_to_block(if_context.block, src_reshape)
+
+    self_identity = else_context.op("Identity", self)
+    utils._add_output_to_block(else_context.block, self_identity)
+    index_identitye = else_context.op("Identity", index)
+    utils._add_output_to_block(else_context.block, index_identitye)
+    src_identity = else_context.op("Identity", src)
+    utils._add_output_to_block(else_context.block, src_identity)
+
+    result = g.op("ScatterElements", *if_op, axis_i=dim, reduction_s=onnx_reduce)
+
+    # if self_rank == 0:
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=1
+    )
+    result_squeezed = if_context.op("Squeeze", result)
+    utils._add_output_to_block(if_context.block, result_squeezed)
+    result_identity = else_context.op("Identity", result)
+    utils._add_output_to_block(else_context.block, result_identity)
+    result_final = if_op.node().output()
+
+    return result_final
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py
new file mode 100644
index 000000000000..e8ea41e64306
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset17.py
@@ -0,0 +1,244 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 17.
+
+Note [ONNX Operators that are added/updated in opset 17]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-17-of-the-default-onnx-operator-set
+New operators:
+    BlackmanWindow
+    DFT
+    HammingWindow
+    HannWindow
+    LayerNormalization
+    MelWeightMatrix
+    STFT
+    SequenceMap
+"""
+
+import functools
+from collections.abc import Sequence
+from typing import Optional
+
+import torch
+from torch import _C
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+)
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = ["layer_norm", "stft", "quantized_layer_norm"]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=17)
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+):
+    # normalized_shape: input shape from an expected input of size
+    # axis: The first normalization dimension.
+    # layer_norm normalizes on the last D dimensions,
+    # where D is the size of normalized_shape
+    axis = -len(normalized_shape)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    dtype = scalar_type.dtype()
+    if symbolic_helper._is_none(weight):
+        weight_value = torch.ones(normalized_shape, dtype=dtype)
+        weight = g.op("Constant", value_t=weight_value)
+    if symbolic_helper._is_none(bias):
+        bias_value = torch.zeros(normalized_shape, dtype=dtype)
+        bias = g.op("Constant", value_t=bias_value)
+    return g.op(
+        "LayerNormalization",
+        input,
+        weight,
+        bias,
+        epsilon_f=eps,
+        axis_i=axis,
+    )
+
+
+@_onnx_symbolic("quantized::layer_norm")
+def quantized_layer_norm(
+    g: jit_utils.GraphContext,
+    x,
+    normalized_shape,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = layer_norm(g, x, normalized_shape, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+def _compute_edge_sizes(n_fft, window_size):
+    """Helper function to compute the sizes of the edges (left and right)
+    of a given window centered within an FFT size."""
+    left = (n_fft - window_size) // 2
+    right = n_fft - left - window_size
+    return left, right
+
+
+@_onnx_symbolic("aten::stft")
+@symbolic_helper.parse_args("v", "i", "i", "i", "v", "b", "b", "b", "b")
+def stft(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    n_fft: int,
+    hop_length: Optional[int] = None,
+    win_length: Optional[int] = None,
+    window: Optional[_C.Value] = None,
+    normalized: bool = False,
+    onesided: Optional[bool] = True,
+    return_complex: Optional[bool] = False,
+    align_to_window: Optional[bool] = None,
+) -> _C.Value:
+    """Associates `torch.stft` with the `STFT` ONNX operator.
+    Note that torch.stft calls _VF.stft, without centering or padding options.
+    Hence, this function does not contain these two arguments.
+    See torch.stft source code for more info.
+
+    Args:
+        g: Graph to write the ONNX representation into
+        input: Input tensor for the transformation
+        n_fft: FFT size
+        hop_length: Size of the hop. Defaults to `floot(n_fft // 4)`
+        win_length: Size of the analysis window. Defaults to `n_fft`
+        window: Analysis window. Defaults to a window of all ones
+        normalized: Whether to return a normalized STFT
+        onesided: Whether to return only half (+1) of the results, given the
+            symmetry of the STFT
+        return_complex: Whether to return the complex value (Note: Must be
+            `False` or `None`)
+
+    Returns:
+        op: Operator for torch.stft associated with STFT (ONNX)
+    """
+    # Checks
+    if return_complex:
+        raise errors.SymbolicValueError(
+            msg="STFT does not currently support complex types", value=input
+        )
+
+    if align_to_window is not None:
+        raise errors.SymbolicValueError(
+            msg="STFT does not currently support the align_to_window option",
+            value=input,
+        )  # TODO(#145944): add compatibility with align_to_window option.
+
+    # Get STFT sizes
+    frame_step_value = hop_length if hop_length is not None else n_fft // 4
+    frame_step_const = g.op(
+        "Constant", value_t=torch.tensor(frame_step_value, dtype=torch.int64)
+    )
+    frame_length_const = g.op(
+        "Constant", value_t=torch.tensor(n_fft, dtype=torch.int64)
+    )
+
+    # Pre-process input if needed
+    signal = input
+    signal_rank = symbolic_helper._get_tensor_rank(signal)
+    if signal_rank == 1:
+        # Add batch dimension
+        signal = g.op(
+            "Unsqueeze",
+            signal,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+    elif signal_rank is None or signal_rank > 2:
+        raise errors.SymbolicValueError(
+            msg="STFT can only take inputs of 1 [signal] or 2 [batch, signal] dimensions. "
+            f"Current rank of signal is {signal_rank}, please reduce it.",
+            value=input,
+        )
+
+    # Get window and make sure it's the same size as `win_length` or `n_fft`
+    n_win = symbolic_helper._get_tensor_dim_size(window, dim=0)
+    if n_win is not None:
+        win_length_default = win_length if win_length else n_fft
+        assert n_win == win_length_default, (
+            "Analysis window size must equal `win_length` or `n_fft`. "
+            f"Please, set `win_length` or `n_fft` to match `window` size ({n_win})",
+        )
+
+        # Center window around zeros if needed (required by ONNX's STFT)
+        if n_win < n_fft:
+            left, right = _compute_edge_sizes(n_fft, n_win)
+            left_win = g.op("Constant", value_t=torch.zeros(left))
+            right_win = g.op("Constant", value_t=torch.zeros(right))
+            window = g.op("Concat", left_win, window, right_win, axis_i=0)
+
+    # Create window, if needed
+    if symbolic_helper._is_none(window):
+        if win_length:
+            if win_length > n_fft:
+                raise errors.SymbolicValueError(
+                    msg="The analysis window can't be longer than the size of the FFT. "
+                    f"Please set `win_length` ({win_length}) to `n_fft` ({n_fft}) or less.",
+                    value=input,
+                )
+
+            # Center window, if needed
+            left, right = _compute_edge_sizes(n_fft, win_length)
+            torch_window = torch.hstack(
+                (torch.zeros(left), torch.ones(win_length), torch.zeros(right))
+            )
+        else:
+            # Rectangle window
+            torch_window = torch.ones(n_fft)
+        assert torch_window.shape[0] == n_fft
+        window = g.op("Constant", value_t=torch_window)
+    window = g.op(
+        "Cast", window, to_i=_type_utils.JitScalarType.from_value(signal).onnx_type()
+    )
+
+    # Run STFT
+    result = g.op(
+        "STFT",
+        signal,
+        frame_step_const,
+        window,
+        frame_length_const,
+        onesided_i=1 if onesided is None or onesided else 0,
+    )
+
+    # Transpose to mimic torch.stft's behavior
+    result = g.op("Transpose", result, perm_i=[0, 2, 1, 3])
+
+    # Remove batch dimension, if needed
+    if signal_rank == 1:
+        result = g.op(
+            "Squeeze",
+            result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    # Normalize, if needed
+    if normalized:
+        sqrt_nfft = torch.sqrt(torch.tensor(n_fft, dtype=signal.type().dtype()))
+        result = g.op("Div", result, g.op("Constant", value_t=sqrt_nfft))
+
+    return result
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py
new file mode 100644
index 000000000000..6a5ac408fb1b
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset18.py
@@ -0,0 +1,270 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 18.
+
+Note [ONNX Operators that are added/updated in opset 18]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-18-of-the-default-onnx-operator-set
+New operators:
+    BitwiseAnd
+    CenterCropPad
+    Col2Im
+    Mish
+    OptionalGetElement
+    OptionalHasElement
+    Pad
+    Resize
+    ScatterElements
+    ScatterND
+    Split
+"""
+
+import functools
+from collections.abc import Sequence
+from typing import Optional
+
+import torch
+from torch import _C
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = [
+    "col2im",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=18)
+
+
+@_onnx_symbolic("aten::__and_")
+@_onnx_symbolic("aten::bitwise_and")
+def __and_(g: jit_utils.GraphContext, self, other):
+    # do type promotion (scalars don't seem to apply)
+    args = [self, other]
+    # type promotion doesn't happen with torch.bitwise_and(tensor, scalar)
+    prom_args = [arg for arg in args if symbolic_helper._get_tensor_rank(arg)]
+    if len(prom_args) == 0:
+        prom_args = args
+    promotion_jit_type = symbolic_helper._type_promote_from_values(*prom_args)
+    self = symbolic_helper._maybe_cast_to_type(g, self, promotion_jit_type)
+    other = symbolic_helper._maybe_cast_to_type(g, other, promotion_jit_type)
+    if promotion_jit_type == _type_utils.JitScalarType.BOOL:
+        return g.op("And", self, other)
+    return g.op("BitwiseAnd", self, other)
+
+
+@_onnx_symbolic("aten::col2im")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is")
+def col2im(
+    g,
+    input: _C.Value,
+    output_size: _C.Value,
+    kernel_size: _C.Value,
+    dilation: Sequence[int],
+    padding: Sequence[int],
+    stride: Sequence[int],
+):
+    # convert [i0, i1, ..., in] into [i0, i0, i1, i1, ..., in, in]
+    adjusted_padding: list[int] = []
+    for pad in padding:
+        adjusted_padding.extend(pad for _ in range(2))
+
+    num_dimensional_axis = symbolic_helper._get_tensor_sizes(output_size)[0]
+    if not adjusted_padding:
+        adjusted_padding = [0, 0] * num_dimensional_axis
+
+    if not dilation:
+        dilation = [1] * num_dimensional_axis
+
+    if not stride:
+        stride = [1] * num_dimensional_axis
+
+    return g.op(
+        "Col2Im",
+        input,
+        output_size,
+        kernel_size,
+        dilations_i=dilation,
+        pads_i=adjusted_padding,
+        strides_i=stride,
+    )
+
+
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
+)
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
+    )
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+def _native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> tuple[_C.Value, _C.Value, _C.Value]:
+    return opset9.native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+def _glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, num_outputs_i=2, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+        return g.op("ReduceMin", self, axes, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, axes, keepdims_i=keepdim
+        )
+    else:
+        return g.op("ReduceMin", self, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, keepdims_i=keepdim
+        )
+
+
+@_onnx_symbolic("aten::var_mean")
+def _var_mean(g: jit_utils.GraphContext, input, *args):
+    if len(args) == 1:
+        return symbolic_helper._var_mean_helper(g, input, None, args[0], None)
+    else:
+        return symbolic_helper._var_mean_helper(g, input, *args)
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+def _logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    if dim is None:
+        return g.op("ReduceLogSumExp", input, keepdims_i=0)
+    else:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+        return g.op("ReduceLogSumExp", input, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def _linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: list[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return opset9.linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return symbolic_helper._embedding_bag_helper(
+        g,
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
+    )
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Optional[Sequence[int]],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset19.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset19.py
new file mode 100644
index 000000000000..781bc2d200c7
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset19.py
@@ -0,0 +1,31 @@
+"""This file exports ONNX ops for opset 19.
+
+Note [ONNX Operators that are added/updated in opset 19]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-19-of-the-default-onnx-operator-set
+New operators:
+AveragePool
+Cast
+CastLike
+Constant
+DeformConv
+DequantizeLinear
+Equal
+Identity
+If
+Loop
+Pad
+QuantizeLinear
+Reshape
+Resize
+Scan
+Shape
+Size
+"""
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__: list[str] = []
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset20.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset20.py
new file mode 100644
index 000000000000..8e8ca44a26a4
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset20.py
@@ -0,0 +1,95 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 20.
+
+Note [ONNX Operators that are added/updated in opset 20]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-20-of-the-default-onnx-operator-set
+New operators:
+    AffineGrid
+    ConstantOfShape
+    DFT
+    Gelu
+    GridSample
+    ImageDecoder
+    IsInf
+    IsNaN
+    ReduceMax
+    ReduceMin
+    RegexFullMatch
+    StringConcat
+    StringSplit
+"""
+
+import functools
+
+import torch.nn.functional as F
+from torch import _C
+from torch.onnx._internal.torchscript_exporter import (
+    jit_utils,
+    registration,
+    symbolic_helper,
+)
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = ["_grid_sampler", "_affine_grid_generator", "gelu"]
+
+
+def convert_grid_sample_mode(mode_s):
+    return (
+        "linear" if mode_s == "bilinear" else "cubic" if mode_s == "bicubic" else mode_s
+    )
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=20)
+
+
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+def _grid_sampler(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    grid: _C.Value,
+    mode_enum: int,
+    padding_mode_enum: int,
+    align_corners: bool,
+):
+    mode_s = {v: k for k, v in F.GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg, index]
+    # mode string changes at https://onnx.ai/onnx/operators/text_diff_GridSample_16_20.html
+    mode_s = convert_grid_sample_mode(mode_s)
+    padding_mode_s = {v: k for k, v in F.GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg, index]
+        padding_mode_enum  # type: ignore[index]
+    ]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::affine_grid_generator")
+@symbolic_helper.parse_args("v", "v", "b")
+def _affine_grid_generator(
+    g: jit_utils.GraphContext,
+    theta: _C.Value,
+    size: _C.Value,
+    align_corners: bool,
+):
+    return g.op(
+        "AffineGrid",
+        theta,
+        size,
+        align_corners_i=int(align_corners),
+    )
+
+
+@_onnx_symbolic("aten::gelu")
+@symbolic_helper.parse_args("v", "s")
+def gelu(g: jit_utils.GraphContext, self: _C.Value, approximate: str = "none"):
+    return g.op("Gelu", self, approximate_s=approximate)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py
new file mode 100644
index 000000000000..d11750b1ee8a
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset7.py
@@ -0,0 +1,71 @@
+# mypy: allow-untyped-defs
+"""
+Note [ONNX operators that are added/updated from opset 7 to opset 8]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+  Expand
+
+Updated operators:
+  Min, Max, Sum, Mean: supports multidirectional broadcasting.
+  MaxPool: added optional indices output.
+  Scan
+"""
+
+import functools
+import warnings
+
+from torch.onnx._internal.torchscript_exporter import (
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=7)
+
+block_listed_operators = (
+    "scan",
+    "expand",
+    "expand_as",
+    "meshgrid",
+    "adaptive_max_pool1d",
+    "adaptive_max_pool2d",
+    "adaptive_max_pool3d",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+)
+
+
+# NOTE: max, min, sum, mean: broadcasting is not supported in opset 7.
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+@_onnx_symbolic("aten::max")
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to max operators "
+            "have different shapes"
+        )
+    return opset9.max(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::min")
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to min operators "
+            "have different shapes"
+        )
+    return opset9.min(g, self, dim_or_y, keepdim)
+
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
new file mode 100644
index 000000000000..bde072608088
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset8.py
@@ -0,0 +1,469 @@
+# mypy: allow-untyped-defs
+"""
+Note [ONNX operators that are added/updated from opset 8 to opset 9]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    Compress
+    ConstantOfShape
+    EyeLike
+    MaxUnpool
+    OneHot
+    Sinh
+    Cosh
+    Asinh
+    Acosh
+    Atanh
+    Shrink
+    IsNaN
+    Sign
+    Erf
+    Scatter
+    Where
+    NonZero
+    TfIdfVectorizer
+    MeanVarianceNormalization
+
+Updated operators:
+    BatchNormalization: removed spatial attribute.
+    Greater, Less, Constant, MatMul, PRelu, Gemm, Flatten: more data types{integers} supported.
+    Cast: more data types{string} supported.
+    Upsample: moved scales from attribute to input.
+    Scan
+"""
+
+import functools
+import warnings
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=8)
+
+block_listed_operators = (
+    "nonzero",
+    "where",
+    "scatter",
+    "scatter_add",
+    "erf",
+    "sign",
+    "isnan",
+    "gather",
+    "arange",
+    "masked_fill",
+    "index_fill",
+    "index_copy",
+    "repeat_interleave",
+    "any",
+    "all",
+)
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+def _interpolate(name, dim, interpolate_mode):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        output_size = symbolic_helper._maybe_get_const(output_size, "is")
+        if symbolic_helper._is_value(output_size):
+            return symbolic_helper._unimplemented(
+                name, "torch._C.Value (output_size) indexing"
+            )
+        if scales is None:
+            scales = [
+                1.0
+                if i < 2
+                else float(output_size[-(dim - i)])
+                / float(input.type().sizes()[-(dim - i)])
+                for i in range(0, dim)
+            ]
+        return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    align_corners = symbolic_helper._maybe_get_const(align_corners, "b")
+    if not symbolic_helper._is_none(align_corners) and align_corners:
+        return symbolic_helper._unimplemented("interpolate", "align_corners == True")
+
+    if not symbolic_helper._is_none(scale_factor) and symbolic_helper._is_value(
+        scale_factor
+    ):
+        return symbolic_helper._unimplemented(
+            "interpolate", "dynamic scales in opset 8"
+        )
+
+    if not symbolic_helper._is_none(size) and symbolic_helper._is_value(size):
+        return symbolic_helper._unimplemented("interpolate", "dynamic size in opset 8")
+
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, mode_s=mode, scales_f=scales)
+
+
+# NOTE: We should create a wrapper for this kind of operation, after resolving the shape/type propagation
+#       issue for "cast" operators. Some symbolic functions depend on shape information of input tensor, which
+#       is lost after casting.
+def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
+    floating_scalar_types = {
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+    }
+    old_type = None
+    # Cast the input tensor to Float if its scalarType is known and is not floating number.
+    # If casting is performed, return the old scalarType, otherwise return None.
+    arg0_type = _type_utils.JitScalarType.from_value(
+        args[0], _type_utils.JitScalarType.UNDEFINED
+    )
+    if arg0_type != _type_utils.JitScalarType.UNDEFINED:
+        old_type = arg0_type
+        if old_type not in floating_scalar_types:
+            old_type = old_type.scalar_name()  # type: ignore[assignment]
+            args = tuple(
+                g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+                for arg in args
+            )
+        else:
+            return (None,) + args
+    else:
+        warnings.warn(
+            "Only floating datatype is supported for these operators: "
+            "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
+            "the onnx model to be incorrect, if inputs have integer datatypes."
+        )
+    return (old_type,) + args
+
+
+def _cast_to_type(g: jit_utils.GraphContext, input, to_type):
+    if to_type is None:
+        return input
+    return getattr(opset9, f"_cast_{to_type}")(g, input, False)
+
+
+def _comparison_operator(g: jit_utils.GraphContext, input, other, op_name):
+    other = symbolic_helper._maybe_get_scalar(other)
+    other = symbolic_helper._if_scalar_type_as(other, input)
+    _, input, other = _try_cast_integer_to_float(g, input, other)
+    return g.op(op_name, input, other)
+
+
+# NOTE: For symbolics {gt, lt, bmm, matmul, prelu, mm, addmm, view, flatten},
+#       integer input type not supported in opset8. Cast to float if possible.
+@_onnx_symbolic("aten::gt")
+def gt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Greater")
+
+
+@_onnx_symbolic("aten::lt")
+def lt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Less")
+
+
+@_onnx_symbolic("aten::bmm")
+def bmm(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other = _try_cast_integer_to_float(g, self, other)
+        return _cast_to_type(g, g.op("MatMul", self, other), old_type)
+    else:
+        return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+def matmul(g: jit_utils.GraphContext, self, other):
+    return bmm(g, self, other)
+
+
+@_onnx_symbolic("aten::prelu")
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    if self_rank is not None and self_rank > 2:
+        weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1)))
+    elif self_rank == 0 and weight_sizes == [1]:
+        # self and weight are both scalar but weight has rank == 1, squeeze weight.
+        weight = symbolic_helper._squeeze_helper(g, weight, [0])
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, weight = _try_cast_integer_to_float(g, self, weight)
+        return _cast_to_type(g, g.op("PRelu", self, weight), old_type)
+    else:
+        return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    scalar_type = symbolic_helper._try_get_scalar_type(self, other)
+    if scalar_type is None:
+        raise errors.SymbolicValueError(
+            "mm can only operate on tensors with known types", self
+        )
+    zero_constant = g.op(
+        "Constant",
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other, zero_constant = _try_cast_integer_to_float(
+            g, self, other, zero_constant
+        )
+        return _cast_to_type(
+            g,
+            g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0),
+            old_type,
+        )
+    return g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, mat1, mat2 = _try_cast_integer_to_float(g, self, mat1, mat2)
+        return _cast_to_type(
+            g,
+            g.op(
+                "Gemm",
+                mat1,
+                mat2,
+                self,
+                beta_f=symbolic_helper._scalar(beta),
+                alpha_f=symbolic_helper._scalar(alpha),
+            ),
+            old_type,
+        )
+    else:
+        return g.op(
+            "Gemm",
+            mat1,
+            mat2,
+            self,
+            beta_f=symbolic_helper._scalar(beta),
+            alpha_f=symbolic_helper._scalar(alpha),
+        )
+
+
+@_onnx_symbolic("aten::flatten")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    start_dim_i = symbolic_helper._get_const(start_dim, "i", "start_dim")
+    end_dim_i = symbolic_helper._get_const(end_dim, "i", "end_dim")
+
+    dim = input.type().dim()
+    if end_dim_i < 0:
+        end_dim_i = dim + end_dim_i
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim_i == 1 and end_dim_i == dim - 1:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=start_dim_i), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=start_dim_i)
+    if start_dim_i == 0 and end_dim_i == dim - 2:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=end_dim_i + 1), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=end_dim_i + 1)
+
+    return opset9.flatten(g, input, start_dim, end_dim)
+
+
+def _constant_fill(g: jit_utils.GraphContext, sizes, dtype: int, const_value):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if not scalar_type.dtype().is_floating_point:
+        result = g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=_type_utils.JitScalarType.FLOAT.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+        return g.op("Cast", result, to_i=scalar_type.onnx_type())
+    else:
+        return g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=scalar_type.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device and layout in ONNX, so we ignore it
+    return _constant_fill(g, sizes, dtype, 0)
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 0)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    return _constant_fill(g, sizes, dtype, 1)
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 1)
+
+
+@_onnx_symbolic("aten::full")
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return opset9.add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return _constant_fill(g, sizes, dtype, const_value)
+
+
+@_onnx_symbolic("aten::full_like")
+@symbolic_helper.parse_args("v", "f", "i", "v", "v", "v", "v")
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, fill_value)
+
+
+@_onnx_symbolic("aten::repeat")
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    if not symbolic_helper._is_value(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+    if symbolic_helper._is_packed_list(repeats):
+        repeat_size_len = len(symbolic_helper._unpack_list(repeats))
+    else:
+        const_repeats = symbolic_helper._maybe_get_const(repeats, "is")
+        repeat_size_len = len(const_repeats)
+    if self.isCompleteTensor():
+        sizes = self.type().sizes()
+        diff_dims = repeat_size_len - len(sizes)
+        if diff_dims > 0:
+            self = opset9.view(
+                g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes))
+            )
+    return g.op("Tile", self, repeats)
diff --git a/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
new file mode 100644
index 000000000000..596c656777f8
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/symbolic_opset9.py
@@ -0,0 +1,6656 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 9.
+
+Opset 9 is supported by ONNX release 1.4.1
+release on 01/23/19
+"""
+
+from __future__ import annotations
+
+import builtins
+import functools
+import math
+import sys
+import warnings
+from typing import Callable, TYPE_CHECKING
+from typing_extensions import deprecated
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.nn.modules.utils
+import torch.onnx
+from torch import _C
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import (
+    _type_utils,
+    jit_utils,
+    registration,
+    symbolic_helper,
+)
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from torch.types import Number
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "abs",
+    "acos",
+    "add",
+    "addcmul",
+    "addmm",
+    "alias",
+    "amax",
+    "amin",
+    "aminmax",
+    "arange",
+    "argmax",
+    "argmin",
+    "as_strided",
+    "as_tensor",
+    "asin",
+    "atan",
+    "atan2",
+    "baddbmm",
+    "batch_norm",
+    "bernoulli",
+    "bitwise_not",
+    "bitwise_or",
+    "bmm",
+    "broadcast_tensors",
+    "broadcast_to",
+    "bucketize",
+    "cat",
+    "cdist",
+    "ceil",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "clone",
+    "constant_pad_nd",
+    "contiguous",
+    "conv_tbc",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "convert_element_type",
+    "convolution",
+    "cos",
+    "cosine_similarity",
+    "cross",
+    "cumsum",
+    "detach",
+    "dim",
+    "div",
+    "dot",
+    "dropout",
+    "elu",
+    "embedding_bag",
+    "embedding",
+    "empty_like",
+    "empty",
+    "eq",
+    "erf",
+    "exp",
+    "expand_as",
+    "expand",
+    "eye",
+    "fill",
+    "flatten",
+    "floor_divide",
+    "floor",
+    "floordiv",
+    "frobenius_norm",
+    "full_like",
+    "full",
+    "gather",
+    "ge",
+    "gelu",
+    "get_pool_ceil_padding",
+    "glu",
+    "group_norm",
+    "gt",
+    "hann_window",
+    "hardshrink",
+    "hardsigmoid",
+    "hardswish",
+    "hardtanh",
+    "index_add",
+    "index_copy",
+    "index_fill",
+    "index_put",
+    "index_select",
+    "index",
+    "instance_norm",
+    "is_floating_point",
+    "is_pinned",
+    "isnan",
+    "item",
+    "kl_div",
+    "layer_norm",
+    "le",
+    "leaky_relu",
+    "lerp",
+    "lift",
+    "linalg_cross",
+    "linalg_matrix_norm",
+    "linalg_norm",
+    "linalg_vector_norm",
+    "linear",
+    "linspace",
+    "log_sigmoid",
+    "log_softmax",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "logical_and",
+    "logical_not",
+    "logical_or",
+    "logical_xor",
+    "logit",
+    "logsumexp",
+    "lstm_cell",
+    "lstm",
+    "lt",
+    "masked_fill",
+    "masked_fill_",
+    "matmul",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+    "max",
+    "maximum",
+    "meshgrid",
+    "min",
+    "minimum",
+    "mish",
+    "mm",
+    "movedim",
+    "mse_loss",
+    "mul",
+    "multinomial",
+    "mv",
+    "narrow",
+    "native_layer_norm",
+    "ne",
+    "neg",
+    "new_empty",
+    "new_full",
+    "new_ones",
+    "new_zeros",
+    "nonzero_numpy",
+    "nonzero",
+    "norm",
+    "numel",
+    "numpy_T",
+    "one_hot",
+    "ones_like",
+    "ones",
+    "onnx_placeholder",
+    "pad",
+    "pairwise_distance",
+    "permute",
+    "pixel_shuffle",
+    "pixel_unshuffle",
+    "pow",
+    "prelu",
+    "prim_constant_chunk",
+    "prim_constant_split",
+    "prim_constant",
+    "prim_data",
+    "prim_device",
+    "prim_dtype",
+    "prim_if",
+    "prim_layout",
+    "prim_list_construct",
+    "prim_list_unpack",
+    "prim_loop",
+    "prim_max",
+    "prim_min",
+    "prim_shape",
+    "prim_tolist",
+    "prim_tuple_construct",
+    "prim_type",
+    "prim_unchecked_cast",
+    "prim_uninitialized",
+    "rand_like",
+    "rand",
+    "randint_like",
+    "randint",
+    "randn_like",
+    "randn",
+    "reciprocal",
+    "reflection_pad",
+    "relu",
+    "relu6",
+    "remainder",
+    "repeat_interleave",
+    "repeat",
+    "replication_pad",
+    "reshape_as",
+    "reshape",
+    "roll",
+    "rrelu",
+    "rsqrt",
+    "rsub",
+    "scalar_tensor",
+    "scatter_add",
+    "scatter",
+    "select",
+    "selu",
+    "sigmoid",
+    "sign",
+    "silu",
+    "sin",
+    "size",
+    "slice",
+    "softmax",
+    "softplus",
+    "softshrink",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "sqrt",
+    "square",
+    "squeeze",
+    "stack",
+    "std_mean",
+    "std",
+    "sub",
+    "t",
+    "take",
+    "tan",
+    "tanh",
+    "tanhshrink",
+    "tensor",
+    "threshold",
+    "to",
+    "topk",
+    "transpose",
+    "true_divide",
+    "type_as",
+    "unbind",
+    "unfold",
+    "unsafe_chunk",
+    "unsafe_split_with_sizes",
+    "unsafe_split",
+    "unsqueeze",
+    "unsupported_complex_operators",
+    "noop_complex_operators",
+    "unused",
+    "var_mean",
+    "var",
+    "view_as",
+    "view",
+    "where",
+    "wrap_logical_op_with_cast_to",
+    "wrap_logical_op_with_negation",
+    "zeros_like",
+    "zeros",
+    "zero",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=9)
+
+
+def _export(name: str):
+    """Exports the function in the current global namespace."""
+
+    def wrapper(func):
+        globals()[name] = func
+        __all__.append(name)
+        return func
+
+    return wrapper
+
+
+def unused(g):
+    """Represents "missing" optional inputs."""
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+@_onnx_symbolic("aten::_shape_as_tensor")
+def _shape_as_tensor(g: jit_utils.GraphContext, input):
+    return g.op("Shape", input)
+
+
+@_onnx_symbolic("aten::_reshape_from_tensor")
+def _reshape_from_tensor(g: jit_utils.GraphContext, input, shape):
+    if isinstance(shape, list):
+        shape = g.op("Concat", *shape, axis_i=0)
+    return reshape(g, input, shape)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+def reshape(g: jit_utils.GraphContext, self, shape):
+    return symbolic_helper._reshape_helper(g, self, shape)
+
+
+@_onnx_symbolic("aten::reshape_as")
+@symbolic_helper.quantized_args(True)
+def reshape_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::add")
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    """
+    This function takes the add function and returns the corresponding ONNX operator.
+
+    This function is not meant to be called directly by the user.
+
+    Args:
+        g (GraphContext): The graph context.
+        self (Tensor): The first operand.
+        other (Tensor): The second operand.
+        alpha (float, optional): The scaling factor for the second operand. Defaults to None.
+
+    Returns:
+        ONNX operator.
+    """
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Add", 9, 11, "Add between list of tensors not supported", self
+        )
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Add", self, other)
+
+
+@_onnx_symbolic("aten::sub")
+def sub(g: jit_utils.GraphContext, self, other, alpha=None):
+    """
+    Consumes sub function and returns the corresponding ONNX operator.
+
+    This function is not meant to be called directly by the user.
+
+    Args:
+        g (GraphContext): The graph context.
+        self (Tensor): The first operand.
+        other (Tensor): The second operand.
+        alpha (Optional[Tensor]): A scaling factor to apply to the second operand.
+            If `alpha` is not provided, it defaults to 1.
+
+    Returns:
+        ONNX operator
+    """
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Sub", self, other)
+
+
+@_onnx_symbolic("aten::rsub")
+def rsub(g: jit_utils.GraphContext, self, other, alpha=None):
+    return sub(g, other, self, alpha=alpha)
+
+
+@_onnx_symbolic("aten::mul")
+def mul(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_bool(self) and symbolic_helper._is_bool(other):
+        # ONNX Mul doesn't support Boolean, so use And as an equivalent operator.
+        return g.op("And", self, other)
+    else:
+        return g.op("Mul", self, other)
+
+
+@_onnx_symbolic("aten::div")
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@_onnx_symbolic("aten::addcmul")
+@symbolic_helper.parse_args("v", "v", "v", "f")
+def addcmul(g: jit_utils.GraphContext, self, tensor1, tensor2, value=1.0):
+    value_tens = g.op("Constant", value_t=torch.tensor([value]))
+    return add(g, self, mul(g, mul(g, tensor1, tensor2), value_tens))
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode is None:
+        return true_divide(g, self, other)
+    elif rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    elif rounding_mode == "trunc":
+        return _trunc_divide(g, self, other)
+    else:
+        raise errors.SymbolicValueError(
+            f'Unsupported rounding mode: "{rounding_mode}". Expected None, "floor" or "trunc"',
+            self,
+        )
+
+
+def _trunc_divide(g: jit_utils.GraphContext, self, other):
+    out = g.op("Div", self, other)
+    # the correct operation is truncate, which is not supported in ONNX,
+    # we cannot call floor since it will behave differently for negative numbers
+    # (eg. -0.1 should become -0 )
+    # - if scalar_type information are not available, assume that
+    # we need to call floor (treat as float)
+    out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+    # Matching PyTorch's behavior:
+    # - if self is fp the output's type is self's type
+    # - if self is not fp and other is fp, the output is of type JitScalarType.FLOAT
+    # - self is not fp and other is not fp, the output's type is self's output type
+    # - the output type defaults to Float
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        if not symbolic_helper._is_fp(self) and symbolic_helper._is_fp(other):
+            out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        else:
+            out = g.op(
+                "Cast",
+                out,
+                to_i=scalar_type.onnx_type(),
+            )
+    else:
+        out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return out
+
+
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does truncation rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op(
+            "Xor",
+            symbolic_helper._lt_helper(g, self, zero),
+            symbolic_helper._lt_helper(g, other, zero),
+        )
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Sub", self, g.op("Mul", div, other))
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Mul", fixup_mask, one)
+        return g.op("Sub", div, fixup)
+
+
+@_onnx_symbolic("aten::floor_divide")
+def floor_divide(g: jit_utils.GraphContext, self, other):
+    # Deprecated behavior, floor_divide actually truncates
+    return _trunc_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::floordiv")
+def floordiv(g: jit_utils.GraphContext, self, other):
+    return floor_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::true_divide")
+def true_divide(g: jit_utils.GraphContext, self, other):
+    """Division where both inputs are cast to floating types
+
+    If both inputs are floating, performs div as usual
+    If only one input is a floating type, the other input is cast to its type
+    If neither input is a floating type, both inputs are cast to the default scalar type
+    """
+
+    # Case 1: either values are floating
+    # Performs div as usual.
+    # Implicit casting will be handled in scalar type analysis pass.
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        return g.op("Div", self, other)
+
+    # Case 2: neither is floating
+    # Casts both inputs to the default scalar type
+    scalar_type = torch.get_default_dtype()
+    onnx_scalar_type = _C_onnx.TensorProtoDataType.FLOAT
+    assert scalar_type is torch.float or scalar_type is torch.double
+    if torch.get_default_dtype() is torch.double:
+        onnx_scalar_type = _C_onnx.TensorProtoDataType.DOUBLE
+
+    self = g.op("Cast", self, to_i=onnx_scalar_type)
+    other = g.op("Cast", other, to_i=onnx_scalar_type)
+    return g.op("Div", self, other)
+
+
+@_onnx_symbolic("aten::reciprocal")
+def reciprocal(g: jit_utils.GraphContext, self):
+    # torch.reciprocal implicitly casts to float, so we do the same.
+    if not symbolic_helper._is_fp(self):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return g.op("Reciprocal", self)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.parse_args("v", "i")
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    """Implement concatenation of pytorch tensors in ONNX along the specified `dim` dimension.
+
+    Parameters:
+        g (jit_utils.GraphContext): Graph context.
+        tensor_list (List[torch.Tensor]): List of tensors to concatenate.
+        dim (int): Dimension along which to concatenate the tensors.
+
+    Returns:
+        ONNX graph node representing the concatenated tensor.
+    """
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    # torch.cat ignores empty tensors such as `torch.Tensor([])`
+    # These needs to be removed as input from ONNX's concat too, otherwise shape inference
+    # will likely fail due to inputs with different ranks (0 for empty tensor, > 0 for anything else)
+    nonempty_tensors = []
+    for t in tensors:
+        if symbolic_helper._is_constant(t) and not symbolic_helper._get_tensor_dim_size(
+            t, 0
+        ):
+            continue
+        nonempty_tensors.append(t)
+    assert len(nonempty_tensors) > 0
+    assert all(
+        symbolic_helper._get_tensor_rank(nonempty_tensors[0]) is None
+        or symbolic_helper._get_tensor_rank(t) is None
+        or symbolic_helper._get_tensor_rank(t)
+        == symbolic_helper._get_tensor_rank(nonempty_tensors[0])
+        for t in nonempty_tensors
+    )
+    tensor_list.node().removeAllInputs()
+    for t in nonempty_tensors:
+        tensor_list.node().addInput(t)
+
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return g.op("Concat", *tensors, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+@symbolic_helper.parse_args("v", "i")
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    unsqueezed = [
+        symbolic_helper._unsqueeze_helper(g, t, [dim])
+        for t in symbolic_helper._unpack_list(tensor_list)
+    ]
+    return g.op("Concat", *unsqueezed, axis_i=dim)
+
+
+@_onnx_symbolic("aten::list")
+def _list(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    C = g.op("Constant", value_t=torch.tensor([1]))
+    return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::bmm")
+def bmm(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+def matmul(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    scalar_type = None
+    self_scalar_type = symbolic_helper._try_get_scalar_type(self)
+    mat1_scalar_type = symbolic_helper._try_get_scalar_type(mat1)
+    mat2_scalar_type = symbolic_helper._try_get_scalar_type(mat2)
+    if self_scalar_type is not None:
+        scalar_type = self_scalar_type
+    elif mat1_scalar_type is not None:
+        scalar_type = mat1_scalar_type
+    elif mat2_scalar_type is not None:
+        scalar_type = mat2_scalar_type
+
+    mat1_rank = symbolic_helper._get_tensor_rank(mat1)
+    mat2_rank = symbolic_helper._get_tensor_rank(mat2)
+
+    def is_not_none_nor(v, u):
+        return v is not None and v != u
+
+    if scalar_type is not None and (
+        is_not_none_nor(mat1_rank, 2) or is_not_none_nor(mat2_rank, 2)
+    ):
+        res1 = g.op("MatMul", mat1, mat2)
+        res2 = self
+
+        alpha = symbolic_helper._scalar(alpha)
+        beta = symbolic_helper._scalar(beta)
+
+        if alpha != 1:
+            alpha = g.op(
+                "Constant", value_t=torch.tensor(alpha, dtype=scalar_type.dtype())
+            )
+            res1 = g.op("Mul", res1, alpha)
+        if beta != 1:
+            beta = g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    symbolic_helper._scalar(beta), dtype=scalar_type.dtype()
+                ),
+            )
+            res2 = g.op("Mul", res2, beta)
+
+        return g.op("Add", res1, res2)
+
+    return g.op(
+        "Gemm",
+        mat1,
+        mat2,
+        self,
+        beta_f=symbolic_helper._scalar(beta),
+        alpha_f=symbolic_helper._scalar(alpha),
+    )
+
+
+@_onnx_symbolic("aten::neg")
+def neg(g: jit_utils.GraphContext, self):
+    return g.op("Neg", self)
+
+
+@_onnx_symbolic("aten::sqrt")
+def sqrt(g: jit_utils.GraphContext, self):
+    if _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT16,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT64,
+    }:
+        # torch converts all int inputs to sqrt to float
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    return g.op("Sqrt", self)
+
+
+@_onnx_symbolic("aten::rsqrt")
+def rsqrt(g: jit_utils.GraphContext, self):
+    return g.op(
+        "Div", symbolic_helper._if_scalar_type_as(torch.ones(1), self), sqrt(g, self)
+    )
+
+
+@_onnx_symbolic("aten::tanh")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qtanh.cpp
+@symbolic_helper.quantized_args(True, scale=2.0 / 256.0, zero_point=128)
+def tanh(g: jit_utils.GraphContext, self):
+    return g.op("Tanh", self)
+
+
+@_onnx_symbolic("aten::sin")
+def sin(g: jit_utils.GraphContext, self):
+    return g.op("Sin", self)
+
+
+@_onnx_symbolic("aten::cos")
+def cos(g: jit_utils.GraphContext, self):
+    return g.op("Cos", self)
+
+
+@_onnx_symbolic("aten::tan")
+def tan(g: jit_utils.GraphContext, self):
+    return g.op("Tan", self)
+
+
+@_onnx_symbolic("aten::asin")
+def asin(g: jit_utils.GraphContext, self):
+    return g.op("Asin", self)
+
+
+@_onnx_symbolic("aten::acos")
+def acos(g: jit_utils.GraphContext, self):
+    return g.op("Acos", self)
+
+
+@_onnx_symbolic("aten::atan")
+def atan(g: jit_utils.GraphContext, self):
+    return g.op("Atan", self)
+
+
+@_onnx_symbolic("aten::atan2")
+def atan2(g: jit_utils.GraphContext, self, other):
+    # self is y, and other is x on coordinate
+    slope = g.op("Div", self, other)
+    atan = g.op("Atan", slope)
+    const_zero = g.op("Constant", value_t=torch.tensor(0))
+    const_pi = g.op("Constant", value_t=torch.tensor(math.pi))
+
+    condition_second_or_third_quadrant = g.op("Greater", self, const_zero)
+    second_third_quadrant = g.op(
+        "Where",
+        condition_second_or_third_quadrant,
+        g.op("Add", atan, const_pi),
+        g.op("Sub", atan, const_pi),
+    )
+
+    condition_14_or_23_quadrant = g.op("Less", other, const_zero)
+    result = g.op("Where", condition_14_or_23_quadrant, second_third_quadrant, atan)
+
+    return result
+
+
+@_onnx_symbolic("aten::sigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+def sigmoid(g: jit_utils.GraphContext, self):
+    """Converts the corresponding PyTorch function into ONNX operators.
+
+    It is not meant to be called directly by a user.
+
+    Args:
+        g (jit_utils.GraphContext): Graph context.
+        self (Tensor): the input tensor.
+    Returns:
+        ONNX operator
+    """
+    return g.op("Sigmoid", self)
+
+
+@_onnx_symbolic("aten::sign")
+def sign(g: jit_utils.GraphContext, self):
+    return g.op("Sign", self)
+
+
+@symbolic_helper.quantized_args(True)
+def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
+    assert len(starts) == len(ends)
+    if len(starts) == 1 and starts[0] == 0 and ends[0] == _constants.INT64_MAX:
+        return input
+    return g.op("Slice", input, axes_i=axes, starts_i=starts, ends_i=ends)
+
+
+@_onnx_symbolic(
+    "aten::sum", decorate=[symbolic_helper._apply_params("ReduceSum", "sum")]
+)
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
+# torch.prod does not support multidimensional "dim"
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
+)
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
+    )
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+def cumsum(g: jit_utils.GraphContext, input, dim, dtype):
+    symbolic_helper._onnx_opset_unsupported("cumsum", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_sample_dirichlet")
+def _sample_dirichlet(g: jit_utils.GraphContext, self, generator):
+    return symbolic_helper._onnx_unsupported("_sample_dirichlet", self)
+
+
+@_onnx_symbolic("aten::_standard_gamma")
+def _standard_gamma(g: jit_utils.GraphContext, self, generator):
+    return symbolic_helper._onnx_unsupported("_standard_gamma", self)
+
+
+@_onnx_symbolic("aten::t")
+def t(g: jit_utils.GraphContext, self):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is None or rank < 2:
+        # The transpose of a 1d or 0d tensor is itself. ONNX does not define the behavior
+        # clearly and onnxruntime fails on these cases. So we add an Identity node to
+        # mirror the behavior of eager mode.
+        return g.op("Identity", self)
+    return g.op("Transpose", self, perm_i=(1, 0))
+
+
+@_onnx_symbolic("aten::numpy_T")
+@symbolic_helper.quantized_args(True)
+def numpy_T(g: jit_utils.GraphContext, input):
+    ndim = symbolic_helper._get_tensor_rank(input)
+    assert ndim is not None
+    perm = list(reversed(range(0, ndim)))
+    return g.op("Transpose", input, perm_i=perm)
+
+
+@_onnx_symbolic("aten::expand")
+@symbolic_helper.quantized_args(True)
+def expand(g: jit_utils.GraphContext, self, size, implicit):
+    """Implement the expand function for a pytorch tensor in ONNX according to specified `size`"""
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::broadcast_to")
+@symbolic_helper.quantized_args(True)
+def broadcast_to(g: jit_utils.GraphContext, self, size):
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::expand_as")
+@symbolic_helper.quantized_args(True, True)
+def expand_as(g: jit_utils.GraphContext, self, other):
+    self_t = symbolic_helper._maybe_get_const(self, "t")
+    if isinstance(self_t, torch.Tensor):
+        orig_type = self_t.dtype
+        self_t = self_t.to(torch.double)
+        dims = []
+        for d in range(self_t.dim()):
+            if torch.equal(self_t.mean(d).unsqueeze(d).expand_as(self_t), self_t):
+                dims.append(d)
+                self = g.op(
+                    "Constant", value_t=self_t.mean(dims, keepdim=True).to(orig_type)
+                )
+
+    shape = g.op("Shape", other)
+    return g.op("Expand", self, shape)
+
+
+@_onnx_symbolic("aten::embedding")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i", "b", "v")
+def embedding(
+    g: jit_utils.GraphContext,
+    weight,
+    indices,
+    padding_idx,
+    scale_grad_by_freq,
+    sparse,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of embedding with scale_grad_by_freq=True "
+            "for training mode. ONNX does not support scaling the gradients.",
+            weight,
+        )
+    if padding_idx >= 0 and GLOBALS.export_training:
+        warnings.warn(
+            "Warning: ONNX export of embedding with padding_idx >= 0 "
+            "for training mode. "
+            "ONNX does not support not updating the embedding vector at padding_idx during training."
+        )
+
+    return g.op("Gather", weight, indices)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if not symbolic_helper._is_none(per_sample_weights):
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with per_sample_weights"
+        )
+
+    return symbolic_helper._onnx_unsupported("embedding_bag", embedding_matrix)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    if symbolic_helper._maybe_get_const(dim, "i") < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            dim = symbolic_helper._maybe_get_const(dim, "i") + rank
+            dim = g.op("Constant", value_t=torch.tensor(dim))
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::transpose")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "i")
+def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
+    if dim0 == dim1:  # micro-optimization
+        return self
+
+    # NB: Transpose in ONNX is actually a Permute
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None:
+        axes = list(range(rank))
+        axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
+        return g.op("Transpose", self, perm_i=axes)
+    else:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of transpose for tensor of unknown rank.",
+            self,
+        )
+
+
+@_onnx_symbolic("aten::permute")
+@symbolic_helper.parse_args("v", "is")
+def permute(g: jit_utils.GraphContext, self, dims):
+    if dims == list(range(0, len(dims))):
+        return self
+    return g.op("Transpose", self, perm_i=dims)
+
+
+@_onnx_symbolic("aten::view")
+@symbolic_helper.quantized_args(True)
+def view(g: jit_utils.GraphContext, self, size):
+    return reshape(g, self, size)
+
+
+@_onnx_symbolic("aten::view_as")
+def view_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unsafe_chunk", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "unsafe_chunk", "unknown dimension size", self
+        )
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return split_with_sizes(g, self, split_size_or_sizes, dim, _outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "split", 9, 11, "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "is", "i", "i")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split_with_sizes", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    return g.op("Split", self, split_i=split_sizes, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unbind", 9, 11, "Dynamic number of outputs not supported", self
+        )
+
+    outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        symbolic_helper._squeeze_helper(g, out, [dim]) for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::select")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+def select(g: jit_utils.GraphContext, self, dim, index):
+    """Implement the select functionality for a pytorch tensor in ONNX.
+
+    Selects elements from the input tensor along the specified `dim` dimension based on the `index` tensor.
+    """
+    index = symbolic_helper._maybe_get_scalar(index)
+    if (not symbolic_helper._is_value(index)) and (index < 0):
+        if index == -1:
+            end_index = _constants.INT64_MAX
+        else:
+            end_index = index + 1
+        slice_node = symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[index], ends=[end_index]
+        )
+        return symbolic_helper._squeeze_helper(g, slice_node, [dim])
+    else:
+        # FIXME(justinchuby): can index be an int and not a value?
+        return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::square")
+def square(g: jit_utils.GraphContext, self):
+    return g.op("Mul", self, self)
+
+
+@_onnx_symbolic("aten::squeeze")
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    squeeze_dim = symbolic_helper._get_const(dim, "i", "dim")
+    # Handle negative dims
+    if squeeze_dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export squeeze with negative axis "
+                + str(squeeze_dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(squeeze_dim + rank)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            squeeze_dim += rank
+        else:
+            return symbolic_helper._unimplemented(
+                "squeeze", "negative axis with unknown input rank", self
+            )
+
+    dim_size = symbolic_helper._get_tensor_dim_size(self, squeeze_dim)
+    if dim_size is None:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + " on an input "
+            + "with unknown shape. Note that if the size of dimension "
+            + str(squeeze_dim)
+            + " of the input "
+            + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on "
+            + "non-singleton dimensions, it is recommended to export this model using opset "
+            + "version 11 or higher."
+        )
+        return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please use opset version 11 to "
+            + "export the model."
+        )
+        return self
+
+    warnings.warn(
+        "This model contains a squeeze operation on dimension "
+        + str(squeeze_dim)
+        + ". If the model is "
+        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model."
+    )
+    return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+
+
+@_onnx_symbolic("aten::prelu")
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    weight_rank = len(weight_sizes)
+    if self_rank is not None:
+        if self_rank > 2:
+            # make weight unidirectional broadcastable
+            weight = symbolic_helper._unsqueeze_helper(
+                g, weight, list(range(1, self_rank - 1))
+            )
+        elif self_rank == 0 and weight_sizes == [1]:
+            # self and weight are both scalar but weight has rank == 1, squeeze weight.
+            weight = symbolic_helper._squeeze_helper(g, weight, [0])
+            weight_rank = 0
+
+    if self_rank is not None and weight_rank is not None:
+        assert self_rank >= weight_rank, (
+            f"rank(x) should be >= rank(slope) but got {self_rank} < {weight_rank}"
+        )
+    return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::silu")
+def silu(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Sigmoid", input))
+
+
+@_onnx_symbolic("aten::mish")
+def mish(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Tanh", g.op("Softplus", input)))
+
+
+@_onnx_symbolic("aten::relu")
+@symbolic_helper.quantized_args(True)
+def relu(g: jit_utils.GraphContext, input):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Relu", input, opset_before=14
+    )
+
+
+@_onnx_symbolic("aten::relu6")
+@symbolic_helper.quantized_args(True)
+def relu6(g: jit_utils.GraphContext, input):
+    return clamp(g, input, 0, 6)
+
+
+@_onnx_symbolic("aten::ceil")
+def ceil(g: jit_utils.GraphContext, input):
+    return g.op("Ceil", input)
+
+
+@_onnx_symbolic("aten::floor")
+def floor(g: jit_utils.GraphContext, input):
+    return g.op("Floor", input)
+
+
+@_onnx_symbolic("aten::len")
+def _len(g: jit_utils.GraphContext, self):
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::threshold")
+@symbolic_helper.parse_args("v", "t", "t")
+def threshold(g: jit_utils.GraphContext, self, threshold, value):
+    # See Note [Export inplace]
+    if symbolic_helper._scalar(threshold) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero threshold", self)
+    if symbolic_helper._scalar(value) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero value", self)
+    return g.op("Relu", self)
+
+
+@_onnx_symbolic("aten::leaky_relu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "b")
+def leaky_relu(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    negative_slope: float,
+    inplace: bool = False,
+):
+    # See Note [Export inplace]
+    return g.op("LeakyRelu", input, alpha_f=negative_slope)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+def glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # Softmax does normalization at vector level.
+    # PyTorch and ONNX use different strategies to split the input tensor into vectors.
+    # Thus dim and axis have different meanings.
+    # PyTorch slices the input tensor into vectors along the `dim`-th dimension.
+    # ONNX reshapes the input into a 2-D tensor, and `axis` indicates where the input is coerced.
+    # If input is a 2 x 3 tensor:
+    # input = [[1.0, 1.0, 1.0],
+    #          [1.0, 1,0, 1,0]]
+    # with dim = 0, the result is:
+    # result = [[0.5, 0.5, 0.5],
+    #           [0.5, 0.5, 0.5]]
+    # with axis = 0, the result is:
+    # result = [[0.167, 0.167, 0.167],
+    #           [0.167, 0.167, 0.167]]
+    # So only when dim and axis both equal to ndim - 1 (the last dimension),
+    # their semantics are equivalent.
+    # So use softmax when dim and axis both equal to ndim - 1,
+    # otherwise transpose the input to put the vectors to be normalized to the last dimension.
+    # When input rank is not known at export time we compute softmax using a subgraph
+    # with other operators
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is not None:
+        # TODO: remove this as onnx opset 11 spec allows negative axes
+        if dim < 0:
+            dim = input_dim + dim
+
+        is_transpose_required = input_dim != dim + 1
+
+        if is_transpose_required:
+            axes = list(range(input_dim))
+            axes[dim], axes[-1] = axes[-1], axes[dim]
+            input = g.op("Transpose", input, perm_i=axes)
+            dim = input_dim - 1
+
+        softmax = g.op("Softmax", input, axis_i=dim)
+        if dtype and dtype.node().kind() != "prim::Constant":
+            parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+            softmax = g.op(
+                "Cast",
+                softmax,
+                to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type(),
+            )
+
+        if is_transpose_required:
+            softmax = g.op("Transpose", softmax, perm_i=axes)  # type: ignore[possibly-undefined]
+        return softmax
+
+    # Apply max normalization.
+    input = g.op("Sub", input, g.op("ReduceMax", input, axes_i=[dim], keepdims_i=1))
+
+    exp = g.op("Exp", input)
+    sum = symbolic_helper._reducesum_helper(g, exp, axes_i=[dim])
+    softmax = g.op("Div", exp, sum)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return softmax
+
+
+@_onnx_symbolic("aten::softplus")
+def softplus(g: jit_utils.GraphContext, self, beta, threshold):
+    beta_const = symbolic_helper._maybe_get_const(beta, "f")
+    if beta_const != 1:
+        return g.op("Div", g.op("Softplus", g.op("Mul", self, beta)), beta)
+    return g.op("Softplus", self)
+
+
+@_onnx_symbolic("aten::get_pool_ceil_padding")
+def get_pool_ceil_padding(input, kernel_size, stride, padding):
+    # TODO(justinchuby): Looks like this op is deprecated in torch
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    dim = sizes[-len(padding) :] if sizes is not None else None
+    if dim is None or any(i is None for i in dim):
+        return symbolic_helper._unimplemented(
+            "get_pool_ceil_padding", "input size not accessible", input
+        )
+    ceiled_output_dim = [
+        int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])))
+        + 1
+        for i in range(0, len(padding))
+    ]
+    # ensure last pooling starts inside
+    ceiled_output_dim = [
+        (
+            ceiled_output_dim[i] - 1
+            if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
+            else ceiled_output_dim[i]
+        )
+        for i in range(0, len(ceiled_output_dim))
+    ]
+    padding_ceil = [
+        (
+            0
+            if (stride[i] == 1)
+            else (
+                kernel_size[i]
+                - (
+                    dim[i]
+                    + 2 * padding[i]
+                    - ((ceiled_output_dim[i] - 1) * stride[i] + 1)
+                )
+            )
+        )
+        for i in range(0, len(padding))
+    ]
+    # ensure padding is not > kernel_size
+    padding_ceil = [
+        (
+            (
+                int(padding_ceil[i])
+                if padding_ceil[i] < kernel_size[i] - 1
+                else int(kernel_size[i] - 1)
+            )
+            if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
+            else int(padding_ceil[i])
+        )
+        for i in range(0, len(padding_ceil))
+    ]
+    return padding_ceil
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool1d", torch.nn.modules.utils._single, 1, return_indices=False
+        ),
+        _export("max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool2d", torch.nn.modules.utils._pair, 2, return_indices=False
+        ),
+        _export("max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool3d", torch.nn.modules.utils._triple, 3, return_indices=False
+        ),
+        _export("max_pool3d"),
+    ],
+)
+def _max_pool(name, tuple_fn, ndims, return_indices):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+        if set(tuple_fn(dilation)) != {1}:
+            return symbolic_helper._unimplemented(name, "dilation", input)
+        if not stride:
+            stride = kernel_size
+        padding = tuple(tuple_fn(padding))
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            padding = padding + tuple(a + b for (a, b) in zip(padding_ceil, padding))
+        else:
+            padding = padding * 2
+        kwargs = {
+            "kernel_shape_i": tuple_fn(kernel_size),
+            "pads_i": padding,
+            "strides_i": tuple_fn(stride),
+        }
+        # easy but hacky way to get flattened indices values
+        # to be used to convert the indices values to non-flattened.
+        # In ONNX the indices are computed as a flatten 1-D tensor,
+        # so the values in indices are in [0, N x C x D1 x ... x Dn).
+        # To convert the indices to the same format used by Pytorch,
+        # we first execute a maxpool with a kernel and stride of 1 on the same input.
+        # This will result in a tensor of indices in which each index will have it's own value.
+        # Using this tensor as a reference, we extract the first index of each axis and subtract
+        # it from each index of this axis in the indices to convert.
+        # This step will result in a tensor were each dimension has values of indices within
+        # the dimension it is in.
+        # For more information :
+        # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
+        if return_indices:
+            r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
+            _, flattened_indices = g.op(
+                "MaxPool",
+                input,
+                outputs=2,
+                kernel_shape_i=[1 for _ in range(ndims)],
+                strides_i=[1 for _ in range(ndims)],
+            )
+            # convert indices to have non-flattened indices values
+            s = symbolic_helper._slice_helper(
+                g,
+                flattened_indices,
+                axes=[2 + i for i in range(ndims)],
+                starts=list(tuple_fn(0)),
+                ends=list(tuple_fn(1)),
+            )
+            indices = sub(g, indices, s)
+            return r, indices
+        else:
+            r = g.op("MaxPool", input, outputs=1, **kwargs)
+            return r
+
+    return symbolic_fn
+
+
+max_pool1d_with_indices = _onnx_symbolic("aten::max_pool1d_with_indices")(
+    _max_pool(
+        "max_pool1d_with_indices",
+        torch.nn.modules.utils._single,
+        1,
+        return_indices=True,
+    )
+)
+max_pool2d_with_indices = _onnx_symbolic("aten::max_pool2d_with_indices")(
+    _max_pool(
+        "max_pool2d_with_indices",
+        torch.nn.modules.utils._pair,
+        2,
+        return_indices=True,
+    )
+)
+max_pool3d_with_indices = _onnx_symbolic("aten::max_pool3d_with_indices")(
+    _max_pool(
+        "max_pool3d_with_indices",
+        torch.nn.modules.utils._triple,
+        3,
+        return_indices=True,
+    )
+)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool1d", torch.nn.modules.utils._single),
+        _export("avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool2d", torch.nn.modules.utils._pair),
+        _export("avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool3d", torch.nn.modules.utils._triple),
+        _export("avg_pool3d"),
+    ],
+)
+def _avg_pool(name, tuple_fn):
+    @symbolic_helper.quantized_args(True)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        if not stride:
+            stride = kernel_size
+        padding = symbolic_helper._avgpool_helper(
+            tuple_fn, padding, kernel_size, stride, divisor_override, name
+        )
+        assert isinstance(padding, tuple)
+        adjusted_padding = padding
+        # Although onnx::AvgPool provides count_include_pad,
+        # The corner case of Average Pooling with ceil_mode on
+        # PyTorch allows sliding window go off bound, which leads to
+        # this accommodation.
+        # More detail on https://github.com/pytorch/pytorch/issues/57178
+        if count_include_pad:
+            input = symbolic_helper._op_with_optional_float_cast(
+                g,
+                "Pad",
+                input,
+                pads_i=((0,) * 2 + padding) * 2,
+                mode_s="constant",
+                value_f=0.0,
+                opset_before=11,
+            )
+            adjusted_padding = (0,) * len(padding)
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            adjusted_padding = adjusted_padding + tuple(
+                a + b for (a, b) in zip(padding_ceil, adjusted_padding)
+            )
+        else:
+            adjusted_padding = adjusted_padding * 2
+        output = g.op(
+            "AveragePool",
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=adjusted_padding,
+        )
+        return output
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool1d", "AveragePool", torch.nn.modules.utils._single
+        ),
+        _export("adaptive_avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool2d", "AveragePool", torch.nn.modules.utils._pair
+        ),
+        _export("adaptive_avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool3d", "AveragePool", torch.nn.modules.utils._triple
+        ),
+        _export("adaptive_avg_pool3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool1d",
+            "MaxPool",
+            torch.nn.modules.utils._single,
+            max_pool1d_with_indices,
+        ),
+        _export("adaptive_max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool2d",
+            "MaxPool",
+            torch.nn.modules.utils._pair,
+            max_pool2d_with_indices,
+        ),
+        _export("adaptive_max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool3d",
+            "MaxPool",
+            torch.nn.modules.utils._triple,
+            max_pool3d_with_indices,
+        ),
+        _export("adaptive_max_pool3d"),
+    ],
+)
+def _adaptive_pool(name, type, tuple_fn, fn=None):
+    @symbolic_helper.quantized_args(True, False)
+    def symbolic_fn(g, input, output_size):
+        # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
+        # by executing a GlobalPool.
+        # It is also supported for cases where the output size is a factor of the input size.
+        # For these cases the stride and kernel size are uniform along all the indices of
+        # the same dimension, which makes it possible to export it to ONNX.
+        # for MaxPool, GlobalMaxPool does not return indices,
+        # so we try using max_poolxd_with_indices, and if it is not possible
+        # (input is not a complete tensor or output size not factor of input size)
+        # then we call GlobalAveragePool and return None for the indices
+        output_size_value = output_size
+        try:
+            output_size = symbolic_helper._parse_arg(output_size, "is")
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_unsupported(
+                "adaptive pooling, since output_size is not constant.", input
+            )
+        if output_size == [1] * len(output_size) and type == "AveragePool":
+            return g.op("GlobalAveragePool", input)
+        sizes = symbolic_helper._get_tensor_sizes(input)
+        try:
+            dim = sizes[2:]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            dim = None
+        if dim is None or any(i is None for i in dim):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "input size not accessible", input
+            )
+        # verify if output size % input size = 0 for all dim
+        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
+        if mod != [0] * len(mod):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "output size that are not factor of input size", output_size_value
+            )
+        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+        # call max_poolxd_with_indices to get indices in the output
+        if type == "MaxPool":
+            return fn(g, input, k, k, (0,) * len(dim), (1,) * len(dim), False)
+        output = g.op(type, input, kernel_shape_i=tuple_fn(k), strides_i=tuple_fn(k))
+        return output
+
+    return symbolic_fn
+
+
+def _prepare_onnx_paddings(dim: int, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+    Args:
+        dim: the dimension of the tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ...
+    """
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # assume zero-dimensions in the beginning
+    paddings = list(pad[:]) + [0] * (dim * 2 - len(pad))
+    # reverse order and collate first beginnings and then ends
+    paddings = paddings[-2::-2] + paddings[-1::-2]
+    return paddings
+
+
+def _convert_padding_node(input):
+    padding = symbolic_helper._maybe_get_const(input, "is")
+    if symbolic_helper._is_value(padding) and symbolic_helper._is_packed_list(padding):
+        input_list = symbolic_helper._unpack_list(padding)
+        try:
+            padding = [
+                symbolic_helper._get_const(v, "i", "padding") for v in input_list
+            ]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "Pad", 9, 11, "The sizes of the padding must be constant", input
+            )
+    return padding
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
+    mode = "constant"
+    try:
+        value = symbolic_helper._get_const(value, "f", "value")
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Pad", 9, 11, "The value for the padding must be constant", value
+        )
+
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
+    )
+
+
+def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value):
+    padding = _convert_padding_node(pad)
+    assert len(padding) % 2 == 0
+    ndim = len(padding) // 2
+
+    cur = input
+    for idx in range(ndim):
+        pad_r = padding[-(2 * idx + 1)]
+        pad_l = padding[-(2 * idx + 2)]
+        tensors = []
+        if pad_l > 0:
+            left = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[-(pad_l)], ends=[_constants.INT64_MAX]
+            )
+            tensors.append(left)
+
+        if pad_l < 0 or pad_r < 0:
+            start = builtins.max(0, -pad_l)
+            end = -(builtins.max(0, -pad_r))
+            middle = symbolic_helper._slice_helper(
+                g,
+                cur,
+                axes=[2 + idx],
+                starts=[start],
+                ends=[end],
+            )
+            tensors.append(middle)
+        else:
+            tensors.append(cur)
+
+        if pad_r > 0:
+            right = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[0], ends=[pad_r]
+            )
+            tensors.append(right)
+
+        cur = g.op("Concat", *tensors, axis_i=(2 + idx))
+
+    return cur
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::pad")
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return _pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest"),
+        _export("upsample_nearest1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest"),
+        _export("upsample_nearest2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest"),
+        _export("upsample_nearest3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_linear1d", 3, "linear"),
+        _export("upsample_linear1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear"),
+        _export("upsample_bilinear2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear"),
+        _export("upsample_trilinear3d"),
+    ],
+)
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Upsample", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, scales, mode_s=mode)
+
+
+@_onnx_symbolic("aten::bitwise_not")
+def bitwise_not(g: jit_utils.GraphContext, input):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            input,
+        )
+    return g.op("Not", input)
+
+
+@_onnx_symbolic("aten::bitwise_or")
+def bitwise_or(g, self, other):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. self: ",
+            self,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. other: ",
+            other,
+        )
+    return g.op("Or", self, other)
+
+
+def wrap_logical_op_with_cast_to(to_type):
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrap_with_cast(g, input, other):
+            to_cast_func = globals()[f"_cast_{to_type}"]
+            return fn(g, to_cast_func(g, input, False), to_cast_func(g, other, False))
+
+        return wrap_with_cast
+
+    return decorator
+
+
+def wrap_logical_op_with_negation(func: Callable) -> Callable:
+    @functools.wraps(func)
+    def wrap_with_not(g, input, other):
+        return g.op("Not", func(g, input, other))
+
+    return wrap_with_not
+
+
+@_onnx_symbolic("aten::__not_")
+def __not_(g: jit_utils.GraphContext, self):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            self,
+        )
+    return g.op("Not", self)
+
+
+@_onnx_symbolic("aten::eq")
+@symbolic_helper.quantized_args(True, True)
+def eq(g: jit_utils.GraphContext, self, other):
+    if isinstance(self.type(), _C.DeviceObjType) and isinstance(
+        other.type(), _C.DeviceObjType
+    ):
+        # ONNX doesn't have devices, so consider them all to be equal.
+        # The no-op check for equality will get constant-folded.
+        return g.op("Constant", value_t=torch.tensor(True, dtype=torch.bool))
+    self_node = self.node()
+    other_node = other.node()
+    if self_node.kind() == other_node.kind() == "onnx::Constant":
+        if self_node.kindOf("value") == other_node.kindOf("value") == "s":
+            # Exporting strings to ONNX is not supported.
+            # If both strings are constant, we can compare them directly.
+            # The no-op check for equality will get constant-folded.
+            return g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    self_node.s("value") == other_node.s("value"),
+                    dtype=torch.bool,
+                ),
+            )
+
+    return g.op("Equal", self, other)
+
+
+@_onnx_symbolic("aten::ne")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def ne(g: jit_utils.GraphContext, self, other):
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::gt")
+@symbolic_helper.quantized_args(True, True)
+def gt(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+def _gt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Greater", input, other)
+
+
+@_onnx_symbolic("aten::lt")
+@symbolic_helper.quantized_args(True, True)
+def lt(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+def _lt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Less", input, other)
+
+
+@_onnx_symbolic("aten::ge")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def ge(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::le")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def le(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::__and_")
+def __and_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::__or_")
+def __or_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::__xor_")
+def __xor_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_and")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_and(g: jit_utils.GraphContext, input, other):
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::logical_or")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_or(g: jit_utils.GraphContext, input, other):
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::logical_xor")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_xor(g: jit_utils.GraphContext, input, other):
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_not")
+def logical_not(g: jit_utils.GraphContext, input):
+    return g.op("Not", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL))
+
+
+@_onnx_symbolic("aten::__rshift_")
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::__lshift_")
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # PyTorch dim and ONNX axis have different meanings.
+    # See Softmax comment for details.
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    if dim < 0:
+        dim = input_dim + dim
+    is_transpose_required = input_dim != dim + 1
+    # ONNX only supports log_softmax with dim = -1. Transpose must be added before and after log_softmax to support other cases.
+    if is_transpose_required:
+        axes = list(range(input_dim))
+        axes[dim], axes[-1] = axes[-1], axes[dim]
+        input = g.op("Transpose", input, perm_i=axes)
+        dim = input_dim - 1
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    if is_transpose_required:
+        return_op = g.op("Transpose", return_op, perm_i=axes)  # type: ignore[possibly-undefined]
+    return return_op
+
+
+@_onnx_symbolic("aten::_log_softmax")
+@symbolic_helper.parse_args("v", "i", "i")
+def _log_softmax(g: jit_utils.GraphContext, input, dim, half_to_float):
+    if (
+        half_to_float
+        and _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.UNDEFINED
+        )
+        == _type_utils.JitScalarType.HALF
+    ):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return log_softmax(g, input, dim)
+
+
+@_onnx_symbolic("aten::_convolution")
+@symbolic_helper.parse_args(
+    "v", "v", "v", "is", "is", "is", "i", "is", "i", "i", "i", "i", "i"
+)
+def _convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    benchmark,
+    deterministic,
+    cudnn_enabled,
+    allow_tf32=None,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
+        # symmetric padding
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    if any(o != 0 for o in output_padding):
+        # ONNX supports both output_shape and output_padding. they are equivalent expressive.
+        # output_padding is more straightforward, so we use it here.
+        # output_shape = stride * (input_shape - 1) + output_padding + kernel_shape - padding * 2
+        assert transposed
+        assert len(stride) == len(output_padding)
+        kwargs["output_padding_i"] = output_padding
+
+    n = g.op("ConvTranspose" if transposed else "Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::_convolution_mode")
+@symbolic_helper.parse_args(
+    "v",
+    "v",
+    "v",
+    "is",
+    "s",
+    "is",
+    "i",
+)
+def _convolution_mode(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    if padding == "valid":
+        padding = "VALID"
+    elif padding == "same":
+        padding = "SAME_UPPER"
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        "auto_pad_s": padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    n = g.op("Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::convolution")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is", "i")
+def convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv1d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv2d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv3d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv_transpose1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose1d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose2d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose3d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    symbolic_helper.check_training_mode(training, "batch_norm")
+
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            9,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        outputs=1 if not training else 5,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var, saved_mean, saved_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        saved_mean.setDebugName("batch_norm_dead_output-" + saved_mean.debugName())
+        saved_var.setDebugName("batch_norm_dead_output-" + saved_var.debugName())
+        return res
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+def native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> tuple[_C.Value, _C.Value, _C.Value]:
+    axes = [-i for i in range(len(normalized_shape), 0, -1)]
+
+    two_cst = symbolic_helper._generate_wrapped_number(g, 2.0)
+    eps_cst = symbolic_helper._generate_wrapped_number(g, eps)
+
+    if g.opset < 18:
+        mean = g.op("ReduceMean", input, axes_i=axes)
+    else:
+        mean = g.op(
+            "ReduceMean",
+            input,
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
+    numerator = sub(g, input, mean)
+
+    # Cast it to eps dtype to avoid precision loss
+    is_type_half = (
+        _type_utils.JitScalarType.from_value(numerator)
+        == _type_utils.JitScalarType.HALF
+    )
+    if is_type_half:
+        eps_dtype = _type_utils.JitScalarType.from_value(eps_cst)
+        numerator = g.op(
+            "Cast", numerator, to_i=_type_utils.JitScalarType(eps_dtype).onnx_type()
+        )
+
+    # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
+    if g.opset < 18:
+        variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
+    else:
+        variance = g.op(
+            "ReduceMean",
+            pow(g, numerator, two_cst),
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
+    denominator = sqrt(g, g.op("Add", variance, eps_cst))
+    normalized = g.op("Div", numerator, denominator)
+
+    # Cast back to input type as eps related ops are all done
+    if is_type_half:
+        input_dtype = _type_utils.JitScalarType.from_value(input)
+        normalized = g.op(
+            "Cast", normalized, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
+        )
+
+    if not (weight is None or symbolic_helper._is_none(weight)):
+        normalized = mul(g, normalized, weight)
+    if not (bias is None or symbolic_helper._is_none(bias)):
+        normalized = add(g, normalized, bias)
+
+    # rdenominator := 1 / sqrt(variance + eps)
+    # According to aten::native_layer_norm, rdenominator should have the same dtype as input,
+    # mean and normalized, so we need to Cast it back
+    if is_type_half:
+        denominator = g.op(
+            "Cast",
+            denominator,
+            to_i=_type_utils.JitScalarType(input_dtype).onnx_type(),  # type: ignore[possibly-undefined]
+        )
+        rdenominator = g.op("Reciprocal", denominator)
+    else:
+        rdenominator = reciprocal(g, denominator)
+
+    return normalized, mean, rdenominator
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "b")
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+) -> _C.Value:
+    normalized, _, _ = native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+    return normalized
+
+
+@_onnx_symbolic("aten::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "b", "f", "f", "b")
+def instance_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    use_input_stats: bool,
+    momentum: Number,
+    eps: Number,
+    cudnn_enabled: bool,
+):
+    symbolic_helper.check_training_mode(use_input_stats, "instance_norm")
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if weight is None or symbolic_helper._is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or symbolic_helper._is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    if (
+        running_mean is None
+        or symbolic_helper._is_none(running_mean)
+        or running_var is None
+        or symbolic_helper._is_none(running_var)
+    ):
+        return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
+    else:
+        input_size = symbolic_helper._get_tensor_sizes(input)
+        # If input shape is [N, C, H, W], reshape to [1, N * C, H, W] and call batch_norm.
+        # For more information instance_norm():
+        # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Normalization.cpp#L542
+        input_size_reshape = input_size.copy()
+        n = input_size[0]
+        if n is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm training for unknown "
+                "batch size.",
+                input,
+            )
+        c = input_size[1]
+        input_size_reshape[0] = 1
+        input_size_reshape[1] = n * c
+        weight_ = repeat(
+            g, weight, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        bias_ = repeat(
+            g, bias, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        running_mean_ = repeat(
+            g,
+            running_mean,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        running_var_ = repeat(
+            g,
+            running_var,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        input_reshaped = g.op(
+            "Reshape",
+            input,
+            g.op("Constant", value_t=torch.LongTensor(input_size_reshape)),
+        )
+        out = batch_norm(
+            g,
+            input_reshaped,
+            weight_,
+            bias_,
+            running_mean_,
+            running_var_,
+            use_input_stats,
+            momentum,
+            eps,
+            cudnn_enabled,
+        )
+        return view(g, out, g.op("Constant", value_t=torch.tensor(input_size)))
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    # FIXME(justinchuby): Get rid of the try catch here to improve readability
+    try:
+        sizedim = sizes[dimension]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        sizedim = None
+    if sizedim is not None:
+        low_indices = range(0, sizedim, step)
+        hi_indices = range(size, sizedim + 1, step)
+        stack = [
+            symbolic_helper._slice_helper(
+                g, input, axes=[dimension], starts=[low], ends=[hi]
+            )
+            for low, hi in zip(low_indices, hi_indices)
+        ]
+        ndim = len(sizes)
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+        unsqueeze = [
+            symbolic_helper._unsqueeze_helper(
+                g, g.op("Transpose", t, perm_i=perm), [dimension]
+            )
+            for t in stack
+        ]
+        return g.op("Concat", *unsqueeze, axis_i=dimension)
+    else:
+        return symbolic_helper._unimplemented(
+            "Unfold", "input size not accessible", input
+        )
+
+
+@_onnx_symbolic("aten::elu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "t", "t", "t")
+def elu(g: jit_utils.GraphContext, input, alpha, scale, input_scale):
+    if scale and scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "scale", "does not support scale in Elu", scale
+        )
+    if input_scale and input_scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "input_scale", "does not support input_scale in Elu", input_scale
+        )
+    # See Note [Export inplace]
+    return g.op("Elu", input, alpha_f=symbolic_helper._scalar(alpha))
+
+
+@_onnx_symbolic("aten::selu")
+@symbolic_helper.quantized_args(True)
+def selu(g: jit_utils.GraphContext, input):
+    return g.op("Selu", input)
+
+
+@_onnx_symbolic("aten::index_select")
+@symbolic_helper.parse_args("v", "i", "v")
+def index_select(g: jit_utils.GraphContext, self, dim, index):
+    # In case of a scalar index, index_select returns a tensor with the same rank as the input.
+    # To match this behavior in ONNX, we make index a 1D tensor so that the following gather
+    # also produces a tensor with the same rank as the input.
+    return symbolic_helper._select_helper(g, self, dim, index)
+
+
+@_onnx_symbolic("aten::index_put")
+def index_put(g: jit_utils.GraphContext, self, indices_list_value, values, accumulate):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        if accumulate:
+            return add(g, self, values)
+        return values
+    symbolic_helper._onnx_opset_unsupported("index_put", 9, 11, self)
+
+
+@_onnx_symbolic("aten::index_fill")
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = expand(g, value, expanded_index_shape, None)
+
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::bucketize")
+@symbolic_helper.parse_args("v", "v", "b", "b")
+def bucketize(
+    g: jit_utils.GraphContext, self, boundaries, out_int32=False, right=False
+):
+    out_type = _C_onnx.TensorProtoDataType.INT64
+    if out_int32:
+        out_type = _C_onnx.TensorProtoDataType.INT32
+    # A tensor expanded_boundaries is created such that it
+    # contains a copy of boundaries for each element of self.
+    new_shape = g.op("Concat", g.op("Shape", boundaries), g.op("Shape", self), axis_i=0)
+    # Unsqueeze step is performed to respect ONNX's numpy style broadcasting for comparison ops
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    assert tensor_rank is not None
+    unsqueeze_axes = list(range(1, tensor_rank + 1))
+    expanded_boundaries = expand(
+        g,
+        symbolic_helper._unsqueeze_helper(g, boundaries, unsqueeze_axes),
+        new_shape,
+        None,
+    )
+    # Compare each element of self to boundaries to get a tensor
+    # with leading 1s and trailing 0s.
+    # e.g., 4 > [1, 3, 4] = [1, 1, 0]
+    # The index of the last 1 is the bucket where the element should go.
+    if right:
+        cond = ge(g, self, expanded_boundaries)
+    else:
+        cond = gt(g, self, expanded_boundaries)
+    cond_out = g.op("Cast", cond, to_i=out_type)
+    # Sum to get the number of 1s corresponding to each element,
+    # which is the same as the bucket index.
+    # e.g., sum(4 > [1, 3, 4]) = sum([1, 1, 0]) = 2
+    return symbolic_helper._reducesum_helper(g, cond_out, axes_i=[0], keepdims_i=0)
+
+
+@_onnx_symbolic("aten::type_as")
+def type_as(g: jit_utils.GraphContext, self, other):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    other_dtype = symbolic_helper._try_get_scalar_type(other)
+    if self_dtype == other_dtype and self_dtype is not None:
+        return self
+    if other_dtype is not None:
+        return g.op(
+            "Cast",
+            self,
+            to_i=other_dtype.onnx_type(),
+        )
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of type_as for tensor "
+        "of unknown dtype. Please check if the dtype of the "
+        "parameter passed to the type_as function is correct.",
+        other,
+    )
+
+
+@_onnx_symbolic("aten::cosine_similarity")
+@symbolic_helper.parse_args("v", "v", "i", "f")
+def cosine_similarity(g: jit_utils.GraphContext, x1, x2, dim, eps):
+    cross = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x2), axes_i=[dim], keepdims_i=0
+    )
+    x1_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x1), axes_i=[dim], keepdims_i=0
+    )
+    x2_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x2, x2), axes_i=[dim], keepdims_i=0
+    )
+    div_tens = max(
+        g, sqrt(g, mul(g, x1_l2, x2_l2)), g.op("Constant", value_t=torch.tensor([eps]))
+    )
+    return div(g, cross, div_tens)
+
+
+@_onnx_symbolic("aten::pairwise_distance")
+def pairwise_distance(g: jit_utils.GraphContext, input1, input2, p, eps, keepdim):
+    if not symbolic_helper._is_value(eps):
+        eps = g.op("Constant", value_t=torch.tensor([eps]))
+    inv_p = div(
+        g,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.float)),
+        add(g, p, eps),
+    )
+    summation = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, sub(g, input1, input2), p),
+        axes_i=[-1],
+        keepdims_i=symbolic_helper._parse_arg(keepdim, "i"),
+    )
+    return pow(g, summation, inv_p)
+
+
+@_onnx_symbolic("aten::clone")
+# ignore clone operators that are inserted by PyTorch autograd
+def clone(g: jit_utils.GraphContext, input, unused_memory_format):
+    return input
+
+
+@_onnx_symbolic("aten::abs")
+def abs(g: jit_utils.GraphContext, self):
+    return g.op("Abs", self)
+
+
+@_onnx_symbolic("aten::log")
+def log(g: jit_utils.GraphContext, self):
+    return g.op("Log", self)
+
+
+@_onnx_symbolic("aten::log1p")
+def log1p(g: jit_utils.GraphContext, self):
+    return log(g, add(g, symbolic_helper._if_scalar_type_as(torch.ones(1), self), self))
+
+
+@_onnx_symbolic("aten::log10")
+def log10(g: jit_utils.GraphContext, self):
+    _ln10 = 2.30258509299404568401
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor([_ln10])))
+
+
+@_onnx_symbolic("aten::pow")
+def pow(g: jit_utils.GraphContext, self, exponent):
+    f_dtype = _type_utils.JitScalarType.from_value(self)
+    if not symbolic_helper._is_fp(self):
+        f_dtype = _type_utils.JitScalarType.FLOAT
+        self = g.op("Cast", self, to_i=f_dtype.onnx_type())
+    if not symbolic_helper._is_fp(exponent):
+        exponent = g.op(
+            "Cast",
+            exponent,
+            to_i=f_dtype.onnx_type(),
+        )
+    pow = g.op("Pow", self, exponent)
+    return pow
+
+
+@_onnx_symbolic("aten::clamp")
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    # min or max may be None that we need to dispatch to
+    # Clip separately, as ONNX does not have None syntax
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if symbolic_helper._is_constant(min) and symbolic_helper._is_constant(max):
+            return symbolic_helper._op_with_optional_float_cast(
+                g,
+                "Clip",
+                self,
+                min_f=symbolic_helper._parse_arg(min, "f"),
+                max_f=symbolic_helper._parse_arg(max, "f"),
+                opset_before=12,
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    if symbolic_helper._is_constant(min):
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min_f=symbolic_helper._parse_arg(min, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        min = g.op("Cast", min, to_i=dtype.onnx_type())
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    if symbolic_helper._is_constant(max):
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, max_f=symbolic_helper._parse_arg(max, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        max = g.op("Cast", max, to_i=dtype.onnx_type())
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMax", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMin", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    reduce_kwargs = {"keepdims_i": keepdim}
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        reduce_kwargs["axes_i"] = [dim]
+
+    return g.op("ReduceMin", self, **reduce_kwargs), g.op(
+        "ReduceMax", self, **reduce_kwargs
+    )
+
+
+@_onnx_symbolic("aten::exp")
+def exp(g: jit_utils.GraphContext, self):
+    return g.op("Exp", self)
+
+
+@_onnx_symbolic("aten::dropout_")
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "i")
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    symbolic_helper.check_training_mode(train, "dropout")
+    # if train is False, dropout is no-op
+    if not train:
+        return input
+    r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
+    return r
+
+
+@_onnx_symbolic(
+    "aten::alpha_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout_")],
+)  # See Note [Export inplace]
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout_")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout_")],
+)
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout",
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout")],
+)
+@_onnx_symbolic(
+    "aten::alpha_dropout",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout")],
+)
+def _unsupported_dropout(name: str):
+    @symbolic_helper.parse_args("v", "none", "b")
+    def feature_dropout(g, input, p, train):
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        if train:
+            return symbolic_helper._unimplemented(name, "training mode", input)
+        return input
+
+    return feature_dropout
+
+
+@_onnx_symbolic("aten::norm")
+@symbolic_helper.parse_args("v", "t", "is", "i", "v")
+def norm(g: jit_utils.GraphContext, self, p, dim, keepdim, dtype=None):
+    if p == 1:
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL1")
+    elif p == 2:
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL2")
+    else:
+        raise errors.SymbolicValueError(
+            "ONNX export only p-norms with p of 1 or 2", self
+        )
+    result = f(g, self, dim=dim, keepdim=keepdim)
+    if dtype is not None:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return result
+
+
+@_onnx_symbolic("aten::conv_tbc")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def conv_tbc(g: jit_utils.GraphContext, input, weight, bias, pad):
+    # input must have 3 dimensions, see:
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvolutionTBC.cpp#L8-L10
+    # input = (time, batch, in_channels)
+    # weight = (kernel_width, in_channels, out_channels)
+    # bias = (out_channels,)
+    input = g.op("Transpose", input, perm_i=[1, 2, 0])
+    weight = g.op("Transpose", weight, perm_i=[2, 1, 0])
+    conv = conv1d(g, input, weight, bias, [1], [pad], [1], 1)
+    return g.op("Transpose", conv, perm_i=[2, 0, 1])
+
+
+@_onnx_symbolic("aten::_unique")
+@symbolic_helper.parse_args("v", "i", "i")
+def _unique(g: jit_utils.GraphContext, input, sorted, return_inverse):
+    return symbolic_helper._onnx_unsupported("_unique", input)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_counts):
+    symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_cast_Byte")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Byte(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.UINT8)
+
+
+@_onnx_symbolic("aten::_cast_Char")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Char(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT8)
+
+
+@_onnx_symbolic("aten::_cast_Short")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Short(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT16)
+
+
+@_onnx_symbolic("aten::_cast_Int")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Int(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+
+
+@_onnx_symbolic("aten::_cast_Long")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Long(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::_cast_Half")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Half(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT16)
+
+
+@_onnx_symbolic("aten::_cast_Float")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Float(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+
+@_onnx_symbolic("aten::_cast_Double")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Double(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+
+@_onnx_symbolic("aten::_cast_Bool")
+@deprecated("Avoid using this function and create a Cast node instead")
+def _cast_Bool(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::new_empty")
+def new_empty(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return empty(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::scalar_tensor")
+def scalar_tensor(g: jit_utils.GraphContext, scalar, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.FLOAT
+    scalar = g.op("Cast", scalar, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return scalar
+
+
+@_onnx_symbolic("aten::tensor")
+def tensor(
+    g: jit_utils.GraphContext, data, dtype=None, device=None, requires_grad=False
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if symbolic_helper._is_packed_list(data):
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(
+                symbolic_helper._unpack_list(data)[0]
+            )
+        input_list = []
+        for t in symbolic_helper._unpack_list(data):
+            shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
+            t = symbolic_helper._reshape_helper(g, t, shape_reference)
+            t = g.op("Cast", t, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+            input_list.append(t)
+        return g.op("Concat", *input_list, axis_i=0)
+    else:
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(data)
+        if symbolic_helper._is_list(data) and (
+            symbolic_helper._is_tensor_list(data)
+            or symbolic_helper._is_scalar_list(data)
+        ):
+            data = g.op("ConcatFromSequence", data, axis_i=0, new_axis_i=1)
+    return g.op("Cast", data, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::as_tensor")
+def as_tensor(g: jit_utils.GraphContext, data, dtype=None, device=None):
+    return tensor(g, data, dtype, device)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device, layout and pin_memory in ONNX, so we ignore it
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_zeros")
+def new_zeros(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zero")
+def zero(g: jit_utils.GraphContext, self):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    return zeros_like(g, self, self_dtype)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_ones")
+def new_ones(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return ones(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::full")
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        dtype = _type_utils.JitScalarType.FLOAT if dtype is None else dtype
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        if dtype is None:
+            scalar_type = _type_utils.JitScalarType.FLOAT
+        else:
+            scalar_type = _type_utils.JitScalarType(dtype)
+        sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+        if isinstance(sizes_, list) and len(sizes_) == 0:
+            sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+        return g.op(
+            "ConstantOfShape",
+            sizes,
+            value_t=const_value.view(1).to(scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::full_like")
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    fill_value = symbolic_helper._maybe_get_const(fill_value, "f")
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if symbolic_helper._is_value(fill_value):
+        tmp = zeros_like(g, input, dtype, layout, device)
+        fill_value = g.op("Cast", fill_value, to_i=scalar_type.onnx_type())
+        return add(g, tmp, fill_value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        shape = g.op("Shape", input)
+        return g.op(
+            "ConstantOfShape",
+            shape,
+            value_t=torch.tensor([fill_value], dtype=scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::new_full")
+def new_full(
+    g: jit_utils.GraphContext,
+    self,
+    size,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return full(g, size, fill_value, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::eye")
+def eye(g: jit_utils.GraphContext, *args):
+    if len(args) == 5:
+        # aten::eye(n, dtype, layout, device, pin_memory)
+        n, dtype, layout, device, _pin_memory = args
+        dim_size = symbolic_helper._unsqueeze_helper(g, n, [0])
+        shape = g.op("Concat", dim_size, dim_size, axis_i=0)
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+    if len(args) == 6:
+        # aten::eye(n, m, dtype, layout, device, pin_memory)
+        n, m, dtype, layout, device, _pin_memory = args
+        shape = g.op(
+            "Concat",
+            symbolic_helper._unsqueeze_helper(g, n, [0]),
+            symbolic_helper._unsqueeze_helper(g, m, [0]),
+            axis_i=0,
+        )
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+
+    return symbolic_helper._unimplemented("aten::eye", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::slice")
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor
+        dim, start, end, step = args
+        step = symbolic_helper._parse_arg(step, "i")
+        if step != 1:
+            raise errors.SymbolicValueError("step!=1 is currently not supported", self)
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        is_start_onnx_const = start.node().kind() == "onnx::Constant"
+        is_end_onnx_const = end.node().kind() == "onnx::Constant"
+        if (
+            ((not is_start_none) and (not is_start_onnx_const))
+            or ((not is_end_none) and (not is_end_onnx_const))
+            or dim.node().kind() != "onnx::Constant"
+        ):
+            if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+                raise errors.SymbolicValueError(
+                    "Unsupported: ONNX export of Slice with dynamic inputs. DynamicSlice "
+                    "is a deprecated experimental op. Please use statically allocated "
+                    "variables or export to a higher opset version.",
+                    self,
+                )
+            else:
+                start_unsqueezed = symbolic_helper._unsqueeze_helper(g, start, [0])
+                end_unsqueezed = symbolic_helper._unsqueeze_helper(g, end, [0])
+                dim_unsqueezed = symbolic_helper._unsqueeze_helper(g, dim, [0])
+                return g.op(
+                    "DynamicSlice",
+                    self,
+                    start_unsqueezed,
+                    end_unsqueezed,
+                    dim_unsqueezed,
+                )
+        else:
+            start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+            end = (
+                _constants.INT64_MAX
+                if is_end_none
+                else symbolic_helper._parse_arg(end, "i")
+            )
+            dim = symbolic_helper._parse_arg(dim, "i")
+            return symbolic_helper._slice_helper(
+                g, self, axes=[dim], starts=[start], ends=[end]
+            )
+    elif len(args) == 3:
+        # aten::slice(t[] l, int start, int end, int step) -> t[]
+        start, end, step = args
+        dim = 0
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+        end = (
+            _constants.INT64_MAX
+            if is_end_none
+            else symbolic_helper._parse_arg(end, "i")
+        )
+        return symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[start], ends=[end]
+        )
+
+    return symbolic_helper._unimplemented("aten::slice", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Clip", self, min_f=min_val, max_f=max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v")
+def hardswish(g: jit_utils.GraphContext, self):
+    hs = hardsigmoid(g, self)
+    return g.op("Mul", self, hs)
+
+
+@_onnx_symbolic("aten::hardsigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+@symbolic_helper.parse_args("v")
+def hardsigmoid(g: jit_utils.GraphContext, self):
+    # Set alpha_f to 1 / 6 to make op equivalent to PyTorch's definition of Hardsigmoid.
+    # See https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
+    return g.op("HardSigmoid", self, alpha_f=1 / 6)
+
+
+@_onnx_symbolic("aten::tanhshrink")
+@symbolic_helper.parse_args("v")
+def tanhshrink(g: jit_utils.GraphContext, self):
+    return g.op("Sub", self, tanh(g, self))
+
+
+@_onnx_symbolic("aten::hardshrink")
+@symbolic_helper.parse_args("v", "f")
+def hardshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    cond = logical_or(g, gt(g, self, lambd_op), lt(g, self, neg(g, lambd_op)))
+    return g.op(
+        "Where",
+        cond,
+        self,
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::softshrink")
+@symbolic_helper.parse_args("v", "f")
+def softshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    gt_cond = gt(g, self, lambd_op)
+    gt_out = g.op(
+        "Where",
+        gt_cond,
+        sub(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    lt_cond = lt(g, self, neg(g, lambd_op))
+    lt_out = g.op(
+        "Where",
+        lt_cond,
+        add(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    return add(g, gt_out, lt_out)
+
+
+@_onnx_symbolic("aten::alias")
+def alias(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::unsqueeze")
+@symbolic_helper.parse_args("v", "i")
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    """Implement unsqueezing a pytorch tensor in ONNX by inserting a new dimension at the specified `dim`"""
+    # Handle negative dim
+    if dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export unsqueeze with negative axis "
+                + str(dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(dim + rank + 1)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            dim = dim + rank + 1
+        else:
+            return symbolic_helper._unimplemented(
+                "unsqueeze", "negative axis with unknown input rank", self
+            )
+
+    return symbolic_helper._unsqueeze_helper(g, self, axes_i=[dim])
+
+
+@_onnx_symbolic("aten::sort")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Sort", "Out parameter is not supported for sort", self
+        )
+    self_sizes = symbolic_helper._get_tensor_sizes(self)
+    try:
+        dim_size = self_sizes[dim]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        dim_size = None
+
+    if dim_size is None:
+        return symbolic_helper._unimplemented("Sort", "input size not accessible", self)
+
+    return g.op("TopK", self, k_i=dim_size, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("aten::numel")
+def numel(g: jit_utils.GraphContext, self):
+    return symbolic_helper._numel_helper(g, self)
+
+
+@_onnx_symbolic("aten::topk")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "TopK", "Out parameter is not supported for topk", self
+        )
+    if not largest:
+        symbolic_helper._unimplemented("TopK", "Ascending TopK is not supported", self)
+
+    return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("prim::convert_element_type")
+def convert_element_type(g: jit_utils.GraphContext, self, *args):
+    dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+    return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::to")
+def to(g: jit_utils.GraphContext, self, *args):
+    def is_aten_to_device_only(args):
+        if len(args) == 4:
+            # aten::to(Tensor, Device, bool, bool, memory_format)
+            return (
+                args[0].node().kind() == "prim::device"
+                or args[0].type().isSubtypeOf(_C.ListType.ofInts())
+                or isinstance(args[0].type(), _C.DeviceObjType)
+            )
+        elif len(args) == 5:
+            # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+            return dtype is None
+        elif len(args) in (6, 7):
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+            return dtype is None
+        return False
+
+    # ONNX doesn't have a concept of a device, so we ignore device-only casts
+    if is_aten_to_device_only(args):
+        return self
+
+    if len(args) == 4:
+        # TestONNXRuntime::test_ones_bool shows args[0] of aten::to() can be onnx::Constant[value=<Tensor>]()
+        # In this case, the constant value is a tensor not int,
+        # so symbolic_helper._maybe_get_const(args[0], 'i') would not work.
+        dtype = args[0]
+        if (
+            symbolic_helper._is_value(args[0])
+            and args[0].node().kind() == "onnx::Constant"
+        ):
+            tval = symbolic_helper._node_get(args[0].node(), "value")
+            if isinstance(tval, torch.Tensor):
+                if len(tval.shape) == 0:
+                    tval = tval.item()
+                    dtype = int(tval)
+                else:
+                    dtype = tval
+
+        if symbolic_helper._is_value(dtype) or isinstance(dtype, torch.Tensor):
+            # aten::to(Tensor, Tensor, bool, bool, memory_format)
+            dtype = _type_utils.JitScalarType.from_value(args[0])
+            return g.op(
+                "Cast",
+                self,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            # aten::to(Tensor, ScalarType, bool, bool, memory_format)
+            # memory_format is ignored
+            return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 5:
+        # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+        dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+        # memory_format is ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 6:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 7:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+    return symbolic_helper._onnx_unsupported("Unknown aten::to signature", self)
+
+
+@_onnx_symbolic("aten::repeat")
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    dtype = _type_utils.JitScalarType.INT64
+    shape_ = ones_like(g, repeats, dtype)
+    self = g.op("Expand", self, shape_)
+    return g.op("Tile", self, repeats)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    input_sizes_temp = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            input_sizes[idx], input_sizes_temp[idx] = 0, -1
+
+    # Cases where repeats is an int or single value tensor
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    # Cases where repeats is a 1 dim Tensor
+    elif repeats_dim == 1:
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        if repeats_sizes[0] is None:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported for cases with dynamic repeats",
+                self,
+            )
+        assert repeats_sizes[0] == input_sizes[dim], (
+            "repeats must have the same size as input along dim"
+        )
+        reps = repeats_sizes[0]
+    else:
+        raise errors.SymbolicValueError("repeats must be 0-dim or 1-dim tensor", self)
+
+    final_splits = []
+    r_splits = symbolic_helper._repeat_interleave_split_helper(g, repeats, reps, 0)
+    i_splits = symbolic_helper._repeat_interleave_split_helper(g, self, reps, dim)
+    input_sizes[dim], input_sizes_temp[dim] = -1, 1
+    for idx, r_split in enumerate(r_splits):
+        i_split = unsqueeze(g, i_splits[idx], dim + 1)
+        r_concat = [
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])),
+            r_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])),
+        ]
+        r_concat = g.op("Concat", *r_concat, axis_i=0)
+        i_split = expand(g, i_split, r_concat, None)
+        i_split = symbolic_helper._reshape_helper(
+            g,
+            i_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes)),
+            allowzero=0,
+        )
+        final_splits.append(i_split)
+    return g.op("Concat", *final_splits, axis_i=dim)
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [2, 3]),
+            g.op(
+                "Constant",
+                value_t=torch.tensor([0, -1, upscale_factor, upscale_factor, 0, 0]),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, 1, 0, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, 1])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, reshape_w, [3, 5])
+    else:
+        output_channel = dims[1] // upscale_factor // upscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        upscale_factor,
+                        upscale_factor,
+                        dims[2],
+                        dims[3],
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] * upscale_factor,
+                        dims[3] * upscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+@_onnx_symbolic("aten::pixel_unshuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_unshuffle(g: jit_utils.GraphContext, self, downscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [3]),
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, downscale_factor, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, downscale_factor])),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", reshape_w, perm_i=[0, 1, 3, 5, 2, 4])
+        final_reshape = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, -1, 1, 1, 0, 0])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, final_reshape, [2, 3])
+    else:
+        output_channel = dims[1] * downscale_factor * downscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        dims[1],
+                        dims[2] // downscale_factor,
+                        downscale_factor,
+                        dims[3] // downscale_factor,
+                        downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 3, 5, 2, 4])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] // downscale_factor,
+                        dims[3] // downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+def _generic_rnn(
+    g: jit_utils.GraphContext,
+    variant,
+    input,
+    initial_states,
+    all_weights,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first=None,
+    batch_sizes=None,
+):
+    warnings.warn(
+        "Exporting a model to ONNX with a batch_size other than 1, "
+        + "with a variable length with "
+        + variant
+        + " can cause an error "
+        + "when running the ONNX model with a different batch size. "
+        + "Make sure to save the model with a batch size of 1, "
+        + "or define the initial states (h0/c0) as inputs of the model. "
+    )
+
+    onnxActivations = [
+        "Relu",
+        "Tanh",
+        "Sigmoid",
+        "Affine",
+        "LeakyRelu",
+        "ThresholdedRelu",
+        "ScaledTanh",
+        "HardSigmoid",
+        "Elu",
+        "Softsign",
+        "Softplus",
+    ]
+    variantToOnnxActivationMap = dict(
+        zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations)
+    )
+    weights_per_layer = 4 if has_biases else 2
+    # this means that projections are used inside LSTM, so need to tell user that it's not supported
+    if variant == "LSTM" and len(all_weights) != num_layers * weights_per_layer * (
+        1 + bidirectional
+    ):
+        return symbolic_helper._unimplemented("LSTM", "LSTMs with projections", input)
+    assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
+    layer_weights = [
+        all_weights[i : i + weights_per_layer]
+        for i in range(0, len(all_weights), weights_per_layer)
+    ]
+    if batch_first:
+        # batch, seq, feat -> seq, batch, feat
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if dropout and train:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "dropout in training mode", input
+        )
+
+    if variant.startswith("RNN"):
+        nonlinearity = variantToOnnxActivationMap[variant[4:].lower()]
+        variant = "RNN"
+
+    w_hh = all_weights[1]
+    hidden_size = symbolic_helper._get_tensor_dim_size(w_hh, 1)
+    if hidden_size is None:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "unknown hidden size", input
+        )
+
+    unidirectional = not bidirectional
+
+    prev_output = input
+
+    h_outs = []
+    if variant == "RNN" or variant == "GRU":
+        h0 = initial_states
+    elif variant == "LSTM":
+        h0, c0 = initial_states
+        c_outs = []
+
+    sequence_lens = unused(g) if batch_sizes is None else batch_sizes
+
+    if variant == "GRU":
+        # pytorch is reset, input, hidden
+        # onnx is    input, reset, hidden
+        reform_permutation = [(1, 2), (0, 1), (2, 3)]
+    elif variant == "LSTM":
+        # pytorch is input, forget, cell, output.
+        # onnx is    input, output, forget, cell.
+        reform_permutation = [(0, 1), (3, 4), (1, 3)]
+
+    def reform_weights(g, w, n, intervals):
+        slices = [
+            symbolic_helper._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n])
+            for x, y in intervals
+        ]
+        return g.op("Concat", *slices, axis_i=0)
+
+    def transform_weights_no_bias(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh)  # type: ignore[possibly-undefined]
+        )
+
+    def transform_weights(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh, bias_ih, bias_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh, bias_ih, bias_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)  # type: ignore[possibly-undefined]
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh, bias_concat)  # type: ignore[possibly-undefined]
+        )
+
+    def retrieve_state(x, start, end):
+        return (
+            x
+            if num_layers == 1
+            else symbolic_helper._slice_helper(
+                g, x, axes=[0], starts=[start], ends=[end]
+            )
+        )
+
+    for i in range(num_layers):
+        if unidirectional:
+            if weights_per_layer == 4:
+                weight_ih, weight_hh, bias_concat = transform_weights(i)
+            else:
+                weight_ih, weight_hh = transform_weights_no_bias(i)
+                bias_concat = unused(g)
+
+            state_indices = i, i + 1
+        else:
+            if weights_per_layer == 4:
+                weight_ih_f, weight_hh_f, bias_f = transform_weights(2 * i)
+                weight_ih_b, weight_hh_b, bias_b = transform_weights(2 * i + 1)
+                bias_concat = g.op("Concat", bias_f, bias_b, axis_i=0)
+            else:
+                weight_ih_f, weight_hh_f = transform_weights_no_bias(2 * i)
+                weight_ih_b, weight_hh_b = transform_weights_no_bias(2 * i + 1)
+                bias_concat = unused(g)
+
+            weight_ih = g.op("Concat", weight_ih_f, weight_ih_b, axis_i=0)
+            weight_hh = g.op("Concat", weight_hh_f, weight_hh_b, axis_i=0)
+
+            state_indices = 2 * i, 2 * i + 2
+
+        inputs = [prev_output, weight_ih, weight_hh, bias_concat, sequence_lens]
+
+        inputs.append(retrieve_state(h0, *state_indices))  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            inputs.append(retrieve_state(c0, *state_indices))  # type: ignore[possibly-undefined]
+
+        extra_kwargs = {} if unidirectional else {"direction_s": "bidirectional"}
+        if variant == "RNN":
+            if bidirectional:
+                activation = [nonlinearity, nonlinearity]  # type: ignore[possibly-undefined]
+            else:
+                activation = [nonlinearity]  # type: ignore[possibly-undefined]
+
+            prev_output, h_out = g.op(
+                "RNN",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                activations_s=activation,
+                **extra_kwargs,
+            )
+        elif variant == "GRU":
+            prev_output, h_out = g.op(
+                "GRU",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                linear_before_reset_i=1,
+                **extra_kwargs,
+            )
+        elif variant == "LSTM":
+            prev_output, h_out, c_out = g.op(
+                "LSTM", *inputs, outputs=3, hidden_size_i=hidden_size, **extra_kwargs
+            )
+
+        if bidirectional:
+            # The ONNX RNN/GRU/LSTM produce an output of dimensions
+            #   seq_len, num_directions, batch, hidden_size
+            # We have to convert to match pytorch's expected
+            #   seq_len, batch, num_directions * hidden_size
+            # by first moving num_directions before hidden_size with
+            # Transpose, and then combining it with hidden_size
+            # with Reshape.
+            prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3])
+            prev_output = symbolic_helper._reshape_helper(
+                g,
+                prev_output,
+                g.op("Constant", value_t=torch.LongTensor([0, 0, -1])),
+                allowzero=0,
+            )
+        else:
+            prev_output = symbolic_helper._squeeze_helper(g, prev_output, [1])
+
+        h_outs.append(h_out)  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            c_outs.append(c_out)  # type: ignore[possibly-undefined]
+    if batch_first:
+        # seq, batch, num_directions * hidden_size -> batch, seq, num_directions * hidden_size
+        prev_output = g.op("Transpose", prev_output, perm_i=[1, 0, 2])
+    h_outs = h_out if num_layers == 1 else g.op("Concat", *h_outs, axis_i=0)  # type: ignore[possibly-undefined]
+    if variant == "RNN" or variant == "GRU":
+        return prev_output, h_outs
+    elif variant == "LSTM":
+        c_outs = c_out if num_layers == 1 else g.op("Concat", *c_outs, axis_i=0)  # type: ignore[possibly-undefined]
+        return prev_output, h_outs, c_outs
+
+
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+def _lstm_full(
+    g: jit_utils.GraphContext,
+    input,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    hidden, weight = (
+        symbolic_helper._unpack_list(hidden_v),
+        symbolic_helper._unpack_list(weight_v),
+    )
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    )
+
+
+@symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+def _lstm_packed(
+    g: jit_utils.GraphContext,
+    input,
+    batch_sizes,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    hidden, weight = (
+        symbolic_helper._unpack_list(hidden_v),
+        symbolic_helper._unpack_list(weight_v),
+    )
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_sizes=batch_sizes,
+    )
+
+
+@_onnx_symbolic("aten::lstm")
+def lstm(g: jit_utils.GraphContext, *args):
+    if symbolic_helper._is_tensor_list(args[3]):
+        return _lstm_packed(g, *args)
+    else:
+        return _lstm_full(g, *args)
+
+
+@_onnx_symbolic("aten::lstm_cell")
+def lstm_cell(g: jit_utils.GraphContext, self, hidden, w_ih, w_hh, b_ih, b_hh):
+    input = symbolic_helper._unsqueeze_helper(g, self, [0])
+    hidden = symbolic_helper._unpack_list(hidden)
+    hidden = [symbolic_helper._unsqueeze_helper(g, x, [0]) for x in hidden]
+    weight = (
+        (w_ih, w_hh, b_ih, b_hh) if symbolic_helper._is_tensor(b_ih) else (w_ih, w_hh)
+    )
+    has_biases = True if symbolic_helper._is_tensor(b_ih) else False
+    _, h_outs, c_outs = _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers=1,
+        dropout=0,
+        train=0,
+        bidirectional=False,
+        batch_first=False,
+    )
+    return symbolic_helper._squeeze_helper(
+        g, h_outs, [0]
+    ), symbolic_helper._squeeze_helper(g, c_outs, [0])
+
+
+@_onnx_symbolic(
+    "aten::gru", decorate=[symbolic_helper._apply_params("GRU"), _export("gru")]
+)
+@_onnx_symbolic(
+    "aten::rnn_tanh",
+    decorate=[symbolic_helper._apply_params("RNN_TANH"), _export("rnn_tanh")],
+)
+@_onnx_symbolic(
+    "aten::rnn_relu",
+    decorate=[symbolic_helper._apply_params("RNN_RELU"), _export("rnn_relu")],
+)
+def _one_hidden_rnn(kind: str):
+    @symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+    def _rnn_full(
+        g,
+        input,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_first,
+        )
+
+    @symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+    def _rnn_packed(
+        g,
+        input,
+        batch_sizes,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_sizes=batch_sizes,
+        )
+
+    def symbolic(g, *args):
+        if symbolic_helper._is_tensor_list(args[3]):
+            return _rnn_packed(g, *args)
+        else:
+            return _rnn_full(g, *args)
+
+    return symbolic
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+    return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::detach")
+def detach(g: jit_utils.GraphContext, input):
+    # Erase aten::detach nodes because ONNX is inference only
+    return input
+
+
+@_onnx_symbolic("aten::contiguous")
+@symbolic_helper.parse_args("v", "i")
+def contiguous(g: jit_utils.GraphContext, input, memory_format):
+    if memory_format > 2:  # allower values are any, preserve and contiguous_format
+        raise errors.SymbolicValueError(
+            "onnx memory_format support is not implemented", input
+        )
+    return input
+
+
+@_onnx_symbolic("aten::_pack_padded_sequence")
+@symbolic_helper.parse_args("v", "v", "i")
+def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first):
+    # Currently there is no PackPadded operator in ONNX. We rely on an
+    # optimization pass to remove this later. It is an error if all
+    # PackPadded operators cannot be optimized out.
+    if batch_first:
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if not lengths.type().isSubtypeOf(torch._C.TensorType.get()):
+        raise errors.SymbolicValueError(
+            "'lengths' must be a Tensor for ONNX export", input
+        )
+    # We know it's a TensorType so this check is now safe.
+    # It's really only necessary because those operators expand to something that
+    # only works with int32 types in Caffe2...
+    if (
+        _type_utils.JitScalarType.from_value(
+            lengths, _type_utils.JitScalarType.UNDEFINED
+        )
+        != _type_utils.JitScalarType.INT
+    ):
+        lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("prim::PackPadded", input, lengths, outputs=2)
+
+
+@_onnx_symbolic("aten::_pad_packed_sequence")
+@symbolic_helper.parse_args("v", "v", "i", "t", "v")
+def _pad_packed_sequence(
+    g: jit_utils.GraphContext,
+    data,
+    batch_sizes,
+    batch_first,
+    padding_value,
+    total_length,
+):
+    # Ignore total_length as it is not supported in _symbolic_pad_packed_sequence
+    # It is only useful/used when training using data_parallel model, so
+    # It shouldn't be relevant for ONNX anyway
+    data, lengths = g.op("prim::PadPacked", data, batch_sizes, outputs=2)
+    if batch_first:
+        data = g.op("Transpose", data, perm_i=[1, 0, 2])
+    return data, lengths
+
+
+@_onnx_symbolic("aten::randint")
+def randint(g: jit_utils.GraphContext, low, high, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        randn = g.op(
+            "RandomUniformLike",
+            shape_const,
+            low_f=low_i,
+            high_f=high_i,
+        )
+    else:
+        randn = g.op(
+            "RandomUniform",
+            shape_i=shape,
+            low_f=low_i,
+            high_f=high_i,
+        )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randint_like")
+def randint_like(g: jit_utils.GraphContext, self, low, high, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    randn = g.op(
+        "RandomUniformLike",
+        self,
+        low_f=low_i,
+        high_f=high_i,
+    )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randn")
+def randn(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomNormalLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomNormal",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::rand")
+def rand(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomUniformLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomUniform",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::randn_like")
+def randn_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op("RandomNormalLike", self, dtype_i=scalar_type.onnx_type())
+
+
+@_onnx_symbolic("aten::rand_like")
+def rand_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    return g.op(
+        "RandomUniformLike", self, dtype_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+
+
+@_onnx_symbolic("aten::rrelu")
+@symbolic_helper.parse_args("v", "f", "f", "i", "none")
+def rrelu(g: jit_utils.GraphContext, input, lower, upper, training, generator):
+    if not training:
+        slope = (upper + lower) / 2.0
+        return g.op("LeakyRelu", input, alpha_f=slope)
+    p = g.op("RandomUniformLike", input, high_f=upper, low_f=lower)
+    return g.op("PRelu", input, p)
+
+
+@_onnx_symbolic("aten::bernoulli")
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+
+    dtype = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.UNDEFINED
+    )
+    if dtype == _type_utils.JitScalarType.UNDEFINED:
+        return symbolic_helper._unimplemented(
+            "Bernoulli", "input dtype not accessible", input
+        )
+
+    rands = g.op(
+        "RandomUniformLike",
+        input,
+        high_f=1.0,
+        low_f=0.0,
+        dtype_i=dtype.onnx_type(),
+    )
+    prob = p if p is not None and not symbolic_helper._is_none(p) else input
+    output = g.op("Less", rands, prob)
+    return g.op("Cast", output, to_i=dtype.onnx_type())
+
+
+@_onnx_symbolic("aten::log_sigmoid")
+@symbolic_helper.parse_args("v")
+def log_sigmoid(g: jit_utils.GraphContext, input):
+    p = g.op("Sigmoid", input)
+    return g.op("Log", p)
+
+
+@_onnx_symbolic("aten::erf")
+@symbolic_helper.parse_args("v")
+def erf(g: jit_utils.GraphContext, input):
+    return g.op("Erf", input)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+            input,
+        )
+
+    if dim == 0:
+        return symbolic_helper._reshape_helper(g, input, [1])
+    if dim == 1:
+        return g.op("Identity", input)
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    if end_dim < 0:
+        end_dim = dim + end_dim
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1 and end_dim == dim - 1:
+        return g.op("Flatten", input, axis_i=start_dim)
+    if start_dim == 0 and end_dim == dim - 2:
+        return g.op("Flatten", input, axis_i=end_dim + 1)
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::nonzero")
+@symbolic_helper.parse_args("v")
+def nonzero(g: jit_utils.GraphContext, input):
+    """Emitted from `torch.nonzero(x, as_tuple=False)`"""
+    return t(g, g.op("NonZero", input))
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::isnan")
+@symbolic_helper.parse_args("v")
+def isnan(g: jit_utils.GraphContext, input):
+    output = g.op("IsNaN", input)
+    return output
+
+
+@_onnx_symbolic("aten::any")
+def _any(g: jit_utils.GraphContext, *args):
+    # aten::any(Tensor self)
+    if len(args) == 1:
+        input = args[0]
+        dim, keepdim = None, 0
+    # aten::any(Tensor self, int[]? dim, bool keepdim)
+    else:
+        input, dim, keepdim = args
+        # Can be int list or single int
+        dim = symbolic_helper._parse_arg(dim, "t")
+        dim = [int(d) for d in dim.view(-1)]
+        keepdim = symbolic_helper._parse_arg(keepdim, "i")
+    input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+    input_sum = symbolic_helper._reducesum_helper(
+        g, input, axes_i=dim, keepdims_i=keepdim
+    )
+    return gt(g, input_sum, g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)))
+
+
+@_onnx_symbolic("aten::all")
+def _all(g: jit_utils.GraphContext, *args):
+    input = g.op("Not", args[0])
+    # aten::all(Tensor self)
+    if len(args) == 1:
+        return g.op("Not", _any(g, input))
+    # aten::all(Tensor self, int[]? dim, bool keepdim)
+    else:
+        return g.op("Not", _any(g, input, args[1], args[2]))
+
+
+@_onnx_symbolic("aten::narrow")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    return symbolic_helper._slice_helper(
+        g, input, axes=[dim], starts=[start], ends=[start + length]
+    )
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("Scatter", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        self_scalar_type = _type_utils.JitScalarType.from_value(self)
+        if self_scalar_type != src_type:
+            src = g.op("Cast", src, to_i=self_scalar_type.onnx_type())
+        return g.op("Scatter", self, index, expand_as(g, src, index), axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        return symbolic_helper._unimplemented(
+            "scatter_add", "input dtype not accessible", self
+        )
+    sizes = symbolic_helper._get_tensor_sizes(self, allow_nonstatic=False)
+    if sizes:
+        to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=scalar_type.dtype()))
+    else:
+        to_add = zeros_like(g, self, scalar_type)
+    to_add = symbolic_helper._scatter_helper(g, to_add, dim, index, src)
+    return add(g, self, to_add)
+
+
+@_onnx_symbolic("aten::log2")
+def log2(g: jit_utils.GraphContext, self):
+    _ln2 = 0.693147180559945309
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor(_ln2)))
+
+
+@_onnx_symbolic("aten::is_floating_point")
+def is_floating_point(g: jit_utils.GraphContext, self):
+    if symbolic_helper._is_fp(self):
+        return g.op("Constant", value_t=torch.BoolTensor([1]))
+    return g.op("Constant", value_t=torch.BoolTensor([0]))
+
+
+@_onnx_symbolic("aten::__is_")
+def __is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if symbolic_helper._is_none(self):
+            return g.op("Constant", value_t=torch.BoolTensor([1]))
+        return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@wrap_logical_op_with_negation
+def __isnot_(g: jit_utils.GraphContext, self, other):
+    return __is_(g, self, other)
+
+
+@_onnx_symbolic("aten::one_hot")
+def one_hot(g: jit_utils.GraphContext, self, num_classes):
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    # onnxruntime supports limited type combinations for OneHot.
+    if _type_utils.JitScalarType.from_value(
+        num_classes, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT16,
+    }:
+        num_classes = g.op("Cast", num_classes, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("OneHot", self, num_classes, values, axis_i=-1)
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True", self)
+    # NOTE: This workaround is needed since GatherElement is only supported
+    #       since opset 11, and Gather in ONNX is not the same as torch.gather.
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
+    index = g.op(
+        "Cast",
+        g.op("OneHot", index, depth, values, axis_i=dim),
+        to_i=scalar_type.onnx_type(),
+    )
+    mul = g.op("Mul", symbolic_helper._unsqueeze_helper(g, self, [dim + 1]), index)
+    return symbolic_helper._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
+
+
+@symbolic_helper.parse_args("v", "is", "i", "i")
+def _var_mean(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    return symbolic_helper._var_mean_helper(g, input, dim, correction, keepdim)
+
+
+@_onnx_symbolic("aten::std")
+def std(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return g.op("Sqrt", var)
+
+
+@_onnx_symbolic("aten::var")
+def var(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return var
+
+
+@_onnx_symbolic("aten::var_mean")
+def var_mean(g: jit_utils.GraphContext, input, *args):
+    if len(args) == 1:
+        return _var_mean(g, input, None, args[0], None)
+    else:
+        return _var_mean(g, input, *args)
+
+
+@_onnx_symbolic("aten::std_mean")
+def std_mean(g: jit_utils.GraphContext, input, *args):
+    var, mean = var_mean(g, input, *args)
+    return g.op("Sqrt", var), mean
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+def logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    return g.op("ReduceLogSumExp", input, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::arange")
+def arange(g: jit_utils.GraphContext, *args):
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    def _float_step_convert(range_tensor):
+        if symbolic_helper._is_fp(range_tensor):
+            range_tensor = g.op(
+                "Cast",
+                g.op("Ceil", range_tensor),
+                to_i=_type_utils.JitScalarType.INT64.onnx_type(),
+            )
+        return range_tensor
+
+    if len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        range_tensor = _float_step_convert(end)
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, dtype, None, None)), [1]
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        step = symbolic_helper._unsqueeze_helper(g, step, [0])
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Div", g.op("Sub", end, start), step))
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, None, None, None)), [1]
+        )
+        arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Sub", end, start))
+        arange_tensor = g.op(
+            "Add",
+            symbolic_helper._squeeze_helper(
+                g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]
+            ),
+            start,
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+
+    return symbolic_helper._unimplemented("aten::arange", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::linspace")
+def linspace(
+    g: jit_utils.GraphContext, start, end, steps, dtype, layout, device, pin_memory
+):
+    range_tensor = symbolic_helper._arange_helper(g, steps, None)
+    step = div(
+        g,
+        sub(g, end, start),
+        sub(g, steps, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))),
+    )
+    return add(g, mul(g, range_tensor, step), start)
+
+
+@_onnx_symbolic("aten::lift")
+def lift(g: jit_utils.GraphContext, self):
+    # at::lift() is a no-op from the perspective of tracing for onnx
+    return self
+
+
+@_onnx_symbolic("aten::masked_fill")
+def masked_fill(g: jit_utils.GraphContext, self, mask, value):
+    """Implement the masked_fill functionality available for a pytorch tensor in ONNX.
+
+    Fills elements of the input tensor with `value` where `mask` is True.
+    """
+    mask = g.op("Cast", mask, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    value = symbolic_helper._maybe_get_scalar(value)
+    return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self)
+
+
+@_onnx_symbolic("aten::masked_fill_")
+def masked_fill_(g: jit_utils.GraphContext, self, mask, value):
+    return masked_fill(g, self, mask, value)
+
+
+@_onnx_symbolic("aten::index")
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    def try_mask_to_index(index):
+        if not symbolic_helper._is_none(index) and (
+            _type_utils.JitScalarType.from_value(
+                index, _type_utils.JitScalarType.UNDEFINED
+            )
+            == _type_utils.JitScalarType.UINT8
+            or symbolic_helper._is_bool(index)
+        ):
+            if g.opset < 9:
+                raise errors.SymbolicValueError(
+                    "Exporting masked indices are only supported after ONNX opset 9.",
+                    self,
+                )
+            warnings.warn(
+                "Exporting aten::index operator with indices of type Byte. "
+                "Only 1-D indices are supported. In any other case, "
+                "this will produce an incorrect ONNX graph."
+            )
+            index = symbolic_helper._squeeze_helper(g, nonzero(g, index), [1])
+        return index
+
+    indices = [try_mask_to_index(idx) for idx in indices]
+    if len(indices) == 1:
+        return symbolic_helper._select_helper(
+            g, self, 0, indices[0], apply_reshape=False
+        )
+    else:
+        # Multiple tensors as indices. Each tensor could either be
+        #   1. prim::Constant()
+        #           representing ":" in python indexing. E.g. tensor[:, :]
+        #   2. prim::Constant[value=...] or tensor output
+        #           representing advanced indexing. E.g. tensor[[0, 1], [2, 0]].
+        # For more info on advanced indexing,
+        # check https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing
+
+        # Consider a general case of
+        #       t: [x_1, y_1, y_2, ..., x_m, ..., y_n]
+        # where t is a tensor of rank m+n, {x_i} are axes where tensor index is provided, and {y_i} are axes for ":".
+        # Same results can be achieved through transposing t into
+        #       t: [x_1, x_2, ..., x_m, y_1, y_2, ..., y_n]
+        # and use gatherND. However ONNX does not have gatherND, to use 1d gather we'll need to flatten t
+        # and process the tensor indices.
+        #       t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n]
+        #       tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j))
+        # After gather, reshape and transpose back.
+        adv_idx_indices = [
+            i for i, idx in enumerate(indices) if not symbolic_helper._is_none(idx)
+        ]
+
+        if len(adv_idx_indices) == 0:
+            return self
+        elif len(adv_idx_indices) == 1:
+            return index_select(
+                g, self, adv_idx_indices[0], indices[adv_idx_indices[0]]
+            )
+        else:
+            rank = symbolic_helper._get_tensor_rank(self)
+            if rank is None:
+                return symbolic_helper._unimplemented(
+                    "aten::index",
+                    "operator of advanced indexing on tensor of unknown rank. ",
+                    self,
+                )
+            # TODO: If indexing is supported natively in ONNX in future opsets,
+            #       update the warning to recommend exporting with higher opset version.
+            warnings.warn(
+                "Exporting aten::index operator of advanced indexing in opset "
+                f"{GLOBALS.export_onnx_opset_version}"
+                " is achieved by combination of multiple ONNX operators, "
+                "including Reshape, Transpose, Concat, and Gather. "
+                "If indices include negative values, the exported graph will produce incorrect results."
+            )
+            adv_idx_count = len(adv_idx_indices)
+            shape_tensor = _shape_as_tensor(g, self)
+            dim_tensor_list = [
+                g.op(
+                    "Gather",
+                    shape_tensor,
+                    g.op("Constant", value_t=torch.LongTensor([dim])),
+                    axis_i=0,
+                )
+                for dim in range(rank)
+            ]
+
+            self = g.op(
+                "Transpose",
+                self,
+                perm_i=adv_idx_indices
+                + [i for i in range(rank) if i not in adv_idx_indices],
+            )
+            self = g.op("Flatten", self, axis_i=adv_idx_count)
+
+            # Note that tensor indices will be broadcasted while accumulating. Thus we get the final subarray shape as well.
+            cum_adv_index = indices[adv_idx_indices[-1]]
+            multiplier = dim_tensor_list[adv_idx_indices[-1]]
+            for i in range(adv_idx_count - 2, -1, -1):
+                adv_index = g.op("Mul", indices[adv_idx_indices[i]], multiplier)
+                cum_adv_index = g.op("Add", cum_adv_index, adv_index)
+                multiplier = g.op(
+                    "Mul", multiplier, dim_tensor_list[adv_idx_indices[i]]
+                )
+
+            # perform gather
+            self = index_select(g, self, 0, cum_adv_index)
+
+            cum_adv_index_shape_tensor = _shape_as_tensor(g, cum_adv_index)
+            # check if all advanced indices are consecutive.
+            # Refer to https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+            # to understand how the subarray position is decided.
+            if adv_idx_indices == list(
+                range(adv_idx_indices[0], adv_idx_indices[-1] + 1)
+            ):
+                # unfold regular index axes
+                folded_adv_idx_shape_list = [
+                    g.op("Constant", value_t=torch.LongTensor([-1]))
+                ] + [
+                    dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices
+                ]
+                folded_adv_idx_shape = g.op(
+                    "Concat", *folded_adv_idx_shape_list, axis_i=0
+                )
+                self = symbolic_helper._reshape_helper(g, self, folded_adv_idx_shape)
+
+                # Transpose folded advanced indexed axis to its original location.
+                adv_idx_permute = (
+                    list(range(1, adv_idx_indices[0] + 1))
+                    + [0]
+                    + list(range(adv_idx_indices[0] + 1, rank - adv_idx_count + 1))
+                )
+                self = g.op("Transpose", self, perm_i=adv_idx_permute)
+
+                # unfold advanced index axes
+                final_shape_list = (
+                    [dim_tensor_list[i] for i in range(adv_idx_indices[0])]
+                    + [cum_adv_index_shape_tensor]
+                    + [
+                        dim_tensor_list[i]
+                        for i in range(adv_idx_indices[0], rank)
+                        if i not in adv_idx_indices
+                    ]
+                )
+                final_shape = g.op("Concat", *final_shape_list, axis_i=0)
+            else:
+                final_shape = g.op(
+                    "Concat",
+                    cum_adv_index_shape_tensor,
+                    *[
+                        dim_tensor_list[i]
+                        for i in range(rank)
+                        if i not in adv_idx_indices
+                    ],
+                    axis_i=0,
+                )
+
+            return symbolic_helper._reshape_helper(g, self, final_shape)
+
+
+@_onnx_symbolic("aten::linalg_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def linalg_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.norm.html
+    ord_value = None
+    if dim is None:
+        if symbolic_helper._is_none(ord):
+            self = symbolic_helper._reshape_helper(g, self, [-1])
+            ord = g.op("Constant", value_t=torch.LongTensor([2]))
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "dim", "Input rank must be known at export time.", self
+            )
+        if self_dim == 1:
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+        else:
+            dim = [0, 1]
+    else:
+        if len(dim) == 1:
+            if symbolic_helper._is_none(ord):
+                ord = g.op("Constant", value_t=torch.LongTensor([2]))
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+    if ord_value:
+        return linalg_vector_norm(g, self, ord_value, dim, keepdim, dtype)
+    return linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: list[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.matrix_norm.html
+    ord_value = symbolic_helper._parse_arg(ord, "s")
+    if ord_value == "fro":
+        return frobenius_norm(g, self, dim, keepdim)
+    elif ord_value == "nuc":
+        return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==nuc", self)
+    else:
+        ord_value = symbolic_helper._parse_arg(ord, "f")
+        if ord_value is None:
+            return frobenius_norm(g, self, dim, keepdim)
+        if ord_value == 2 or ord_value == -2:
+            # ord = 2/-2 unimplemented due to lack of operators
+            # used to calculate singular values
+            return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==2", self)
+        # Wrap the dim vector to handle negative dim values
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "linalg.matrix_norm", "Input rank must be known at export time.", self
+            )
+        # Common implementation for cases with
+        # ord = 1/-1 and ord = inf/-inf
+        if dim[0] < 0:
+            dim[0] += self_dim
+        if dim[1] < 0:
+            dim[1] += self_dim
+
+        if ord_value == math.inf or ord_value == -math.inf:
+            dim[0], dim[1] = dim[1], dim[0]
+        if dim[1] > dim[0] and not keepdim:
+            dim[1] -= 1
+        sum = symbolic_helper._reducesum_helper(
+            g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim
+        )
+        if ord_value > 0:
+            result, _indices = max(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        else:
+            result, _indices = min(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        return result
+
+
+@_onnx_symbolic("aten::linalg_cross")
+@symbolic_helper.parse_args("v", "v", "i")
+def linalg_cross(g: jit_utils.GraphContext, input, other, dim=-1):
+    return cross(g, input, other, dim)
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "is", "b")
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, axes_i=dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::multinomial")
+@symbolic_helper.parse_args("v", "i", "b", "v")
+def multinomial(
+    g: jit_utils.GraphContext, input, num_samples, replacement=False, generator=None
+):
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Multinomial", "generator is not supported for multinomial", input
+        )
+    if not replacement and num_samples > 1:
+        symbolic_helper._unimplemented(
+            "Multinomial",
+            "replacement=False when num_samples > 1 is not supported for multinomial",
+            input,
+        )
+
+    log_input = log(g, input)
+    return g.op(
+        "Multinomial",
+        log_input,
+        dtype_i=_C_onnx.TensorProtoDataType.INT64,
+        sample_size_i=num_samples,
+    )
+
+
+@_onnx_symbolic("aten::baddbmm")
+def baddbmm(g: jit_utils.GraphContext, self, batch1, batch2, beta, alpha):
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    batch_mul = matmul(g, batch1, batch2)
+    mul_a = mul(
+        g,
+        batch_mul,
+        g.op("Cast", alpha, to_i=scalar_type.onnx_type()),
+    )
+    mul_b = mul(
+        g,
+        self,
+        g.op("Cast", beta, to_i=scalar_type.onnx_type()),
+    )
+    return add(g, mul_a, mul_b)
+
+
+@_onnx_symbolic("aten::meshgrid")
+@symbolic_helper.parse_args("v", "s")
+def meshgrid(g: jit_utils.GraphContext, tensor_list, indexing: str | None = None):
+    if indexing is None:
+        indexing = "ij"
+    elif indexing not in {"ij", "xy"}:
+        raise errors.SymbolicValueError(
+            f"Unsupported indexing: {indexing}", tensor_list
+        )
+    unpacked_tensor_list = symbolic_helper._unpack_list(tensor_list)
+    if indexing == "xy":
+        unpacked_tensor_list[:2] = unpacked_tensor_list[1::-1]
+    tensors = [
+        symbolic_helper._reshape_helper(
+            g, t, g.op("Constant", value_t=torch.LongTensor([-1]))
+        )
+        for t in unpacked_tensor_list
+    ]
+    tensors_shape = [g.op("Shape", t) for t in tensors]
+    out_shape = g.op("Concat", *tensors_shape, axis_i=0)
+    out = []
+    for i, t in enumerate(tensors):
+        shape_i = [g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))] * len(
+            tensors
+        )
+        shape_i[i] = tensors_shape[i]
+        t_reshaped = _reshape_from_tensor(g, t, g.op("Concat", *shape_i, axis_i=0))
+        out.append(g.op("Expand", t_reshaped, out_shape))
+    if indexing == "xy":
+        out[0], out[1] = out[1], out[0]
+    return g.op("prim::ListConstruct", *out)
+
+
+@_onnx_symbolic("aten::remainder")
+def remainder(g: jit_utils.GraphContext, input, other):
+    div = _floor_divide(g, input, other)
+    quo = g.op("Mul", div, other)
+    return g.op("Sub", input, quo)
+
+
+@_onnx_symbolic("aten::gelu")
+@symbolic_helper.parse_args("v", "s")
+def gelu(g: jit_utils.GraphContext, self: torch._C.Value, approximate: str = "none"):
+    if approximate == "tanh":
+        kBeta = math.sqrt(2 / math.pi)
+        kKappa = 0.044715
+
+        beta = torch.tensor(kBeta, dtype=torch.double)
+        kappa = torch.tensor(kKappa, dtype=torch.double)
+        one = torch.tensor(1.0, dtype=torch.double)
+        half = torch.tensor(0.5, dtype=torch.double)
+
+        self_cube = mul(g, self, mul(g, self, self))
+        inner = mul(g, beta, add(g, self, mul(g, kappa, self_cube)))
+        return mul(g, half, mul(g, self, add(g, one, g.op("Tanh", inner))))
+    else:
+        _sqrt2 = 1.4142135623730951
+        erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double)))
+        erf_plusone = add(
+            g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))
+        )
+        return mul(
+            g,
+            mul(g, self, erf_plusone),
+            g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double)),
+        )
+
+
+@_onnx_symbolic("aten::group_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "i", "v", "v", "f", "i")
+def group_norm(
+    g: jit_utils.GraphContext, input, num_groups, weight, bias, eps, cudnn_enabled
+):
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if channel_size is not None:
+        assert channel_size % num_groups == 0
+    input_rank = symbolic_helper._get_tensor_rank(input)
+    if input_rank is None:
+        return symbolic_helper._unimplemented("group_norm", "unknown input rank", input)
+    # 0 in the shape list keeps dimension value unchanged.
+    shape = [0, num_groups, -1]
+    input_reshaped = symbolic_helper._reshape_helper(
+        g, input, g.op("Constant", value_t=torch.LongTensor(shape))
+    )
+
+    # C is always divisible by num_groups
+    # Due to shape difference. we need to apply weight and bias after
+    # instance norm computation and reshape
+    weight_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [1.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+    bias_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [0.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+
+    norm_reshaped = g.op(
+        "InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps
+    )
+    norm = symbolic_helper._reshape_helper(g, norm_reshaped, g.op("Shape", input))
+
+    if weight is None or weight.node().mustBeNone():
+        weight_value = torch.tensor(
+            [1.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or bias.node().mustBeNone():
+        bias_value = torch.tensor(
+            [0.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        bias = g.op("Constant", value_t=bias_value)
+
+    # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
+    axes = list(range(1, input_rank - 1))
+    return add(
+        g,
+        mul(g, norm, symbolic_helper._unsqueeze_helper(g, weight, axes)),
+        symbolic_helper._unsqueeze_helper(g, bias, axes),
+    )
+
+
+@_onnx_symbolic("aten::_weight_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+def _weight_norm(g: jit_utils.GraphContext, weight_v, weight_g, dim):
+    rank = symbolic_helper._get_tensor_rank(weight_v)
+    if rank is not None:
+        # W = g * ((v) / ||v||)
+        # Compute norm_except_dim for l2 norm. dim = None means over all dims
+        # torch's weight_norm module sets dim = -1 if it's None.
+        # This conflicts the logic for negative axes to access dims backwards
+        # TODO: Might need a fix in torch group_norm module
+        axes = list(range(rank))
+        if dim is not None:
+            if dim < -1:
+                dim += rank
+            if dim != -1:
+                axes.remove(dim)
+        norm_v = norm(g, weight_v, 2, axes, 1)
+        div = g.op("Div", weight_v, norm_v)
+        return g.op("Mul", div, weight_g)
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of _weight_norm for tensor of unknown rank.",
+        weight_v,
+    )
+
+
+@_onnx_symbolic("aten::dim")
+def dim(g: jit_utils.GraphContext, self):
+    """Implement the dim functionality available for a pytorch tensor in ONNX"""
+    # ONNX does not support dim directly in this opset so we can use 2 ops to get the info
+    shape = g.op("Shape", self)
+    return g.op("Size", shape)
+
+
+@_onnx_symbolic("aten::__contains_")
+def __contains_(g: jit_utils.GraphContext, self, element):
+    unpacked_list = symbolic_helper._unpack_list(self)
+    if all(
+        symbolic_helper._is_constant(x) for x in unpacked_list
+    ) and symbolic_helper._is_constant(element):
+        return g.op(
+            "Constant",
+            value_t=torch.tensor(
+                symbolic_helper._node_get(element.node(), "value")
+                in (symbolic_helper._node_get(x.node(), "value") for x in unpacked_list)
+            ),
+        )
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of __contains__ for non-constant list or element.",
+        self,
+    )
+
+
+@_onnx_symbolic("aten::__getitem_")
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    return select(g, self, g.op("Constant", value_t=torch.tensor([0])), i)
+
+
+@_onnx_symbolic("aten::item")
+def item(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::take")
+def take(g: jit_utils.GraphContext, self, index):
+    self_flattened = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    out = index_select(g, self_flattened, 0, index)
+    out = reshape_as(g, out, index)
+    return out
+
+
+def _kl_div_log_target_impl(g: jit_utils.GraphContext, input, target):
+    diff_ = sub(g, target, input)
+    exp_ = exp(g, target)
+    output = mul(g, exp_, diff_)
+    return output
+
+
+def _kl_div_non_log_target_impl(g: jit_utils.GraphContext, input, target):
+    log_ = log(g, target)
+    diff_ = sub(g, log_, input)
+    output_pos = mul(g, target, diff_)
+    zeros_ = zeros_like(g, output_pos)
+    mask_ = gt(g, target, g.op("Constant", value_t=torch.tensor(0)))
+    output = where(g, mask_, output_pos, zeros_)
+    return output
+
+
+@_onnx_symbolic("aten::kl_div")
+@symbolic_helper.parse_args("v", "v", "i", "b")
+def kl_div(g: jit_utils.GraphContext, input, target, reduction, log_target):
+    if log_target:
+        output = _kl_div_log_target_impl(g, input, target)
+    else:
+        output = _kl_div_non_log_target_impl(g, input, target)
+
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "kl_div with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::mse_loss")
+@symbolic_helper.parse_args("v", "v", "i")
+def mse_loss(g: jit_utils.GraphContext, input, target, reduction):
+    output = mul(g, sub(g, input, target), sub(g, input, target))
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "mse_loss with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::as_strided")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "is", "i")
+def as_strided(g: jit_utils.GraphContext, self, sizes, strides, offset=None):
+    sizes = symbolic_helper._maybe_get_const(sizes, "is")
+    rank = len(strides)
+    self_1d = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    ind: torch.Tensor | None
+    if not symbolic_helper._is_value(sizes):
+        ind = torch.tensor([0], dtype=torch.long)
+        for i, (size, stride) in enumerate(zip(sizes, strides)):
+            r_size = [1] * rank
+            r_size[i] = -1
+            ind = ind + torch.arange(size).view(r_size) * stride
+        if offset:
+            ind = ind + offset
+        return g.op("Gather", self_1d, g.op("Constant", value_t=ind))
+    else:
+        ind = None
+        for i, stride in enumerate(strides):
+            r_size = [1] * rank
+            r_size[i] = -1
+            size = select(
+                g,
+                sizes,
+                g.op("Constant", value_t=torch.tensor([0])),
+                g.op("Constant", value_t=torch.tensor(i)),
+            )
+            tmp_ind = symbolic_helper._reshape_helper(
+                g,
+                arange(g, size, 4, None, None, None),
+                g.op("Constant", value_t=torch.tensor(r_size)),
+            )
+            tmp_ind = g.op(
+                "Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride]))
+            )
+            if ind is None:
+                ind = tmp_ind
+            else:
+                ind = g.op("Add", ind, tmp_ind)
+        if offset:
+            ind = g.op("Add", ind, g.op("Constant", torch.tensor([offset])))
+        return g.op("Gather", self_1d, ind)
+
+
+@_onnx_symbolic("aten::__derive_index")
+def __derive_index(g: jit_utils.GraphContext, index, start, step):
+    return g.op("Add", start, g.op("Mul", index, step))
+
+
+@_onnx_symbolic("aten::__range_length")
+# Source code for aten op can be found here: pytorch/torch/csrc/jit/runtime/register_prim_ops.cpp
+# if (step > 0 && lo < hi) {
+#   push(stack, 1 + (hi - 1 - lo) / step);
+# } else if (step < 0 && lo > hi) {
+#   push(stack, 1 + (lo - 1 - hi) / (0 - step));
+# } else {
+#  push(stack, 0);
+# }
+def __range_length(g: jit_utils.GraphContext, lo, hi, step):
+    sub = g.op("Sub", hi, lo)
+    div = g.op("Ceil", true_divide(g, sub, step))
+    return g.op("Cast", div, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::linear")
+def linear(g: jit_utils.GraphContext, input, weight, bias):
+    rank = symbolic_helper._get_tensor_rank(input)
+    weight = t(g, weight)
+    if rank == 2 and not bias.node().mustBeNone():
+        alpha = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        beta = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        output = addmm(g, bias, input, weight, alpha, beta)
+    else:
+        output = matmul(g, input, weight)
+        if not bias.node().mustBeNone():
+            output = add(g, bias, output)
+
+    return output
+
+
+@_onnx_symbolic("aten::hann_window")
+@symbolic_helper.parse_args("v", "b", "i", "v", "v", "v", "v")
+def hann_window(
+    g: jit_utils.GraphContext,
+    window_length,
+    periodic=True,
+    dtype: int | None = None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    requires_grad=False,
+):
+    if dtype is None:
+        dtype_ = torch.get_default_dtype()
+        if not dtype_ or not dtype_.is_floating_point:
+            dtype_ = torch.float
+        scalar_type = _type_utils.JitScalarType.from_dtype(dtype_)
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    n_array = arange(g, window_length, 4, None, None, None)
+    output = g.op("Cast", n_array, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    output = mul(
+        g, g.op("Constant", value_t=torch.tensor(math.pi, dtype=torch.float)), output
+    )
+
+    if periodic is False:
+        window_length = sub(
+            g, window_length, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int))
+        )
+    output = div(g, output, window_length)
+    output = g.op(
+        "Cast",
+        square(g, sin(g, output)),
+        to_i=scalar_type.onnx_type(),
+    )
+
+    return output
+
+
+@_onnx_symbolic("aten::mv")
+def mv(g: jit_utils.GraphContext, self, vec):
+    return matmul(g, self, vec)
+
+
+@_onnx_symbolic("aten::dot")
+def dot(g: jit_utils.GraphContext, self, other):
+    return matmul(g, self, other)
+
+
+@_onnx_symbolic("aten::movedim")
+@symbolic_helper.parse_args("v", "t", "t")
+def movedim(g: jit_utils.GraphContext, self, source, destination):
+    # This is a pythonic implementation mostly taken from aten/src/ATen/native/TensorShape.cpp::movedim
+    source = source.view(-1)
+    destination = destination.view(-1)
+
+    assert source.size() == destination.size()
+
+    if (source == destination).all():
+        return self
+
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    assert self_rank is not None
+
+    perm = list(range(self_rank))
+
+    src_dims = perm.copy()
+    dst_dims = perm.copy()
+
+    for src, dst in zip(source.tolist(), destination.tolist()):
+        perm[dst] = src
+        src_dims[src] = -1
+        dst_dims[dst] = -1
+
+    src_dims = [dim for dim in src_dims if dim != -1]
+    dst_dims = [dim for dim in dst_dims if dim != -1]
+
+    for src, dst in zip(src_dims, dst_dims):
+        perm[dst] = src
+
+    return g.op("Transpose", self, perm_i=perm)
+
+
+@_onnx_symbolic("aten::fill")
+@symbolic_helper.parse_args("v", "v")
+def fill(g: jit_utils.GraphContext, self, value):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    return full_like(g, self, value, scalar_type)
+
+
+@_onnx_symbolic("aten::index_add")
+def index_add(g: jit_utils.GraphContext, self, dim, index, other, alpha=None):
+    warnings.warn(
+        "Warning: ONNX export does not support duplicated values in 'index' field, "
+        + "this will cause the ONNX model to be incorrect."
+    )
+
+    # ONNX does not support "alpha" argument, unlike aten index_add
+    # See: https://github.com/pytorch/pytorch/pull/65993#issuecomment-953151102 for more context
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        return symbolic_helper._unimplemented("index_add", "alpha != 1", self)
+
+    dim = symbolic_helper._maybe_get_const(dim, "i")
+    if dim is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function with "
+            "unknown 'dim' value.",
+            self,
+        )
+
+    self_dim_rank = symbolic_helper._get_tensor_rank(self)
+    other_dim_rank = symbolic_helper._get_tensor_rank(other)
+
+    if self_dim_rank is None or other_dim_rank is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function while "
+            "the rank of self tensor or tensor to be added is unknown.",
+            self,
+        )
+
+    if other_dim_rank != self_dim_rank:
+        delta = self_dim_rank - other_dim_rank
+        for i in range(delta):
+            other = symbolic_helper._unsqueeze_helper(
+                g, other, [symbolic_helper._get_tensor_rank(other)]
+            )
+
+    other_dim_size = symbolic_helper._get_tensor_dim_size(other, dim)
+    self_dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+
+    if (other_dim_size is not None) and (self_dim_size is not None):
+        if other_dim_size > self_dim_size:
+            raise errors.SymbolicValueError(
+                "ONNX export does not support exporting 'index_add_()' function with "
+                "duplicated values in 'index' parameter yet.",
+                self,
+            )
+
+    # Construct a new shape. It's almost as same as self except the size of the 'dim'
+    # dimension is 1, so that we can expand other dimensions as expected.
+    new_shape_axes = list(range(self_dim_rank))
+    new_shape_starts = [0 for i in range(self_dim_rank)]
+    new_shape_ends = [sys.maxsize if (i != dim) else 1 for i in range(self_dim_rank)]
+
+    new_shape = symbolic_helper._slice_helper(
+        g, self, axes=new_shape_axes, starts=new_shape_starts, ends=new_shape_ends
+    )
+    other = expand_as(g, other, new_shape)
+
+    for i in range(dim):
+        index = symbolic_helper._unsqueeze_helper(g, index, [0])
+
+    for i in range(self_dim_rank - dim - 1):
+        index = symbolic_helper._unsqueeze_helper(
+            g, index, [symbolic_helper._get_tensor_rank(index)]
+        )
+
+    return scatter_add(g, self, dim, expand_as(g, index, other), other)
+
+
+@_onnx_symbolic("aten::roll")
+@symbolic_helper.parse_args("v", "is", "is")
+def roll(g: jit_utils.GraphContext, self, shifts, dims):
+    assert len(shifts) == len(dims)
+
+    result = self
+    for i in range(len(shifts)):
+        shapes = []
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[-shifts[i]], ends=[sys.maxsize]
+        )
+        shapes.append(shape)
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[0], ends=[-shifts[i]]
+        )
+        shapes.append(shape)
+        result = g.op("Concat", *shapes, axis_i=dims[i])
+
+    return result
+
+
+@_onnx_symbolic("aten::cross")
+@symbolic_helper.parse_args("v", "v", "i")
+def cross(g: jit_utils.GraphContext, input, other, dim=None):
+    dim = symbolic_helper._get_dim_for_cross(input, dim)
+    # If we have two tensors such that
+    # A = [a, b, c], B = [d, e, f], we permute the tensor such that we have
+    # After first roll,
+    # A' = [b, c, a], B' = [f, d, e], so that we calculate (b*f, c*d, a*e)
+    roll_x_1 = roll(g, input, [2], [dim])
+    roll_y_1 = roll(g, other, [1], [dim])
+    # After second roll,
+    # A' = [c, a, b], B' = [e, f, d], so that we calculate (c*e, a*f, b*d)
+    roll_x_2 = roll(g, input, [1], [dim])
+    roll_y_2 = roll(g, other, [2], [dim])
+    # cross product is calculated as
+    # result = [(b*f - c*e), (c*d - a*f), (a*e - b*d)]
+    return sub(g, mul(g, roll_x_1, roll_y_1), mul(g, roll_x_2, roll_y_2))
+
+
+@_onnx_symbolic("aten::cdist")
+def cdist(
+    g: jit_utils.GraphContext,
+    x1,
+    x2,
+    p=2.0,
+    compute_mode="use_mm_for_euclid_dist_if_necessary",
+):
+    # X1.shape = (B * P * D), X2.shape = (B * R * D)
+    # In order to respect numpy style broadcasting as demonstrated in
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    # we unsqueeze both input tensors
+    row_size_x1 = symbolic_helper._get_tensor_dim_size(x1, -2)
+    row_size_x2 = symbolic_helper._get_tensor_dim_size(x2, -2)
+    assert row_size_x1 is not None
+    assert row_size_x2 is not None
+    p_float = symbolic_helper._parse_arg(p, "f")
+    compute_mode = symbolic_helper._parse_arg(compute_mode, "i")
+    if p_float == 2.0 and (
+        compute_mode == 1
+        or (compute_mode is None and row_size_x1 >= 25 and row_size_x2 >= 25)
+    ):
+        return _euclidean_dist(g, x1, x2)
+    rank = symbolic_helper._get_tensor_rank(x1)
+    assert rank is not None
+    broadcasted_x1 = symbolic_helper._unsqueeze_helper(g, x1, [rank - 1])
+    broadcasted_x2 = symbolic_helper._unsqueeze_helper(g, x2, [rank - 2])
+    return pairwise_distance(
+        g, broadcasted_x1, broadcasted_x2, p, eps=1e-06, keepdim=False
+    )
+
+
+def _euclidean_dist(g: jit_utils.GraphContext, x1, x2):
+    # X1.shape = (B * P * D), X2.shape = (B * R * D)
+    # using matrix multiplication to accelerate the calculation of
+    # the euclidean distance
+    rank = symbolic_helper._get_tensor_rank(x1)
+    assert rank is not None
+    x1_norm = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, x1, symbolic_helper._generate_wrapped_number(g, 2.0)),
+        axes_i=[-1],
+        keepdims_i=True,
+    )
+    x1_pad = ones_like(g, x1_norm)
+    x2_norm = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, x2, symbolic_helper._generate_wrapped_number(g, 2.0)),
+        axes_i=[-1],
+        keepdims_i=True,
+    )
+    x2_pad = ones_like(g, x2_norm)
+    x1_ = g.op(
+        "Concat",
+        *[
+            mul(g, symbolic_helper._generate_wrapped_number(g, -2.0), x1),
+            x1_norm,
+            x1_pad,
+        ],
+        axis_i=-1,
+    )
+    x2_ = g.op("Concat", *[x2, x2_pad, x2_norm], axis_i=-1)
+    result = matmul(g, x1_, transpose(g, x2_, -2, -1))
+    dtype = _type_utils.JitScalarType.from_value(result)
+    min = g.op(
+        "Cast", symbolic_helper._generate_wrapped_number(g, 0.0), to_i=dtype.onnx_type()
+    )
+    result = symbolic_helper._op_with_optional_float_cast(
+        g, "Max", result, min, opset_before=12
+    )
+    result = sqrt(g, result)
+    return result
+
+
+@_onnx_symbolic("aten::lerp")
+def lerp(g: jit_utils.GraphContext, self, end, weight):
+    # Conditional for better numeric. This has been discussed in
+    # https://github.com/pytorch/pytorch/pull/18871
+    diff = g.op("Sub", end, self)
+    return where(
+        g,
+        g.op("Less", weight, g.op("Constant", value_t=torch.tensor(0.5))),
+        g.op("Add", self, g.op("Mul", weight, diff)),
+        g.op(
+            "Sub",
+            end,
+            g.op(
+                "Mul",
+                diff,
+                g.op("Sub", g.op("Constant", value_t=torch.tensor(1.0)), weight),
+            ),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::broadcast_tensors")
+def broadcast_tensors(g: jit_utils.GraphContext, self):
+    all_tensors = symbolic_helper._unpack_list(self)
+    t_with_final_shape = zeros_like(g, all_tensors[0])
+
+    # Add operator supports multidirectional broadcasting. So we leverage this function
+    # to infer the final shape generated by the broadcast.
+    for t in all_tensors:
+        t_with_final_shape = add(g, t_with_final_shape, t)
+
+    t_list = [expand_as(g, t, t_with_final_shape) for t in all_tensors]
+    return g.op("prim::ListConstruct", *t_list)
+
+
+@_onnx_symbolic("aten::is_pinned")
+def is_pinned(g: jit_utils.GraphContext, self, device=None):
+    # Unused by ONNX.
+    return None
+
+
+@_onnx_symbolic("prim::ConstantSplit")
+def prim_constant_split(g: jit_utils.GraphContext, self, split_size, dim):
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantSplit", "unknown dimension size", self
+        )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
+
+
+# TODO: It would be better to export this as a chunk directly, as this is
+# less sensitive to changes in input size.
+# TODO: Once we have proper scoping, stop reimplementing chunk, delete this
+# method, and use the desugared version
+@_onnx_symbolic("prim::ConstantChunk")
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if dim_size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantChunk", "unknown dimension size", self
+        )
+    split_size = (dim_size + chunks - 1) // chunks
+    return prim_constant_split(g, self, split_size, dim)
+
+
+@_onnx_symbolic("prim::shape")
+def prim_shape(g: jit_utils.GraphContext, self):
+    return g.op("Shape", self)
+
+
+@_onnx_symbolic("prim::max")
+def prim_max(g: jit_utils.GraphContext, self, other):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Max", self, other, opset_before=12
+    )
+
+
+@_onnx_symbolic("prim::min")
+def prim_min(g: jit_utils.GraphContext, self, other=None):
+    if not other:
+        if symbolic_helper._is_packed_list(self):
+            self = stack(g, self, g.op("Constant", value_t=torch.tensor([0])))
+        return min(g, self)
+    return min(g, self, other)
+
+
+@_onnx_symbolic("prim::data")
+def prim_data(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::layout")
+def prim_layout(g: jit_utils.GraphContext, self):
+    # Always return 'torch.strided'. Other layout types are not supported by JIT 'TensorType'.
+    # Layout class defined in 'c10/core/Layout.h'.
+    return g.op("Constant", value_t=torch.tensor(0))
+
+
+@_onnx_symbolic("prim::ListConstruct")
+def prim_list_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::ListUnpack")
+def prim_list_unpack(
+    g: jit_utils.GraphContext, *inputs, **kwargs
+) -> list[_C.Value] | None:
+    if len(inputs) == 1 and inputs[0].node().kind() == "prim::ListConstruct":
+        # Cancel the previous node if it is ListConstruct by returning its inputs
+        # TODO(justinchuby): Use a public method in the helper module
+        return symbolic_helper._unpack_list(inputs[0])
+
+    return None
+
+
+@_onnx_symbolic("prim::TupleConstruct")
+def prim_tuple_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::Uninitialized")
+def prim_uninitialized(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+# exists to refine the type of the Value
+# if x is an optional Tensor, unchecked_cast will cast
+# x to Tensor, so the rest of the graph knows that x is a Tensor
+# this doesn't do anything in runtime and is a noop in ONNX
+@_onnx_symbolic("prim::unchecked_cast")
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::dtype")
+def prim_dtype(g: jit_utils.GraphContext, self):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    # This node records a torch dtype as int
+    return g.op("Constant", value_t=torch.tensor(scalar_type))
+
+
+@_onnx_symbolic("prim::tolist")
+def prim_tolist(g: jit_utils.GraphContext, input, dim_val, elem_ty_val):
+    """tolist is currently supported only for 1D input tensors.
+
+    dim_val and elem_ty_val represent dimension and type annotations
+    that need to match dimension and type of the input tensor.
+    """
+    dim = symbolic_helper._maybe_get_const(dim_val, "i")
+    if dim > 1:
+        return symbolic_helper._unimplemented("prim::tolist", "dim_val > 1", input)
+    return input
+
+
+# -----------------------------------------------------------------------------
+# Symbolic functions that need extra context
+# -----------------------------------------------------------------------------
+@_onnx_symbolic("prim::device")
+def prim_device(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
+    output_type = g.original_node.output().type()
+    if isinstance(output_type, _C.DeviceObjType):
+        return None
+
+    return symbolic_helper._unimplemented(
+        "prim::device",
+        f"output type should be 'DeviceObjType', not '{output_type.kind()}'",
+        g.original_node.output(),
+    )
+
+
+@_onnx_symbolic("prim::Loop")
+def prim_loop(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
+    node = g.original_node
+    env = g.env
+    values_in_env = g.values_in_env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    old_blocks = tuple(node.blocks())
+    _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+        g, "Loop", *inputs, outputs=node.outputsSize(), n_blocks=len(old_blocks)
+    )
+
+    for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+        # Copy input metadata to subblock
+        #
+        #   prim::Loop(iter, cond, input_1, ..., input_n)
+        #     block0(iter, input_1, ..., input_n)
+        #
+        # For `Loop` node, copy metadata for `iter`, `input_1`, ..., `input_n`.
+        for i, b_in in enumerate(old_block.inputs()):
+            if i == 0 and i < len(inputs):
+                b_in.setType(inputs[i].type())
+            # For optional block inputs, they may switch between None not-None inside
+            # the loop body, so if the loop input is not optional, the block input may
+            # still need to be optional.
+            if (
+                i > 0
+                and (i + 1) < len(inputs)
+                and not isinstance(b_in.type(), _C.OptionalType)
+            ):
+                b_in.setType(inputs[i + 1].type())
+        torch._C._jit_pass_onnx_block(
+            old_block,
+            new_block_context.block,
+            operator_export_type,
+            env,
+            values_in_env,
+            False,
+        )
+    fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+        new_node, opset_version
+    )
+    # Run shape type inference for Loop after subblock is converted.
+    if GLOBALS.onnx_shape_inference:
+        torch._C._jit_pass_onnx_node_shape_type_inference(
+            new_node, params_dict, opset_version
+        )
+    return fixed_outputs
+
+
+@_onnx_symbolic("prim::If")
+def prim_if(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
+    n = g.original_node
+    block = g.block
+    env = g.env
+    values_in_env = g.values_in_env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    static_if = inputs[0].node().kind() == "onnx::Constant"
+    if static_if:
+        # Fold static if
+        #
+        # The torch IR
+        # graph(%embedding_matrix.1 : Float(10, 15, strides=[15, 1], requires_grad=0, device=cpu),
+        #    %input.1 : Long(6, strides=[1], requires_grad=0, device=cpu), ...
+        # %65 : Bool(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        # %21 : Long(device=cpu) = aten::eq(%20, %64)
+        # %22 : Long(device=cpu) = prim::If(%21)
+        #     block0():
+        #     %23 : Long(device=cpu) = aten::is_floating_point(%input.1)
+        #     -> (%23)
+        #     block1():
+        #     -> (%65)
+        # %input.53 : Tensor, %weight : Tensor = prim::If(%22)
+        #     block0():
+        #     -> (%embedding_matrix.1, %input.1)
+        #     block1():
+        #     -> (%input.1, %embedding_matrix.1)
+        # %26 : int[] = aten::size(%input.53)
+        #
+        # The converted ONNX graph
+        # %10 : Bool(device=cpu) = onnx::Constant[value={0}]()
+        # %14 : Bool(device=cpu) = onnx::Equal(%13, %8)
+        # %15 : Bool(requires_grad=0, device=cpu) = onnx::Constant[value={0}]()
+        # %16 : Long(1, strides=[1], device=cpu) = onnx::Shape(%input.1)
+        input_flag = symbolic_helper._node_get(inputs[0].node(), "value").tolist()
+        const_value = (
+            all(input_flag) if isinstance(input_flag, list) else bool(input_flag)
+        )
+        block_idx = 0 if const_value else 1
+        current_b = list(n.blocks())[block_idx]
+        env = torch._C._jit_pass_onnx_block(
+            current_b,
+            block,
+            operator_export_type,
+            env,
+            values_in_env,
+            True,
+        )
+        if_output_list = list(n.outputs())
+        current_b_list = list(current_b.outputs())
+
+        final_b_list = []
+        for idx in range(len(if_output_list)):
+            if current_b_list[idx] not in env:
+                raise errors.SymbolicValueError(
+                    f"The sub block ATen output {current_b_list[idx]} is not in env.",
+                    current_b_list[idx],
+                )  # type:ignore[operator]
+            onnx_b = env[current_b_list[idx]]
+            final_b_list.append(onnx_b)
+        return final_b_list
+    else:
+        old_blocks = tuple(n.blocks())
+        _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+            g, "If", *inputs, outputs=n.outputsSize(), n_blocks=len(old_blocks)
+        )
+
+        for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+            torch._C._jit_pass_onnx_block(
+                old_block,
+                new_block_context.block,
+                operator_export_type,
+                env,
+                values_in_env,
+                False,
+            )
+        fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+            new_node, opset_version
+        )
+        # Run shape type inference for If after subblock is converted.
+        if GLOBALS.onnx_shape_inference:
+            torch._C._jit_pass_onnx_node_shape_type_inference(
+                new_node, params_dict, opset_version
+            )
+        return fixed_outputs
+
+
+@_onnx_symbolic("prim::Constant")
+def prim_constant(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+
+    if node.mustBeNone():
+        return None
+    # This must go before checking for string values, because some device constants
+    # have string values, but we want to keep them as unconverted Device types so
+    # that eq() can work on them.
+    if isinstance(node.output().type(), _C.DeviceObjType):
+        return None
+    if node.kindOf("value") == "t":
+        return g.op("Constant", value_t=symbolic_helper._node_get(node, "value"))
+    if node.kindOf("value") == "s":
+        return g.op("Constant", value_s=symbolic_helper._node_get(node, "value"))
+    if node.output().type().isSubtypeOf(
+        _C.ListType.ofInts()
+    ) or node.output().type().isSubtypeOf(_C.ListType.ofFloats()):
+        return g.op(
+            "Constant", value_t=torch.tensor(symbolic_helper._node_get(node, "value"))
+        )
+    if node.output().type().isSubtypeOf(_C.ListType.ofStrings()):
+        str_constants = [
+            g.op("Constant", value_s=s)
+            for s in symbolic_helper._node_get(node, "value")
+        ]
+        return g.op("prim::ListConstruct", *str_constants)
+
+    raise errors.SymbolicValueError(
+        f"Unsupported prim::Constant kind: '{node.kindOf('value')}'. "
+        f"Please send a bug report at {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
+        node.output(),
+    )
+
+
+@_onnx_symbolic("prim::type")
+def prim_type(g: jit_utils.GraphContext, device_value: _C.Value, *args, **kwargs):
+    if device_value.node().kind() == "prim::device":
+        device = jit_utils.get_device_from_value(device_value.node().input())
+        if device is not None:
+            return g.op("Constant", value_s=str(device))
+
+    return symbolic_helper._unimplemented(
+        "prim::type",
+        "Device type cannot be statically determined.",
+        device_value,
+    )
+
+
+@_onnx_symbolic("onnx::Placeholder")
+def onnx_placeholder(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+    block = g.block
+    env = g.env
+    values_in_env = g.values_in_env
+
+    return torch._C._jit_onnx_convert_pattern_from_subblock(
+        block, node, env, values_in_env
+    )
+
+
+@_onnx_symbolic("aten::resolve_conj")
+@_onnx_symbolic("aten::resolve_neg")
+def noop_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # `aten::resolve_conj` and `aten::resolve_neg` can safely be implemented as no-op
+    return input
+
+
+@_onnx_symbolic("aten::_conj")
+@_onnx_symbolic("aten::conj_physical")
+def unsupported_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # While `aten::_conj` and `aten::conj_physical` raise exception when input is complex
+    if symbolic_helper.is_complex_value(input):
+        # FIXME(justinchuby): report correct name for symbolic being executed
+        return symbolic_helper._onnx_unsupported(
+            "aten::_conj, aten::conj_physical",
+            input,
+        )
+
+    # they can safely be implemented as no-op for real numbers only
+    return noop_complex_operators(g, input)
+
+
+@_onnx_symbolic("aten::logit")
+def logit(g: jit_utils.GraphContext, self: torch._C.Value, eps: torch._C.Value):
+    one = g.op("Constant", value_t=torch.tensor(1.0))
+
+    if not symbolic_helper._is_none(eps):
+        eps = g.op(
+            "Cast", eps, to_i=_type_utils.JitScalarType.from_value(self).onnx_type()
+        )
+        one_sub_eps = g.op("Sub", one, eps)
+        self_less_equal_one_sub_eps = g.op("Greater", one_sub_eps, self)
+        temporary_self = g.op("Where", self_less_equal_one_sub_eps, self, one_sub_eps)
+
+        temporary_self_less_eps = g.op("Less", temporary_self, eps)
+        z = g.op("Where", temporary_self_less_eps, eps, temporary_self)
+    else:
+        z = self
+
+    sub = g.op("Sub", one, z)
+    div = g.op("Div", z, sub)
+    return g.op("Log", div)
diff --git a/torch/onnx/_internal/torchscript_exporter/utils.py b/torch/onnx/_internal/torchscript_exporter/utils.py
new file mode 100644
index 000000000000..2a7339c27e08
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/utils.py
@@ -0,0 +1,1930 @@
+# mypy: allow-untyped-defs
+"""Functions to export models into the ONNX IR format.
+
+These models can be loaded with the ONNX library and then
+converted to models which run on other deep learning frameworks.
+"""
+
+from __future__ import annotations
+
+
+__all__ = [
+    "select_model_mode_for_export",
+    "disable_apex_o2_state_dict_hook",
+    "setup_onnx_logging",
+    "exporter_context",
+    "export",
+    "model_signature",
+    "warn_on_static_input_change",
+    "unpack_quantized_tensor",
+    "unconvertible_ops",
+    "register_custom_op_symbolic",
+    "unregister_custom_op_symbolic",
+    "_add_block",
+    "_add_input_to_block",
+    "_add_output_to_block",
+    "_apply_friendly_debug_names",
+    "_check_flatten_did_not_remove",
+    "_create_jit_graph",
+    "_decide_add_node_names",
+    "_decide_constant_folding",
+    "_decide_input_format",
+    "_decide_keep_init_as_input",
+    "_export",
+    "_get_aten_op_overload_name",
+    "_get_example_outputs",
+    "_get_module_attributes",
+    "_get_named_param_dict",
+    "_get_param_count_list",
+    "_is_constant_tensor_list",
+    "_model_to_graph",
+    "_optimize_graph",
+    "_pre_trace_quant_model",
+    "_reset_trace_module_map",
+    "_resolve_args_by_export_type",
+    "_run_symbolic_function",
+    "_run_symbolic_method",
+    "_set_input_and_output_names",
+    "_setup_trace_module_map",
+    "_should_aten_fallback",
+    "_signature",
+    "_split_tensor_list_constants",
+    "_trace_and_get_graph_from_model",
+    "_trace",
+    "_trigger_symbolic_function_registration",
+    "_validate_dynamic_axes",
+    "_verify_custom_op_name",
+]
+
+import contextlib
+import copy
+import inspect
+import re
+import typing
+import warnings
+from typing import Any, Callable, cast
+from typing_extensions import deprecated
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.jit._trace
+from torch import _C
+from torch.onnx import _constants, errors
+from torch.onnx._internal.torchscript_exporter import (
+    jit_utils,
+    onnx_proto_utils,
+    registration,
+    symbolic_helper,
+)
+from torch.onnx._internal.torchscript_exporter._globals import GLOBALS
+
+
+if typing.TYPE_CHECKING:
+    from collections.abc import Collection, Mapping, Sequence
+
+
+# TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
+# Skip check due to cannot import IValue from torch._C
+_params_dict = {}  # type: ignore[var-annotated]
+
+
+@deprecated("Please set training mode before exporting the model", category=None)
+@contextlib.contextmanager
+def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode):
+    """A context manager to temporarily set the training mode of ``model``
+    to ``mode``, resetting it when we exit the with-block.
+
+    .. deprecated:: 2.7
+        Please set training mode before exporting the model.
+
+    Args:
+        model: Same type and meaning as ``model`` arg to :func:`export`.
+        mode: Same type and meaning as ``training`` arg to :func:`export`.
+    """
+    if not isinstance(mode, _C_onnx.TrainingMode):
+        raise TypeError(
+            f"'mode' should be a torch.onnx.TrainingMode enum, but got '{type(mode)}'."
+        )
+    originally_training: bool = False
+
+    if hasattr(model, "training"):
+        originally_training = model.training
+
+        # ONNX opset 12 has better support for training amenable models, with updated
+        # versions of the dropout and batch_norm operators
+        if mode == _C_onnx.TrainingMode.TRAINING or (
+            mode == _C_onnx.TrainingMode.PRESERVE and originally_training
+        ):
+            GLOBALS.export_training = True
+            if GLOBALS.export_onnx_opset_version < 12:
+                warnings.warn(
+                    "You are exporting the model in training mode with onnx opset "
+                    f"version {GLOBALS.export_onnx_opset_version}. "
+                    "Opset versions lower than opset 12 will not be able to export "
+                    "nodes such as Dropout and BatchNorm correctly."
+                )
+        else:
+            GLOBALS.export_training = False
+
+        GLOBALS.training_mode = mode
+        if mode == _C_onnx.TrainingMode.TRAINING:
+            model.train(True)
+        elif mode == _C_onnx.TrainingMode.EVAL:
+            model.train(False)
+        # else mode == _C_onnx.TrainingMode.PRESERVE, do nothing
+
+    try:
+        yield
+    finally:
+        if hasattr(model, "training") and not mode == _C_onnx.TrainingMode.PRESERVE:
+            model.train(originally_training)
+
+
+@deprecated(
+    "Please remove usage of this function. Copy its logic if it is required in user code",
+    category=None,
+)
+@contextlib.contextmanager
+def disable_apex_o2_state_dict_hook(model: torch.nn.Module | torch.jit.ScriptFunction):
+    """A context manager to temporarily disable the Apex O2 hook that returns.
+
+    .. deprecated:: 2.7
+        Please remove usage of this function.
+    """
+    # Apex O2 hook state_dict to return fp16 weights as fp32.
+    # Exporter cannot identify them as same tensors.
+    # Since this hook is only used by optimizer, it is safe to
+    # remove this hook while exporting.
+    if not isinstance(model, torch.jit.ScriptFunction):
+        model_hooks = {}  # type: ignore[var-annotated]
+        for module in model.modules():
+            for key, hook in module._state_dict_hooks.items():
+                if type(hook).__name__ == "O2StateDictHook":
+                    if module not in model_hooks:
+                        model_hooks[module] = {}
+                    model_hooks[module][key] = hook
+            if module in model_hooks:
+                for key in model_hooks[module]:
+                    module._state_dict_hooks.pop(key)
+        try:
+            yield
+        finally:
+            # Add the hooks back
+            for module, m_map in model_hooks.items():
+                for key, hook in m_map.items():
+                    module._state_dict_hooks[key] = hook
+    else:
+        try:
+            yield
+        finally:
+            pass
+
+
+@deprecated("The feature will be removed. Please remove usage of this function")
+@contextlib.contextmanager
+def setup_onnx_logging(verbose: bool):
+    """A context manager to temporarily set the ONNX logging verbosity.
+
+    .. deprecated:: 2.7
+        Please remove usage of this function.
+    """
+    is_originally_enabled = _C._jit_is_onnx_log_enabled
+    if is_originally_enabled or verbose:  # type: ignore[truthy-function]
+        _C._jit_set_onnx_log_enabled(True)
+    try:
+        yield
+    finally:
+        if not is_originally_enabled:  # type: ignore[truthy-function]
+            _C._jit_set_onnx_log_enabled(False)
+
+
+@deprecated(
+    "The feature will be removed. Please remove usage of this function "
+    "and implement equivalent logic if needed",
+    category=None,
+)
+@contextlib.contextmanager
+def exporter_context(model, mode: _C_onnx.TrainingMode, verbose: bool):
+    """A context manager to temporarily set the training mode of ``model``
+    to ``mode``, disable the Apex O2 hook, and set the ONNX logging verbosity.
+
+    .. deprecated:: 2.7
+        Please set training mode before exporting the model.
+    """
+    with (
+        select_model_mode_for_export(model, mode) as mode_ctx,
+        disable_apex_o2_state_dict_hook(model) as apex_ctx,
+        setup_onnx_logging(verbose) as log_ctx,
+    ):
+        yield (mode_ctx, apex_ctx, log_ctx)
+
+
+def export(
+    model: torch.nn.Module | torch.jit.ScriptModule | torch.jit.ScriptFunction,
+    args: tuple[Any, ...] | torch.Tensor,
+    f: str,
+    *,
+    kwargs: dict[str, Any] | None = None,
+    export_params: bool = True,
+    verbose: bool = False,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    input_names: Sequence[str] | None = None,
+    output_names: Sequence[str] | None = None,
+    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
+    opset_version: int | None = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Mapping[str, Mapping[int, str]]
+    | Mapping[str, Sequence[int]]
+    | None = None,
+    keep_initializers_as_inputs: bool | None = None,
+    custom_opsets: Mapping[str, int] | None = None,
+    export_modules_as_functions: bool | Collection[type[torch.nn.Module]] = False,
+    autograd_inlining: bool = True,
+) -> None:
+    r"""Exports a model into ONNX format.
+
+    If ``model`` is not a :class:`torch.jit.ScriptModule` nor a
+    :class:`torch.jit.ScriptFunction`, this runs
+    ``model`` once in order to convert it to a TorchScript graph to be exported
+    (the equivalent of :func:`torch.jit.trace`). Thus this has the same limited support
+    for dynamic control flow as :func:`torch.jit.trace`.
+
+    Args:
+        model: The model to be exported.
+        args:
+
+            args can be structured either as:
+
+            1. ONLY A TUPLE OF ARGUMENTS::
+
+                args = (x, y, z)
+
+            The tuple should contain model inputs such that ``model(*args)`` is a valid
+            invocation of the model. Any non-Tensor arguments will be hard-coded into the
+            exported model; any Tensor arguments will become inputs of the exported model,
+            in the order they occur in the tuple.
+
+            2. A TENSOR::
+
+                args = torch.Tensor([1])
+
+            This is equivalent to a 1-ary tuple of that Tensor.
+
+            3. A TUPLE OF ARGUMENTS ENDING WITH A DICTIONARY OF NAMED ARGUMENTS::
+
+                args = (x, {"y": input_y, "z": input_z})
+
+            All but the last element of the tuple will be passed as non-keyword arguments,
+            and named arguments will be set from the last element. If a named argument is
+            not present in the dictionary, it is assigned the default value, or None if a
+            default value is not provided.
+
+            .. warning::
+                This behavior will be deprecated in a future release. Please use the
+                kwargs argument instead.
+
+            .. note::
+                If a dictionary is the last element of the args tuple, it will be
+                interpreted as containing named arguments. In order to pass a dict as the
+                last non-keyword arg, provide an empty dict as the last element of the args
+                tuple. For example, instead of::
+
+                    torch.onnx.export(
+                        model,
+                        (
+                            x,
+                            # WRONG: will be interpreted as named arguments
+                            {y: z},
+                        ),
+                        "test.onnx.pb",
+                    )
+
+                Write::
+
+                    torch.onnx.export(model, (x, {y: z}, {}), "test.onnx.pb")
+
+        f: Path to the output ONNX model file. E.g. "model.onnx".
+        kwargs: Named arguments to the model.
+        export_params: If True, all parameters will
+            be exported. Set this to False if you want to export an untrained model.
+            In this case, the exported model will first take all of its parameters
+            as arguments, with the ordering as specified by ``model.state_dict().values()``
+        verbose: if True, prints a description of the
+            model being exported to stdout. In addition, the final ONNX graph will include the
+            field ``doc_string``` from the exported model which mentions the source code locations
+            for ``model``. If True, ONNX exporter logging will be turned on.
+        training:
+            * ``TrainingMode.EVAL``: export the model in inference mode.
+            * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
+                False and in training mode if model.training is True.
+            * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations
+                which might interfere with training.
+        input_names (list of str, default empty list): names to assign to the
+            input nodes of the graph, in order.
+        output_names (list of str, default empty list): names to assign to the
+            output nodes of the graph, in order.
+        operator_export_type (enum, default OperatorExportTypes.ONNX):
+
+            .. warning::
+                This option will be deprecated in a future release. Future exported
+                graphs will always use the default opset domain.
+
+            * ``OperatorExportTypes.ONNX``: Export all ops as regular ONNX ops
+                (in the default opset domain).
+            * ``OperatorExportTypes.ONNX_FALLTHROUGH``: Try to convert all ops
+                to standard ONNX ops in the default opset domain. If unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting the op into a custom opset domain without conversion. Applies
+                to `custom ops <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
+                as well as ATen ops. For the exported model to be usable, the runtime must support
+                these non-standard ops.
+            * ``OperatorExportTypes.ONNX_ATEN``: All ATen ops (in the TorchScript namespace "aten")
+                are exported as ATen ops (in opset domain "org.pytorch.aten").
+                `ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library, so
+                this instructs the runtime to use PyTorch's implementation of these ops.
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+                    This may be useful if the numeric differences in implementations of operators are
+                    causing large differences in behavior between PyTorch and Caffe2 (which is more
+                    common on untrained models).
+
+            * ``OperatorExportTypes.ONNX_ATEN_FALLBACK``: Try to export each ATen op
+                (in the TorchScript namespace "aten") as a regular ONNX op. If we are unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting an ATen op. See documentation on OperatorExportTypes.ONNX_ATEN for
+                context.
+                For example::
+
+                    graph(%0 : Float):
+                    %3 : int = prim::Constant[value=0]()
+                    # conversion unsupported
+                    %4 : Float = aten::triu(%0, %3)
+                    # conversion supported
+                    %5 : Float = aten::mul(%4, %0)
+                    return (%5)
+
+                Assuming ``aten::triu`` is not supported in ONNX, this will be exported as::
+
+                    graph(%0 : Float):
+                    %1 : Long() = onnx::Constant[value={0}]()
+                    # not converted
+                    %2 : Float = aten::ATen[operator="triu"](%0, %1)
+                    # converted
+                    %3 : Float = onnx::Mul(%2, %0)
+                    return (%3)
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+        opset_version (int, default 18): The version of the
+            `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
+            to target. Must be >= 7.
+        do_constant_folding: Apply the constant-folding optimization.
+            Constant-folding will replace some of the ops that have all constant inputs
+            with pre-computed constant nodes.
+        dynamic_axes:
+
+            By default the exported model will have the shapes of all input and output tensors
+            set to exactly match those given in ``args``. To specify axes of tensors as
+            dynamic (i.e. known only at run-time), set ``dynamic_axes`` to a dict with schema:
+
+            * KEY (str): an input or output name. Each name must also be provided in ``input_names`` or
+                ``output_names``.
+            * VALUE (dict or list): If a dict, keys are axis indices and values are axis names. If a
+                list, each element is an axis index.
+
+            For example::
+
+                class SumModule(torch.nn.Module):
+                    def forward(self, x):
+                        return torch.sum(x, dim=1)
+
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                ...
+
+            While::
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                    dynamic_axes={
+                        # dict value: manually named axes
+                        "x": {0: "my_custom_axis_name"},
+                        # list value: automatic names
+                        "sum": [0],
+                    },
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "my_custom_axis_name"  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "sum_dynamic_axes_1"  # axis 0
+                ...
+
+        keep_initializers_as_inputs: If True, all the
+            initializers (typically corresponding to parameters) in the
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the non-parameter inputs are added as inputs.
+            This may allow for better optimizations (e.g. constant folding) by
+            backends/runtimes.
+
+            If True, `deduplicate_initializers` pass will not be executed. This means
+            initializers with duplicated values will not be deduplicated and
+            will be treated as distinct inputs to the graph. This allows different
+            input initializers to be supplied at the runtime following export.
+
+            If ``opset_version < 9``, initializers MUST be part of graph
+            inputs and this argument will be ignored and the behavior will be
+            equivalent to setting this argument to True.
+
+        custom_opsets (dict[str, int], default empty dict): A dict with schema:
+
+            * KEY (str): opset domain name
+            * VALUE (int): opset version
+
+            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
+            the opset version is set to 1. Only custom opset domain name and version should be
+            indicated through this argument.
+
+        export_modules_as_functions: Flag to enable
+            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
+            particular types of modules to export as local functions in ONNX.
+            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
+            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
+            Module variables will be exported as function attributes. There are two categories of function
+            attributes.
+
+            1. Annotated attributes: class variables that have type annotations via
+            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
+            will be exported as attributes.
+            Annotated attributes are not used inside the subgraph of ONNX local function because
+            they are not created by PyTorch JIT tracing, but they may be used by consumers
+            to determine whether or not to replace the function with a particular fused kernel.
+
+            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
+            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
+            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
+
+            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
+            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
+            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
+                only if the type of the ``nn.Module`` is found in the set.
+
+        autograd_inlining: Flag used to control whether to inline autograd functions.
+            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
+
+    Raises:
+        :class:`torch.onnx.errors.CheckerError`: If the ONNX checker detects an invalid ONNX graph.
+        :class:`torch.onnx.errors.UnsupportedOperatorError`: If the ONNX graph cannot be exported because it
+            uses an operator that is not supported by the exporter.
+        :class:`torch.onnx.errors.OnnxExporterError`: Other errors that can occur during export.
+            All errors are subclasses of :class:`errors.OnnxExporterError`.
+    """
+    if operator_export_type != _C_onnx.OperatorExportTypes.ONNX:
+        warnings.warn(
+            "Setting `operator_export_type` to something other than default is deprecated. "
+            "The option will be removed in a future release.",
+            category=DeprecationWarning,
+        )
+    if training == _C_onnx.TrainingMode.TRAINING:
+        warnings.warn(
+            "Setting `training` to something other than default is deprecated. "
+            "The option will be removed in a future release. Please set the training mode "
+            "before exporting the model.",
+            category=DeprecationWarning,
+        )
+
+    args = (args,) if isinstance(args, torch.Tensor) else args
+    if kwargs is not None:
+        args = args + (kwargs,)
+
+    _export(
+        model,
+        args,
+        f,
+        export_params,
+        verbose,
+        training,
+        input_names,
+        output_names,
+        operator_export_type=operator_export_type,
+        opset_version=opset_version,
+        do_constant_folding=do_constant_folding,
+        dynamic_axes=dynamic_axes,
+        keep_initializers_as_inputs=keep_initializers_as_inputs,
+        custom_opsets=custom_opsets,
+        export_modules_as_functions=export_modules_as_functions,
+        autograd_inlining=autograd_inlining,
+    )
+
+    return None
+
+
+def _is_constant_tensor_list(node):
+    if node.kind() != "prim::Constant":
+        return False
+    output_type = node.output().type()
+    if output_type.isSubtypeOf(_C.ListType.ofTensors()):
+        return True
+    if output_type.isSubtypeOf(_C.ListType(_C.OptionalType.ofTensor())):
+        return True
+
+
+# ONNX can't handle constants that are lists of tensors, which can
+# get generated in constant prop. So we split them back into prim::ListConstructs
+
+
+def _split_tensor_list_constants(g, block):
+    for node in block.nodes():
+        for subblock in node.blocks():
+            _split_tensor_list_constants(g, subblock)
+        if _is_constant_tensor_list(node):
+            inputs = []
+            for val in node.output().toIValue():
+                input = g.insertConstant(val)
+                input.node().moveBefore(node)
+                input.node().copyMetadata(node)
+                inputs.append(input)
+
+            lc = (
+                g.create("prim::ListConstruct", inputs)
+                .insertBefore(node)
+                .output()
+                .setType(_C.ListType.ofTensors())
+            )
+            lc.node().copyMetadata(node)
+            node.output().replaceAllUsesWith(lc)
+
+
+def _optimize_graph(
+    graph: _C.Graph,
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    _disable_torch_constant_prop: bool = False,
+    fixed_batch_size: bool = False,
+    params_dict=None,
+    dynamic_axes=None,
+    input_names=None,
+    module=None,
+):
+    if params_dict is None:
+        params_dict = {}
+
+    # Inline everything
+    _C._jit_pass_inline(graph)
+
+    # Remove fork/wait nodes
+    _C._jit_pass_inline_fork_wait(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.autograd_inlining:
+        _C._jit_pass_onnx_autograd_function_process(graph)
+    _C._jit_pass_lower_all_tuples(graph)
+
+    # we now record some ops like ones/zeros
+    # into a trace where we previously recorded constants.
+    # use constant prop to maintain our current level of onnx support
+    # without implementing symbolics for all of them
+    if _disable_torch_constant_prop is False:
+        _C._jit_pass_constant_propagation(graph)
+
+    _split_tensor_list_constants(graph, graph)
+    # run dce to eliminate dead parts of the graph that might have been
+    # left behind by things like symbolic_override
+    _C._jit_pass_dce(graph)
+    _C._jit_pass_lint(graph)
+
+    # CSE should improve perf when Autocast is used with disabled cache
+    # Autocast is disabled due to a limitation on tracer as described at https://github.com/pytorch/pytorch/issues/84092
+    # Must run before _C._jit_pass_erase_number_types to prevent type substitution
+    if _C._jit_pass_cse(graph):
+        _C._jit_pass_onnx_lint(graph)
+
+    _C._jit_pass_canonicalize_graph_fuser_ops(graph)
+    _C._jit_pass_lint(graph)
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_fuse_addmm(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_lower_all_tuples(graph)
+    # in _jit_pass_onnx, symbolic functions are called for each node for conversion.
+    # However, there are nodes that cannot be converted without additional context.
+    # For example, the number of outputs from split (and whether it is static or dynamic) is unknown
+    # until the point where it is unpacked by listUnpack node.
+    # This pass does a preprocess, and prepares the nodes such that enough context can be received
+    # by the symbolic function.
+    _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+    _C._jit_pass_onnx_preprocess(graph)
+
+    # onnx does not support tuples, so try to remove them
+    _C._jit_pass_lint(graph)
+
+    # onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0
+    _C._jit_pass_prepare_division_for_onnx(graph)
+
+    _C._jit_pass_onnx_remove_print(graph)
+    _C._jit_pass_onnx_preprocess_caffe2(graph)
+
+    symbolic_helper._quantized_ops.clear()
+    # Unpack quantized weights for conv and linear ops and insert into graph.
+    _C._jit_pass_onnx_unpack_quantized_weights(graph, params_dict)
+    # onnx only supports tensors, so we turn all out number types into tensors
+    _C._jit_pass_erase_number_types(graph)
+    if GLOBALS.onnx_shape_inference:
+        input_names = [] if input_names is None else input_names
+        dynamic_axes = {} if dynamic_axes is None else dynamic_axes
+        _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
+    _C._jit_pass_onnx_lint(graph)
+
+    graph = _C._jit_pass_onnx(graph, operator_export_type)
+    _C._jit_pass_onnx_lint(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_scalar_type_analysis(
+        graph, True, GLOBALS.export_onnx_opset_version
+    )
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_peephole(
+        graph, GLOBALS.export_onnx_opset_version, fixed_batch_size
+    )
+    _C._jit_pass_lint(graph)
+
+    # graph is not a valid jit graph anymore because types have been replaced
+    # (e.g. int with Tensor), so it now contains operators that don't actually
+    # exist. We can't run normal dead code elimination because it'd fail trying
+    # to look up if an operator has side effects, but we can run a dead code
+    # elimination variant that doesn't need to look up if an op has side effects.
+    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    _C._jit_pass_lint(graph)
+    graph = _C._jit_pass_canonicalize(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.onnx_shape_inference:
+        try:
+            _C._jit_pass_onnx_graph_shape_type_inference(
+                graph, params_dict, GLOBALS.export_onnx_opset_version
+            )
+        except RuntimeError:
+            # NOTE: shape type inference error should not stop the export process
+            # https://github.com/pytorch/pytorch/issues/132205
+            pass
+
+    return graph
+
+
+def warn_on_static_input_change(input_states):
+    """Warns that changes to input dictionaries and strings won't take effect in the traced ONNX graph.
+
+    We accept dictionaries and strings as ONNX inputs, but they should be only for
+    configuration use. we detect here if these inputs are modified, and if so we warn
+    the user that the changes won't take effect in the traced ONNX graph.
+    """
+    for input, traced_input in zip(input_states[0], input_states[1]):
+        if isinstance(input, dict):
+            if list(input.keys()) != list(traced_input.keys()):
+                warning = (
+                    "We detected that you are modifying a dictionary that is an input to your "
+                    "model. "
+                    "Note that dictionaries are allowed as inputs in ONNX but they should be "
+                    "handled with care. "
+                    "Usages of dictionaries is not recommended, and should not be used except "
+                    "for configuration use. "
+                    "Also note that the order and values of the keys must remain the same. "
+                )
+                warnings.warn(warning)
+        elif isinstance(input, str):
+            if input != traced_input:
+                warning = (
+                    "The model seems to have string inputs/outputs. "
+                    "Note that strings will not appear as inputs/outputs of the ONNX graph. "
+                )
+                warnings.warn(warning)
+
+
+def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type):
+    """Resolves the arguments that are ignored when export_type != operator_export_type.ONNX."""
+    return arg_value
+
+
+def _decide_keep_init_as_input(
+    keep_initializers_as_inputs: bool | None,
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    opset_version: int,
+):
+    """Decides whether the initializers in the graph should be listed as ONNX graph inputs.
+
+    This method encapsulates the logic to decide whether the initializers in the graph
+    should be listed as ONNX graph inputs (i.e., whether to choose ONNX IR v3 or v4).
+    If keep_initializers_as_inputs is not specified (None), then we decide whether to keep
+    initializers as graph inputs (val_keep_init_as_ip) based on export type. If export type
+    is ONNX, then do not keep initializers as input (val_keep_init_as_ip=False). For all other
+    export types keep initializers as input (val_keep_init_as_ip=True).
+    If keep_initializers_as_inputs is specified, then respect it. Unless opset version <= 8,
+    in which case it must be ignored because for opset version <= 8, all initializers MUST be
+    part of graph input (only ONNX IR v3 is allowed), i.e. val_keep_init_as_ip=True.
+
+    Special handling is needed for opset version 8 or lower, because irrespective
+    of user input for keep_initializers_as_inputs, the graph must follow ONNX IR v3
+    semantics, i.e. all initializers must be listed as ONNX graph input.
+    """
+
+    if opset_version < 9:
+        if keep_initializers_as_inputs is False:
+            warnings.warn(
+                "Setting 'keep_initializers_as_inputs=False' for opset version"
+                "8 or lower would lead to an invalid ONNX graph. Therefore, "
+                "'keep_initializers_as_inputs=False' is ignored during export."
+                "Exported model will have initializers as graph inputs (compliant "
+                " to ONNX IR v3)."
+            )
+        return True  # i.e. True == initializers are part of graph input (ONNX IR v3)
+    val_keep_init_as_ip = (
+        True if keep_initializers_as_inputs is None else keep_initializers_as_inputs
+    )
+    if (
+        keep_initializers_as_inputs is None
+        and operator_export_type is _C_onnx.OperatorExportTypes.ONNX
+    ):
+        val_keep_init_as_ip = False
+    return val_keep_init_as_ip
+
+
+def _decide_add_node_names(add_node_names, operator_export_type):
+    return _resolve_args_by_export_type(
+        "add_node_names", add_node_names, operator_export_type
+    )
+
+
+def _decide_constant_folding(do_constant_folding, operator_export_type, training):
+    do_constant_folding = _resolve_args_by_export_type(
+        "do_constant_folding", do_constant_folding, operator_export_type
+    )
+    if do_constant_folding and (
+        training is not None and training is not _C_onnx.TrainingMode.EVAL
+    ):
+        warnings.warn(
+            "It is recommended that constant folding be turned off ('do_constant_folding=False') "
+            "when exporting the model in training-amenable mode, i.e. with 'training=TrainingMode.TRAIN' "
+            "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
+            "learnable model parameters may not translate correctly in the exported ONNX model "
+            "because constant folding mutates model parameters. Please consider "
+            "turning off constant folding or setting the training=TrainingMode.EVAL."
+        )
+    return do_constant_folding
+
+
+def _signature(model) -> inspect.Signature:
+    should_be_callable = getattr(model, "forward", model)
+    if callable(should_be_callable):
+        return inspect.signature(should_be_callable)
+    raise ValueError("model has no forward method and is not callable")
+
+
+def _decide_input_format(model, args):
+    try:
+        sig = _signature(model)
+    except ValueError as e:
+        warnings.warn(f"{e}, skipping _decide_input_format")
+        return args
+    try:
+        ordered_list_keys = list(sig.parameters.keys())
+        if ordered_list_keys[0] == "self":
+            ordered_list_keys = ordered_list_keys[1:]
+        args_dict: dict = {}
+        if isinstance(args, list):
+            args_list = args
+        elif isinstance(args, tuple):
+            args_list = list(args)
+        else:
+            args_list = [args]
+        if isinstance(args_list[-1], dict):
+            args_dict = args_list[-1]
+            args_list = args_list[:-1]
+        n_nonkeyword = len(args_list)
+        for optional_arg in ordered_list_keys[n_nonkeyword:]:
+            if optional_arg in args_dict:
+                args_list.append(args_dict[optional_arg])
+            # Check if this arg has a default value
+            else:
+                param = sig.parameters[optional_arg]
+                if param.default != param.empty:
+                    args_list.append(param.default)
+        args = args_list if isinstance(args, list) else tuple(args_list)
+    # Cases of models with no input args
+    except IndexError:
+        warnings.warn("No input args, skipping _decide_input_format")
+    except Exception as e:
+        warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}")
+    return args
+
+
+def _trace(func, args, operator_export_type, return_outs=False):
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, torch.Tensor):
+        args = (args,)
+
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        func,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    warn_on_static_input_change(inputs_states)
+
+    trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={})
+    if return_outs:
+        return trace_graph, torch_out
+    return trace_graph
+
+
+def _trace_and_get_graph_from_model(model, args):
+    # A basic sanity check: make sure the state_dict keys are the same
+    # before and after running the model.  Fail fast!
+    orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
+
+    # Disable Autocast cache because it replaces kernel's weight and bias
+    # by (undesired) constants.
+    # No perf impact for when there are reused weights since https://github.com/pytorch/pytorch/pull/85665
+    prev_autocast_cache_enabled = torch.is_autocast_cache_enabled()
+    torch.set_autocast_cache_enabled(False)
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        model,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    torch.set_autocast_cache_enabled(prev_autocast_cache_enabled)
+
+    warn_on_static_input_change(inputs_states)
+
+    if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():
+        raise RuntimeError(
+            "state_dict changed after running the tracer; "
+            "something weird is happening in your model!"
+        )
+
+    return trace_graph, torch_out
+
+
+def _get_param_count_list(method_graph, args_params):
+    param_count_list = []
+    for input_, arg_params_ in zip(method_graph.inputs(), args_params):
+        if "PackedParams" in str(input_.type()):
+            in_vars, _ = torch.jit._flatten(arg_params_)
+            param_count_list.append(len(in_vars))
+        else:
+            param_count_list.append(arg_params_ is not None)
+
+    return param_count_list
+
+
+def _check_flatten_did_not_remove(original, jit_flattened):
+    """torch.jit._flatten removes None. Check if it did so in this case."""
+
+    def flatten(x):
+        if isinstance(x, (list, tuple)):
+            for inner in x:
+                yield from flatten(inner)
+        elif isinstance(x, dict):
+            for inner in x.values():
+                yield from flatten(inner)
+        else:
+            yield x
+
+    flattened_with_none = list(flatten(original))
+    num_none = len(flattened_with_none) - len(jit_flattened)
+    assert num_none >= 0
+    if num_none:
+        raise ValueError(
+            f"args contained {num_none} None's after flattening. "
+            "When exporting a ScriptModule or ScriptFunction, no args may "
+            "be None because that breaks type propagation."
+        )
+
+
+def _create_jit_graph(
+    model: torch.nn.Module | torch.jit.ScriptFunction, args: Sequence[Any]
+) -> tuple[_C.Graph, list[_C.IValue], Any | None, _C.ScriptModule | None]:
+    if isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)):
+        flattened_args = tuple(torch.jit._flatten(tuple(args))[0])
+        _check_flatten_did_not_remove(args, flattened_args)
+        torch_out = None
+
+        if isinstance(model, torch.jit.ScriptModule):
+            try:
+                graph = model.forward.graph  # type: ignore[attr-defined]
+            except AttributeError as e:
+                raise RuntimeError("'forward' method must be a script method") from e
+            _C._jit_pass_onnx_function_substitution(graph)
+            freezed_module = _C._freeze_module(
+                cast(_C.ScriptModule, model._c), preserveParameters=True
+            )
+            module, params = _C._jit_onnx_list_model_parameters(freezed_module)
+            method_graph = module._get_method("forward").graph
+            args_params = tuple(args) + tuple(params)
+            param_count_list = _get_param_count_list(method_graph, args_params)
+            in_vars, _ = torch.jit._flatten(args_params)
+            graph = _C._propagate_and_assign_input_shapes(
+                method_graph, tuple(in_vars), param_count_list, False, False
+            )
+            return graph, params, torch_out, module
+
+        # torch.jit.ScriptFunction
+        params = []
+        graph = model.graph
+        _C._jit_pass_onnx_function_substitution(graph)
+        param_count_list = _get_param_count_list(graph, args)
+        graph = _C._propagate_and_assign_input_shapes(
+            graph, flattened_args, param_count_list, False, False
+        )
+        return graph, params, torch_out, None
+
+    graph, torch_out = _trace_and_get_graph_from_model(model, args)
+    _C._jit_pass_onnx_lint(graph)
+    state_dict = torch.jit._unique_state_dict(model)
+    params = list(state_dict.values())
+    graph_inputs = list(graph.inputs())
+    user_input_num = len(graph_inputs) - len(state_dict)
+    param_names = list(state_dict.keys())
+    for i, inp in enumerate(graph_inputs):
+        if i >= user_input_num:
+            inp.setDebugName(param_names[i - user_input_num])
+    _C._jit_pass_onnx_function_substitution(graph)
+    return graph, params, torch_out, None
+
+
+def _get_named_param_dict(graph, params):
+    input_and_param_names = [val.debugName() for val in graph.inputs()]
+    param_names = input_and_param_names[len(input_and_param_names) - len(params) :]
+    _params_dict = dict(zip(param_names, params))
+    return _params_dict
+
+
+def _get_example_outputs(model, args):
+    input_args = copy.deepcopy(args)
+    input_kwargs = {}
+    if input_args and isinstance(input_args[-1], dict):
+        input_kwargs = input_args[-1]
+        input_args = input_args[:-1]
+
+    example_outputs = model(*input_args, **input_kwargs)
+    if isinstance(example_outputs, list):
+        example_outputs = [example_outputs]
+    elif not isinstance(example_outputs, tuple):
+        example_outputs = (example_outputs,)
+
+    return example_outputs
+
+
+_qtype_vtype_map = {
+    torch.quint8: torch.uint8,
+    torch.qint8: torch.int8,
+    torch.qint32: torch.int32,
+    torch.quint4x2: torch.int8,
+}
+
+
+def unpack_quantized_tensor(value, cast_onnx_accepted=True):
+    if isinstance(value, torch.Tensor) and value.dtype in _qtype_vtype_map:
+        q_value_dequantize = value.dequantize()
+        q_scale = (
+            torch.tensor(value.q_scale(), dtype=torch.double)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_scale(), dtype=torch.float32)
+        )
+        q_zero_point = (
+            torch.tensor(value.q_zero_point(), dtype=torch.int64)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_zero_point(), dtype=_qtype_vtype_map[value.dtype])
+        )
+        q_value = q_value_dequantize / q_scale + q_zero_point
+        q_value = q_value.to(dtype=_qtype_vtype_map[value.dtype])
+        return q_value, q_scale, q_zero_point
+    else:
+        return (value,)
+
+
+def _pre_trace_quant_model(model, args):
+    r"""Returns `torch.jit.trace(model, args)` if model is quantized. Otherwise do nothing and return
+    original model.
+
+    This is due to https://github.com/pytorch/pytorch/issues/75761.
+    """
+    if any(
+        hasattr(m, "_packed_params") for m in getattr(model, "modules", list)()
+    ) or any(getattr(arg, "is_quantized", False) for arg in args):
+        return torch.jit.trace(model, args)
+    return model
+
+
+def _model_to_graph(
+    model,
+    args,
+    verbose=False,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    do_constant_folding=True,
+    _disable_torch_constant_prop=False,
+    fixed_batch_size=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    dynamic_axes=None,
+) -> tuple[
+    _C.Graph,
+    dict[str, torch.Tensor],
+    torch.Tensor
+    | tuple[torch.Tensor, ...]
+    | list[torch.Tensor]
+    | dict[str, torch.Tensor]
+    | Any
+    | None,
+]:
+    """Converts model into an ONNX graph.
+
+    Returns:
+        graph: A TorchScript IR Graph with ONNX nodes.
+        params_dict: Dict from input param name to param value.
+        torch_out: The output tensors resulting from the trace of ``model``.
+            If ``model`` is a :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`,
+            this will be None, since we are not doing any tracing.
+    """
+    # TODO: can we simplify this to always return a tuple of Tensor or None?
+
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, (torch.Tensor, int, float, bool)):
+        args = (args,)
+
+    model = _pre_trace_quant_model(model, args)
+    graph, params, torch_out, module = _create_jit_graph(model, args)
+    params_dict = _get_named_param_dict(graph, params)
+
+    try:
+        graph = _optimize_graph(
+            graph,
+            operator_export_type,
+            _disable_torch_constant_prop=_disable_torch_constant_prop,
+            fixed_batch_size=fixed_batch_size,
+            params_dict=params_dict,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            module=module,
+        )
+    except Exception:
+        _C._jit_onnx_log("Torch IR graph at exception: ", graph)
+        raise
+
+    is_script = isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule))
+    if is_script:
+        example_outputs = _get_example_outputs(model, args)
+        example_outputs_final = ()
+        for example_output in example_outputs:
+            example_outputs_final += unpack_quantized_tensor(example_output)
+        out_vars, desc = torch.jit._flatten(example_outputs_final)
+        _C._jit_pass_onnx_assign_output_shape(
+            graph,
+            out_vars,
+            desc,
+            GLOBALS.onnx_shape_inference,
+            is_script,
+            GLOBALS.export_onnx_opset_version,
+        )
+
+    # NB: ONNX requires complete information about output types, which might be
+    # erased by some optimizations, so we need to set it explicitly again.
+    else:
+        if not isinstance(torch_out, (list, tuple)):
+            output_wrapped = [torch_out]
+        else:
+            output_wrapped = torch_out  # type: ignore[assignment]
+
+        output_tensors, out_desc = torch.jit._flatten(tuple(output_wrapped))
+        # assign_output_shape pass is not compatible with quantized outputs.
+        # Quantized outputs are flattened to 3 values in ONNX, while packed as
+        # single value in PyTorch.
+        if not any(getattr(out, "is_quantized", False) for out in output_tensors):
+            _C._jit_pass_onnx_assign_output_shape(
+                graph,
+                output_tensors,
+                out_desc,
+                GLOBALS.onnx_shape_inference,
+                is_script,
+                GLOBALS.export_onnx_opset_version,
+            )
+
+    _set_input_and_output_names(graph, input_names, output_names)
+    params_dict = _get_named_param_dict(graph, params)
+
+    if (
+        do_constant_folding
+        and GLOBALS.export_onnx_opset_version
+        >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
+    ):
+        if training is None or training == _C_onnx.TrainingMode.EVAL:
+            params_dict = _C._jit_pass_onnx_eval_peephole(graph, params_dict)
+
+        params_dict = _C._jit_pass_onnx_constant_fold(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
+        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if GLOBALS.onnx_shape_inference:
+        try:
+            _C._jit_pass_onnx_graph_shape_type_inference(
+                graph, params_dict, GLOBALS.export_onnx_opset_version
+            )
+        except RuntimeError:
+            # NOTE: shape type inference error should not stop the export process
+            # https://github.com/pytorch/pytorch/issues/132205
+            pass
+
+    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
+
+    # For ONNX opset < 9, constants only have three data types: float16, float, double.
+    # In this pass transform constants of other data types to float/double + cast operator.
+    if GLOBALS.export_onnx_opset_version < 9:
+        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
+
+    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
+    _C._jit_decay_packed_param_input_types(graph)
+
+    # If output names lack a proper name and are identified only by their unique
+    # give them a legible name for debugging purposes
+    _apply_friendly_debug_names(graph, params_dict)
+
+    return graph, params_dict, torch_out
+
+
+@deprecated(
+    "Unconvertible ops are not definitive. Please remove usage of this function"
+)
+def unconvertible_ops(
+    model,
+    args,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+) -> tuple[_C.Graph, list[str]]:
+    """Returns an approximated list of all ops that are yet supported by :mod:`torch.onnx`.
+
+    .. deprecated:: 2.5
+        Unconvertible ops are not definitive. Please remove usage of this function.
+
+    The list is approximated because some ops may be removed during the conversion
+    process and don't need to be converted. Some other ops may have partial support
+    that will fail conversion with particular inputs. Please open a Github Issue
+    for op support requests.
+
+    Args:
+        model: Same as the `model` parameter in :func:`torch.onnx.export`.
+        args: Same as the `args` parameter in :func:`torch.onnx.export`.
+        training: Same as the `training` parameter in :func:`torch.onnx.export`.
+        opset_version: Same as the `opset_version` parameter in :func:`torch.onnx.export`.
+
+    Returns:
+        The JIT graph and a list of unconvertible ops in the format of "domain::op".
+    """
+
+    opset_version = opset_version or _constants.ONNX_DEFAULT_OPSET
+    GLOBALS.export_onnx_opset_version = opset_version
+
+    try:
+        with exporter_context(model, training, verbose=False):
+            # Create a mostly clean JIT graph that contains the plain aten and
+            # other ops we can check with the symbolic registry.
+            # NOTE: We don't want to actually convert any ops to ONNX or run any
+            # symbolic functions because there is a higher chance that a pass
+            # fails or an unconvertible op messes up the graph during ONNX conversion.
+            # This way we can always generate a list just by looking at the names
+            # of the ops in the graph.
+            args = _decide_input_format(model, args)
+            model = _pre_trace_quant_model(model, args)
+            graph, _, _, module = _create_jit_graph(model, args)
+            _C._jit_pass_inline(graph)
+            _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+            _C._jit_pass_erase_number_types(graph)
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    except Exception as e:
+        raise errors.OnnxExporterError(
+            "Failed to discover unconvertible ops because of errors during the JIT graph "
+            "generation process."
+        ) from e
+
+    unsupported_ops = []
+    for node in graph.nodes():
+        domain_op = node.kind()
+        if domain_op.startswith(("onnx::", "prim::")):
+            # We consider onnx and prim ops as supported ops, even though some "prim"
+            # ops are not implemented as symbolic functions, because they may be
+            # eliminated in the conversion passes. Users may still see errors caused
+            # by prim ops even though they don't show up in the list.
+            continue
+        if not registration.registry.is_registered_op(
+            domain_op.rstrip("_"), opset_version
+        ):
+            # We consider all registered ops supported, even though some of them are
+            # only partially supported, because there is not yet a good way to check
+            # if an op is fully supported.
+            # TODO(justinchuby): Create a way to check if an op is fully supported.
+            unsupported_ops.append(domain_op)
+    return graph, unsupported_ops
+
+
+def _setup_trace_module_map(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    export_modules_as_functions: bool | Collection[type[torch.nn.Module]],
+) -> set[str]:
+    def __register_attribute_hook():
+        attr_name = "_onnx_attrs"
+
+        def _track_module_attributes_forward_pre_hook(module, input):
+            setattr(module, attr_name, _get_module_attributes(module))
+
+        def _track_module_attributes_forward_hook(module, input, output):
+            tracing_state = _C._get_tracing_state()
+            if not tracing_state:
+                return
+
+            graph = tracing_state.graph()
+            onnx_attrs = {}
+            if hasattr(module, attr_name):
+                onnx_attrs = getattr(module, attr_name)
+                delattr(module, attr_name)
+
+            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+
+        for m in model.modules():
+            m.register_forward_hook(_track_module_attributes_forward_hook)
+            m.register_forward_pre_hook(_track_module_attributes_forward_pre_hook)
+
+    def _unqualified_variable_name(qualified_name: str) -> str:
+        """
+        Parse qualified variable name and return the unqualified version.
+
+        Pure numeric atoms are considered inadequate, so this function will look past them,
+        and start from the first non-numeric atom.
+
+        Example:
+            >>> _unqualified_variable_name("__main__.Foo.bar")
+            'bar'
+            >>> _unqualified_variable_name("__main__.Foo.bar.0")
+            'bar.0'
+        """
+        name_atoms = qualified_name.split(".")
+        for i, atom in reversed(list(enumerate(name_atoms))):
+            if not atom.isnumeric():
+                return ".".join(name_atoms[i:])
+        return qualified_name
+
+    trace_module_map = {
+        _m: torch._C._jit_onnx_create_full_scope_name(
+            torch.typename(type(_m)), _unqualified_variable_name(_n)
+        )
+        for _n, _m in model.named_modules()
+    }
+    torch.jit._trace._trace_module_map = trace_module_map
+    if isinstance(export_modules_as_functions, bool) and export_modules_as_functions:
+        module_typenames = {torch.typename(type(module)) for module in trace_module_map}
+    elif isinstance(export_modules_as_functions, set) and export_modules_as_functions:
+
+        def _find_typename(v):
+            if isinstance(v, type):
+                return torch.typename(v)
+            else:
+                raise RuntimeError(
+                    "Only type of the `nn.Module` should be "
+                    "passed in the set for argument `export_modules_as_functions`. "
+                    f"Got `{type(v).__name__}`."
+                )
+
+        module_typenames = {_find_typename(v) for v in export_modules_as_functions}
+    else:
+        module_typenames = set()
+
+    if module_typenames:
+        __register_attribute_hook()
+
+    return module_typenames
+
+
+def _reset_trace_module_map():
+    torch.jit._trace._trace_module_map = None
+    _C._jit_pass_onnx_clear_scope_records()
+
+
+def _get_module_attributes(module):
+    annotations = typing.get_type_hints(type(module))
+    base_m_annotations = typing.get_type_hints(torch.nn.Module)
+    [annotations.pop(k, None) for k in base_m_annotations]
+    # Check whether module attributes can be accessed. Some classes
+    # define attributes but don't provide access to them in their
+    # constructor.
+    #
+    # For example, torch.nn.Embedding has the `freeze` variable and its
+    # type specified in the class but the attribute is not created in the
+    # constructor. In other words, there is no `self.freeze = <True | False>`
+    # in the constructor.
+    #
+    # Reference: https://github.com/pytorch/pytorch/blob/92de1d322223fb5584e384971b32c46b93bc2f4b/torch/nn/modules/sparse.py#L120
+    attrs = {}
+    for k in annotations:
+        try:
+            attrs[k] = getattr(module, k)
+        except AttributeError:
+            _C._jit_onnx_log(f"Skipping module attribute '{k}'")
+            continue
+    return attrs
+
+
+def _trigger_symbolic_function_registration():
+    """Trigger the registration of symbolic functions for all supported opsets."""
+
+    from torch.onnx._internal.torchscript_exporter import (  # noqa: F401
+        symbolic_opset10,
+        symbolic_opset11,
+        symbolic_opset12,
+        symbolic_opset13,
+        symbolic_opset14,
+        symbolic_opset15,
+        symbolic_opset16,
+        symbolic_opset17,
+        symbolic_opset18,
+        symbolic_opset19,
+        symbolic_opset20,
+        symbolic_opset7,
+        symbolic_opset8,
+        symbolic_opset9,
+    )
+
+
+def _export(
+    model,
+    args,
+    f,
+    export_params=True,
+    verbose=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    export_type=None,
+    opset_version=None,
+    do_constant_folding=True,
+    dynamic_axes=None,
+    keep_initializers_as_inputs=None,
+    fixed_batch_size=False,
+    custom_opsets=None,
+    add_node_names=True,
+    onnx_shape_inference=True,
+    export_modules_as_functions: Any = False,
+    autograd_inlining=True,
+):
+    assert GLOBALS.in_onnx_export is False
+
+    _trigger_symbolic_function_registration()
+
+    if isinstance(model, torch.nn.DataParallel):
+        raise ValueError(
+            "torch.nn.DataParallel is not supported by ONNX "
+            "exporter, please use 'attribute' module to "
+            "unwrap model from torch.nn.DataParallel. Try "
+            "torch.onnx.export(model.module, ...)"
+        )
+
+    GLOBALS.onnx_shape_inference = onnx_shape_inference
+
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+
+    if opset_version > _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET:
+        warnings.warn(
+            f"Exporting to ONNX opset version {opset_version} is not supported. "
+            f"by 'torch.onnx.export()'. "
+            f"The highest opset version supported is {_constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET}. "
+            f"To use a newer opset version, consider 'torch.onnx.export(..., dynamo=True)'. ",
+            category=errors.OnnxExporterWarning,
+        )
+
+    if export_modules_as_functions and opset_version < 15:
+        raise ValueError(
+            "`export_modules_as_functions` is not supported for `opset_version` < 15."
+            "This is because `opset_version` < 15 implies IR version < 8, which means "
+            "no local function support. "
+        )
+    if not operator_export_type:
+        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
+
+    # By default, training=TrainingMode.EVAL,
+    # which is good because running a model in training mode could result in
+    # internal buffers getting updated, dropout getting applied, etc.
+    # If you really know what you're doing, you can turn
+    # training=TrainingMode.TRAINING or training=TrainingMode.PRESERVE,
+    # (to preserve whatever the original training mode was.)
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    try:
+        GLOBALS.in_onnx_export = True
+        _autograd_inlining_previous = GLOBALS.autograd_inlining
+        GLOBALS.autograd_inlining = autograd_inlining
+
+        module_typenames_to_export_as_functions: set[str] = set()
+        if isinstance(model, (torch.nn.Module, torch.jit.ScriptModule)):
+            module_typenames_to_export_as_functions = _setup_trace_module_map(
+                model, export_modules_as_functions
+            )
+
+        with exporter_context(model, training, verbose):
+            val_keep_init_as_ip = _decide_keep_init_as_input(
+                keep_initializers_as_inputs,
+                operator_export_type,
+                opset_version,
+            )
+            val_add_node_names = _decide_add_node_names(
+                add_node_names, operator_export_type
+            )
+            val_do_constant_folding = _decide_constant_folding(
+                do_constant_folding, operator_export_type, training
+            )
+            # Normally f can be a file-like object, but for large models, the external data format requires a
+            # valid `model_file_location`. Code in export.cpp will enforce this.
+            if isinstance(f, str):
+                model_file_location = f
+            else:
+                model_file_location = ""
+            args = _decide_input_format(model, args)
+            if dynamic_axes is None:
+                dynamic_axes = {}
+            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+            graph, params_dict, torch_out = _model_to_graph(
+                model,
+                args,
+                verbose,
+                input_names,
+                output_names,
+                operator_export_type,
+                val_do_constant_folding,
+                fixed_batch_size=fixed_batch_size,
+                training=training,
+                dynamic_axes=dynamic_axes,
+            )
+
+            if custom_opsets is None:
+                custom_opsets = {}
+
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+            node_attr_to_name = {}  # type: ignore[var-annotated]
+            if module_typenames_to_export_as_functions:
+                # NOTE: cannot call DCE after this pass. DCE will remove function definition nodes.
+                node_attr_to_name = _C._jit_pass_onnx_function_extraction(
+                    graph,
+                    module_typenames_to_export_as_functions,
+                    list(params_dict.keys()),
+                )
+
+            if keep_initializers_as_inputs is not True:
+                params_dict = _C._jit_pass_onnx_deduplicate_initializers(  # type: ignore[assignment]
+                    graph,
+                    params_dict,  # type: ignore[arg-type]
+                    getattr(model, "training", False),  # type: ignore[arg-type]
+                )
+            _C._jit_pass_onnx_assign_scoped_names_for_node_and_value(graph)
+            defer_weight_export = False
+            if export_params:
+                (
+                    proto,
+                    export_map,
+                    _val_use_external_data_format,
+                    _node_names,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    params_dict,
+                    opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            else:
+                (
+                    proto,
+                    export_map,
+                    _,
+                    _,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    {},
+                    opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            # insert function_proto into model_proto.
+            proto = onnx_proto_utils._add_onnxscript_fn(
+                proto,
+                custom_opsets,
+            )
+            if verbose:
+                _C._jit_onnx_log("Exported graph: ", graph)
+            onnx_proto_utils._export_file(proto, f, export_map)
+    finally:
+        assert GLOBALS.in_onnx_export
+        GLOBALS.in_onnx_export = False
+        GLOBALS.autograd_inlining = _autograd_inlining_previous
+        _reset_trace_module_map()
+
+    return torch_out
+
+
+def _apply_friendly_debug_names(graph, params):
+    for n in graph.nodes():
+        for v in n.inputs():
+            old_name = v.debugName()
+            if old_name != str(v.unique()):
+                continue
+            new_name = f"{n.kind()}_{v.unique()}"
+            v.setDebugName(new_name)
+            if old_name in params:
+                params[new_name] = params.pop(old_name)
+
+
+def _set_input_and_output_names(graph, input_names, output_names):
+    def set_names(node_list, name_list, descriptor):
+        if name_list is None:
+            return
+        if len(name_list) > len(node_list):
+            raise RuntimeError(
+                f"number of {descriptor} names provided ({len(name_list)}) "
+                f"exceeded number of {descriptor}s ({len(node_list)})"
+            )
+
+        # Mark if the output node DebugName is set before.
+        output_node_set = set()
+        for i, (name, node) in enumerate(zip(name_list, node_list)):
+            # Duplicated output node, insert onnx::Identity to avoid setting the same DebugName after setDebugName().
+            if descriptor == "output":
+                if node in output_node_set:
+                    identity_node = graph.create("onnx::Identity")
+                    identity_node.insertAfter(node.node())
+                    identity_node.addInput(node)
+                    identity_node.output().setType(node.type())
+                    graph.return_node().replaceInput(i, identity_node.output())
+                    node = identity_node.output()
+                output_node_set.add(node)
+
+            if node.debugName() != name:
+                node.setDebugName(name)
+
+    set_names(list(graph.inputs()), input_names, "input")
+    set_names(list(graph.outputs()), output_names, "output")
+
+
+def _run_symbolic_method(g, op_name, symbolic_fn, args):
+    r"""
+    This trampoline function gets invoked for every symbolic method
+    call from C++.
+    """
+    try:
+        graph_context = jit_utils.GraphContext(
+            graph=g,
+            block=g.block(),
+            opset=GLOBALS.export_onnx_opset_version,
+            original_node=None,  # type: ignore[arg-type]
+            params_dict=_params_dict,
+            env={},
+            values_in_env=set(),
+            new_nodes=[],
+        )
+        return symbolic_fn(graph_context, *args)
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch
+        # to symbolic_fn.  Otherwise, the backtrace will have the clues
+        # you need.
+        e.args = (f"{e.args[0]} (occurred when translating {op_name})",)
+        raise
+
+
+def _add_block(node: _C.Node) -> _C.Block:
+    return node.addBlock()
+
+
+def _add_input_to_block(block: _C.Block):
+    return block.addInputToBlock()  # type: ignore[attr-defined]
+
+
+def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
+    return block.registerOutput(value)
+
+
+def _should_aten_fallback(
+    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
+):
+    # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
+    #   an aten::ATen operator is created regardless of symbolics existence
+
+    is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
+    is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
+    is_aten_fallback_export = (
+        operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+    )
+
+    if not name.startswith("aten::"):
+        return False
+
+    if is_onnx_aten_export or (is_aten_fallback_export and not is_exportable_aten_op):
+        return True
+
+    return False
+
+
+def _get_aten_op_overload_name(n: _C.Node) -> str:
+    # Returns `overload_name` attribute to ATen ops on non-Caffe2 builds
+    schema = n.schema()
+    if not schema.startswith("aten::"):
+        return ""
+    return _C.parse_schema(schema).overload_name
+
+
+def _run_symbolic_function(
+    graph: _C.Graph,
+    block: _C.Block,
+    node: _C.Node,
+    inputs: Any,
+    env: dict[_C.Value, _C.Value],
+    values_in_env: set[_C.Value],
+    new_nodes: list[_C.Node],
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+) -> _C.Value | Sequence[_C.Value | None] | None:
+    """Runs a symbolic function.
+
+    The function is used in C++ to export the node to ONNX.
+
+    Returns:
+        A single or a tuple of Values.
+        None when the node gets cloned as is into the new graph.
+    """
+
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    # See Note [Export inplace]
+    node_kind = node.kind()
+    if node_kind.endswith("_"):
+        # Treat relu_ -> relu; add_ -> add etc.
+        ns_op_name = node_kind[:-1]
+    else:
+        ns_op_name = node_kind
+
+    namespace, op_name = jit_utils.parse_node_kind(ns_op_name)
+
+    graph_context = jit_utils.GraphContext(
+        graph=graph,
+        block=block,
+        opset=opset_version,
+        original_node=node,
+        params_dict=_params_dict,
+        env=env,
+        values_in_env=values_in_env,
+        new_nodes=new_nodes,
+    )
+
+    # Direct ATen export requested
+    if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        outputs = node.outputsSize()
+        attrs["outputs"] = outputs
+        return graph_context.aten_op(
+            op_name,
+            *inputs,
+            overload_name=_get_aten_op_overload_name(node),
+            **attrs,
+        )
+
+    try:
+        domain = namespace
+        symbolic_function_name = f"{domain}::{op_name}"
+
+        symbolic_function_group = registration.registry.get_function_group(
+            symbolic_function_name
+        )
+        if symbolic_function_group is not None:
+            symbolic_fn = symbolic_function_group.get(opset_version)
+            if symbolic_fn is not None:
+                # TODO Wrap almost identical attrs assignment or comment the difference.
+                attrs = {
+                    k: symbolic_helper._node_get(node, k) for k in node.attributeNames()
+                }
+                return symbolic_fn(graph_context, *inputs, **attrs)
+
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        if namespace == "onnx":
+            # Clone node to trigger ONNX shape inference
+            return graph_context.op(
+                op_name, *inputs, **attrs, outputs=node.outputsSize()
+            )  # type: ignore[attr-defined]
+
+        raise errors.UnsupportedOperatorError(
+            symbolic_function_name,
+            opset_version,
+            symbolic_function_group.get_min_supported()
+            if symbolic_function_group
+            else None,
+        )
+
+    except RuntimeError:
+        if operator_export_type == _C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
+            return None
+        elif operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+            # Emit ATen op for non-Caffe2 builds when `operator_export_type==ONNX_ATEN_FALLBACK`
+            attrs = {
+                k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+                for k in node.attributeNames()
+            }
+            return graph_context.aten_op(
+                op_name,
+                *inputs,
+                overload_name=_get_aten_op_overload_name(node),
+                **attrs,
+            )
+        raise
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch.
+        # Otherwise, the backtrace will have the clues you need.
+        e.args = (f"{e.args[0]} \n(Occurred when translating {op_name}).",)
+        raise
+
+
+def _verify_custom_op_name(symbolic_name: str):
+    if not re.match(r"^[a-zA-Z0-9-_]+::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name):
+        raise errors.OnnxExporterError(
+            f"Failed to register operator {symbolic_name}. "
+            "The symbolic name must match the format domain::name, "
+            "and should start with a letter and contain only "
+            "alphanumerical characters"
+        )
+
+    ns, _ = jit_utils.parse_node_kind(symbolic_name)
+    if ns == "onnx":
+        raise ValueError(
+            f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."
+        )
+
+
+def register_custom_op_symbolic(
+    symbolic_name: str,
+    symbolic_fn: Callable,
+    opset_version: int,
+):
+    """Registers a symbolic function for a custom operator.
+
+    When the user registers symbolic for custom/contrib ops,
+    it is highly recommended to add shape inference for that operator via setType API,
+    otherwise the exported graph may have incorrect shape inference in some extreme cases.
+    An example of setType is `test_aten_embedding_2` in `test_operators.py`.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        symbolic_fn (Callable): A function that takes in the ONNX graph and
+            the input arguments to the current operator, and returns new
+            operator nodes to add to the graph.
+        opset_version (int): The ONNX opset version in which to register.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.custom_onnx_symbolic(symbolic_name, opset_version)(symbolic_fn)
+
+
+def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
+    """Unregisters ``symbolic_name``.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        opset_version (int): The ONNX opset version in which to unregister.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.registry.unregister(symbolic_name, opset_version)
+
+
+def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
+    """Ensures dynamic axes argument is follows the expected format."""
+    if len(dynamic_axes) == 0:
+        return
+
+    if hasattr(model, "graph"):
+        # Extracting set of valid input/output names that shall be used for dynamic_axes
+        if (input_names is None) or len(input_names) == 0:
+            input_names = [x.debugName() for x in model.graph.inputs()]
+        if (output_names is None) or len(output_names) == 0:
+            output_names = [y.debugName() for y in model.graph.outputs()]
+
+    valid_names = set((input_names or []) + (output_names or []))
+
+    # If dynamic axes are provided as a list rather than dictionary, they should
+    # first get converted to a dictionary in expected format. If desired axes names
+    # are not provided for dynamic axes, automatic names shall be generated for
+    # provided dynamic axes of specified input/output
+    for key, value in dynamic_axes.items():
+        if key not in valid_names:
+            warnings.warn(
+                f"Provided key {key} for dynamic axes is not a valid input/output name"
+            )
+        if isinstance(value, list):
+            warnings.warn(
+                "No names were found for specified dynamic axes of provided input."
+                f"Automatically generated names will be applied to each dynamic axes of input {key}"
+            )
+
+            value_dict = {}
+            for i, x in enumerate(value):
+                if not isinstance(x, int):
+                    raise ValueError(
+                        "The type of axis index is expected to be an integer"
+                    )
+                if x in value_dict:
+                    warnings.warn(
+                        f"Duplicate dynamic axis index {x} was provided for input {key}."
+                    )
+                else:
+                    value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1)
+            dynamic_axes[key] = value_dict
+
+
+def model_signature(model: torch.nn.Module | Callable) -> inspect.Signature:
+    return inspect.signature(
+        model.forward if isinstance(model, torch.nn.Module) else model
+    )
diff --git a/torch/onnx/_internal/torchscript_exporter/verification.py b/torch/onnx/_internal/torchscript_exporter/verification.py
new file mode 100644
index 000000000000..3bf8cba1c8d6
--- /dev/null
+++ b/torch/onnx/_internal/torchscript_exporter/verification.py
@@ -0,0 +1,511 @@
+# mypy: allow-untyped-defs
+"""The ONNX verification module provides a set of tools to verify the correctness of ONNX models."""
+
+from __future__ import annotations
+
+
+__all__ = [
+    "OnnxBackend",
+    "VerificationOptions",
+    "verify",
+]
+
+import contextlib
+import copy
+import dataclasses
+import enum
+import io
+import os
+import tempfile
+import warnings
+from collections.abc import Mapping, Sequence
+from typing import Any, Union
+
+import numpy as np
+import numpy.typing as npt
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch.onnx._internal.torchscript_exporter import utils
+from torch.types import Number
+
+
+# Everything below are deprecated ##############################################
+
+_ORT_PROVIDERS = ("CPUExecutionProvider",)
+
+_NumericType = Union[Number, torch.Tensor, np.ndarray]
+_ModelType = Union[torch.nn.Module, torch.jit.ScriptModule]
+_InputArgsType = Union[torch.Tensor, tuple[Any, ...]]
+_InputKwargsType = Mapping[str, Any]
+_OutputsType = Union[Sequence[_NumericType], Sequence]
+
+
+class OnnxBackend(enum.Enum):
+    """Enum class for ONNX backend used for export verification.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+    """
+
+    REFERENCE = "ONNXReferenceEvaluator"
+    ONNX_RUNTIME_CPU = "CPUExecutionProvider"
+    ONNX_RUNTIME_CUDA = "CUDAExecutionProvider"
+
+
+@dataclasses.dataclass
+class VerificationOptions:
+    """Options for ONNX export verification.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+
+    Attributes:
+        flatten: If True, unpack nested list/tuple/dict inputs into a flattened list of
+            Tensors for ONNX. Set this to False if nested structures are to be preserved
+            for ONNX, which is usually the case with exporting ScriptModules. Default True.
+        ignore_none: Whether to ignore None type in torch output, which is usually the
+            case with tracing. Set this to False, if torch output should keep None type,
+            which is usually the case with exporting ScriptModules. Default to True.
+        check_shape: Whether to check the shapes between PyTorch and ONNX Runtime outputs
+            are exactly the same. Set this to False to allow output shape broadcasting.
+            Default to True.
+        check_dtype: Whether to check the dtypes between PyTorch and ONNX Runtime outputs
+            are consistent. Default to True.
+        backend: ONNX backend for verification. Default to OnnxBackend.ONNX_RUNTIME_CPU.
+        rtol: relative tolerance in comparison between ONNX and PyTorch outputs.
+        atol: absolute tolerance in comparison between ONNX and PyTorch outputs.
+        remained_onnx_input_idx: If provided, only the specified inputs will be passed
+            to the ONNX model. Supply a list when there are unused inputs in the model.
+            Since unused inputs will be removed in the exported ONNX model, supplying
+            all inputs will cause an error on unexpected inputs. This parameter tells
+            the verifier which inputs to pass into the ONNX model.
+        acceptable_error_percentage: acceptable percentage of element mismatches in comparison.
+            It should be a float of value between 0.0 and 1.0.
+    """
+
+    flatten: bool = True
+    ignore_none: bool = True
+    check_shape: bool = True
+    check_dtype: bool = True
+    backend: OnnxBackend = OnnxBackend.ONNX_RUNTIME_CPU
+    rtol: float = 1e-3
+    atol: float = 1e-7
+    remained_onnx_input_idx: Sequence[int] | None = None
+    acceptable_error_percentage: float | None = None
+
+
+def _flatten_tuples(elem):
+    flattened = []
+    for t in elem:
+        if isinstance(t, tuple):
+            flattened.extend(_flatten_tuples(t))
+        else:
+            flattened.append(t)
+    return flattened
+
+
+def _to_numpy(elem) -> list | npt.NDArray:
+    if isinstance(elem, torch.Tensor):
+        if elem.requires_grad:
+            return elem.detach().cpu().numpy()
+        else:
+            return elem.cpu().numpy()
+    elif isinstance(elem, (list, tuple)):
+        return [_to_numpy(inp) for inp in elem]
+    elif isinstance(elem, (bool, int, float)):
+        return np.array(elem)
+    elif isinstance(elem, dict):
+        flattened = []
+        for k in elem:
+            flattened.extend([_to_numpy(k), _to_numpy(elem[k])])
+        return flattened
+    return elem
+
+
+def _inline_flatten_list(inputs, res_list) -> list:
+    for i in inputs:
+        res_list.append(i) if not isinstance(
+            i, (list, tuple)
+        ) else _inline_flatten_list(i, res_list)
+    return res_list
+
+
+def _unpack_to_numpy(values, cast_onnx_accepted=True) -> list:
+    value_unpacked = []
+    for value in values:
+        value_unpacked.extend(
+            utils.unpack_quantized_tensor(value, cast_onnx_accepted=cast_onnx_accepted)
+        )
+    return [_to_numpy(v) for v in value_unpacked]
+
+
+def _run_onnx(onnx_session, inputs) -> _OutputsType:
+    kw_inputs = {}
+    if inputs and isinstance(inputs[-1], dict):
+        kw_inputs = inputs[-1]
+        inputs = inputs[:-1]
+    inputs = _unpack_to_numpy(_flatten_tuples(inputs))
+    ort_inputs = {}
+    for input_name, input in kw_inputs.items():
+        ort_inputs[input_name] = _to_numpy(input)
+    inputs = _to_numpy(inputs)
+    if hasattr(onnx_session, "get_inputs"):
+        # onnxruntime.InferenceSession
+        input_names = [i.name for i in onnx_session.get_inputs()]
+    elif hasattr(onnx_session, "input_names"):
+        # onnx.reference.ReferenceEvaluator
+        input_names = onnx_session.input_names
+    else:
+        raise ValueError(f"Unknown ONNX backend type: {type(onnx_session)}.")
+
+    for i, input in enumerate(inputs):
+        if i == len(input_names) or input_names[i] in ort_inputs:
+            raise ValueError(
+                f"got too many positional inputs. inputs: {inputs}. kw_inputs: {kw_inputs}. "
+                f"input names: {input_names}."
+            )
+        ort_inputs[input_names[i]] = input
+    onnx_outs = onnx_session.run(None, ort_inputs)
+    return onnx_outs
+
+
+def _ort_session(
+    model: str | io.BytesIO, ort_providers: Sequence[str] = _ORT_PROVIDERS
+):
+    try:
+        import onnxruntime  # type: ignore[import]
+    except ImportError as e:
+        raise ImportError("onnxruntime is required for export verification.") from e
+
+    if ort_providers is None:
+        ort_providers = _ORT_PROVIDERS
+
+    session_options = onnxruntime.SessionOptions()
+    # suppress ort warnings.
+    # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
+    session_options.log_severity_level = 3
+    ort_session = onnxruntime.InferenceSession(
+        model if isinstance(model, str) else model.getvalue(),
+        session_options,
+        providers=ort_providers,
+    )
+    return ort_session
+
+
+def _onnx_backend_session(model: str | io.BytesIO, backend: OnnxBackend):
+    if backend == OnnxBackend.REFERENCE:
+        raise NotImplementedError
+    elif backend in {OnnxBackend.ONNX_RUNTIME_CPU, OnnxBackend.ONNX_RUNTIME_CUDA}:
+        onnx_session = _ort_session(model, (backend.value,))
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+    return onnx_session
+
+
+def _compare_onnx_pytorch_outputs_in_np(
+    onnx_outs: _OutputsType,
+    pt_outs: _OutputsType,
+    options: VerificationOptions,
+):
+    assert len(onnx_outs) == len(pt_outs), (
+        f"Number of outputs differ ONNX runtime: ({len(onnx_outs)}) PyTorch: ({len(pt_outs)})"
+    )
+    acceptable_error_percentage = options.acceptable_error_percentage
+    if acceptable_error_percentage and (
+        acceptable_error_percentage > 1.0 or acceptable_error_percentage < 0.0
+    ):
+        raise ValueError(
+            "If set, acceptable_error_percentage should be between 0.0 and 1.0"
+        )
+
+    for ort_out, pt_out in zip(onnx_outs, pt_outs):
+        try:
+            # TODO: Remove `check_shape` option once every shape inconsistent issue is addressed.
+            if not options.check_shape:
+                # Allow different but broadcastable output shapes.
+                ort_out, pt_out = np.broadcast_arrays(ort_out, pt_out)
+            torch.testing.assert_close(
+                ort_out,
+                pt_out,
+                rtol=options.rtol,
+                atol=options.atol,
+                check_dtype=options.check_dtype,
+                equal_nan=True,
+            )
+        except AssertionError as e:
+            if acceptable_error_percentage:
+                error_percentage = 1 - np.sum(
+                    np.isclose(ort_out, pt_out, rtol=options.rtol, atol=options.atol)
+                ) / np.prod(ort_out.shape)
+                if error_percentage <= acceptable_error_percentage:
+                    warnings.warn(
+                        f"Suppressed AssertionError:\n{e}.\n"
+                        f"Error percentage {error_percentage} "
+                        f"within acceptable range {acceptable_error_percentage}."
+                    )
+                    continue
+            if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
+                warnings.warn("ONNX output is quantized")
+            if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
+                warnings.warn("PyTorch output is quantized")
+            raise
+
+
+def _compare_onnx_pytorch_outputs(
+    onnx_outs: _OutputsType,
+    pt_outs: Any,
+    options: VerificationOptions,
+):
+    """
+    Compare ONNX and PyTorch outputs.
+
+    Args:
+        onnx_outs: outputs from ONNX backend.
+        pt_outs: outputs from PyTorch.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options.ignore_none:
+        # torch.jit._flatten filters None type
+        pt_outs, _ = torch.jit._flatten(pt_outs)
+    else:
+        pt_outs = _inline_flatten_list([pt_outs], [])
+    pt_outs_np = _unpack_to_numpy(pt_outs, cast_onnx_accepted=False)
+    onnx_outs = _inline_flatten_list(onnx_outs, [])
+    _compare_onnx_pytorch_outputs_in_np(onnx_outs, pt_outs_np, options)
+
+
+def _prepare_input_for_pytorch(args, kwargs):
+    """Prepare input for PyTorch model execution.
+
+    Any future changes/formatting to the input before dispatching to the PyTorch
+    model should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+    """
+    if isinstance(args, (torch.Tensor, dict)):
+        args = (args,)
+    # In-place operators will update input tensor data as well.
+    # Thus inputs are replicated before every forward call.
+    args = copy.deepcopy(args)
+    if kwargs:
+        kwargs = copy.deepcopy(kwargs)
+    else:
+        kwargs = {}
+    return args, kwargs
+
+
+def _prepare_input_for_export(args, kwargs):
+    """Prepare input for ONNX model export.
+
+    Any future changes/formatting to the input before dispatching to the
+    :func:`torch.onnx.export` api should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model export, as `args` in
+            :func:`torch.onnx.export`.
+    """
+    args, kwargs = _prepare_input_for_pytorch(args, kwargs)
+    if not kwargs and len(args) > 0 and isinstance(args[-1], dict):
+        onnx_inputs = args + ({},)
+    elif kwargs:
+        onnx_inputs = args + (kwargs,)
+    else:
+        onnx_inputs = args
+    return onnx_inputs
+
+
+def _prepare_input_for_onnx(
+    args, kwargs, remained_onnx_input_idx: Sequence[int] | None, flatten: bool
+):
+    """Prepare input for ONNX model execution in ONNX backend.
+
+    Any future changes/formatting to the input before dispatching to the ONNX backend
+    run should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+        remained_onnx_input_idx: indices of inputs to be used for ONNX model execution.
+        flatten: whether to flatten the input before dispatching to the ONNX model execution.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model execution in ONNX backend.
+    """
+    onnx_inputs = _prepare_input_for_export(args, kwargs)
+    if flatten:
+        onnx_inputs, _ = torch.jit._flatten(onnx_inputs)
+    elif onnx_inputs and onnx_inputs[-1] == {}:
+        # Handle empty kwargs (normally removed by flatten).
+        onnx_inputs = onnx_inputs[:-1]
+    if remained_onnx_input_idx is not None:
+        return [onnx_inputs[i] for i in remained_onnx_input_idx]
+    else:
+        return onnx_inputs
+
+
+def _try_clone_model(model):
+    """Used for preserving original model in case forward mutates model states."""
+    try:
+        return copy.deepcopy(model)
+    except Exception:
+        warnings.warn(
+            "Failed to clone model. Model state might be mutated during verification."
+        )
+        return model
+
+
+def _compare_onnx_pytorch_model(
+    pt_model: _ModelType,
+    onnx_model_f: str | io.BytesIO,
+    input_args: _InputArgsType,
+    input_kwargs: _InputKwargsType | None,
+    additional_test_inputs: Sequence[_InputArgsType] | None,
+    options: VerificationOptions,
+):
+    """Compare outputs from ONNX model runs with outputs from PyTorch model runs.
+
+    Args:
+        pt_model: PyTorch model.
+        onnx_model_f: ONNX model file path or file-like object.
+        input_args: positional arguments for PyTorch model forward method.
+        input_kwargs: keyword arguments for PyTorch model forward method.
+        additional_test_inputs: additional positional arguments for PyTorch model
+            forward method.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+    """
+    onnx_session = _onnx_backend_session(onnx_model_f, options.backend)
+
+    def compare_onnx_pytorch_model_with_input(input_args, input_kwargs):
+        pt_args, pt_kwargs = _prepare_input_for_pytorch(input_args, input_kwargs)
+        # TODO: remove this and treat mutating model separately. See #77679
+        pt_model_copy = _try_clone_model(pt_model)
+        pt_outs = pt_model_copy(*pt_args, **pt_kwargs)
+
+        onnx_inputs = _prepare_input_for_onnx(
+            input_args, input_kwargs, options.remained_onnx_input_idx, options.flatten
+        )
+
+        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
+
+        _compare_onnx_pytorch_outputs(
+            onnx_outs=onnx_outs,
+            pt_outs=pt_outs,
+            options=options,
+        )
+
+    compare_onnx_pytorch_model_with_input(input_args, input_kwargs)
+
+    if additional_test_inputs:
+        for test_input_args in additional_test_inputs:
+            compare_onnx_pytorch_model_with_input(test_input_args, {})
+
+
+def verify(
+    model: _ModelType,
+    input_args: _InputArgsType,
+    input_kwargs: _InputKwargsType | None = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Mapping[str, Mapping[int, str] | Mapping[str, Sequence[int]]]
+    | None = None,
+    input_names: Sequence[str] | None = None,
+    output_names: Sequence[str] | None = None,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+    keep_initializers_as_inputs: bool = True,
+    verbose: bool = False,
+    fixed_batch_size: bool = False,
+    use_external_data: bool = False,
+    additional_test_inputs: Sequence[_InputArgsType] | None = None,
+    options: VerificationOptions | None = None,
+):
+    """Verify model export to ONNX against original PyTorch model.
+
+    .. deprecated:: 2.7
+        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
+        ``ONNXProgram`` to test the ONNX model.
+
+    Args:
+        model: See :func:`torch.onnx.export`.
+        input_args: See :func:`torch.onnx.export`.
+        input_kwargs: See :func:`torch.onnx.export`.
+        do_constant_folding: See :func:`torch.onnx.export`.
+        dynamic_axes: See :func:`torch.onnx.export`.
+        input_names: See :func:`torch.onnx.export`.
+        output_names: See :func:`torch.onnx.export`.
+        training: See :func:`torch.onnx.export`.
+        opset_version: See :func:`torch.onnx.export`.
+        keep_initializers_as_inputs: See :func:`torch.onnx.export`.
+        verbose: See :func:`torch.onnx.export`.
+        fixed_batch_size: Legacy argument, used only by rnn test cases.
+        use_external_data: Explicitly specify whether to export the model with external data.
+        additional_test_inputs: List of tuples. Each tuple is a group of
+            input arguments to test. Currently only ``*args`` are supported.
+        options: A VerificationOptions object that controls the verification behavior.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options is None:
+        options = VerificationOptions()
+
+    if training == torch.onnx.TrainingMode.TRAINING:
+        model.train()
+    elif training == torch.onnx.TrainingMode.EVAL:
+        model.eval()
+    with torch.no_grad(), contextlib.ExitStack() as stack:
+        model_f: str | io.BytesIO = io.BytesIO()
+        if use_external_data:
+            tmpdir_path = stack.enter_context(tempfile.TemporaryDirectory())
+            model_f = os.path.join(tmpdir_path, "model.onnx")
+
+        inputs_for_export = _prepare_input_for_export(input_args, input_kwargs)
+
+        # TODO(#77679): remove this and treat mutating model separately.
+        model_copy = _try_clone_model(model)
+        utils._export(
+            model,
+            inputs_for_export,
+            model_f,
+            opset_version=opset_version,
+            do_constant_folding=do_constant_folding,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            output_names=output_names,
+            fixed_batch_size=fixed_batch_size,
+            training=training,
+            verbose=verbose,
+        )
+
+        _compare_onnx_pytorch_model(
+            pt_model=model_copy,
+            onnx_model_f=model_f,
+            input_args=input_args,
+            input_kwargs=input_kwargs,
+            additional_test_inputs=additional_test_inputs,
+            options=options,
+        )
diff --git a/torch/onnx/ops/_impl.py b/torch/onnx/ops/_impl.py
index 7127716872f7..a7eba334ecfc 100644
--- a/torch/onnx/ops/_impl.py
+++ b/torch/onnx/ops/_impl.py
@@ -1,13 +1,15 @@
 # flake8: noqa: B950
 import math
-import typing
-from typing import Callable, Optional
+from typing import Callable, Optional, TypeVar
+from typing_extensions import ParamSpec
 
 import torch
 from torch.onnx.ops import _dtype_mappings
 
 
-_T = typing.TypeVar("_T", bound=Callable)
+# Use ParamSpec for better type preservation instead of bound Callable TypeVar
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
 
 # ONNX to ATen decomp table
 ONNX_ATEN_DECOMP_TABLE: dict[torch._ops.OpOverload, Callable] = {}
@@ -21,10 +23,12 @@
 )
 
 
-def _onnx_op(op_type: str, opset_version: int) -> Callable[[_T], _T]:
+def _onnx_op(
+    op_type: str, opset_version: int
+) -> Callable[[Callable[_P, _R]], Callable[_P, _R]]:
     """Decorator to register an ONNX operator with a custom implementation."""
 
-    def decorator(func: _T) -> _T:
+    def decorator(func: Callable[_P, _R]) -> Callable[_P, _R]:
         overload = f"opset{opset_version}"
         torch_op = torch.library.custom_op(
             f"onnx::{op_type}.{overload}", mutates_args=()
@@ -52,18 +56,55 @@ def rotary_embedding_23(
     rotary_embedding_dim: int = 0,
 ) -> torch.Tensor:
     """RotaryEmbedding-23 https://onnx.ai/onnx/operators/onnx__RotaryEmbedding.html#rotaryembedding-23"""
+    # x has shape (batch_size, num_heads, sequence_length, head_size)
+    # or (batch_size, sequence_length, hidden_size)
+    input_shape = x.shape
+    input_rank = len(input_shape)
+    batch_size = input_shape[0]
+    sequence_length = input_shape[-2]
+
+    # Validate position_ids and caches match x
+    if position_ids is not None:
+        torch._check(
+            position_ids.dim() == 2,
+            lambda: f"position_ids must be 2D when provided. Received shape {position_ids.shape}",
+        )
+        torch._check(
+            position_ids.shape[0] == batch_size,
+            lambda: f"position_ids first dim (batch) must match x.shape[0] ({batch_size}). Received {position_ids.shape[0]}",
+        )
+        torch._check(
+            position_ids.shape[1] == sequence_length,
+            lambda: f"position_ids second dim (sequence) must match x.shape[-2] ({sequence_length}). Received {position_ids.shape[1]}",
+        )
+        torch._check(
+            cos_cache.dim() == 2 and sin_cache.dim() == 2,
+            lambda: "cos_cache/sin_cache must be 2D when position_ids is provided. "
+            f"Received cos_cache shape {cos_cache.shape}, sin_cache shape {sin_cache.shape}",
+        )
+    else:
+        torch._check(
+            cos_cache.dim() == 3 and sin_cache.dim() == 3,
+            lambda: "cos_cache/sin_cache must be 3D when position_ids is not provided. "
+            f"Received cos_cache shape {cos_cache.shape}, sin_cache shape {sin_cache.shape}",
+        )
+
     # First ensure x has shape [batch_size, num_heads, seq_len, head_size]
-    batch_size = x.shape[0]
-    sequence_length = x.shape[1]
-    if len(x.shape) == 3:
-        hidden_size = x.shape[2]
+    # So that the rotation logic can be shared with reshaped 3D inputs
+    if input_rank == 4:
+        # Reshape from (batch_size, num_heads, seq_len, head_size)
+        # to [batch_size, seq_len, num_heads, head_size]
+        x = torch.permute(x, (0, 2, 1, 3))
+    elif input_rank == 3:
         torch._check(
             num_heads != 0,
-            lambda: f"num_heads must be provided for 3D inputs. Received input tensor with shape {x.shape}",
+            lambda: f"num_heads must be provided for 3D inputs. Received input tensor with shape {input_shape}",
         )
+        hidden_size = input_shape[2]
         head_size = hidden_size // num_heads
         new_shape = [batch_size, sequence_length, num_heads, head_size]
         x = torch.reshape(x, new_shape)
+
     torch._check(len(x.shape) == 4, lambda: "x should be a 4D tensor by now")
     head_size = x.shape[3]
 
@@ -84,14 +125,25 @@ def rotary_embedding_23(
             position_ids
         ]  # Shape: [batch_size, sequence_length, head_size/2]
     else:
-        cos = cos_cache
-        sin = sin_cache
-    cos = cos[
-        :, :, :rotary_embedding_dim_half
-    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
-    sin = sin[
-        :, :, :rotary_embedding_dim_half
-    ]  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+        cos = cos_cache  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+        sin = sin_cache  # Shape: [batch_size, sequence_length, rotary_embedding_dim/2]
+
+    torch._check(
+        cos.shape[0] == batch_size and cos.shape[1] == sequence_length,
+        lambda: f"cos has shape {cos.shape} but expected (batch={batch_size}, seq={sequence_length}, ...)",
+    )
+    torch._check(
+        sin.shape[0] == batch_size and sin.shape[1] == sequence_length,
+        lambda: f"sin has shape {sin.shape} but expected (batch={batch_size}, seq={sequence_length}, ...)",
+    )
+    torch._check(
+        cos.shape[-1] == rotary_embedding_dim_half,
+        lambda: f"Last dimension of cos cache ({cos.shape[-1]}) should match rotary_embedding_dim/2 ({rotary_embedding_dim_half}).",
+    )
+    torch._check(
+        sin.shape[-1] == rotary_embedding_dim_half,
+        lambda: f"Last dimension of sin cache ({sin.shape[-1]}) should match rotary_embedding_dim/2 ({rotary_embedding_dim_half}).",
+    )
     cos = torch.unsqueeze(
         cos, 2
     )  # Shape: [batch_size, sequence_length, 1, rotary_embedding_dim/2]
@@ -121,9 +173,11 @@ def rotary_embedding_23(
     else:
         x_rotate = torch.cat((real, imag), dim=-1)
     output = torch.cat((x_rotate, x_not_rotate), dim=-1)
-    if len(x.shape) == 3:
-        output = torch.reshape(output, x.shape)
-    return output
+    if input_rank == 3:
+        return torch.reshape(output, input_shape)
+
+    # Return the dimensions to the original order
+    return torch.permute(output, (0, 2, 1, 3))
 
 
 def _get_scale_factor(scale: Optional[float], head_size: int) -> float:
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index dc6312e5f7a3..76b50a8eb3f7 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -1,2267 +1,8 @@
-# mypy: allow-untyped-defs
-from __future__ import annotations
-
-import functools
-import inspect
-import math
-import sys
-import typing
-import warnings
-from typing import Any, Callable, Literal, NoReturn, TypeVar as _TypeVar
-from typing_extensions import Concatenate as _Concatenate, ParamSpec as _ParamSpec
-
-import torch
-import torch._C._onnx as _C_onnx
-from torch import _C
-
-# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import _constants, _type_utils, errors, utils
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils
-
-
-if typing.TYPE_CHECKING:
-    from collections.abc import Sequence
-
-    from torch.types import Number
-
-_T = _TypeVar("_T")
-_U = _TypeVar("_U")
-_P = _ParamSpec("_P")
-
-# ---------------------------------------------------------------------------------
-# Helper functions
-# ---------------------------------------------------------------------------------
-
-_ValueDescriptor = Literal[
-    "v",
-    "i",
-    "is",
-    "f",
-    "fs",
-    "b",
-    "s",
-    "t",
-    "none",
-]
-
-
-def _parse_arg(
-    value,
-    desc: _ValueDescriptor,
-    arg_name: str | None = None,
-    node_name: str | None = None,
-):
-    if desc == "none":
-        return value
-    if desc == "v" or not _is_value(value):
-        return value
-
-    node = value.node()
-    if node.mustBeNone():
-        return None
-    if node.kind() == "onnx::Constant":
-        node_val = _node_get(node, "value")
-        if desc == "i":
-            return int(node_val)
-        elif desc == "f":
-            return float(node_val)
-        elif desc == "b":
-            return bool(node_val)
-        elif desc == "s":
-            return str(node_val)
-        elif desc == "t":
-            return node_val
-        elif desc == "is":
-            return [int(v) for v in node_val]
-        elif desc == "fs":
-            return [float(v) for v in node_val]
-        else:
-            raise errors.SymbolicValueError(
-                f"ONNX symbolic does not understand the Constant node '{node}' "
-                f"specified with descriptor '{desc}'.",
-                value,
-            )
-    elif node.kind() == "prim::ListConstruct":
-        if desc == "is":
-            for v in node.inputs():
-                element_node = v.node()
-                if element_node.kind() != "onnx::Constant":
-                    raise errors.SymbolicValueError(
-                        f"Failed to export a node '{element_node}' "
-                        f"(in list node {node}) "
-                        f"because it is not constant. "
-                        f"Please try to make things (e.g. kernel sizes) static if possible.",
-                        value,
-                    )
-            return [int(_node_get(v.node(), "value")) for v in value.node().inputs()]
-        else:
-            raise errors.SymbolicValueError(
-                f"ONNX symbolic does not know how to unpack the ListConstruct node that "
-                f"is not a list of integers: '{node}'",
-                value,
-            )
-
-    if arg_name is None or node_name is None:
-        raise errors.SymbolicValueError(
-            f"Expected node type 'onnx::Constant', got '{node.kind()}'.",
-            value,
-        )
-
-    raise errors.SymbolicValueError(
-        "Expected node type 'onnx::Constant' "
-        f"for argument '{arg_name}' of node '{node_name}', got '{node.kind()}'.",
-        value,
-    )
-
-
-def _node_get(node: _C.Node, key: str):
-    """Gets attributes of a node which is polymorphic over return type."""
-    assert isinstance(node, _C.Node)
-    sel = node.kindOf(key)
-    return getattr(node, sel)(key)
-
-
-def _is_onnx_constant(value: _C.Value):
-    """Whether a Value is an ONNX constant."""
-    return value.node().kind() == "onnx::Constant"
-
-
-def _maybe_get_const(
-    value: _C.Value | torch.Tensor | Number | Sequence | None,
-    descriptor: _ValueDescriptor,
-):
-    # NOTE: prim::Constant at this stage usually means something not compatible in ONNX,
-    # otherwise it'd be converted to onnx::Constant
-    # TODO(justinchuby): Replace insinstance with _is_value once we figure out mypy
-    if isinstance(value, _C.Value) and _is_onnx_constant(value):
-        return _parse_arg(value, descriptor)
-    return value
-
-
-def _maybe_get_scalar(value):
-    value_t = _maybe_get_const(value, "t")
-    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
-        return value_t
-    return value
-
-
-def _get_const(value, desc, arg_name):
-    if not _is_constant(value):
-        raise errors.SymbolicValueError(
-            f"ONNX symbolic expected a constant value of the '{arg_name}' argument, "
-            f"got '{value}'",
-            value,
-        )
-    return _parse_arg(value, desc)
-
-
-def _unpack_list(list_value: _C.Value) -> list[_C.Value]:
-    list_node = list_value.node()
-    if list_node.kind() != "prim::ListConstruct":
-        raise errors.SymbolicValueError(
-            f"ONNX symbolic expected node type prim::ListConstruct, got '{list_node}'.",
-            list_value,
-        )
-    return list(list_node.inputs())
-
-
-def _unpack_tuple(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
-    tuple_node = tuple_value.node()
-    if not _is_tuple_construct(tuple_value):
-        raise errors.SymbolicValueError(
-            f"ONNX symbolic expected node type 'prim::TupleConstruct', "
-            f"got '{tuple_node.kind()}'.",
-            tuple_value,
-        )
-    return tuple(tuple_node.inputs())
-
-
-def _unpack_quantized_tensor(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
-    """Unpacks a quantized tensor into a tuple of tensor and scale/zero_point.
-    Args:
-        tuple_value: A tuple of tensor, scale, zero_point, and optionally axis.
-    Returns:
-        A tuple of tensor, scale, zero_point, and optionally axis.
-    """
-    tuple_node = tuple_value.node()
-    # A quantized tensor is represented as tuple of the form (tensor, scale, zero_point, <axis>)
-    if not _is_tuple_construct(tuple_value):
-        raise errors.SymbolicValueError(
-            f"ONNX symbolic expected the output of `{tuple_node}` to be a quantized "
-            f"tensor. Is this likely due to missing support for quantized "
-            f"`{tuple_node.kind()}`. Please create an issue on {_constants.PYTORCH_GITHUB_ISSUES_URL}",
-            tuple_value,
-        )
-    unpacked = tuple(tuple_node.inputs())
-    assert len(unpacked) == 3 or len(unpacked) == 4
-    return unpacked
-
-
-# Check if list_value is output from prim::ListConstruct
-# This is usually called before _unpack_list to ensure the list can be unpacked.
-def _is_packed_list(list_value: Any) -> bool:
-    return _is_value(list_value) and list_value.node().kind() == "prim::ListConstruct"
-
-
-def parse_args(
-    *arg_descriptors: _ValueDescriptor,
-) -> Callable[[Callable[_Concatenate[_U, _P], _T]], Callable[_Concatenate[_U, _P], _T]]:
-    """A decorator which converts args from torch._C.Value to built-in types.
-
-    For example:
-
-    ```
-    @parse_args('v', 'i', 'fs')
-    foo(g, a, b, c):
-        assert isinstance(a, torch._C.Value)
-        assert isinstance(b, int)
-        assert isinstance(c, list)
-        assert isinstance(c[0], float)
-    ```
-
-    Args:
-        arg_descriptors: list of str, where each element is
-            a string that specifies the type to convert to. Valid descriptors:
-            "v": no conversion, keep torch._C.Value.
-            "i": int
-            "is": list of int
-            "f": float
-            "fs": list of float
-            "b": bool
-            "s": str
-            "t": torch.Tensor
-            "none": the variable is unused
-    """
-
-    def decorator(
-        fn: Callable[_Concatenate[_U, _P], _T],
-    ) -> Callable[_Concatenate[_U, _P], _T]:
-        fn._arg_descriptors = arg_descriptors  # type: ignore[attr-defined]
-
-        @functools.wraps(fn)
-        def wrapper(g: _U, *args: _P.args, **kwargs: _P.kwargs) -> _T:
-            # some args may be optional, so the length may be smaller
-            FILE_BUG_MSG = (
-                "If you believe this is not due to custom symbolic implementation within your code or "
-                "an external library, please file an issue at "
-                "https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to report this bug."
-            )
-            assert len(arg_descriptors) >= len(args), (
-                f"A mismatch between the number of arguments ({len(args)}) and "
-                f"their descriptors ({len(arg_descriptors)}) was found at symbolic function '{fn.__name__}'. "
-                f"{FILE_BUG_MSG}"
-            )
-
-            try:
-                sig = inspect.signature(fn)
-                arg_names = list(sig.parameters.keys())[1:]
-                fn_name = fn.__name__
-            except Exception:
-                # FIXME(justinchuby): Avoid catching Exception.
-                # Catch a more specific exception instead.
-                arg_names = [None] * len(args)  # type: ignore[list-item]
-                fn_name = None
-            args = [
-                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[method-assign]
-                for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
-            ]
-            # only support _outputs in kwargs
-            assert len(kwargs) <= 1, (
-                f"Symbolic function {fn.__name__}'s '**kwargs' can contain a single "
-                f"key/value entry. "
-                f"{FILE_BUG_MSG}"
-            )
-
-            if len(kwargs) == 1:
-                assert "_outputs" in kwargs, (
-                    f"Symbolic function {fn.__name__}'s '**kwargs' can only contain "
-                    f"'_outputs' key at '**kwargs'. "
-                    f"{FILE_BUG_MSG}"
-                )
-            return fn(g, *args, **kwargs)
-
-        return wrapper
-
-    return decorator
-
-
-def quantized_args(
-    *arg_q_descriptors: bool,
-    scale: float | None = None,
-    zero_point: int | None = None,
-    quantize_output: bool = True,
-) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
-    """A decorator which extends support for quantized version of the base operator.
-
-    Quantization is detected by examining the arguments that are annotated by
-    `arg_q_descriptors`.
-
-    If quantization is detected, the base operator symbolic function will be wrapped with
-    argument de-quantization and output quantization.
-
-    Otherwise, only the base symbolic function will be invoked.
-
-    For example:
-
-    ```
-    @quantized_args(True, False)
-    def foo(g, x, y):
-        return x + y
-    ```
-
-    is equivalent to
-
-    ```
-    def q_foo(g, x, y):
-        if is_quantized_tensor(x):
-            x = dequantize(x)
-            out = foo(g, x, y)
-            return quantize(out)
-        else:
-            return foo(g, x, y)
-    ```
-
-    Args:
-        arg_q_descriptors: A sequence of bool, where each element represents if the
-          argument is QTensor for quantized version of this operator. It defaults
-          to False for unspecified (variable length) arguments.
-        scale: Quantized output scale. If None, derive from
-          the first quantized input scale.
-        zero_point: Quantized output zero point. If None,
-          derive from the first quantized input zero point.
-        quantize_output: If True, quantize the output of the base operator. Default is True
-    """
-
-    def decorator(fn):
-        @functools.wraps(fn)
-        def wrapper(g, *args, **kwargs):
-            nonlocal scale
-            nonlocal zero_point
-            if scale is not None:
-                _scale = g.op("Constant", value_t=torch.tensor(scale))
-            else:
-                _scale = None
-            if zero_point is not None:
-                _zero_point = g.op("Constant", value_t=torch.tensor(zero_point))
-            else:
-                _zero_point = None
-
-            # Support variable length arguments by marking unspecified ones as non-quantized
-            arg_q_descriptors_extended = arg_q_descriptors + (False,) * (
-                len(args) - len(arg_q_descriptors)
-            )
-            descriptor_args = tuple(zip(arg_q_descriptors_extended, args))
-
-            def _is_arg_quantized(descriptor, arg):
-                return descriptor and _is_value(arg) and _is_tuple_construct(arg)
-
-            # Run regular symbolic function if none of the argument is QTensor.
-            is_quantized: list[bool] = []
-            for descriptor, arg in descriptor_args:
-                # ListConstruct
-                if _is_packed_list(arg):
-                    is_quantized.extend(
-                        _is_arg_quantized(descriptor, arg_input)
-                        for arg_input in arg.node().inputs()
-                    )
-                else:
-                    is_quantized.append(_is_arg_quantized(descriptor, arg))
-
-            if not any(is_quantized):
-                return fn(g, *args, **kwargs)
-
-            # Dequantize arguments that are quantized
-            non_quantized_args = []
-            for descriptor, arg in descriptor_args:
-                if _is_arg_quantized(descriptor, arg):
-                    # Quantized arg is a tuple of (value, scale, zero_point)
-                    dequantized_arg, arg_scale, arg_zero_point, _ = dequantize_helper(
-                        g, arg
-                    )
-                    non_quantized_args.append(dequantized_arg)
-                    # Set scale and zero_point to the first quantized input if not already set
-                    if _scale is None:
-                        _scale = arg_scale
-                    if _zero_point is None:
-                        _zero_point = arg_zero_point
-                # ListConstruct
-                elif _is_packed_list(arg):
-                    for arg_input in arg.node().inputs():
-                        if _is_arg_quantized(descriptor, arg_input):
-                            # Quantized arg is a tuple of (value, scale, zero_point)
-                            (
-                                dequantized_arg,
-                                arg_scale,
-                                arg_zero_point,
-                                _,
-                            ) = dequantize_helper(g, arg_input)
-                            # Set scale and zero_point to the first quantized input if not already set
-                            if _scale is None:
-                                _scale = arg_scale
-                            if _zero_point is None:
-                                _zero_point = arg_zero_point
-                            arg_input.replaceAllUsesWith(dequantized_arg)
-                    non_quantized_args.append(arg)
-                else:
-                    # Non-quantized arg
-                    non_quantized_args.append(arg)
-            # TODO(justinchuby): Only single output is supported for now. We may want to
-            # support multiple outputs in the future.
-            output = fn(g, *non_quantized_args, **kwargs)
-
-            assert _scale is not None, "Bug: Scale must be set for quantized operator"
-            assert _zero_point is not None, (
-                "Bug: Zero point must be set for quantized operator"
-            )
-
-            if quantize_output:
-                return quantize_helper(g, output, _scale, _zero_point)
-            return output
-
-        return wrapper
-
-    return decorator
-
-
-def _scalar(x: Any) -> Number | None:
-    """Convert a scalar tensor into a Python value."""
-    if isinstance(x, torch.Tensor) and x.shape == ():
-        return x.item()
-    return None
-
-
-def _if_scalar_type_as(self, tensor):
-    """
-    Convert self into the same type of tensor, as necessary.
-    We only support implicit casting for scalars, so we never
-    actually need to insert an ONNX cast operator here; just
-    fix up the scalar.
-    """
-    if isinstance(self, _C.Value):
-        return self
-
-    scalar_type = _type_utils.JitScalarType.from_value(
-        tensor, _type_utils.JitScalarType.UNDEFINED
-    )
-    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        ty = scalar_type.scalar_name().lower()
-        return getattr(self, ty)()
-    return self
-
-
-def _is_none(x: Any) -> bool:
-    return x is None or (x.node().mustBeNone() if isinstance(x, _C.Value) else False)
-
-
-def _is_value(x: Any) -> bool:
-    return isinstance(x, _C.Value)
-
-
-def _is_constant(value: Any) -> bool:
-    return not _is_value(value) or value.node().kind() in {
-        "onnx::Constant",
-        "prim::Constant",
-    }
-
-
-def _is_tensor(x: _C.Value) -> bool:
-    return x.type().isSubtypeOf(_C.TensorType.get())
-
-
-# Note: _C.JitType is not exposed to Python and cannot be checked in runtime.
-def _as_list_type(jit_type: _C.JitType) -> _C.ListType | None:
-    if isinstance(jit_type, _C.ListType):
-        return jit_type
-    return None
-
-
-def _is_list(x: _C.Value) -> bool:
-    return _as_list_type(x.type()) is not None
-
-
-def _is_tensor_list(x: _C.Value) -> bool:
-    x_type = _as_list_type(x.type())
-    if x_type is None:
-        return False
-    return isinstance(x_type.getElementType(), _C.TensorType)
-
-
-def _is_scalar_list(x: _C.Value) -> bool:
-    """Checks if x is a scalar list, for example: List[float], List[int].
-
-    Besides checking the type is ListType, we also check if the data type is
-    a valid ONNX data type.
-    """
-    x_type = _as_list_type(x.type())
-    if x_type is None:
-        return False
-    scalar_type = _type_utils.JitScalarType.from_value(x)
-    return scalar_type.onnx_compatible()
-
-
-def _is_tuple_construct(x: _C.Value) -> bool:
-    return x.node().kind() == "prim::TupleConstruct"
-
-
-def is_complex_value(x: _C.Value) -> bool:
-    assert _is_value(x)
-    return _type_utils.JitScalarType.from_value(
-        x, _type_utils.JitScalarType.UNDEFINED
-    ) in {
-        _type_utils.JitScalarType.COMPLEX32,
-        _type_utils.JitScalarType.COMPLEX64,
-        _type_utils.JitScalarType.COMPLEX128,
-    }
-
-
-def _get_tensor_rank(x: _C.Value) -> int | None:
-    if not _is_tensor(x) or x.type() is None:
-        return None
-    x_type = x.type()
-    x_type = typing.cast(_C.TensorType, x_type)
-    return x_type.dim()
-
-
-def _get_tensor_sizes(x: _C.Value, allow_nonstatic: bool = True):
-    if not _is_tensor(x) or x.type() is None:
-        return None
-    x_type = x.type()
-    x_type = typing.cast(_C.TensorType, x_type)
-    if allow_nonstatic:
-        # Each individual symbol is returned as None.
-        # e.g. [1, "a", "b"] -> [1, None, None]
-        return x_type.varyingSizes()
-    # returns None, if exists any symbol in sizes.
-    # e.g. [1, "a", "b"] -> None
-    return x_type.sizes()
-
-
-def _get_tensor_dim_size(x: _C.Value, dim: int) -> int | None:
-    sizes = _get_tensor_sizes(x)
-    return sizes[dim] if sizes else None
-
-
-def _get_dim_for_cross(x: _C.Value, dim: int | None):
-    if dim == -1:
-        tensor_rank = _get_tensor_rank(x)
-        assert tensor_rank is not None
-        return dim + tensor_rank
-    # If dim is not given, it defaults to the first dimension found with the size 3
-    if dim is None:
-        sizes = _get_tensor_sizes(x)
-        assert sizes is not None
-        for index, size in enumerate(sizes):
-            if size is not None and size == 3:
-                return index
-    return dim
-
-
-def _unimplemented(op: str, msg: str, value: _C.Value | None = None) -> None:
-    # For BC reasons, the behavior for Caffe2 does not raise exception for unimplemented operators
-    if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
-        _onnx_unsupported(f"{op}, {msg}", value)
-
-
-def _onnx_unsupported(op_name: str, value: _C.Value | None = None) -> NoReturn:
-    message = (
-        f"Unsupported: ONNX export of operator {op_name}. "
-        f"Please feel free to request support or submit a pull request "
-        f"on PyTorch GitHub: {_constants.PYTORCH_GITHUB_ISSUES_URL}"
-    )
-    if isinstance(value, _C.Value):
-        raise errors.SymbolicValueError(
-            message,
-            value,
-        )
-    raise errors.OnnxExporterError(message)
-
-
-def _onnx_opset_unsupported(
-    op_name: str,
-    current_opset: int,
-    supported_opset: int,
-    value: _C.Value | None = None,
-) -> NoReturn:
-    message = (
-        f"Unsupported: ONNX export of {op_name} in opset {current_opset}. "
-        f"Please try opset version {supported_opset}."
-    )
-    if isinstance(value, _C.Value):
-        raise errors.SymbolicValueError(
-            message,
-            value,
-        )
-    raise errors.OnnxExporterError(message)
-
-
-def _onnx_opset_unsupported_detailed(
-    op_name: str,
-    current_opset: int,
-    supported_opset: int,
-    reason: str,
-    value: _C.Value | None = None,
-) -> NoReturn:
-    message = (
-        f"Unsupported: ONNX export of {op_name} in "
-        f"opset {current_opset}. {reason}. Please try opset version {supported_opset}."
-    )
-    if isinstance(value, _C.Value):
-        raise errors.SymbolicValueError(
-            message,
-            value,
-        )
-    raise errors.OnnxExporterError(message)
-
-
-def _block_list_in_opset(name: str):
-    def symbolic_fn(*args, **kwargs):
-        raise errors.OnnxExporterError(
-            f"ONNX export failed on {name}, which is not implemented for opset "
-            f"{GLOBALS.export_onnx_opset_version}. "
-            "Try exporting with other opset versions."
-        )
-
-    return symbolic_fn
-
-
-def _try_get_scalar_type(*args) -> _type_utils.JitScalarType | None:
-    for arg in args:
-        scalar_type = _type_utils.JitScalarType.from_value(
-            arg, _type_utils.JitScalarType.UNDEFINED
-        )
-        if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-            return scalar_type
-    return None
-
-
-def _type_promote_from_values(*args) -> _type_utils.JitScalarType:
-    undef = _type_utils.JitScalarType.UNDEFINED
-    jit_types = [_try_get_scalar_type(arg) for arg in args]
-    if len(jit_types) == 0:
-        return undef
-    if len(jit_types) == 1:
-        return jit_types[0]  # type: ignore[return-value]
-    new_dtype = jit_types[0].dtype()  # type: ignore[union-attr]
-    for t in jit_types:
-        new_dtype = torch.promote_types(new_dtype, t.dtype())  # type: ignore[union-attr]
-    return _type_utils.JitScalarType.from_dtype(new_dtype)
-
-
-def _maybe_cast_to_type(
-    g: jit_utils.GraphContext, value, jit_type: _type_utils.JitScalarType
-):
-    if (
-        _type_utils.JitScalarType.from_value(value, _type_utils.JitScalarType.UNDEFINED)
-        != jit_type
-    ):
-        return g.op(
-            "Cast",
-            value,
-            to_i=jit_type.onnx_type(),
-        )
-    return value
-
-
-def _select_helper(g: jit_utils.GraphContext, self, dim, index, apply_reshape=True):
-    index_const = _maybe_get_scalar(index)
-    index_dim = _get_tensor_rank(index)
-    if not _is_value(index_const):
-        # Index is a constant scalar. Make it a size 1 constant tensor.
-        index = g.op("Constant", value_t=torch.LongTensor([index_const]))
-    elif index_dim is not None and apply_reshape:
-        if index_dim == 0:
-            # Index is a scalar. Reshape it to a size 1 tensor.
-            index = _reshape_helper(
-                g, index, g.op("Constant", value_t=torch.LongTensor([1]))
-            )
-
-    index_scalar_type = _type_utils.JitScalarType.from_value(
-        index, _type_utils.JitScalarType.UNDEFINED
-    )
-    if index_scalar_type not in {
-        _type_utils.JitScalarType.INT64,
-        _type_utils.JitScalarType.INT,
-    }:
-        index = g.op("Cast", index, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return g.op("Gather", self, index, axis_i=dim)
-
-
-def _slice_helper(
-    g: jit_utils.GraphContext,
-    input,
-    axes,
-    starts,
-    ends,
-    steps=None,
-):
-    if g.opset <= 9:
-        from torch.onnx.symbolic_opset9 import _slice as _slice9
-
-        return _slice9(g, input, axes, starts, ends)
-    else:
-        from torch.onnx.symbolic_opset10 import _slice as _slice10
-
-        return _slice10(g, input, axes, starts, ends, steps)
-
-
-def _is_fp(value) -> bool:
-    return _type_utils.JitScalarType.from_value(
-        value, _type_utils.JitScalarType.UNDEFINED
-    ) in {
-        _type_utils.JitScalarType.FLOAT,
-        _type_utils.JitScalarType.DOUBLE,
-        _type_utils.JitScalarType.HALF,
-        _type_utils.JitScalarType.BFLOAT16,
-    }
-
-
-def _is_bool(value) -> bool:
-    return _type_utils.JitScalarType.from_value(
-        value, _type_utils.JitScalarType.UNDEFINED
-    ) in {_type_utils.JitScalarType.BOOL}
-
-
-def _generate_wrapped_number(g: jit_utils.GraphContext, scalar):
-    """Creates a wrapped number based on https://github.com/pytorch/pytorch/issues/9515.
-
-    A Tensor is a considered a "wrapped number" if it is
-    auto-wrapped from a C++ or Python number type. Integer types are
-    wrapped as 0-dim int64 tensors and floating-point types are
-    wrapped as 0-dim double tensors.
-
-    The input to this function is constant value. If the data type
-    is a floating point type, it is converted to a 0-dim double
-    tensor, else it is converted to a 0-dim tensor of its original type
-    """
-    assert not isinstance(scalar, torch.Tensor)
-    if isinstance(scalar, float):
-        return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
-    return g.op("Constant", value_t=torch.tensor(scalar))
-
-
-def _sort_helper(g: jit_utils.GraphContext, input, dim, descending=True, out=None):
-    if out is not None:
-        _unimplemented("Sort", "Out parameter is not supported")
-    shape_ = g.op("Shape", input)
-    dim_size_ = g.op(
-        "Gather",
-        shape_,
-        g.op("Constant", value_t=torch.tensor([dim], dtype=torch.int64)),
-    )
-    if g.opset <= 10:
-        if not descending:
-            _unimplemented("Sort", "Ascending is not supported")
-        return g.op("TopK", input, dim_size_, axis_i=dim, outputs=2)
-    else:
-        return g.op(
-            "TopK", input, dim_size_, axis_i=dim, largest_i=descending, outputs=2
-        )
-
-
-def _topk_helper(
-    g: jit_utils.GraphContext, input, k, dim, largest=True, sorted=False, out=None
-):
-    if out is not None:
-        _unimplemented("TopK", "Out parameter is not supported")
-    if not _is_value(k):
-        k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
-    else:
-        k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
-        if _try_get_scalar_type(k) != _type_utils.JitScalarType.INT64:
-            k = g.op("Cast", k, to_i=_C_onnx.TensorProtoDataType.INT64)
-    if g.opset <= 10:
-        if not largest:
-            _unimplemented("TopK", "Ascending is not supported")
-        return g.op("TopK", input, k, axis_i=dim, outputs=2)
-    else:
-        return g.op(
-            "TopK", input, k, axis_i=dim, largest_i=largest, sorted_i=sorted, outputs=2
-        )
-
-
-def _lt_helper(g: jit_utils.GraphContext, input, other):
-    if g.opset <= 8:
-        from torch.onnx.symbolic_opset8 import lt as _lt8
-
-        return _lt8(g, input, other)
-    else:
-        from torch.onnx.symbolic_opset9 import lt as _lt9
-
-        return _lt9(g, input, other)
-
-
-def _interpolate_warning(interpolate_mode):
-    onnx_op = (
-        "onnx:Resize" if GLOBALS.export_onnx_opset_version >= 10 else "onnx:Upsample"
-    )
-    warnings.warn(
-        "You are trying to export the model with "
-        + onnx_op
-        + " for ONNX opset version "
-        "" + str(GLOBALS.export_onnx_opset_version) + ". "
-        "This operator might cause results to not match the expected results by PyTorch.\n"
-        "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
-        "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
-        "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
-        "We recommend using opset 11 and above for models using this operator."
-    )
-
-
-def _unsqueeze_helper(g: jit_utils.GraphContext, input, axes_i):
-    if len(axes_i) == 0:
-        # unnecessary unsqueeze if axes length==0
-        return input
-    elif _is_constant(axes_i[0]):
-        if g.opset >= 13:
-            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
-            return g.op("Unsqueeze", input, axes)
-        return g.op("Unsqueeze", input, axes_i=axes_i)
-    # Tensor type
-    if g.opset < 13:
-        raise errors.SymbolicValueError(
-            "Opset version must be >= 13 for Unsqueeze with dynamic axes.", input
-        )
-    return g.op("Unsqueeze", input, axes_i[0])
-
-
-def _squeeze_helper(g: jit_utils.GraphContext, input, axes_i):
-    if _is_constant(axes_i[0]):
-        if g.opset >= 13:
-            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
-            return g.op("Squeeze", input, axes)
-        return g.op("Squeeze", input, axes_i=axes_i)
-    # Tensor type
-    if g.opset < 13:
-        raise errors.SymbolicValueError(
-            "Opset version must be >= 13 for Squeeze with dynamic axes.", input
-        )
-    axes_t = axes_i[0]
-    axes_rank = _get_tensor_rank(axes_t)
-    assert axes_rank is not None
-    if axes_rank > 1:
-        raise errors.SymbolicValueError(
-            "For Squeeze axses as input, the axes rank must be one in ONNX spec.", input
-        )
-    elif axes_rank == 0:
-        # The axes is a scalar. Unsqueeze it to a rank 1 tensor.
-        axes_t = _unsqueeze_helper(g, axes_t, [0])
-        return g.op("Squeeze", input, axes_t)
-    return g.op("Squeeze", input, axes_t)
-
-
-def _reducesum_helper(
-    g: jit_utils.GraphContext,
-    input,
-    axes_i=None,
-    keepdims_i=1,
-    noop_with_empty_axes_i=0,
-):
-    keepdims_i = _maybe_get_const(keepdims_i, "i")
-    if g.opset >= 13:
-        if axes_i:
-            if not _is_value(axes_i):
-                axes_i = g.op(
-                    "Constant", value_t=torch.tensor(axes_i, dtype=torch.long)
-                )
-            return g.op(
-                "ReduceSum",
-                input,
-                axes_i,
-                keepdims_i=keepdims_i,
-                noop_with_empty_axes_i=noop_with_empty_axes_i,
-            )
-        return g.op(
-            "ReduceSum",
-            input,
-            keepdims_i=keepdims_i,
-            noop_with_empty_axes_i=noop_with_empty_axes_i,
-        )
-    else:
-        return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i)
-
-
-def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, dim):
-    output_size = _maybe_get_const(output_size, "is")
-    if _is_value(output_size):
-        offset = 2
-        offsets = g.op("Constant", value_t=torch.ones(offset, dtype=torch.float32))
-        dividend = g.op("Cast", output_size, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-        divisor = _slice_helper(
-            g, g.op("Shape", input), axes=[0], ends=[sys.maxsize], starts=[offset]
-        )
-        divisor = g.op("Cast", divisor, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-        scale_dims = g.op("Div", dividend, divisor)
-        scales = g.op("Concat", offsets, scale_dims, axis_i=0)
-    else:
-        scales_constant = [
-            1.0
-            if i < 2
-            else float(output_size[-(dim - i)])
-            / float(input.type().sizes()[-(dim - i)])
-            for i in range(0, dim)
-        ]
-        scales = g.op(
-            "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
-        )
-    return scales
-
-
-def _interpolate_get_scales_if_available(g: jit_utils.GraphContext, scales):
-    available_scales = _maybe_get_const(scales[0], "fs") != -1 and not _is_none(
-        scales[0]
-    )
-
-    if not available_scales:
-        return None
-
-    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
-    scales_list = g.op(
-        "Constant", value_t=torch.tensor(_maybe_get_const(scales[0], "fs"))
-    )
-    scales = g.op("Concat", offsets, scales_list, axis_i=0)
-    return scales
-
+"""Backward compatibility module for torch.onnx.symbolic_helper."""
 
-def _get_interpolate_attributes(g: jit_utils.GraphContext, mode, args):
-    if mode == "nearest":
-        align_corners = None
-        scales = args[0:]
-    else:
-        align_corners = args[0]
-        scales = args[1:]
-    scales = _interpolate_get_scales_if_available(g, scales)
-    return scales, align_corners
-
-
-def _interpolate_get_scales(g: jit_utils.GraphContext, scale_factor, dim):
-    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
-    scale_factor_rank = _get_tensor_rank(scale_factor)
-    if isinstance(scale_factor.type(), _C.ListType) or (
-        scale_factor_rank is not None and scale_factor_rank > 0
-    ):
-        return g.op("Concat", offsets, scale_factor, axis_i=0)
-    else:
-        scale_factor = _unsqueeze_helper(g, scale_factor, [0])
-        scale_factor = g.op(
-            "Cast", scale_factor, to_i=_C_onnx.TensorProtoDataType.FLOAT
-        )
-        scales = [scale_factor for i in range(dim - 2)]
-    scale_factor = g.op("Concat", offsets, *scales, axis_i=0)
-    return scale_factor
-
-
-def _interpolate_get_scales_and_mode(
-    g: jit_utils.GraphContext, input, size, scale_factor, mode, align_corners
-):
-    mode = _maybe_get_const(mode, "s")
-    if "linear" in mode:
-        mode = "linear"
-    if "cubic" in mode:
-        mode = "cubic"
-    _interpolate_warning(mode)
-
-    align_corners = _maybe_get_const(align_corners, "b")
-    if isinstance(align_corners, bool) and align_corners:
-        return _unimplemented("interpolate", "align_corners == True")
-
-    if not input.type().dim():
-        return _unimplemented("interpolate", "missing input shape")
-    dim = input.type().dim()
-
-    if not _is_none(scale_factor):
-        scale_factor = _interpolate_get_scales(g, scale_factor, dim)
-    elif not _is_none(size):
-        if not _is_packed_list(size):
-            is_scalar = _maybe_get_const(size, "t").dim() == 0
-            if is_scalar:
-                size = _unsqueeze_helper(g, size, [0])
-                size = [size for i in range(dim - 2)]
-                size = g.op("Concat", *size, axis_i=0)
-        scale_factor = _interpolate_size_to_scales(g, input, size, dim)
-    else:
-        return _unimplemented(
-            "interpolate", "Both size and scales are None in __interpolate"
-        )
-    return scale_factor, mode
-
-
-def _argmin_argmax_helper(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-    op_name: str,
-):
-    def op_wrapper(input, axis_i, keepdims_i):
-        if g.opset >= 12:
-            return g.op(
-                op_name,
-                input,
-                axis_i=axis_i,
-                keepdims_i=keepdims_i,
-                select_last_index_i=False,
-            )
-        return g.op(op_name, input, axis_i=axis_i, keepdims_i=keepdims_i)
-
-    if _is_none(dim):
-        flattened = _reshape_helper(
-            g, input, g.op("Constant", value_t=torch.tensor([-1]))
-        )
-        output = op_wrapper(flattened, axis_i=0, keepdims_i=False)
-        if keepdim:
-            input_shape = g.op("Shape", input)
-            input_shape_shape = g.op("Shape", input_shape)
-            new_shape = g.op(
-                "ConstantOfShape",
-                input_shape_shape,
-                value_t=torch.tensor([1], dtype=torch.int64),
-            )
-            output = g.op("Reshape", output, new_shape)
-        return output
-
-    dim = _parse_arg(dim, "i")
-    return op_wrapper(input, axis_i=dim, keepdims_i=keepdim)
-
-
-def _interpolate_helper(name, dim, interpolate_mode):
-    @quantized_args(True, False, False)
-    def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args)
-        align_corners = _maybe_get_scalar(align_corners)
-        coordinate_transformation_mode = (
-            "asymmetric"
-            if interpolate_mode == "nearest"
-            else "align_corners"
-            if align_corners
-            else "half_pixel"
-        )
-
-        if scales is None:
-            input_size = g.op("Shape", input)
-            input_size_beg = _slice_helper(
-                g, input_size, axes=[0], ends=[2], starts=[0]
-            )
-            output_size = g.op(
-                "Cast", output_size, to_i=_C_onnx.TensorProtoDataType.INT64
-            )
-            output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
-
-            if g.opset >= 13:
-                empty_roi = _optional_input_placeholder_tensor(g)
-                empty_scales = _optional_input_placeholder_tensor(g)
-            else:
-                empty_roi = g.op(
-                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
-                )
-                empty_scales = g.op(
-                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
-                )
-
-            return g.op(
-                "Resize",
-                input,
-                empty_roi,
-                empty_scales,
-                output_size,
-                coordinate_transformation_mode_s=coordinate_transformation_mode,
-                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                mode_s=interpolate_mode,  # nearest, linear, or cubic
-                nearest_mode_s="floor",
-            )  # only valid when mode="nearest"
-        else:
-            if g.opset >= 13:
-                empty_roi = _optional_input_placeholder_tensor(g)
-            else:
-                empty_roi = g.op(
-                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
-                )
-
-            return g.op(
-                "Resize",
-                input,
-                empty_roi,
-                scales,
-                coordinate_transformation_mode_s=coordinate_transformation_mode,
-                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-                mode_s=interpolate_mode,  # nearest, linear, or cubic
-                nearest_mode_s="floor",
-            )  # only valid when mode="nearest"
-
-    return symbolic_fn
-
-
-def __interpolate_helper(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-):
-    mode = _maybe_get_const(mode, "s")
-    if "linear" in mode:
-        mode = "linear"
-    if "cubic" in mode:
-        mode = "cubic"
-    align_corners = _maybe_get_const(align_corners, "b")
-    align_corners = False if not isinstance(align_corners, bool) else align_corners
-    coordinate_transformation_mode = (
-        "asymmetric"
-        if mode == "nearest"
-        else "align_corners"
-        if align_corners
-        else "half_pixel"
-    )
-
-    if not _is_none(size):
-        input_size = g.op("Shape", input)
-        input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
-        # in some cases size is not a packed list but size is a scalar
-        # We need to also verify that (_maybe_get_const(size, "t").dim() == 0)
-        # but this information is not always available. Try to get the dim,
-        # and if not assume that it is not a scalar.
-        try:
-            is_scalar = not _is_packed_list(size) and (
-                _maybe_get_const(size, "t").dim() == 0
-            )
-        except AttributeError:
-            is_scalar = not _is_packed_list(size)
-            if not is_scalar:
-                warnings.warn(
-                    "Cannot verify if the output_size is a scalar "
-                    "while exporting interpolate. Assuming that it is not a scalar."
-                )
-
-        if is_scalar:
-            rank = _get_tensor_rank(input)
-            if rank is None:
-                return _unimplemented(
-                    "interpolate (with a scalar output_size)",
-                    "missing input shape (try giving an array of output_size values)",
-                )
-            size = _unsqueeze_helper(g, size, [0])
-            size = [size for i in range(rank - 2)]
-            size = g.op("Concat", *size, axis_i=0)
-        size = g.op("Cast", size, to_i=_C_onnx.TensorProtoDataType.INT64)
-        size = g.op("Concat", input_size, size, axis_i=0)
-
-        if g.opset >= 13:
-            empty_roi = _optional_input_placeholder_tensor(g)
-            empty_scales = _optional_input_placeholder_tensor(g)
-        else:
-            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-            empty_scales = g.op(
-                "Constant", value_t=torch.tensor([], dtype=torch.float32)
-            )
-
-        return g.op(
-            "Resize",
-            input,
-            empty_roi,
-            empty_scales,
-            size,
-            coordinate_transformation_mode_s=coordinate_transformation_mode,
-            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-            mode_s=mode,  # nearest, linear, or cubic
-            nearest_mode_s="floor",
-        )
-    else:  # if not _is_none(scales)
-        rank = _get_tensor_rank(input)
-        if rank is None:
-            return _unimplemented("interpolate (with scales)", "missing input shape")
-
-        if g.opset >= 13:
-            empty_roi = _optional_input_placeholder_tensor(g)
-        else:
-            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
-
-        scales = _interpolate_get_scales(g, scale_factor, rank)
-        return g.op(
-            "Resize",
-            input,
-            empty_roi,
-            scales,
-            coordinate_transformation_mode_s=coordinate_transformation_mode,
-            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
-            mode_s=mode,  # nearest, linear, or cubic
-            nearest_mode_s="floor",
-        )  # only valid when mode="nearest"
-
-
-def _unbind_helper(g: jit_utils.GraphContext, self, dim, _outputs):
-    if g.opset < 11:
-        from torch.onnx.symbolic_opset9 import unbind
-    elif g.opset <= 12:
-        from torch.onnx.symbolic_opset11 import unbind  # type: ignore[no-redef]
-    else:
-        from torch.onnx.symbolic_opset13 import unbind  # type: ignore[no-redef]
-    return unbind(g, self, dim, _outputs)
-
-
-def _scatter_helper(g: jit_utils.GraphContext, self, dim, index, src):
-    if g.opset <= 10:
-        from torch.onnx.symbolic_opset9 import scatter
-    else:
-        # for mypy, scatter was imported two lines above
-        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
-    return scatter(g, self, dim, index, src)
-
-
-def _repeat_interleave_split_helper(g: jit_utils.GraphContext, self, reps, dim):
-    if g.opset <= 12:
-        split_out = g.op("Split", self, split_i=[1] * reps, axis_i=dim, outputs=reps)
-    else:
-        from torch.onnx.symbolic_opset13 import split
-
-        repeats = g.op("Constant", value_t=torch.tensor([1] * reps))
-        split_out = split(g, self, repeats, dim, _outputs=reps)
-    return split_out if reps > 1 else [split_out]
-
-
-def _repeat_interleave_single_value_repeat_helper(
-    g: jit_utils.GraphContext, self, repeats, dim
-):
-    from torch.onnx.symbolic_opset9 import flatten, unsqueeze
-
-    if not _is_tensor(repeats):
-        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
-
-    const_repeats: bool = _is_constant(repeats)
-    reps = _maybe_get_const(repeats, "t")
-
-    # Convert 'repeats' to 1-d if it is 0-d.
-    if _get_tensor_rank(repeats) == 0:
-        repeats = g.op("Reshape", repeats, g.op("Constant", value_t=torch.tensor([1])))
-
-    # Create a new dim of size 1, then expand it to be 'repeats' long, and finally collapse it.
-    unsqueezed = unsqueeze(g, self, dim + 1)
-
-    # repeats_per_dim is 1 for all dims except for the new unsqueezed dim, where it has value 'repeats'.
-    if const_repeats:
-        # 'Repeats' is a constant, 'repeats_per_dim' can be a constant.
-        onehot = torch.ones(_get_tensor_rank(unsqueezed), dtype=torch.int64)  # type: ignore[arg-type]
-        onehot[dim + 1] = reps
-        repeats_per_dim = g.op("Constant", value_t=onehot)
-    else:
-        # 'Repeats' is a variable, 'repeats_per_dim' cannot be a constant.
-        onehot = g.op(
-            "OneHot",
-            unsqueeze(g, dim + 1, 0),  # indices, must be >= 1-dimensional
-            g.op(
-                "Constant", value_t=torch.tensor(_get_tensor_rank(unsqueezed))
-            ),  # depth
-            g.op(
-                "Concat", g.op("Constant", value_t=torch.tensor([1])), repeats, axis_i=0
-            ),  # on/off values
-        )
-        repeats_per_dim = flatten(g, onehot, 0, 1)
-
-    tiled = g.op("Tile", unsqueezed, repeats_per_dim)
-    return flatten(g, tiled, dim, dim + 1)
-
-
-def _arange_cast_helper(
-    g: jit_utils.GraphContext, end, start=None, step=None, dtype=None
-) -> tuple[
-    _type_utils.JitScalarType,
-    _C.Value | None,
-    _C.Value | None,
-    _C.Value | None,
-]:
-    def _is_all_integral(scalars):
-        for scalar in scalars:
-            scalar_type = _type_utils.JitScalarType.from_value(
-                scalar, _type_utils.JitScalarType.UNDEFINED
-            )
-            if (
-                scalar_type != _type_utils.JitScalarType.INT64
-                and scalar_type != _type_utils.JitScalarType.UNDEFINED
-            ):
-                return False
-        return True
-
-    # This logic is based on torch.arange docs. If "dtype" is provided,
-    # infer input types from dtype. If not, then check if any of start, stop,
-    # or step are floating point, and infer the type from get_default.
-    # Otherwise, the dtype is inferred to be torch.int64.
-    if dtype is None or (_is_value(dtype) and _is_none(dtype)):
-        if _is_all_integral([start, end, step]):
-            scalar_type = _type_utils.JitScalarType.INT64
-        else:
-            scalar_type = _type_utils.JitScalarType.from_dtype(
-                torch.get_default_dtype()
-            )
-    else:
-        assert isinstance(dtype, int)
-        # TODO(justinchuby): Check if dtype is indeed a int.
-        scalar_type = _type_utils.JitScalarType(dtype)
-
-    start = g.op("Cast", start, to_i=scalar_type.onnx_type()) if start else None
-    end = g.op("Cast", end, to_i=scalar_type.onnx_type()) if end else None
-    step = g.op("Cast", step, to_i=scalar_type.onnx_type()) if step else None
-    return scalar_type, end, start, step
-
-
-def _arange_helper(g: jit_utils.GraphContext, *args):
-    if g.opset <= 10:
-        from torch.onnx.symbolic_opset9 import arange
-    else:
-        from torch.onnx.symbolic_opset11 import arange  # type: ignore[no-redef]
-    return arange(g, *args)
-
-
-def _size_helper(g: jit_utils.GraphContext, self, dim):
-    full_shape = g.op("Shape", self)
-    from torch.onnx.symbolic_opset9 import select
-
-    return select(g, full_shape, g.op("Constant", value_t=torch.tensor([0])), dim)
-
-
-def _index_fill_reshape_helper(g: jit_utils.GraphContext, self, dim, index):
-    # 1. reshape index => [1, ..., 1, dim, 1, ..., 1]
-    # 2. expand index => [..., dim, ...], same shape as self except for dim.
-    # 3. expand value as well.
-    # 4. apply onnx::scatter.
-
-    from torch.onnx.symbolic_opset9 import expand
-
-    if g.opset <= 10:
-        from torch.onnx.symbolic_opset9 import scatter
-    else:
-        # for mypy, scatter was imported two lines above
-        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
-
-    if self.type().dim() is None:
-        return _unimplemented("index_fill", "input rank not accessible")
-    self_dim = self.type().dim()
-    dim_value = _parse_arg(dim, "i")
-    if dim_value < 0:
-        dim_value += self_dim
-    unsqueezed_index = _unsqueeze_helper(
-        g, index, [i for i in range(self_dim) if i != dim_value]
-    )
-    expanded_index_shape = scatter(
-        g, g.op("Shape", self), 0, _unsqueeze_helper(g, dim, [0]), g.op("Shape", index)
-    )
-    expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
-    return expanded_index_shape, expanded_index
-
-
-# By default, when any value in the 'shape' input is equal to zero
-# the corresponding dimension value is copied from the input tensor dynamically.
-# allowzero=1 indicates that if any value in the 'shape' input is set to zero,
-# the zero value is honored, similar to NumPy.
-# allowzero=1 is only supported for opset version >= 14.
-def _reshape_helper(g: jit_utils.GraphContext, input, shape, allowzero=0):
-    shape = _maybe_get_const(shape, "is")
-    if not _is_value(shape):
-        shape = g.op("Constant", value_t=torch.LongTensor(shape))
-    if g.opset <= 13:
-        if allowzero == 1:
-            _onnx_opset_unsupported(
-                "Reshape with allowzero=1", GLOBALS.export_onnx_opset_version, 14, input
-            )
-        return g.op("Reshape", input, shape)
-    else:
-        return g.op("Reshape", input, shape, allowzero_i=allowzero)
-
-
-def _batchnorm_helper(
-    g: jit_utils.GraphContext, input, weight, bias, running_mean, running_var
-):
-    from torch.onnx.symbolic_opset9 import _var_mean
-
-    batch_size = _get_tensor_dim_size(input, 0)
-    channel_size = _get_tensor_dim_size(input, 1)
-
-    if weight is None or _is_none(weight):
-        if channel_size is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of batch_norm for unknown channel size.",
-                input,
-            )
-        weight_value = torch.tensor(
-            [1.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        )
-        weight = g.op("Constant", value_t=weight_value)
-    if bias is None or _is_none(bias):
-        if channel_size is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of batch_norm for unknown channel size.",
-                input,
-            )
-        bias_value = torch.tensor(
-            [0.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        )
-        bias = g.op("Constant", value_t=bias_value)
-    # If track_running_stats is set to False batch statistics are instead used during evaluation time
-    if (
-        running_mean is None
-        or _is_none(running_mean)
-        or running_var is None
-        or _is_none(running_var)
-    ):
-        assert batch_size is not None and channel_size is not None
-        reshape_in = _reshape_helper(
-            g,
-            input,
-            g.op(
-                "Constant",
-                value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64),
-            ),
-        )
-        trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
-        running_var, running_mean = _var_mean(
-            g,
-            trans_in,
-            g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
-            False,
-            False,
-        )
-    return weight, bias, running_mean, running_var
-
-
-def _avgpool_helper(
-    tuple_fn: Callable[[Any], Sequence[int]],
-    padding: int | Sequence[int],
-    kernel_size,
-    stride,
-    divisor_override,
-    name,
-) -> tuple[int, ...]:
-    if divisor_override and divisor_override.node().kind() != "prim::Constant":
-        _unimplemented(name, "divisor_override")
-    return tuple(tuple_fn(padding))
-
-
-def check_training_mode(op_train_mode: int, op_name: str) -> None:
-    """Warns the user if the model's training mode and the export mode do not agree."""
-    if GLOBALS.training_mode == _C_onnx.TrainingMode.PRESERVE:
-        return
-
-    if op_train_mode:
-        op_mode_enum = _C_onnx.TrainingMode.TRAINING
-    else:
-        op_mode_enum = _C_onnx.TrainingMode.EVAL
-    if op_mode_enum == GLOBALS.training_mode:
-        # The modes agree. Do nothing
-        return
-
-    op_mode_text = f"train={bool(op_train_mode)}"
-    # Setting the model mode could result in op_mode != GLOBALS.training_mode
-    # if the model is a FuncModule. In this case we warn the user of
-    # the state and export depending on op_mode
-    # This is to support use-cases of fixing certain layer weights
-    # in training.
-    warnings.warn(
-        f"ONNX export mode is set to {GLOBALS.training_mode}, but operator '{op_name}' "
-        f"is set to {op_mode_text}. Exporting with {op_mode_text}."
-    )
-
-
-def _flatten_helper(g: jit_utils.GraphContext, input, start_dim, end_dim, dim):
-    input_size = g.op("Shape", input)
-    slice1 = _slice_helper(g, input_size, axes=[0], starts=[0], ends=[start_dim])
-    slices = [slice1, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
-    if end_dim < dim - 1:
-        slice3 = _slice_helper(
-            g, input_size, axes=[0], starts=[end_dim + 1], ends=[dim]
-        )
-        slices = [
-            slice1,
-            g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-            slice3,
-        ]
-
-    final_shape = g.op("Concat", *slices, axis_i=0)
-    from torch.onnx.symbolic_opset9 import _reshape_from_tensor
-
-    return _reshape_from_tensor(g, input, final_shape)
-
-
-def _is_split_static(split_size_or_sizes, _outputs):
-    if _outputs is None:
-        return False
-    if (
-        _is_value(split_size_or_sizes)
-        and split_size_or_sizes.node().kind() != "onnx::Constant"
-    ):
-        return False
-    return True
-
-
-def _optional_input_placeholder_tensor(g):
-    n = g.op("prim::Constant")
-    n.setType(_C.OptionalType.ofTensor())
-    return n
-
-
-def _handle_reduce_dim_none(g: jit_utils.GraphContext, self, op_name):
-    rank = _get_tensor_rank(self)
-    if rank is not None and any(
-        _get_tensor_dim_size(self, i) == 0 for i in range(rank)
-    ):
-        # If input tensor is empty, according to ONNX ReduceSum definition,
-        # set keepdims=1 so that the resulted tensor has the same rank as the input.
-        return g.op(op_name, self, keepdims_i=1)
-    return g.op(op_name, self, keepdims_i=0)
-
-
-def dequantize_helper(
-    g: jit_utils.GraphContext,
-    qtensor: _C.Value,
-    qdtype: _C_onnx.TensorProtoDataType | None = None,
-) -> tuple[_C.Value, _C.Value, _C.Value, _C.Value | None]:
-    """Appends to graph `g` ONNX nodes that dequantizes `qtensor` into `tensor`.
-
-    Args:
-        g: Graph, the ONNX IR graph that is under construction.
-        qtensor: torch._C.Value, either a tuple of (quantized_tensor, scale, zero_point)
-            for per tensor quantization, or
-            (quantized_tensor, scale, zero_point, axis) for per channel quantization,
-            representing the quantized tensor.
-        qdtype: torch.onnx.TensorProtoDataType default None, if not None, represents the
-            data type of quantized tensor. It must be either
-            torch.onnx.TensorProtoDataType.UINT8 or torch.onnx.TensorProtoDataType.INT8.
-    """
-    unpacked_qtensors = _unpack_quantized_tensor(qtensor)
-    tensor, scale, zero_point = unpacked_qtensors[:3]
-    axis = unpacked_qtensors[3] if len(unpacked_qtensors) >= 4 else None
-    axis_i = _get_const(axis, "i", "axis")
-    input_qdtype = _type_utils.JitScalarType.from_value(tensor)
-    if qdtype is None:
-        if input_qdtype is not None:
-            qdtype = input_qdtype.onnx_type()
-        else:
-            qdtype = _C_onnx.TensorProtoDataType.UINT8
-    value = g.op("Cast", tensor, to_i=qdtype)
-    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    zero_point = g.op("Cast", zero_point, to_i=qdtype)
-
-    if axis_i is not None and GLOBALS.export_onnx_opset_version < 13:
-        _onnx_opset_unsupported_detailed(
-            "DequantizeLinear",
-            GLOBALS.export_onnx_opset_version,
-            13,
-            "Attribute axis is not supported.",
-            qtensor,
-        )
-
-    return (
-        g.op("DequantizeLinear", value, scale, zero_point, axis_i=axis_i),
-        scale,
-        zero_point,
-        axis,
-    )
-
-
-def quantize_helper(
-    g: jit_utils.GraphContext,
-    tensor: _C.Value,
-    scale: _C.Value,
-    zero_point: _C.Value,
-    axis: _C.Value | None = None,
-) -> _C.Value:
-    """Appends to graph `g` ONNX nodes that quantizes `tensor` based on `scale`, `zero_point` and `axis`.
-
-    Args:
-        g: Graph, the ONNX IR graph that is under construction.
-        tensor: torch._C.Value, representing the tensor to be quantized.
-        scale: torch._C.Value, quantized scale.
-        zero_point: torch._C.Value, quantized zero point.
-        axis: Optional[torch._C.Value] default None, if None, represents per tensor quantization.
-            Otherwise, represents per channel quantization, along given axis.
-
-    Returns:
-        A TupleConstruct storing information of the quantized tensor.
-    """
-    if (
-        axis is not None
-        and not _is_none(axis)
-        and GLOBALS.export_onnx_opset_version < 13
-    ):
-        _onnx_opset_unsupported_detailed(
-            "QuantizeLinear",
-            GLOBALS.export_onnx_opset_version,
-            13,
-            "Attribute axis is not supported.",
-            tensor,
-        )
-
-    assert scale is not None
-    if (
-        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
-        != _type_utils.JitScalarType.FLOAT
-    ):
-        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-
-    assert zero_point is not None
-    if _type_utils.JitScalarType.from_value(
-        zero_point, _type_utils.JitScalarType.UNDEFINED
-    ) not in {
-        _type_utils.JitScalarType.UINT8,
-        _type_utils.JitScalarType.INT8,
-    }:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
-    output = g.op(
-        "QuantizeLinear",
-        tensor,
-        scale,
-        zero_point,
-        axis_i=_get_const(axis, "i", "axis"),
-    )
-    args = [output, scale, zero_point]
-    if axis is not None and not _is_none(axis):
-        args.append(axis)
-    return g.op("prim::TupleConstruct", *args)
-
-
-def requantize_bias_helper(
-    g: jit_utils.GraphContext, bias, input_scale, weight_scale, axis=None
-):
-    """In PyTorch, bias is float and is quantized to int32 implicitly inside the quantized ATen op kernel.
-    In ONNX we need to make the quantization explicit because operators expect all of their inputs to be quantized.
-    Since int32 is not a supported output type by ONNX operator `QuantizeLinear`, quantization is exported using
-    regular operators.
-    """
-    bias_scale = g.op("Mul", weight_scale, input_scale)
-    bias_scale_shape = g.op("Shape", bias_scale)
-    bias_zero_point = g.op(
-        "ConstantOfShape", bias_scale_shape, value_t=torch.tensor([0], dtype=torch.int)
-    )
-    q_bias = g.op(
-        "Cast", g.op("Div", bias, bias_scale), to_i=_C_onnx.TensorProtoDataType.INT32
-    )
-    axis_args = []
-    if axis is not None and not _is_none(axis):
-        axis_args.append(axis)
-    return g.op("prim::TupleConstruct", q_bias, bias_scale, bias_zero_point, *axis_args)
-
-
-def args_have_same_dtype(args):
-    assert args
-    base_dtype = _type_utils.JitScalarType.from_value(args[0])
-    has_same_dtype = all(
-        _type_utils.JitScalarType.from_value(elem) == base_dtype for elem in args
-    )
-    return has_same_dtype
-
-
-def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kwargs):
-    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
-    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
-    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
-    `Clip<int>(INPUT)` (opset version < 12).
-
-    Args:
-        g (torch._C.Graph): graph to write the ONNX representation into.
-        op_name (str): operator name in ONNX.
-        *args (tuple): operands to the operator.
-        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
-            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
-            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
-
-    Returns:
-        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
-    """
-    opset_before = kwargs.pop("opset_before", None)
-    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
-
-    inputs = list(args)
-    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
-
-    require_cast = not _is_fp(inputs[0]) and (
-        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
-    )
-
-    if require_cast:
-        for input in inputs:
-            if input.isCompleteTensor():
-                input_scalar_type = _type_utils.JitScalarType.from_value(input)
-                if input_scalar_type != dtype_0:
-                    raise errors.SymbolicValueError(
-                        f"Inputs of {op_name} must have same dtype."
-                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
-                        input,
-                    )
-        for i, input in enumerate(inputs):
-            if input.isCompleteTensor() and not _is_fp(input):
-                inputs[i] = g.op(
-                    "Cast",
-                    input,
-                    to_i=target_float_t.onnx_type(),
-                )
-
-    self = g.op(op_name, *inputs, **kwargs)
-
-    if require_cast:
-        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
-
-    return self
-
-
-def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    )
-    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        # This check only covers traced modules where dtype is present
-        # pytorch reduce-ops cast all other integral types to int64
-        if not _is_fp(self) and scalar_type != _type_utils.JitScalarType.INT64:
-            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return self
-
-
-def _apply_params(*args, **kwargs):
-    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
-
-    def _apply(fn):
-        return fn(*args, **kwargs)
-
-    return _apply
-
-
-def _reduce_op_symbolic_helper(onnx_op_name, allow_multi_dim_support=True):
-    def symbolic(g, self, dim=None, keepdim=None):
-        self = _maybe_cast_reduce_op_input(g, self)
-        if dim is None or dim == ():
-            # Dim can be 0, which will cause (not dim) == True. So we don't want to do
-            # (not dim)
-            # all-reduce path
-            return _handle_reduce_dim_none(g, self, onnx_op_name)
-        else:
-            # dim-reduce path
-            keepdim = _get_const(keepdim, "i", "keepdim")
-            if g.opset < 18:
-                desc = "is" if allow_multi_dim_support else "i"
-                dim = _get_const(dim, desc, "dim")
-                dim_list = dim if allow_multi_dim_support else [dim]
-                return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
-            else:
-                if _is_value(dim):
-                    axes = dim
-                else:
-                    if allow_multi_dim_support:
-                        axes = g.op(
-                            "Constant", value_t=torch.tensor(dim, dtype=torch.long)
-                        )
-                    else:
-                        axes = g.op(
-                            "Constant", value_t=torch.tensor([dim], dtype=torch.long)
-                        )
-                return g.op(onnx_op_name, self, axes, keepdims_i=keepdim)
-
-    return symbolic
-
-
-def _overload_by_arg_count(fn):
-    @functools.wraps(fn)
-    def wrapper(g, *args):
-        overloads = fn(g, *args)
-        for overload in overloads:
-            arg_descriptors = overload._arg_descriptors
-            if len(arg_descriptors) == len(args):
-                return overload(g, *args)
-        return _unimplemented(f"aten::{fn.__name__}", f"with {len(args)} arguments")
-
-    return wrapper
-
-
-def _reduce_with_dtype_helper(
-    onnx_op: str, name: str, allow_multi_dim_support: bool = True
-):
-    symbolic = _reduce_op_symbolic_helper(
-        onnx_op, allow_multi_dim_support=allow_multi_dim_support
-    )
-
-    @_overload_by_arg_count
-    def reduce(g, *args, **kwargs):
-        @quantized_args(True)
-        @parse_args("v", "none")
-        def reduce_nodim(g, self, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = _get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return _unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        dim_desc = "is" if allow_multi_dim_support else "i"
-
-        @quantized_args(True)
-        @parse_args("v", dim_desc, "i", "none")  # type: ignore[arg-type]
-        def reduce_dim(g, self, dim, keepdim, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = _get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return _unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self, dim, keepdim)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        return reduce_nodim, reduce_dim
-
-    return reduce
-
-
-def _max_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.max(input)
-    if dim_or_y is None and keepdim is None:
-        return g.op("ReduceMax", self, keepdims_i=0)
-    # torch.max(input, other)
-    if keepdim is None:
-        return _op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
-    # torch.max(input, dim, keepdim)
-    else:
-        keepdim = _get_const(keepdim, "i", "keepdim")
-        dim = _get_const(dim_or_y, "i", "dim")
-        if g.opset < 18:
-            max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
-        else:
-            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-            max = g.op("ReduceMax", self, axes, keepdims_i=keepdim)
-        indices = g.op("ArgMax", self, axis_i=dim, keepdims_i=keepdim)
-        return max, indices
-
-
-def _min_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.min(input)
-    if dim_or_y is None and keepdim is None:
-        return g.op("ReduceMin", self, keepdims_i=0)
-    # torch.min(input, other)
-    if keepdim is None:
-        return _op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
-    # torch.min(input, dim, keepdim)
-    else:
-        keepdim = _get_const(keepdim, "i", "keepdim")
-        dim = _get_const(dim_or_y, "i", "dim")
-        if g.opset < 18:
-            min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
-        else:
-            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-            min = g.op("ReduceMin", self, axes, keepdims_i=keepdim)
-        indices = g.op("ArgMin", self, axis_i=dim, keepdims_i=keepdim)
-        return min, indices
-
-
-def _numel_helper(g: jit_utils.GraphContext, self):
-    shape = g.op("Shape", self)
-    return g.op("ReduceProd", shape, keepdims_i=0)
-
-
-@parse_args("v", "is", "i", "i")
-def _var_mean_helper(g: jit_utils.GraphContext, input, dim, correction, keepdim):
-    if g.opset < 18:
-        if dim is None:
-            mean = g.op("ReduceMean", input, keepdims_i=0)
-            t_mean = mean
-            num_elements = _numel_helper(g, input)
-        else:
-            mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
-            t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
-            redudced_dims = g.op("Shape", input)
-            # dim could contain one or multiple dimensions
-            redudced_dims = g.op(
-                "Gather",
-                redudced_dims,
-                g.op("Constant", value_t=torch.tensor(dim)),
-                axis_i=0,
-            )
-            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
-        sub_v = g.op("Sub", input, t_mean)
-        sqr_sub = g.op("Mul", sub_v, sub_v)
-        keepdim_mean = 0 if dim is None else keepdim
-        var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
-        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
-        if correction is None:
-            correction = 1
-        if correction != 0:
-            num_elements = g.op(
-                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
-            )
-            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
-            mul = g.op("Mul", var, num_elements)
-            var = g.op("Div", mul, g.op("Sub", num_elements, one))
-        return var, mean
-    else:
-        axes = None
-        if dim is None:
-            mean = g.op("ReduceMean", input, keepdims_i=0)
-            t_mean = mean
-            num_elements = _numel_helper(g, input)
-        else:
-            axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-            mean = g.op("ReduceMean", input, axes, keepdims_i=keepdim)
-            t_mean = g.op("ReduceMean", input, axes, keepdims_i=1)
-            redudced_dims = g.op("Shape", input)
-            # dim could contain one or multiple dimensions
-            redudced_dims = g.op(
-                "Gather",
-                redudced_dims,
-                g.op("Constant", value_t=torch.tensor(dim)),
-                axis_i=0,
-            )
-            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
-        sub_v = g.op("Sub", input, t_mean)
-        sqr_sub = g.op("Mul", sub_v, sub_v)
-        keepdim_mean = 0 if dim is None else keepdim
-        if axes is None:
-            var = g.op("ReduceMean", sqr_sub, keepdims_i=keepdim_mean)
-        else:
-            var = g.op("ReduceMean", sqr_sub, axes, keepdims_i=keepdim_mean)
-        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
-        if correction is None:
-            correction = 1
-        if correction != 0:
-            num_elements = g.op(
-                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
-            )
-            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
-            mul = g.op("Mul", var, num_elements)
-            var = g.op("Div", mul, g.op("Sub", num_elements, one))
-        return var, mean
-
-
-def _embedding_bag_helper(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    if scale_grad_by_freq and GLOBALS.export_training:
-        return _onnx_unsupported(
-            "embedding_bag with scale_grad_by_freq for training mode"
-        )
-    if padding_idx is not None and padding_idx >= 0:
-        raise RuntimeError("embedding_bag with padding_idx")
-
-    loop_condition = g.op("Constant", value_t=torch.tensor(1))
-    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    zero = g.op("Constant", value_t=torch.tensor([0]))
-
-    indices_len = _unsqueeze_helper(
-        g,
-        _size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
-        [0],
-    )
-    if not include_last_offset:
-        offsets = [offsets, indices_len]
-        offsets = g.op("Concat", *offsets, axis_i=0)
-
-    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
-    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
-    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
-    offsets_starts = _slice_helper(
-        g, offsets, axes=[0], starts=[0], ends=[sys.maxsize], steps=[1]
-    )
-    offsets_ends = _slice_helper(
-        g, offsets, axes=[0], starts=[1], ends=[sys.maxsize], steps=[1]
-    )
-
-    loop_len = _size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
-
-    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
-        g, "Loop", loop_len, loop_condition, n_blocks=1
-    )
-    loop_block = loop_context.block
-
-    # FIXME(justinchuby): We need to handle what happens when we call b.op on a node return
-    block_input_iter = utils._add_input_to_block(loop_block)
-    utils._add_input_to_block(loop_block)
-
-    indices_start = loop_context.op(
-        "Gather", offsets_starts, block_input_iter, axis_i=0
-    )
-    indices_end = loop_context.op("Gather", offsets_ends, block_input_iter, axis_i=0)
-    indices_start = _unsqueeze_helper(loop_context, indices_start, [0])
-    indices_end = _unsqueeze_helper(loop_context, indices_end, [0])
-
-    indices_row = loop_context.op("Slice", indices, indices_start, indices_end, zero)
-    embeddings = loop_context.op("Gather", embedding_matrix, indices_row, axis_i=0)
-    if not _is_none(per_sample_weights):
-        per_sample_weights_row = loop_context.op(
-            "Slice", per_sample_weights, indices_start, indices_end, zero
-        )
-        per_sample_weights_row = _unsqueeze_helper(
-            loop_context, per_sample_weights_row, [1]
-        )
-        embeddings = loop_context.op("Mul", embeddings, per_sample_weights_row)
-    if mode == 0:
-        embeddings = _reducesum_helper(
-            loop_context, embeddings, axes_i=[0], keepdims_i=0
-        )
-    elif mode == 1:
-        if loop_context.opset < 18:
-            embeddings = loop_context.op(
-                "ReduceMean", embeddings, axes_i=[0], keepdims_i=0
-            )
-        else:
-            axes = loop_context.op(
-                "Constant", value_t=torch.tensor([0], dtype=torch.long)
-            )
-            embeddings = loop_context.op("ReduceMean", embeddings, axes, keepdims_i=0)
-    else:
-        if loop_context.opset < 18:
-            embeddings = loop_context.op(
-                "ReduceMax", embeddings, axes_i=[0], keepdims_i=0
-            )
-        else:
-            axes = loop_context.op(
-                "Constant", value_t=torch.tensor([0], dtype=torch.long)
-            )
-            embeddings = loop_context.op("ReduceMax", embeddings, axes, keepdims_i=0)
-
-    cond_out = loop_context.op(
-        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
-    )
-    utils._add_output_to_block(loop_block, cond_out)
-    utils._add_output_to_block(loop_block, embeddings)
-
-    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
-    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
-    return loop.node().output(), None, None, None
-
-
-def _linalg_vector_norm_helper(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: float,
-    dim: Sequence[int] | None,
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    axes = None
-    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
-    if _is_none(dim):
-        self = _reshape_helper(g, self, [-1])
-        keepdim = False
-    elif g.opset >= 18:
-        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-
-    if ord == math.inf:
-        if g.opset < 18:
-            result = g.op(
-                "ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
-            )
-        else:
-            if axes is None:
-                result = g.op("ReduceMax", g.op("Abs", self), keepdims_i=keepdim)
-            else:
-                result = g.op("ReduceMax", g.op("Abs", self), axes, keepdims_i=keepdim)
-    elif ord == -math.inf:
-        if g.opset < 18:
-            result = g.op(
-                "ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
-            )
-        else:
-            if axes is None:
-                result = g.op("ReduceMin", g.op("Abs", self), keepdims_i=keepdim)
-            else:
-                result = g.op("ReduceMin", g.op("Abs", self), axes, keepdims_i=keepdim)
-    elif ord == 0:
-        if g.opset < 11:
-            return _onnx_opset_unsupported_detailed(
-                "linalg_vector_norm", 9, 11, "ord=0 not supported", self
-            )
-        else:
-            if dim is None:
-                self = _reshape_helper(
-                    g,
-                    self,
-                    g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
-                )
-                keepdim = False
-
-            cond_op = g.op(
-                "Not",
-                g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0]))),
-            )
-            cond_op = g.op(
-                "Cast",
-                cond_op,
-                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-            )
-            return _reducesum_helper(g, cond_op, axes_i=dim, keepdims_i=keepdim)
-    elif ord == 1:
-        if g.opset < 18:
-            result = _reduce_op_symbolic_helper("ReduceL1")(
-                g, self, dim=dim, keepdim=keepdim
-            )
-        else:
-            if axes is None:
-                result = _reduce_op_symbolic_helper("ReduceL1")(
-                    g, self, keepdim=keepdim
-                )
-            else:
-                result = _reduce_op_symbolic_helper("ReduceL1")(
-                    g, self, axes, keepdim=keepdim
-                )
-    elif ord == 2:
-        if g.opset < 18:
-            result = _reduce_op_symbolic_helper("ReduceL2")(
-                g, self, dim=dim, keepdim=keepdim
-            )
-        else:
-            if axes is None:
-                result = _reduce_op_symbolic_helper("ReduceL2")(
-                    g, self, keepdim=keepdim
-                )
-            else:
-                result = _reduce_op_symbolic_helper("ReduceL2")(
-                    g, self, axes, keepdim=keepdim
-                )
-    else:
-        ord_op = g.op("Constant", value_t=torch.tensor(ord, dtype=torch.float32))
-        result = _reducesum_helper(
-            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
-        )
-        result = g.op(
-            "Pow",
-            result,
-            g.op(
-                "Div",
-                g.op("Constant", value_t=torch.tensor(1, dtype=torch.float32)),
-                ord_op,
-            ),
-        )
-
-    if not _is_none(dtype):
-        dtype = _get_const(dtype, "i", "dtype")
-        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())  # type: ignore[arg-type]
-    return result
-
-
-# Deprecated. Internally use _type_utils.ScalarType
-# TODO: remove these once we support Type's in the JIT IR and we can once again
-# use the unified toType operator
-cast_pytorch_to_onnx = {
-    "Byte": _C_onnx.TensorProtoDataType.UINT8,
-    "Char": _C_onnx.TensorProtoDataType.INT8,
-    "Double": _C_onnx.TensorProtoDataType.DOUBLE,
-    "Float": _C_onnx.TensorProtoDataType.FLOAT,
-    "Half": _C_onnx.TensorProtoDataType.FLOAT16,
-    "Int": _C_onnx.TensorProtoDataType.INT32,
-    "Long": _C_onnx.TensorProtoDataType.INT64,
-    "Short": _C_onnx.TensorProtoDataType.INT16,
-    "Bool": _C_onnx.TensorProtoDataType.BOOL,
-    "ComplexFloat": _C_onnx.TensorProtoDataType.COMPLEX64,
-    "ComplexDouble": _C_onnx.TensorProtoDataType.COMPLEX128,
-    "BFloat16": _C_onnx.TensorProtoDataType.BFLOAT16,
-    "Undefined": _C_onnx.TensorProtoDataType.UNDEFINED,
-}
-
-# Deprecated. Internally use _type_utils.ScalarType
-scalar_name_to_pytorch = {
-    "uint8_t": "Byte",
-    "int8_t": "Char",
-    "double": "Double",
-    "float": "Float",
-    "half": "Half",
-    "int": "Int",
-    "int64_t": "Long",
-    "int16_t": "Short",
-    "bool": "Bool",
-    "complex64": "ComplexFloat",
-    "complex128": "ComplexDouble",
-    "qint8": "QInt8",
-    "quint8": "QUInt8",
-    "qint32": "QInt32",
-    "bfloat16": "BFloat16",
-}
-
-
-# Deprecated. Internally use _type_utils.ScalarType
-# This indicates each scalar type's corresponding
-# torch type. Related source:
-# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
-scalar_type_to_pytorch_type = [
-    torch.uint8,  # 0
-    torch.int8,  # 1
-    torch.short,  # 2
-    torch.int,  # 3
-    torch.int64,  # 4
-    torch.half,  # 5
-    torch.float,  # 6
-    torch.double,  # 7
-    torch.complex32,  # 8
-    torch.complex64,  # 9
-    torch.complex128,  # 10
-    torch.bool,  # 11
-    torch.qint8,  # 12
-    torch.quint8,  # 13
-    torch.qint32,  # 14
-    torch.bfloat16,  # 15
-]
-
-# Deprecated. Internally use _type_utils.ScalarType
-# source of truth is
-# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
-pytorch_name_to_type = {
-    "Byte": torch.uint8,
-    "Char": torch.int8,
-    "Double": torch.double,
-    "Float": torch.float,
-    "Half": torch.half,
-    "Int": torch.int,
-    "Long": torch.int64,
-    "Short": torch.short,
-    "Bool": torch.bool,
-    "ComplexFloat": torch.complex64,
-    "ComplexDouble": torch.complex128,
-    "QInt8": torch.qint8,
-    "QUInt8": torch.quint8,
-    "QInt32": torch.qint32,
-    "BFloat16": torch.bfloat16,
-}
+from __future__ import annotations
 
 
-# Deprecated. Internally use _type_utils.ScalarType
-scalar_type_to_onnx = [
-    cast_pytorch_to_onnx["Byte"],  # 0
-    cast_pytorch_to_onnx["Char"],  # 1
-    cast_pytorch_to_onnx["Short"],  # 2
-    cast_pytorch_to_onnx["Int"],  # 3
-    cast_pytorch_to_onnx["Long"],  # 4
-    cast_pytorch_to_onnx["Half"],  # 5
-    cast_pytorch_to_onnx["Float"],  # 6
-    cast_pytorch_to_onnx["Double"],  # 7
-    cast_pytorch_to_onnx["Undefined"],  # 8
-    cast_pytorch_to_onnx["ComplexFloat"],  # 9
-    cast_pytorch_to_onnx["ComplexDouble"],  # 10
-    cast_pytorch_to_onnx["Bool"],  # 11
-    cast_pytorch_to_onnx["Char"],  # 12
-    cast_pytorch_to_onnx["Byte"],  # 13
-    cast_pytorch_to_onnx["Int"],  # 14
-    cast_pytorch_to_onnx["BFloat16"],  # 15
-]
+__all__: list[str] = []
 
-# Global set to store the list of quantized operators in the network.
-# This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX.
-_quantized_ops: set[int] = set()
+from torch.onnx._internal.torchscript_exporter.symbolic_helper import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 0b8e2478ce33..9bda69b81ab6 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -1,1190 +1,11 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-from __future__ import annotations
-
-import functools
-import sys
-import warnings
-from typing import TYPE_CHECKING
-
-import torch
-import torch._C._onnx as _C_onnx
-import torch.onnx
-from torch import _C
-
-# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import (
-    _constants,
-    _type_utils,
-    errors,
-    symbolic_helper,
-    symbolic_opset9 as opset9,
-)
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils, registration
-
-
-if TYPE_CHECKING:
-    from collections.abc import Sequence
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-# This file exports ONNX ops for opset 10
-# Opset 10 is supported by ONNX release 1.5.0
-# release on 04/24/19
-
-
-__all__ = [
-    "dequantize",
-    "div",
-    "embedding_bag",
-    "fake_quantize_per_tensor_affine",
-    "flip",
-    "fmod",
-    "isfinite",
-    "isinf",
-    "nan_to_num",
-    "quantize_per_tensor",
-    "quantized_add_relu",
-    "quantized_add",
-    "quantized_cat",
-    "quantized_conv1d_relu",
-    "quantized_conv2d_relu",
-    "quantized_conv3d_relu",
-    "quantized_conv1d",
-    "quantized_conv2d",
-    "quantized_conv3d",
-    "quantized_conv_transpose1d",
-    "quantized_conv_transpose2d",
-    "quantized_conv_transpose3d",
-    "quantized_group_norm",
-    "quantized_hardswish",
-    "quantized_instance_norm",
-    "quantized_layer_norm",
-    "quantized_leaky_relu",
-    "quantized_linear",
-    "quantized_linear_relu",
-    "quantized_mul",
-    "quantized_sigmoid",
-    "slice",
-    "sort",
-    "topk",
-]
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=10)
-
-
-@_onnx_symbolic("aten::div")
-def div(g: jit_utils.GraphContext, self, other, *args):
-    if len(args) == 0:
-        return opset9.true_divide(g, self, other)
-    else:
-        return _div_rounding_mode(g, self, other, *args)
-
-
-@symbolic_helper.parse_args("v", "v", "s")
-def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
-    if rounding_mode == "floor":
-        return _floor_divide(g, self, other)
-    else:
-        return opset9._div_rounding_mode(g, self, other, rounding_mode)
-
-
-@_onnx_symbolic("aten::_floor_divide")
-def _floor_divide(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
-        out = opset9.true_divide(g, self, other)
-        return g.op("Floor", out)
-    else:
-        # Integer division does truncation rounding
-        div = g.op("Div", self, other)
-        # Division is negative if: self < 0 != other < 0
-        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
-        negative = g.op("Xor", g.op("Less", self, zero), g.op("Less", other, zero))
-
-        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
-        mod = g.op("Mod", self, other, fmod_i=0)
-        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
-
-        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
-        fixup = g.op("Sub", div, one)
-        return g.op("Where", fixup_mask, fixup, div)
-
-
-@_onnx_symbolic("aten::sort")
-@symbolic_helper.parse_args("v", "i", "i", "none")
-def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
-    return symbolic_helper._sort_helper(g, self, dim, descending=descending, out=out)
-
+"""Backward compatibility module for torch.onnx.symbolic_opset10."""
 
-@_onnx_symbolic("aten::topk")
-@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
-def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
-    return symbolic_helper._topk_helper(
-        g, self, k, dim, largest=largest, sorted=sorted, out=out
-    )
-
-
-def _aten_max_pool_onnx(
-    g: jit_utils.GraphContext,
-    self: _C.Value,
-    kernel_shape: Sequence[int],
-    strides: Sequence[int],
-    pads: Sequence[int],
-    dilations: Sequence[int],
-    ceil_mode: bool,
-    unbatched_rank: int,
-) -> _C.Value:
-    self_rank = g.op("Size", g.op("Shape", self))
-    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
-        self = g.op(
-            "Unsqueeze",
-            self,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-
-    pool_result, _ = g.op(
-        "MaxPool",
-        self,
-        outputs=2,
-        ceil_mode_i=ceil_mode,
-        dilations_i=dilations,
-        kernel_shape_i=kernel_shape,
-        pads_i=pads,
-        strides_i=strides,
-    )
-
-    if self_rank == unbatched_rank:
-        pool_result = g.op(
-            "Squeeze",
-            pool_result,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-
-    return pool_result
-
-
-# For MaxPool
-def _adjust_attributes_of_max_pool(
-    expand_size: int,
-    kernel_size: Sequence[int] | int,
-    stride: Sequence[int] | int,
-    padding: Sequence[int] | int,
-    dilation: Sequence[int] | int,
-) -> tuple[Sequence[int], Sequence[int], Sequence[int], Sequence[int]]:
-    """Adjust attributes of avg_pool to match ONNX specification."""
-
-    if isinstance(dilation, int):
-        dilation = [dilation] * expand_size
-
-    if isinstance(kernel_size, int):
-        kernel_shape = [kernel_size] * expand_size
-    else:
-        kernel_shape = kernel_size  # type: ignore[assignment]
-
-    if isinstance(padding, int):
-        pads = [padding] * expand_size * 2  # type: ignore[operator, assignment]
-    elif len(padding) == 1:
-        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
-    elif len(padding) == 2:
-        # 2D padding
-        pads = padding * 2  # type: ignore[operator, assignment]
-    elif len(padding) == 3:
-        # 3D padding
-        pads = padding * 2  # type: ignore[operator, assignment]
-    else:
-        # When padding is already done for all dimensions,
-        # we don't need to double it
-        # eg: (1, 1, 1, 1, 1, 1)
-        pads = padding  # type: ignore[assignment]
-
-    if isinstance(stride, int):
-        strides = [stride] * expand_size
-    elif not stride:
-        strides = kernel_shape
-    else:
-        strides = stride  # type: ignore[assignment]
-
-    return (kernel_shape, strides, pads, dilation)
-
-
-def _aten_max_pool_with_indices_onnx(
-    g: jit_utils.GraphContext,
-    self: _C.Value,
-    kernel_shape: Sequence[int],
-    strides: Sequence[int],
-    pads: Sequence[int],
-    dilations: Sequence[int],
-    ceil_mode: bool,
-    unbatched_rank: int,
-    n_dims_one: Sequence[int],
-    n_dims_zero: Sequence[int],
-    n_dims_axes: Sequence[int],
-) -> tuple[_C.Value, Sequence[int]]:
-    self_rank = g.op("Size", g.op("Shape", self))
-    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
-        self = g.op(
-            "Unsqueeze",
-            self,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-
-    pool_result, indices = g.op(
-        "MaxPool",
-        self,
-        outputs=2,
-        ceil_mode_i=ceil_mode,
-        dilations_i=dilations,
-        kernel_shape_i=kernel_shape,
-        pads_i=pads,
-        strides_i=strides,
-    )
-    _, flatten_indices = g.op(
-        "MaxPool",
-        self,
-        outputs=2,
-        dilations_i=dilations,
-        kernel_shape_i=n_dims_one,
-        strides_i=n_dims_one,
-    )
-
-    ends = g.op("Constant", value_t=torch.tensor(n_dims_one))
-    starts = g.op("Constant", value_t=torch.tensor(n_dims_zero))
-    axes = g.op("Constant", value_t=torch.tensor(n_dims_axes))
-
-    delta = g.op("Slice", flatten_indices, starts, ends, axes)
-    indices = g.op("Sub", indices, delta)
-
-    if self_rank == unbatched_rank:
-        pool_result = g.op(
-            "Squeeze", pool_result, value_t=torch.tensor([0], dtype=torch.int64)
-        )
-        indices = g.op("Squeeze", indices, value_t=torch.tensor([0], dtype=torch.int64))
-
-    return (pool_result, indices)
-
-
-@_onnx_symbolic(
-    "aten::max_pool1d",
-    decorate=[symbolic_helper._apply_params("max_pool1d", 1, return_indices=False)],
-)
-@_onnx_symbolic(
-    "aten::max_pool2d",
-    decorate=[symbolic_helper._apply_params("max_pool2d", 2, return_indices=False)],
-)
-@_onnx_symbolic(
-    "aten::max_pool3d",
-    decorate=[symbolic_helper._apply_params("max_pool3d", 3, return_indices=False)],
-)
-@_onnx_symbolic(
-    "aten::max_pool1d_with_indices",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool1d_with_indices",
-            1,
-            return_indices=True,
-        )
-    ],
-)
-@_onnx_symbolic(
-    "aten::max_pool2d_with_indices",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool2d_with_indices",
-            2,
-            return_indices=True,
-        )
-    ],
-)
-@_onnx_symbolic(
-    "aten::max_pool3d_with_indices",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool3d_with_indices",
-            3,
-            return_indices=True,
-        )
-    ],
-)
-def _max_pool(name: str, expand_size: int, return_indices: bool):
-    @symbolic_helper.quantized_args(True, False, False, False, False, False)
-    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
-    def symbolic_fn(
-        g: jit_utils.GraphContext,
-        input: _C.Value,
-        kernel_size: Sequence[int],
-        stride: Sequence[int],
-        padding: int | Sequence[int],
-        dilation: Sequence[int],
-        ceil_mode: bool,
-    ):
-        kernel_shape, strides, pads, dilations = _adjust_attributes_of_max_pool(
-            expand_size, kernel_size, stride, padding, dilation
-        )
-
-        if return_indices:
-            return _aten_max_pool_with_indices_onnx(
-                g,
-                input,
-                kernel_shape,
-                strides,
-                pads,
-                dilations,
-                ceil_mode,
-                expand_size + 1,
-                ([1] * expand_size),
-                ([0] * expand_size),
-                ([2 + i for i in range(expand_size)]),
-            )
-        else:
-            return _aten_max_pool_onnx(
-                g,
-                input,
-                kernel_shape,
-                strides,
-                pads,
-                dilations,
-                ceil_mode,
-                expand_size + 1,
-            )
-
-    return symbolic_fn
-
-
-# For AvgPool
-def _adjust_attributes_of_avg_pool(
-    expand_size: int,
-    kernel_size: Sequence[int] | int,
-    stride: Sequence[int] | int,
-    padding: Sequence[int] | int,
-) -> tuple[Sequence[int], Sequence[int], Sequence[int]]:
-    """Adjust attributes of avg_pool to match ONNX specification."""
-
-    if isinstance(kernel_size, int):
-        kernel_shape = [kernel_size] * expand_size
-    else:
-        kernel_shape = kernel_size  # type: ignore[assignment]
-
-    if isinstance(padding, int):
-        pads = [padding] * expand_size * 2
-    elif len(padding) == 1:
-        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
-    elif len(padding) == 2:
-        pads = padding * expand_size  # type: ignore[operator, assignment]
-    else:
-        pads = padding * 2  # type: ignore[operator, assignment]
-
-    if isinstance(stride, int):
-        strides = [stride] * expand_size
-    elif not stride:
-        strides = kernel_shape
-    else:
-        strides = stride  # type: ignore[assignment]
-
-    return (kernel_shape, strides, pads)
-
-
-@_onnx_symbolic(
-    "aten::avg_pool1d",
-    decorate=[symbolic_helper._apply_params("avg_pool1d", 1)],
-)
-@_onnx_symbolic(
-    "aten::avg_pool2d",
-    decorate=[symbolic_helper._apply_params("avg_pool2d", 2)],
-)
-@_onnx_symbolic(
-    "aten::avg_pool3d",
-    decorate=[symbolic_helper._apply_params("avg_pool3d", 3)],
-)
-def _avg_pool(name, expand_size):
-    @symbolic_helper.quantized_args(True, False, False, False, False, False, False)
-    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
-    def symbolic_fn(
-        g,
-        input: _C.Value,
-        kernel_size: Sequence[int],
-        stride: Sequence[int],
-        padding: int | Sequence[int],
-        ceil_mode: int,
-        count_include_pad: int,
-        divisor_override=None,
-    ):
-        kernel_shape, strides, pads = _adjust_attributes_of_avg_pool(
-            expand_size, kernel_size, stride, padding
-        )
-
-        result = g.op(
-            "AveragePool",
-            input,
-            ceil_mode_i=ceil_mode,
-            count_include_pad_i=count_include_pad,
-            kernel_shape_i=kernel_shape,
-            pads_i=pads,
-            strides_i=strides,
-        )
-
-        return result
+from __future__ import annotations
 
-    return symbolic_fn
 
+__all__: list[str] = []
 
-@_onnx_symbolic(
-    "aten::upsample_nearest1d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest2d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest3d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_linear1d",
-    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_bilinear2d",
-    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_trilinear3d",
-    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+from torch.onnx._internal.torchscript_exporter.symbolic_opset10 import *  # noqa: F401,F403
+from torch.onnx._internal.torchscript_exporter.symbolic_opset10 import (  # noqa: F401
+    _slice,
 )
-def _interpolate(name, dim, interpolate_mode):
-    @symbolic_helper.quantized_args(True, False, False)
-    def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = symbolic_helper._get_interpolate_attributes(
-            g, interpolate_mode, args
-        )
-        symbolic_helper._interpolate_warning(interpolate_mode)
-        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
-        if align_corners:
-            return symbolic_helper._unimplemented(name, "align_corners == True", input)
-        if scales is None:
-            scales = symbolic_helper._interpolate_size_to_scales(
-                g, input, output_size, dim
-            )
-        return g.op("Resize", input, scales, mode_s=interpolate_mode)
-
-    return symbolic_fn
-
-
-@_onnx_symbolic("aten::__interpolate")
-def __interpolate(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-    antialias,
-):
-    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
-        g, input, size, scale_factor, mode, align_corners
-    )
-    return g.op("Resize", input, scales, mode_s=mode)
-
-
-def _slice(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    axes: list | torch.Tensor | torch._C.Value,
-    starts: list | torch.Tensor | torch._C.Value,
-    ends: list | torch.Tensor | torch._C.Value,
-    steps: list | torch.Tensor | torch._C.Value | None = None,
-):
-    def is_none_value(value):
-        if value is None:
-            return True
-        return (
-            isinstance(value, torch._C.Value)
-            and value.node().kind() == "prim::Constant"
-            and isinstance(value.type(), _C.NoneType)
-        )
-
-    def to_slice_input(list_or_value, default_value=None):
-        # Convert input param into a 1D torch.Value.
-        if is_none_value(list_or_value) and default_value is not None:
-            list_or_value = [default_value]
-
-        if isinstance(list_or_value, torch.Tensor):
-            return g.op("Constant", value_t=list_or_value.clone().detach())
-        elif isinstance(list_or_value, list):
-            return g.op("Constant", value_t=torch.tensor(list_or_value))
-
-        rank = symbolic_helper._get_tensor_rank(list_or_value)
-        if rank == 0:
-            return symbolic_helper._unsqueeze_helper(g, list_or_value, [0])
-        if rank == 1:
-            return list_or_value
-        raise errors.SymbolicValueError(
-            f"Rank must be 0 or 1, not {rank}", list_or_value
-        )
-
-    def get_const_value(list_or_value):
-        if isinstance(list_or_value, (list, torch.Tensor)):
-            if len(list_or_value) == 1:
-                return list_or_value[0]
-            return None
-        return symbolic_helper._maybe_get_const(list_or_value, "i")
-
-    # Check if slice is a no-op
-    if (
-        get_const_value(starts) == 0
-        and get_const_value(ends) == _constants.INT64_MAX
-        and (steps is None or get_const_value(steps) == 1)
-    ):
-        return input
-
-    axes = to_slice_input(axes)
-    starts = to_slice_input(starts, default_value=0)
-    ends = to_slice_input(ends, default_value=_constants.INT64_MAX)
-    if steps is None:
-        return g.op("Slice", input, starts, ends, axes)
-    steps = to_slice_input(steps, default_value=1)
-    return g.op("Slice", input, starts, ends, axes, steps)
-
-
-@_onnx_symbolic("aten::slice")
-def slice(g: jit_utils.GraphContext, self, *args):
-    if len(args) == 4:
-        # aten::slice(Tensor self, int dim, int? start=None, int? end=None, int step=1) -> Tensor
-        dims, start, end, step = args
-    elif len(args) == 3:
-        # aten::slice(t[] l, int? start=None, int? end=None, int step=1) -> t[]
-        start, end, step = args
-        dims = [0]
-    else:
-        raise errors.SymbolicValueError("Unknown aten::slice signature", self)
-
-    return symbolic_helper._slice_helper(
-        g,
-        self,
-        axes=dims,
-        starts=start,
-        ends=end,
-        steps=step,
-    )
-
-
-@_onnx_symbolic("aten::flip")
-@symbolic_helper.parse_args("v", "is")
-def flip(g: jit_utils.GraphContext, input, dims):
-    return symbolic_helper._slice_helper(
-        g,
-        input,
-        axes=dims,
-        starts=[-1] * len(dims),
-        ends=[-_constants.INT64_MAX] * len(dims),
-        steps=[-1] * len(dims),
-    )
-
-
-@_onnx_symbolic("aten::fmod")
-def fmod(g: jit_utils.GraphContext, input, other):
-    return g.op("Mod", input, other, fmod_i=1)
-
-
-@_onnx_symbolic("aten::embedding_bag")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    if scale_grad_by_freq and GLOBALS.export_training:
-        return symbolic_helper._onnx_unsupported(
-            "embedding_bag with scale_grad_by_freq for training mode"
-        )
-    if padding_idx is not None and padding_idx >= 0:
-        raise RuntimeError("embedding_bag with padding_idx")
-
-    warnings.warn(
-        "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
-        "Please use opset 11 or higher to export model for dynamic input shape.'"
-    )
-    offsets_dim_0 = symbolic_helper._get_tensor_dim_size(offsets, 0)
-    if offsets_dim_0 is not None:
-        if include_last_offset:
-            offset_len = offsets_dim_0 - 1
-            offsets_extended = offsets
-        else:
-            offset_len = offsets_dim_0
-            offsets_extended = [
-                offsets,
-                g.op("Constant", value_t=torch.tensor([sys.maxsize])),
-            ]
-            offsets_extended = g.op("Concat", *offsets_extended, axis_i=0)
-        list_ = []
-        for i in range(offset_len):
-            start_ = symbolic_helper._unsqueeze_helper(
-                g,
-                opset9.select(g, offsets_extended, torch.tensor(0), torch.tensor(i)),
-                [0],
-            )
-            end_ = symbolic_helper._unsqueeze_helper(
-                g,
-                opset9.select(
-                    g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)
-                ),
-                [0],
-            )
-            axes_ = g.op("Constant", value_t=torch.tensor([0]))
-            indices_row = g.op("Slice", indices, start_, end_, axes_)
-
-            embeddings = g.op("Gather", embedding_matrix, indices_row)
-            if not symbolic_helper._is_none(per_sample_weights):
-                per_sample_weights_row = g.op(
-                    "Slice", per_sample_weights, start_, end_, axes_
-                )
-                per_sample_weights_row = symbolic_helper._unsqueeze_helper(
-                    g, per_sample_weights_row, [1]
-                )
-                embeddings = g.op("Mul", embeddings, per_sample_weights_row)
-            if mode == 0:
-                embeddings = symbolic_helper._reducesum_helper(
-                    g, embeddings, axes_i=[0], keepdims_i=0
-                )
-            elif mode == 1:
-                embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
-            else:
-                embeddings = g.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
-
-            embeddings = symbolic_helper._unsqueeze_helper(g, embeddings, [0])
-            list_.append(embeddings)
-
-        output = g.op("Concat", *list_, axis_i=0)
-        # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
-        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
-        return output, None, None, None
-    else:
-        return symbolic_helper._onnx_unsupported(
-            "embedding_bag with unknown shape of offsets for opset 10 is not supported. "
-            "please use opset 11 or higher."
-        )
-
-
-@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i")
-def fake_quantize_per_tensor_affine(
-    g: jit_utils.GraphContext,
-    inputs,
-    scale,
-    zero_point,
-    quant_min=-128,
-    quant_max=127,
-):
-    # NOTE: (0, 127) is a special case. PyTorch restricts activations to be in the range (0, 127).
-    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
-    if (quant_min, quant_max) == (0, 127):
-        symbolic_helper._onnx_opset_unsupported_detailed(
-            "fake_quantize_per_tensor_affine",
-            10,
-            13,
-            "Quantize range (0, 127) not supported, requires opset 13 Clip",
-            inputs,
-        )
-    if (quant_min, quant_max) not in [(0, 255), (-128, 127)]:
-        raise errors.SymbolicValueError(
-            f"For (quant_min, quant_max), ONNX allows only (0, 255) and (-128, 127). "
-            f"Got ({quant_min}, {quant_max})",
-            inputs,
-        )
-    scale = symbolic_helper._maybe_get_scalar(scale)
-    if scale is None:
-        symbolic_helper._onnx_opset_unsupported_detailed(
-            "fake_quantize_per_tensor_affine",
-            10,
-            13,
-            "Non-constant scale not supported",
-            inputs,
-        )
-    scale = scale.float().data  # Avoid exporter generating double type
-    if quant_min == 0:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
-    else:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
-    return g.op(
-        "DequantizeLinear",
-        g.op("QuantizeLinear", inputs, scale, zero_point),
-        scale,
-        zero_point,
-    )
-
-
-@_onnx_symbolic("aten::isinf")
-def isinf(g: jit_utils.GraphContext, input):
-    return g.op("IsInf", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE))
-
-
-@_onnx_symbolic("aten::isfinite")
-def isfinite(g: jit_utils.GraphContext, input):
-    inf_node = isinf(g, input)
-    nan_node = opset9.isnan(g, input)
-    return opset9.__not_(g, opset9.__or_(g, inf_node, nan_node))
-
-
-@_onnx_symbolic("aten::quantize_per_tensor")
-def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    # TODO(justinchuby): Extract all the cast ops into a helper function.
-    zero_point = g.op(
-        "Cast", zero_point, to_i=_type_utils.JitScalarType(dtype).onnx_type()
-    )
-    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    return symbolic_helper.quantize_helper(g, input, scale, zero_point)
-
-
-@_onnx_symbolic("aten::dequantize")
-def dequantize(g: jit_utils.GraphContext, input):
-    return symbolic_helper.dequantize_helper(g, input)[0]
-
-
-@_onnx_symbolic("aten::nan_to_num")
-@symbolic_helper.parse_args("v", "f", "f", "f")
-def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
-    # Cannot create a int type tensor with inf/nan values, so we simply
-    # return the original tensor
-    if not symbolic_helper._is_fp(input):
-        return input
-    input_dtype = _type_utils.JitScalarType.from_value(input).dtype()
-    if nan is None:
-        nan = 0.0
-    nan_cond = opset9.isnan(g, input)
-    nan_result = g.op(
-        "Where",
-        nan_cond,
-        g.op("Constant", value_t=torch.tensor([nan], dtype=input_dtype)),
-        input,
-    )
-
-    # For None values of posinf, neginf we use the greatest/lowest finite
-    # value representable by input's dtype.
-    finfo = torch.finfo(input_dtype)
-    if posinf is None:
-        posinf = finfo.max
-    posinf_cond = opset9.logical_and(
-        g,
-        isinf(g, nan_result),
-        opset9.gt(g, nan_result, g.op("Constant", value_t=torch.LongTensor([0]))),
-    )
-    nan_posinf_result = g.op(
-        "Where",
-        posinf_cond,
-        g.op("Constant", value_t=torch.tensor([posinf], dtype=input_dtype)),
-        nan_result,
-    )
-
-    if neginf is None:
-        neginf = finfo.min
-    neginf_cond = opset9.logical_and(
-        g,
-        isinf(g, nan_posinf_result),
-        opset9.lt(
-            g, nan_posinf_result, g.op("Constant", value_t=torch.LongTensor([0]))
-        ),
-    )
-    return g.op(
-        "Where",
-        neginf_cond,
-        g.op("Constant", value_t=torch.tensor([neginf], dtype=input_dtype)),
-        nan_posinf_result,
-    )
-
-
-# Quantized symbolics ---------------------------------------------------------
-# https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
-# Support starts from opset 10 because `DequantizeLinear` and `QuantizeLinear` were
-# introduced in opset version 10.
-@_onnx_symbolic("quantized::linear")
-def quantized_linear(
-    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.linear(g, input, weight, bias)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::linear_relu")
-def quantized_linear_relu(
-    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.linear(g, input, weight, bias)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::add")
-def quantized_add(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
-
-    output = opset9.add(g, x, y)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::add_relu")
-def quantized_add_relu(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
-
-    output = opset9.add(g, x, y)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::mul")
-def quantized_mul(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
-
-    output = opset9.mul(g, x, y)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::hardswish")
-def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.hardswish(g, x)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::sigmoid")
-def quantized_sigmoid(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.sigmoid(g, x)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::leaky_relu")
-def quantized_leaky_relu(
-    g: jit_utils.GraphContext, x, negative_slope, inplace, op_scale, op_zero_point
-):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.leaky_relu(g, x, negative_slope, inplace)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::layer_norm")
-def quantized_layer_norm(
-    g: jit_utils.GraphContext,
-    x,
-    normalized_shape,
-    weight,
-    bias,
-    eps,
-    op_scale,
-    op_zero_point,
-):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.layer_norm(g, x, normalized_shape, weight, bias, eps, False)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::group_norm")
-def quantized_group_norm(
-    g: jit_utils.GraphContext,
-    x,
-    num_groups,
-    weight,
-    bias,
-    eps,
-    op_scale,
-    op_zero_point,
-):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = opset9.group_norm(g, x, num_groups, weight, bias, eps, False)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::instance_norm")
-@symbolic_helper.parse_args("v", "v", "v", "f", "v", "v")
-def quantized_instance_norm(
-    g: jit_utils.GraphContext,
-    q_input,
-    weight,
-    bias,
-    eps,
-    op_scale,
-    op_zero_point,
-):
-    input, _, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-
-    output = opset9.instance_norm(
-        g, input, weight, bias, None, None, False, 0.0, eps, False
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv1d_relu")
-def quantized_conv1d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv2d_relu")
-def quantized_conv2d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv3d_relu")
-def quantized_conv3d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv1d")
-def quantized_conv1d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv2d")
-def quantized_conv2d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv3d")
-def quantized_conv3d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose1d")
-def quantized_conv_transpose1d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose2d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose2d")
-def quantized_conv_transpose2d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose2d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose3d")
-def quantized_conv_transpose3d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose3d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::cat")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def quantized_cat(
-    g: jit_utils.GraphContext,
-    q_inputs: _C.Value,
-    dim: int,
-    op_scale: _C.Value,
-    op_zero_point: _C.Value,
-) -> _C.Value:
-    unpacked_inputs = symbolic_helper._unpack_list(q_inputs)
-    dequantized = [
-        symbolic_helper.dequantize_helper(g, input)[0] for input in unpacked_inputs
-    ]
-    concatenated = g.op("Concat", *dequantized, axis_i=dim)
-    return symbolic_helper.quantize_helper(g, concatenated, op_scale, op_zero_point)
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 47ed56bcfeac..276ef7209bf6 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -1,1469 +1,8 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-"""This file exports ONNX ops for opset 11."""
+"""Backward compatibility module for torch.onnx.symbolic_opset11."""
 
 from __future__ import annotations
 
-import functools
-import sys
-import warnings
-from typing import TYPE_CHECKING
 
-import torch
-from torch import _C
-from torch._C import _onnx as _C_onnx
-from torch.onnx import (
-    _type_utils,
-    errors,
-    symbolic_helper,
-    symbolic_opset10 as opset10,
-    symbolic_opset9 as opset9,
-    utils,
-)
-from torch.onnx._internal import jit_utils, registration
+__all__: list[str] = []
 
-
-if TYPE_CHECKING:
-    from collections.abc import Sequence
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-__all__ = [
-    "add",
-    "append",
-    "arange",
-    "argsort",
-    "atleast_1d",
-    "atleast_2d",
-    "atleast_3d",
-    "cat",
-    "chunk",
-    "clamp_max",
-    "clamp_min",
-    "clamp",
-    "constant_pad_nd",
-    "cumsum",
-    "Delete",
-    "embedding_bag",
-    "embedding_renorm",
-    "flatten",
-    "gather",
-    "hardtanh",
-    "hstack",
-    "im2col",
-    "index_fill",
-    "index",
-    "index_copy",
-    "index_put",
-    "insert",
-    "linalg_det",
-    "linalg_vector_norm",
-    "logdet",
-    "masked_scatter",
-    "masked_select",
-    "mm",
-    "narrow",
-    "normal",
-    "pad",
-    "pixel_shuffle",
-    "pop",
-    "prim_constant_chunk",
-    "reflection_pad",
-    "relu6",
-    "remainder",
-    "replication_pad",
-    "round",
-    "scatter",
-    "select",
-    "size",
-    "sort",
-    "split_with_sizes",
-    "split",
-    "squeeze",
-    "stack",
-    "topk",
-    "unbind",
-    "unique_dim",
-    "unsqueeze",
-    "vstack",
-]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=11)
-
-
-@_onnx_symbolic("aten::hardtanh")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "f", "f")
-def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.FLOAT
-    )
-    min_val = g.op(
-        "Constant",
-        value_t=torch.tensor(min_val, dtype=scalar_type.dtype()),
-    )
-    max_val = g.op(
-        "Constant",
-        value_t=torch.tensor(max_val, dtype=scalar_type.dtype()),
-    )
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Clip", self, min_val, max_val, opset_before=12
-    )
-
-
-@_onnx_symbolic("aten::clamp")
-def clamp(g: jit_utils.GraphContext, self, min, max):
-    def _cast_if_not_none(tensor, dtype):
-        if tensor is not None and not symbolic_helper._is_none(tensor):
-            return g.op(
-                "Cast",
-                tensor,
-                to_i=dtype.onnx_type(),
-            )
-        else:
-            return tensor
-
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    )
-    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        min = _cast_if_not_none(min, scalar_type)
-        max = _cast_if_not_none(max, scalar_type)
-
-    if symbolic_helper._is_none(min):
-        return clamp_max(g, self, max)
-    elif symbolic_helper._is_none(max):
-        return clamp_min(g, self, min)
-    else:
-        if (
-            symbolic_helper._get_tensor_rank(min) == 0
-            and symbolic_helper._get_tensor_rank(max) == 0
-        ):
-            return symbolic_helper._op_with_optional_float_cast(
-                g, "Clip", self, min, max, opset_before=12
-            )
-        else:
-            return clamp_max(g, clamp_min(g, self, min), max)
-
-
-@_onnx_symbolic("aten::clamp_min")
-@symbolic_helper.parse_args("v", "v")
-def clamp_min(g: jit_utils.GraphContext, self, min):
-    min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
-    if symbolic_helper._get_tensor_rank(min) == 0:
-        max = opset9.unused(g)
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Clip", self, min, max, opset_before=12
-        )
-    else:
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Max", self, min, opset_before=12
-        )
-
-
-@_onnx_symbolic("aten::clamp_max")
-@symbolic_helper.parse_args("v", "v")
-def clamp_max(g: jit_utils.GraphContext, self, max):
-    max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
-    if symbolic_helper._get_tensor_rank(max) == 0:
-        min = opset9.unused(g)
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Clip", self, min, max, opset_before=12
-        )
-    else:
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Min", self, max, opset_before=12
-        )
-
-
-@_onnx_symbolic("aten::relu6")
-def relu6(g: jit_utils.GraphContext, input):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        input, _type_utils.JitScalarType.FLOAT
-    )
-    min_val = g.op(
-        "Constant",
-        value_t=torch.tensor(0, dtype=scalar_type.dtype()),
-    )
-    max_val = g.op(
-        "Constant",
-        value_t=torch.tensor(6, dtype=scalar_type.dtype()),
-    )
-    return clamp(g, input, min_val, max_val)
-
-
-@_onnx_symbolic("aten::select")
-# Opset 11 gather accepts negative indices
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "i", "v")
-def select(g: jit_utils.GraphContext, self, dim, index):
-    return g.op("Gather", self, index, axis_i=dim)
-
-
-@_onnx_symbolic("aten::index_put")
-def index_put(
-    g: jit_utils.GraphContext, self, indices_list_value, values, accumulate=False
-):
-    if symbolic_helper._is_packed_list(indices_list_value):
-        indices_list = symbolic_helper._unpack_list(indices_list_value)
-    else:
-        indices_list = [indices_list_value]
-    accumulate = symbolic_helper._parse_arg(accumulate, "b")
-
-    if len(indices_list) == 0:
-        return values
-
-    if len(indices_list) > 1:
-        for idx_ in range(len(indices_list)):
-            if symbolic_helper._is_bool(indices_list[idx_]):
-                indices_list[idx_] = g.op("NonZero", indices_list[idx_])
-        index = indices_list[0]
-
-        for ind in indices_list[1:]:
-            index = opset9.add(g, index, ind)
-        broadcast_index_shape = g.op("Shape", index)
-        indices_list = [
-            symbolic_helper._unsqueeze_helper(
-                g, opset9.expand(g, ind, broadcast_index_shape, None), [-1]
-            )
-            for ind in indices_list
-        ]
-        index = g.op("Concat", *indices_list, axis_i=-1)
-    else:
-        # Replace index_put node with masked_scatter or masked_fill
-        # when inputs to the index_put node contains a single boolean input.
-        #
-        # index_put -> masked_fill
-        #   * input index contains single tensor of Bool type (e.g.: %24 <- %23).
-        #   * input value contains single element (e.g.: %18).
-        #
-        # Torch IR
-        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
-        #   %16 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
-        #               aten::to(%8, %26, %27, %11, %12, %28, %29, %15)
-        #   %18 : Float(requires_grad=0, device=cpu) = prim::Constant[value={1}]()
-        #   %23 : Bool(8, strides=[1], device=cpu) = aten::view(%16, %22)
-        #   %24 : Tensor?[] = prim::ListConstruct(%23)
-        #   %25 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
-        #                aten::index_put(%mask, %24, %18, %30)
-        #   return (%25)
-        #
-        #
-        # index_put -> masked_scatter
-        #   * input index contains single tensor of Bool type (e.g.: %32 <- %31).
-        #   * input value contains multiple elements (e.g.: %28).
-        #
-        # Torch IR
-        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
-        #   %28 : Float(8, strides=[1], requires_grad=0, device=cpu)
-        #                = prim::Constant[value= 1  1  1  1  1  1  1  1 [ CPUFloatType{8} ]]()
-        #   %15 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
-        #                = aten::ne(%mask, %some_const)
-        #   %23 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
-        #                = aten::to(%15, %34, %35, %18, %19, %36, %37, %22)
-        #   %38 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
-        #   %30 : int[] = prim::Constant[value=[-1]]()
-        #   %31 : Bool(8, strides=[1], device=cpu) = aten::view(%23, %30)
-        #   %32 : Tensor?[] = prim::ListConstruct(%31)
-        #   %33 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
-        #               = aten::index_put(%mask, %32, %28, %38)
-        #   return (%33)
-        index = indices_list[0]
-        bool_inp = index
-        if symbolic_helper._is_bool(bool_inp):
-            rank = symbolic_helper._get_tensor_rank(values)
-            if rank is not None and rank == 0:
-                return opset9.masked_fill(g, self, bool_inp, values)
-            mask_rank = symbolic_helper._get_tensor_rank(bool_inp)
-            self_rank = symbolic_helper._get_tensor_rank(self)
-            if (
-                mask_rank is not None
-                and self_rank is not None
-                and self_rank > mask_rank
-            ):
-                # Unsqueeze 'bool_inp' to be broadcastable to shape of 'self'.
-                bool_inp = symbolic_helper._unsqueeze_helper(
-                    g, bool_inp, list(range(mask_rank, self_rank))
-                )
-            return masked_scatter(g, self, bool_inp, values)
-        broadcast_index_shape = g.op("Shape", index)
-        index = symbolic_helper._unsqueeze_helper(g, index, [-1])
-    sub_data_shape = symbolic_helper._slice_helper(
-        g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[sys.maxsize]
-    )
-    values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
-    # Check if values is a singular value and expand accordingly
-    rank = symbolic_helper._get_tensor_rank(values)
-    if rank is not None and rank == 0:
-        values = opset9.expand(g, values, values_shape, None)
-    values = symbolic_helper._reshape_helper(g, values, values_shape)
-
-    self_scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    )
-    if self_scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        values_scalar_type = _type_utils.JitScalarType.from_value(
-            values, _type_utils.JitScalarType.UNDEFINED
-        )
-        if self_scalar_type != values_scalar_type:
-            values = g.op("Cast", values, to_i=self_scalar_type.onnx_type())
-    elif accumulate:
-        raise errors.SymbolicValueError("self does not have a valid scalar type.", self)
-
-    if accumulate:
-        zeros = g.op(
-            "ConstantOfShape",
-            g.op("Shape", self),
-            value_t=torch.tensor([0], dtype=self_scalar_type.dtype()),
-        )
-        result = g.op("ScatterND", zeros, index, values)
-        result = add(g, self, result)
-    else:
-        result = g.op("ScatterND", self, index, values)
-
-    return result
-
-
-@_onnx_symbolic("aten::pixel_shuffle")
-@symbolic_helper.parse_args("v", "i")
-def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
-    rank = symbolic_helper._get_tensor_rank(self)
-    if rank is not None and rank != 4:
-        return symbolic_helper._unimplemented("pixel_shuffle", "only support 4d input")
-    return g.op("DepthToSpace", self, blocksize_i=upscale_factor, mode_s="CRD")
-
-
-@_onnx_symbolic(
-    "aten::upsample_nearest1d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest2d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest3d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_linear1d",
-    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_bilinear2d",
-    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_trilinear3d",
-    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_bicubic2d",
-    decorate=[symbolic_helper._apply_params("upsample_bicubic2d", 4, "cubic")],
-)
-def _interpolate(name: str, dim: int, interpolate_mode: str):
-    return symbolic_helper._interpolate_helper(name, dim, interpolate_mode)
-
-
-@_onnx_symbolic("aten::__interpolate")
-@symbolic_helper.quantized_args(True, False, False, False, False, False, False)
-def __interpolate(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-    antialias,
-):
-    return symbolic_helper.__interpolate_helper(
-        g, input, size, scale_factor, mode, align_corners, recompute_scale_factor
-    )
-
-
-@_onnx_symbolic("aten::gather")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
-    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
-        return symbolic_helper._unimplemented("gather", "sparse_grad == True")
-    return g.op("GatherElements", self, index, axis_i=dim)
-
-
-@_onnx_symbolic("aten::scatter")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def scatter(g: jit_utils.GraphContext, self, dim, index, src):
-    src_type = _type_utils.JitScalarType.from_value(src)
-    src = symbolic_helper._maybe_get_scalar(src)
-    if symbolic_helper._is_value(src):
-        return g.op("ScatterElements", self, index, src, axis_i=dim)
-    else:
-        # Check if scalar "src" has same type as self (PyTorch allows different
-        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
-        if _type_utils.JitScalarType.from_value(self) != src_type:
-            src = g.op(
-                "Cast",
-                src,
-                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-            )
-        return g.op(
-            "ScatterElements", self, index, opset9.expand_as(g, src, index), axis_i=dim
-        )
-
-
-@_onnx_symbolic("aten::cumsum")
-@symbolic_helper.parse_args("v", "i", "none")
-def cumsum(g: jit_utils.GraphContext, self, dim, dtype=None):
-    dim_tensor = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.int))
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        cast = g.op(
-            "Cast", self, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-    else:
-        cast = self
-    csum = g.op("CumSum", cast, dim_tensor)
-    return csum
-
-
-@_onnx_symbolic("aten::masked_select")
-def masked_select(g: jit_utils.GraphContext, self, mask):
-    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
-    return g.op("GatherND", self, index)
-
-
-@_onnx_symbolic("aten::masked_scatter")
-def masked_scatter(g: jit_utils.GraphContext, self, mask, source):
-    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
-    # NOTE: source can have more elements than needed.
-    # It could also have arbitrary shape.
-    # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor.
-    source = symbolic_helper._reshape_helper(g, source, torch.LongTensor([-1]))
-    source = symbolic_helper._slice_helper(
-        g,
-        source,
-        axes=torch.LongTensor([0]),
-        starts=torch.LongTensor([0]),
-        ends=opset9.size(g, index, torch.LongTensor([0])),
-    )
-    return g.op("ScatterND", self, index, source)
-
-
-@_onnx_symbolic("aten::len")
-def _len(g: jit_utils.GraphContext, self):
-    if (
-        symbolic_helper._is_tensor_list(self)
-        or self.node().kind() == "onnx::SplitToSequence"
-    ):
-        return g.op("SequenceLength", self)
-    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
-    return symbolic_helper._squeeze_helper(g, sz_0, [0])
-
-
-@_onnx_symbolic("aten::__getitem_")
-def __getitem_(g: jit_utils.GraphContext, self, i):
-    if symbolic_helper._is_tensor_list(self):
-        # SequenceAt requires that the input be a List of Tensors
-        return g.op("SequenceAt", self, i)
-    else:
-        from torch.onnx.symbolic_opset9 import __getitem_ as getitem
-
-        return getitem(g, self, i)
-
-
-@_onnx_symbolic("aten::_set_item")
-def _set_item(g: jit_utils.GraphContext, tensor_list, i, v):
-    tensor_list = g.op("SequenceErase", tensor_list, i)
-    return g.op("SequenceInsert", tensor_list, v, i)
-
-
-@_onnx_symbolic("aten::append")
-def append(g: jit_utils.GraphContext, self, tensor):
-    return g.op("SequenceInsert", self, tensor)
-
-
-@_onnx_symbolic("aten::add")
-def add(g: jit_utils.GraphContext, self, other, alpha=None):
-    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
-        tensor_list_node = other.node()
-        if tensor_list_node.kind() != "prim::ListConstruct":
-            return symbolic_helper._unimplemented(
-                "add", "does not support adding dynamic tensor list to another"
-            )
-        tensors = symbolic_helper._unpack_list(other)
-        l = self
-        for t in tensors:
-            l = g.op("SequenceInsert", l, t)
-        return l
-
-    return opset9.add(g, self, other, alpha)
-
-
-@_onnx_symbolic("aten::insert")
-def insert(g: jit_utils.GraphContext, self, pos, tensor):
-    return g.op("SequenceInsert", self, tensor, pos)
-
-
-@_onnx_symbolic("aten::pop")
-def pop(g: jit_utils.GraphContext, tensor_list, dim):
-    return g.op("SequenceErase", tensor_list, dim)
-
-
-@_onnx_symbolic("aten::Delete")
-def Delete(g: jit_utils.GraphContext, tensor_list, dim):
-    return g.op("SequenceErase", tensor_list, dim)
-
-
-@_onnx_symbolic("aten::cat")
-@symbolic_helper.quantized_args(True)
-def cat(g: jit_utils.GraphContext, tensor_list, dim):
-    if symbolic_helper._is_packed_list(tensor_list):
-        return opset9.cat(g, tensor_list, dim)
-    else:
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-        return g.op("ConcatFromSequence", tensor_list, axis_i=dim)
-
-
-@_onnx_symbolic("aten::stack")
-def stack(g: jit_utils.GraphContext, tensor_list, dim):
-    if symbolic_helper._is_packed_list(tensor_list):
-        return opset9.stack(g, tensor_list, dim)
-    else:
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-        return g.op("ConcatFromSequence", tensor_list, axis_i=dim, new_axis_i=1)
-
-
-@_onnx_symbolic("aten::_unique2")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def _unique2(g: jit_utils.GraphContext, self, sorted, return_inverse, return_counts):
-    u, _indices, inverse_indices, counts = g.op(
-        "Unique", self, sorted_i=sorted, outputs=4
-    )
-    return u, inverse_indices, counts
-
-
-@_onnx_symbolic("aten::unique_dim")
-@symbolic_helper.parse_args("v", "i", "i", "i", "i")
-def unique_dim(
-    g: jit_utils.GraphContext, self, dim, sorted, return_inverse, return_counts
-):
-    u, _indices, inverse_indices, counts = g.op(
-        "Unique", self, axis_i=dim, sorted_i=sorted, outputs=4
-    )
-    return u, inverse_indices, counts
-
-
-@_onnx_symbolic("aten::topk")
-@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
-def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
-    return symbolic_helper._topk_helper(
-        g, self, k, dim, largest=largest, sorted=sorted, out=out
-    )
-
-
-@_onnx_symbolic("aten::sort")
-@symbolic_helper.parse_args("v", "i", "i", "none")
-def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
-    return symbolic_helper._sort_helper(g, self, dim, descending=descending, out=out)
-
-
-@_onnx_symbolic("aten::argsort")
-@symbolic_helper.parse_args("v", "i", "i", "none")
-def argsort(g: jit_utils.GraphContext, self, dim, descending, out=None):
-    _, indices = symbolic_helper._sort_helper(
-        g, self, dim, descending=descending, out=out
-    )
-    return indices
-
-
-@_onnx_symbolic("aten::round")
-@symbolic_helper.parse_args("v", "i")
-def round(g: jit_utils.GraphContext, self, decimals=0):
-    if not symbolic_helper._is_fp(self):
-        return self
-    if decimals == 0:
-        return g.op("Round", self)
-    mul = g.op("Mul", self, g.op("Constant", value_t=torch.tensor(pow(10, decimals))))
-    round = g.op("Round", mul)
-    return g.op(
-        "Mul", round, g.op("Constant", value_t=torch.tensor(pow(10, -1 * decimals)))
-    )
-
-
-@_onnx_symbolic("aten::remainder")
-def remainder(g: jit_utils.GraphContext, input, other):
-    if symbolic_helper._is_fp(input) or symbolic_helper._is_fp(other):
-        return opset9.remainder(g, input, other)
-    return g.op("Mod", input, other, fmod_i=0)
-
-
-@_onnx_symbolic("aten::split")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
-    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
-        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
-        if _outputs is None:
-            return split_out
-        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
-        if (
-            symbolic_helper._is_packed_list(split_size_or_sizes)
-            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
-        ):
-            split_sizes = [
-                symbolic_helper._unsqueeze_helper(g, v, [0])
-                for v in symbolic_helper._unpack_list(split_size_or_sizes)
-            ]
-            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-            res = []
-            for i in range(_outputs):
-                end = g.op(
-                    "Add", start, split_sizes[i]
-                )  # split_sizes is a list of same length as _outputs
-                res.append(g.op("Slice", self, start, end, axis))
-                start = end
-            return res
-        return [
-            g.op(
-                "SequenceAt",
-                split_out,
-                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
-            )
-            for i in range(_outputs)
-        ]
-    else:
-        return opset9.split(g, self, split_size_or_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::split_with_sizes")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
-    return split(g, self, split_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::unbind")
-@symbolic_helper.parse_args("v", "i", "i")
-def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
-    if _outputs is None:
-        return g.op(
-            "SplitToSequence",
-            self,
-            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
-            axis_i=dim,
-            keepdims_i=0,
-        )
-    else:
-        return opset9.unbind(g, self, dim, _outputs)
-
-
-def _prepare_onnx_paddings(g: jit_utils.GraphContext, input, pad):
-    """Generate paddings in ONNX order based on pad in pytorch.
-
-    Args:
-        input: the input tensor.
-        pad: the paddings in pytorch.
-            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
-            where m is in range [0, n].
-    """
-    if (
-        not symbolic_helper._is_packed_list(pad)
-        and symbolic_helper._is_list(pad)
-        and symbolic_helper._is_scalar_list(pad)
-    ):
-        pad = g.op("ConcatFromSequence", pad, axis_i=0, new_axis_i=1)
-    # The desired order of paddings is
-    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
-    # n is the dimension of input.
-    # Assume zero-dimensions in the beginning, pad the "pad" sequence with zeros in the beginning
-    pad_len = opset9.size(g, pad, g.op("Constant", value_t=torch.tensor([0])))
-    # Set extension = [0] * (dim * 2 - len(pad))
-    rank = symbolic_helper._get_tensor_rank(input)
-    if rank is None:
-        rank = g.op("Size", g.op("Shape", input))
-    else:
-        rank = g.op("Constant", value_t=torch.tensor(rank, dtype=torch.int64))
-    extension = g.op(
-        "Sub",
-        g.op("Mul", rank, g.op("Constant", value_t=torch.tensor(2, dtype=torch.int64))),
-        pad_len,
-    )
-    # Concat pad with extension: paddings = [dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, 0, 0, ... ]
-    # Currently ONNX only supports int64 type for Pad
-    pad = g.op("Cast", pad, to_i=_C_onnx.TensorProtoDataType.INT64)
-    paddings = g.op(
-        "Concat",
-        pad,
-        g.op(
-            "ConstantOfShape", extension, value_t=torch.tensor([0], dtype=torch.int64)
-        ),
-        axis_i=0,
-    )
-    # Reshape and reverse order and collate first beginnings and then ends
-    # paddings = [[..., 0, dim_n-1_begin, dim_n_begin],
-    #               [..., 0, dim_n-1_end, dim_n_end]]
-    # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end]
-    paddings = symbolic_helper._reshape_helper(
-        g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))
-    )
-    paddings = g.op("Transpose", opset10.flip(g, paddings, [0]), perm_i=[1, 0])
-    paddings = symbolic_helper._reshape_helper(
-        g, paddings, g.op("Constant", value_t=torch.tensor([-1]))
-    )
-    padding_c = g.op("Cast", paddings, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return padding_c
-
-
-@_onnx_symbolic("aten::constant_pad_nd")
-def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value=None):
-    mode = "constant"
-    value = symbolic_helper._maybe_get_scalar(value)
-    value = symbolic_helper._if_scalar_type_as(value, input)
-    pad = _prepare_onnx_paddings(g, input, padding)
-    return g.op("Pad", input, pad, value, mode_s=mode)
-
-
-@_onnx_symbolic("aten::reflection_pad1d")
-@_onnx_symbolic("aten::reflection_pad2d")
-@_onnx_symbolic("aten::reflection_pad3d")
-def reflection_pad(g: jit_utils.GraphContext, input, padding):
-    mode = "reflect"
-    paddings = _prepare_onnx_paddings(g, input, padding)
-    return g.op("Pad", input, paddings, mode_s=mode)
-
-
-@_onnx_symbolic("aten::replication_pad1d")
-@_onnx_symbolic("aten::replication_pad2d")
-@_onnx_symbolic("aten::replication_pad3d")
-def replication_pad(g: jit_utils.GraphContext, input, padding):
-    mode = "edge"
-    paddings = _prepare_onnx_paddings(g, input, padding)
-    return g.op("Pad", input, paddings, mode_s=mode)
-
-
-@_onnx_symbolic("aten::pad")
-def pad(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    pad: _C.Value,
-    mode: _C.Value,
-    value: _C.Value,
-):
-    mode = symbolic_helper._parse_arg(mode, "s")
-    if mode == "replicate":
-        return replication_pad(g, input, pad)
-    elif mode == "reflect":
-        return reflection_pad(g, input, pad)
-    elif mode == "constant":
-        return constant_pad_nd(g, input, pad, value)
-    elif mode == "circular":
-        return opset9._pad_circular(g, input, pad)
-    else:
-        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
-
-
-@_onnx_symbolic("aten::linalg_det")
-def linalg_det(g: jit_utils.GraphContext, self):
-    return g.op("Det", self)
-
-
-@_onnx_symbolic("aten::logdet")
-def logdet(g: jit_utils.GraphContext, input):
-    return opset9.log(g, linalg_det(g, input))
-
-
-@_onnx_symbolic("aten::arange")
-def arange(g: jit_utils.GraphContext, *args):
-    def _get_arange_dtype(dtype):
-        dtype = symbolic_helper._maybe_get_const(dtype, "i")
-        return dtype
-
-    if len(args) == 2 and all(isinstance(val, int) for val in args):
-        # aten::arange(Scalar start, Scalar end)
-        dtype = torch.int64
-        # Start index.
-        start = g.op(
-            "Constant",
-            value_t=torch.tensor(args[0], dtype=dtype),
-        )
-        # End (exclusive) index.
-        end = g.op(
-            "Constant",
-            value_t=torch.tensor(args[1], dtype=dtype),
-        )
-        # Step size from start to end indexes.
-        delta_default = g.op(
-            "Constant",
-            value_t=torch.tensor(1, dtype=dtype),
-        )
-        return g.op("Range", start, end, delta_default)
-    elif len(args) == 2 or len(args) == 5:
-        if len(args) == 2:
-            # aten::arange(Scalar end, Tensor out)
-            dtype = None
-        else:
-            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-            dtype = _get_arange_dtype(args[1])
-        type_, end, start, step = symbolic_helper._arange_cast_helper(
-            g, end=args[0], dtype=dtype
-        )
-        start_default = g.op(
-            "Constant",
-            value_t=torch.tensor(0, dtype=type_.dtype()),
-        )
-        delta_default = g.op(
-            "Constant",
-            value_t=torch.tensor(1, dtype=type_.dtype()),
-        )
-        return g.op("Range", start_default, end, delta_default)
-    elif len(args) == 4 or len(args) == 7:
-        if len(args) == 4:
-            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
-            dtype = None
-        else:
-            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
-            dtype = _get_arange_dtype(args[3])
-        _, end, start, step = symbolic_helper._arange_cast_helper(
-            g, start=args[0], end=args[1], step=args[2], dtype=dtype
-        )
-        return g.op("Range", start, end, step)
-    elif len(args) == 6:
-        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-        dtype = _get_arange_dtype(args[2])
-        type_, end, start, step = symbolic_helper._arange_cast_helper(
-            g, start=args[0], end=args[1], dtype=dtype
-        )
-        delta_default = g.op(
-            "Constant",
-            value_t=torch.tensor(1, dtype=type_.dtype()),
-        )
-        return g.op("Range", start, end, delta_default)
-    else:
-        return symbolic_helper._unimplemented(
-            "aten::arange", f"with {len(args)} arguments"
-        )
-
-
-@_onnx_symbolic("aten::_dim_arange")
-@symbolic_helper.parse_args("v", "i")
-def _dim_arange(g: jit_utils.GraphContext, like, dim):
-    like_shape = g.op("Shape", like)
-    stop = g.op(
-        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
-    )
-    return arange(g, stop, 4, None, None, None)
-
-
-@_onnx_symbolic("aten::size")
-@symbolic_helper.quantized_args(True, quantize_output=False)
-def size(g: jit_utils.GraphContext, self, dim=None):
-    if dim is None:
-        return g.op("Shape", self)
-    return symbolic_helper._size_helper(g, self, dim)
-
-
-@_onnx_symbolic("aten::squeeze")
-def squeeze(g: jit_utils.GraphContext, self, dim=None):
-    if dim is None:
-        return g.op("Squeeze", self)
-
-    # dim as a tensor
-    if not symbolic_helper._is_constant(dim):
-        return symbolic_helper._squeeze_helper(g, self, [dim])
-
-    dim = symbolic_helper._get_const(dim, "i", "dim")
-
-    input_rank = symbolic_helper._get_tensor_rank(self)
-    adjusted_dim = dim
-    if input_rank is not None and dim < 0:
-        adjusted_dim += input_rank
-    dim_size = symbolic_helper._get_tensor_dim_size(self, adjusted_dim)
-    if (dim < 0 and input_rank is None) or dim_size is None:
-        # If onnx shape inference is not on, export always as dynamic.
-        # Because we cannot tell if observed static shape is also static at runtime.
-        # create "cond" node (condition is shape[i]==1)
-        dim_constant = g.op("Constant", value_t=torch.tensor([dim]))
-        size = symbolic_helper._size_helper(g, self, dim_constant)
-        const_one = g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))
-        cond = g.op("Equal", size, const_one)
-        # create the "If" node and add the "then" and "else" blocks to it.
-        if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
-            g, "If", cond, n_blocks=2
-        )
-        squeeze_ = symbolic_helper._squeeze_helper(if_context, self, [dim])
-        utils._add_output_to_block(if_context.block, squeeze_)
-        identity_ = else_context.op("Identity", self)
-        utils._add_output_to_block(else_context.block, identity_)
-        return if_op
-
-    # For static input shape
-    dim = adjusted_dim
-    if dim_size > 1:
-        warnings.warn(
-            "This model contains a squeeze operation on dimension "
-            + str(dim)
-            + ". The size of "
-            + "this dimension in the given input is "
-            + str(dim_size)
-            + ". The model will "
-            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
-            + "input shapes, please export with dynamic_axes argument."
-        )
-        return self
-    return symbolic_helper._squeeze_helper(g, self, [dim])
-
-
-@_onnx_symbolic("aten::unsqueeze")
-def unsqueeze(g: jit_utils.GraphContext, self, dim):
-    if symbolic_helper._is_constant(dim):
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-
-    return symbolic_helper._unsqueeze_helper(g, self, [dim])
-
-
-@_onnx_symbolic("aten::mm")
-def mm(g: jit_utils.GraphContext, self, other):
-    return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
-
-
-@_onnx_symbolic("aten::index")
-def index(g: jit_utils.GraphContext, self, index):
-    if symbolic_helper._is_packed_list(index):
-        indices = symbolic_helper._unpack_list(index)
-    else:
-        indices = [index]
-
-    # Handle single mask index.
-    if len(indices) == 1:
-        index = indices[0]
-        if not symbolic_helper._is_none(index) and (
-            symbolic_helper._is_bool(index)
-            or _type_utils.JitScalarType.from_value(index)
-            == _type_utils.JitScalarType.UINT8
-        ):
-            index = opset9.nonzero(g, index)
-            return g.op("GatherND", self, index)
-    return opset9.index(g, self, index)
-
-
-@_onnx_symbolic("aten::index_fill")
-def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
-    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
-        g, self, dim, index
-    )
-    value = symbolic_helper._maybe_get_scalar(value)
-    value = symbolic_helper._if_scalar_type_as(value, self)
-    expanded_value = opset9.expand(g, value, expanded_index_shape, None)
-    return scatter(g, self, dim, expanded_index, expanded_value)
-
-
-@_onnx_symbolic("aten::index_copy")
-def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
-    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
-        g, self, dim, index
-    )
-    return scatter(g, self, dim, expanded_index, source)
-
-
-@_onnx_symbolic("aten::bitwise_right_shift")
-@_onnx_symbolic("aten::__rshift_")
-def __rshift_(g: jit_utils.GraphContext, self, other):
-    # make sure to cast other to self's type
-    # (when self is long, make sure that other is not float)
-    if _type_utils.JitScalarType.from_value(
-        other, _type_utils.JitScalarType.UNDEFINED
-    ) != _type_utils.JitScalarType.from_value(self):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-        )
-
-    if (
-        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
-        == _type_utils.JitScalarType.UINT8
-    ):
-        return g.op("BitShift", self, other, direction_s="RIGHT")
-
-    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
-    # exponent (same type as self) has to be float or double in onnx::Pow
-    if not symbolic_helper._is_fp(self):
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    two_pow = g.op("Pow", two, other)
-    two_pow = g.op(
-        "Cast",
-        two_pow,
-        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-    )
-    rshift = g.op("Div", self, two_pow)
-    return rshift
-
-
-@_onnx_symbolic("aten::bitwise_left_shift")
-@_onnx_symbolic("aten::__lshift_")
-def __lshift_(g: jit_utils.GraphContext, self, other):
-    # make sure to cast other to self's type
-    # (when self is long, make sure that other is not float)
-    if _type_utils.JitScalarType.from_value(
-        other, _type_utils.JitScalarType.UNDEFINED
-    ) != _type_utils.JitScalarType.from_value(self):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-        )
-
-    if (
-        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
-        == _type_utils.JitScalarType.UINT8
-    ):
-        return g.op("BitShift", self, other, direction_s="LEFT")
-
-    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
-    # exponent (same type as self) has to be float or double in onnx::Pow
-    if not symbolic_helper._is_fp(self):
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    two_pow = g.op("Pow", two, other)
-    two_pow = g.op(
-        "Cast",
-        two_pow,
-        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-    )
-    lshift = g.op("Mul", self, two_pow)
-    return lshift
-
-
-def _get_im2col_indices_along_dim(
-    g: jit_utils.GraphContext, input_d, kernel_size_d, dilation_d, padding_d, stride_d
-):
-    # Input is always 4-D (N, C, H, W)
-    # Calculate indices of sliding blocks along spatial dimension
-    # Slide kernel over input each dim d:
-    # each dimension d ranges from 0 to input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
-    # with steps = stride
-
-    blocks_d = g.op(
-        "Add", input_d, g.op("Constant", value_t=torch.tensor(padding_d * 2))
-    )
-    blocks_d = g.op(
-        "Sub",
-        blocks_d,
-        g.op("Constant", value_t=torch.tensor(dilation_d * (kernel_size_d - 1))),
-    )
-
-    # Stride kernel over input and find starting indices along dim d
-    blocks_d_indices = g.op(
-        "Range",
-        g.op("Constant", value_t=torch.tensor(0)),
-        blocks_d,
-        g.op("Constant", value_t=torch.tensor(stride_d)),
-    )
-
-    # Apply dilation on kernel and find its indices along dim d
-    kernel_grid = torch.arange(0, kernel_size_d * dilation_d, dilation_d)
-    kernel_grid = g.op("Constant", value_t=kernel_grid.unsqueeze(0))
-
-    # Broadcast and add kernel staring positions (indices) with
-    # kernel_grid along dim d, to get block indices along dim d
-    blocks_d_indices = symbolic_helper._unsqueeze_helper(
-        g, blocks_d_indices, [0]
-    )  # Reshape to [1, -1]
-    kernel_mask = symbolic_helper._reshape_helper(
-        g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))
-    )
-    block_mask = g.op("Add", blocks_d_indices, kernel_mask)
-
-    return block_mask
-
-
-def _get_im2col_padded_input(g: jit_utils.GraphContext, input, padding_h, padding_w):
-    # Input is always 4-D tensor (N, C, H, W)
-    # Padding tensor has the following format: (padding_h, padding_w)
-    # Reshape the padding to follow ONNX format: (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
-    pad = g.op("Constant", value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
-    return g.op("Pad", input, pad)
-
-
-def _get_im2col_output_shape(g: jit_utils.GraphContext, input, kernel_h, kernel_w):
-    batch_dim = size(g, input, g.op("Constant", value_t=torch.tensor(0)))
-    channel_dim = size(g, input, g.op("Constant", value_t=torch.tensor(1)))
-    channel_unfolded = g.op(
-        "Mul", channel_dim, g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w))
-    )
-
-    return g.op(
-        "Concat",
-        symbolic_helper._unsqueeze_helper(g, batch_dim, [0]),
-        symbolic_helper._unsqueeze_helper(g, channel_unfolded, [0]),
-        g.op("Constant", value_t=torch.tensor([-1])),
-        axis_i=0,
-    )
-
-
-@_onnx_symbolic("aten::im2col")
-@symbolic_helper.parse_args("v", "is", "is", "is", "is")
-def im2col(g: jit_utils.GraphContext, input, kernel_size, dilation, padding, stride):
-    # Input is always 4-D tensor (N, C, H, W)
-    # All other args are int[2]
-
-    input_h = size(g, input, g.op("Constant", value_t=torch.tensor(2)))
-    input_w = size(g, input, g.op("Constant", value_t=torch.tensor(3)))
-
-    stride_h, stride_w = stride[0], stride[1]
-    padding_h, padding_w = padding[0], padding[1]
-    dilation_h, dilation_w = dilation[0], dilation[1]
-    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
-
-    blocks_row_indices = _get_im2col_indices_along_dim(
-        g, input_h, kernel_h, dilation_h, padding_h, stride_h
-    )
-    blocks_col_indices = _get_im2col_indices_along_dim(
-        g, input_w, kernel_w, dilation_w, padding_w, stride_w
-    )
-
-    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
-    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
-
-    # For a 4D matrix of size (1, 1, 3, 3) as below with kernel_size=2, stride=1, and dilation=1
-    # [[[[1., 2., 3.,],
-    #    [4., 5., 6.,],
-    #    [7., 8., 9.,]]]]
-    # First gather indices along rows (dim=2) with blocks_row_indices = [[0,1], [1,2]] to get:
-    # [[[[[1., 2., 3.],
-    #     [4., 5., 6.]],
-    #    [[4., 5., 6.],
-    #     [7., 8., 9.]]]]]
-    # And then gather along cols (dim=4) with blocks_row_indices = [[0,1], [1,2]] to get:
-    # [[[[[[1., 2.],
-    #      [4., 5.]],
-    #     [[2., 3.],
-    #      [5., 6]]],
-    #    [[[4., 5.],
-    #      [7., 8.]],
-    #     [[5., 6.],
-    #      [8., 9.]]]]]]
-    # Transpose dims 3 (depth) and 4 (rows), and then reshape to output shape (1, 1, 4, 4) to get:
-    #  [[[1., 2., 4., 5.],
-    #    [2., 3., 5., 6.],
-    #    [4., 5., 7., 8.],
-    #    [5., 6., 8., 9.]]]
-    output = g.op("Gather", padded_input, blocks_row_indices, axis_i=2)
-    output = g.op("Gather", output, blocks_col_indices, axis_i=4)
-    output = g.op("Transpose", output, perm_i=[0, 1, 2, 4, 3, 5])
-    return symbolic_helper._reshape_helper(g, output, output_shape)
-
-
-@_onnx_symbolic("aten::narrow")
-def narrow(g: jit_utils.GraphContext, input, dim, start, length):
-    end = g.op("Add", start, length)
-    return symbolic_helper._slice_helper(g, input, axes=dim, starts=start, ends=end)
-
-
-@_onnx_symbolic("aten::flatten")
-@symbolic_helper.quantized_args(True, False, False)
-@symbolic_helper.parse_args("v", "i", "i")
-def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
-    dim = symbolic_helper._get_tensor_rank(input)
-    if dim == 1:
-        return input
-    # use ONNX's Flatten operator for cases where the output shape is 2D
-    if start_dim == 1:
-        if end_dim == -1 or (dim is not None and end_dim == dim - 1):
-            return g.op("Flatten", input, axis_i=start_dim)
-    elif start_dim == 0:
-        if end_dim == -2 or (dim is not None and end_dim == dim - 2):
-            return g.op("Flatten", input, axis_i=end_dim + 1)
-    if dim is None:
-        return symbolic_helper._unimplemented(
-            "dim",
-            "ONNX and PyTorch use different strategies to split the input. "
-            "Input rank must be known at export time.",
-        )
-    # if end_dim is negative add dim
-    if end_dim < 0:
-        end_dim = dim + end_dim
-
-    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
-
-
-@_onnx_symbolic("aten::linalg_vector_norm")
-@symbolic_helper.parse_args("v", "f", "is", "b", "v")
-def linalg_vector_norm(
-    g: jit_utils.GraphContext,
-    self,
-    ord,
-    dim: Sequence[int] | None,
-    keepdim: bool,
-    dtype,
-):
-    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
-
-
-@_onnx_symbolic("aten::embedding_bag")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    return symbolic_helper._embedding_bag_helper(
-        g,
-        embedding_matrix,
-        indices,
-        offsets,
-        scale_grad_by_freq,
-        mode,
-        sparse,
-        per_sample_weights,
-        include_last_offset,
-        padding_idx,
-    )
-
-
-@_onnx_symbolic("aten::embedding_renorm")
-@symbolic_helper.parse_args("v", "v", "f", "f")
-def embedding_renorm(g: jit_utils.GraphContext, weight, indices, max_norm, norm_type):
-    unique_indices = g.op("Unique", indices)
-    partial_weight = g.op("Gather", weight, unique_indices)
-    norm_i = int(norm_type)
-    if norm_i == 1:
-        norm_type = "ReduceL1"
-    elif norm_i == 2:
-        norm_type = "ReduceL2"
-    else:
-        raise errors.SymbolicValueError(
-            f"Unsupported: ONNX export of embedding_renorm with norm: {norm_i}. "
-            "Only 1. and 2. are supported.",
-            weight,
-        )
-    partial_weight_norm = g.op(norm_type, partial_weight, axes_i=[1], keepdims_i=1)
-    # https://github.com/pytorch/pytorch/blob/0a07488ed2c47765e337e290bd138c0e6e459cbd/aten/src/ATen/native/Embedding.cpp#L177
-    # Add 1e-7 to prevent division by zero.
-    partial_weight_norm_ = g.op(
-        "Add", partial_weight_norm, g.op("Constant", value_t=torch.tensor(1e-7))
-    )
-    max_norm = torch.tensor(max_norm)
-    scales = g.op("Div", max_norm, partial_weight_norm_)
-    partial_weight_renorm = g.op("Mul", partial_weight, scales)
-    partial_weight_renorm = g.op(
-        "Where",
-        g.op("Greater", partial_weight_norm, max_norm),
-        partial_weight_renorm,
-        partial_weight,
-    )
-    return g.op(
-        "ScatterND",
-        weight,
-        symbolic_helper._unsqueeze_helper(g, unique_indices, [1]),
-        partial_weight_renorm,
-    )
-
-
-@_onnx_symbolic("aten::chunk")
-def chunk(g: jit_utils.GraphContext, self, chunks, dim):
-    # Calculate chunk size for dynamic chunk
-    dim_size = g.op("Gather", g.op("Shape", self), dim, axis_i=0)
-    chunk_size_s = g.op(
-        "Sub", chunks, g.op("Constant", value_t=torch.tensor([1], dtype=torch.long))
-    )
-    chunk_size = g.op("Div", g.op("Add", dim_size, chunk_size_s), chunks)
-    # Create splits vector
-    chunk_vec = [
-        opset9.expand(g, chunk_size, chunk_size_s, None),
-        g.op("Sub", dim_size, g.op("Mul", chunk_size, chunk_size_s)),
-    ]
-    chunk_vec = g.op("Concat", *chunk_vec, axis_i=0)
-    return split(g, self, chunk_vec, dim)
-
-
-@_onnx_symbolic("aten::normal")
-def normal(
-    g: jit_utils.GraphContext,
-    mean,
-    std,
-    sizes=None,
-    generator=None,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=None,
-):
-    # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
-    # scale-location transformation of that distribution, which has mean mu and variance sigma's square. If x is a sample
-    # from a mean 0 and variance 1 distribution then
-    #       sigma x+mu
-    # is a sample with mean mu and variance sigma's square.
-    if sizes is not None and not symbolic_helper._is_none(sizes):
-        mean = opset9.expand(g, mean, sizes, None)
-    result = opset9.mul(g, std, g.op("RandomNormalLike", mean))
-    return add(g, result, mean)
-
-
-@_onnx_symbolic("aten::atleast_1d")
-def atleast_1d(g: jit_utils.GraphContext, self: torch._C.Value):
-    # NOTE: If it's 0D, reshape to 1D
-
-    # NOTE: self could be a packed list or a tensor
-    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
-        tensor_list = symbolic_helper._unpack_list(self)
-        new_tensor_list = []
-        for tensor in tensor_list:
-            new_tensor = tensor
-            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
-            if tensor_rank == 0:
-                new_tensor = symbolic_helper._reshape_helper(
-                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1]))
-                )
-            new_tensor_list.append(new_tensor)
-        return g.op("SequenceConstruct", *new_tensor_list)
-
-    tensor_rank = symbolic_helper._get_tensor_rank(self)
-    if tensor_rank == 0:
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([1]))
-        )
-    return self
-
-
-@_onnx_symbolic("aten::atleast_2d")
-def atleast_2d(g: jit_utils.GraphContext, self: torch._C.Value):
-    # NOTE: If it's 0D, reshape to 2D
-    #       If it's 1D, unsqueeze to 2D
-
-    # NOTE: self could be a packed list or a tensor
-    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
-        tensor_list = symbolic_helper._unpack_list(self)
-        new_tensor_list = []
-        for tensor in tensor_list:
-            new_tensor = tensor
-            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
-            if tensor_rank == 0:
-                new_tensor = symbolic_helper._reshape_helper(
-                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1]))
-                )
-            elif tensor_rank == 1:
-                new_tensor = symbolic_helper._unsqueeze_helper(
-                    g, new_tensor, axes_i=[0]
-                )
-            new_tensor_list.append(new_tensor)
-        return g.op("SequenceConstruct", *new_tensor_list)
-
-    tensor_rank = symbolic_helper._get_tensor_rank(self)
-    if tensor_rank == 0:
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([1, 1]))
-        )
-    elif tensor_rank == 1:
-        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
-    return self
-
-
-@_onnx_symbolic("aten::atleast_3d")
-def atleast_3d(g: jit_utils.GraphContext, self: torch._C.Value):
-    # NOTE: If it's 0D, reshape to 3D
-    #       If it's 1D, unsqueeze to 3D
-    #       If it's 2D, unsqueeze to 3D
-
-    # NOTE: self could be a packed list or a tensor
-    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
-        tensor_list = symbolic_helper._unpack_list(self)
-        new_tensor_list = []
-        for tensor in tensor_list:
-            new_tensor = tensor
-            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
-            if tensor_rank == 0:
-                new_tensor = symbolic_helper._reshape_helper(
-                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
-                )
-            elif tensor_rank == 1:
-                new_tensor = symbolic_helper._unsqueeze_helper(
-                    g, new_tensor, axes_i=[0]
-                )
-                new_tensor = symbolic_helper._unsqueeze_helper(
-                    g, new_tensor, axes_i=[-1]
-                )
-            elif tensor_rank == 2:
-                new_tensor = symbolic_helper._unsqueeze_helper(
-                    g, new_tensor, axes_i=[-1]
-                )
-            new_tensor_list.append(new_tensor)
-        return g.op("SequenceConstruct", *new_tensor_list)
-
-    tensor_rank = symbolic_helper._get_tensor_rank(self)
-    if tensor_rank == 0:
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
-        )
-    elif tensor_rank == 1:
-        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
-        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
-    elif tensor_rank == 2:
-        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
-    return self
-
-
-@_onnx_symbolic("prim::ConstantChunk")
-def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
-    input_shape = g.op("Shape", self)
-    axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-    input_shape_dim = g.op("Gather", input_shape, axis, axis_i=0)
-    start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-    chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
-    chunk_size_minus_1 = g.op(
-        "Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long)
-    )
-    input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
-    chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
-    res = []
-    for i in range(chunks):
-        index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
-        end = g.op("Mul", chunk_dim, index)
-        res.append(g.op("Slice", self, start, end, axis))
-        start = end
-    return res
-
-
-@_onnx_symbolic("aten::hstack")
-def hstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
-    tensor_list = atleast_1d(g, tensor_list)
-    first_tensor = g.op(
-        "SequenceAt",
-        tensor_list,
-        g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)),
-    )
-    first_tensor_shape = g.op("Shape", first_tensor)
-    first_tensor_dim = g.op("Size", first_tensor_shape)
-
-    const_one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
-    equal_to_one = g.op("Equal", first_tensor_dim, const_one)
-
-    (
-        if_op_greater,
-        (if_context_equal, else_context_equal),
-        _,
-    ) = jit_utils.add_op_with_blocks(g, "If", equal_to_one, n_blocks=2, outputs=1)
-    result_if = if_context_equal.op(
-        "ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0
-    )
-    utils._add_output_to_block(if_context_equal.block, result_if)
-    result_else = else_context_equal.op(
-        "ConcatFromSequence", tensor_list, axis_i=1, new_axis_i=0
-    )
-    utils._add_output_to_block(else_context_equal.block, result_else)
-    result = if_op_greater.node().output()
-
-    return result
-
-
-@_onnx_symbolic("aten::vstack")
-def vstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
-    tensor_list = atleast_2d(g, tensor_list)
-    return g.op("ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset11 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset12.py b/torch/onnx/symbolic_opset12.py
index 21489fbb7972..63e137734e8a 100644
--- a/torch/onnx/symbolic_opset12.py
+++ b/torch/onnx/symbolic_opset12.py
@@ -1,464 +1,8 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-from __future__ import annotations
-
-import functools
-import sys
-
-import torch
-from torch._C import _onnx as _C_onnx
-from torch.onnx import (
-    _type_utils,
-    errors,
-    symbolic_helper,
-    symbolic_opset9 as opset9,
-    utils,
-)
-from torch.onnx._internal import jit_utils, registration
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-# This file exports ONNX ops for opset 12
-
-__all__ = [
-    "argmax",
-    "argmin",
-    "binary_cross_entropy_with_logits",
-    "celu",
-    "cross_entropy_loss",
-    "dropout",
-    "einsum",
-    "ge",
-    "le",
-    "native_dropout",
-    "nll_loss",
-    "nll_loss2d",
-    "nll_loss_nd",
-    "outer",
-    "pow",
-    "tensordot",
-    "unfold",
-]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=12)
-
-
-def _einsum_helper(g: jit_utils.GraphContext, equation, tensors):
-    if not tensors:
-        raise RuntimeError("Einsum inputs are empty.")
-    # ONNX does not support bool for Einsum inputs.
-    if symbolic_helper._is_bool(tensors[0]):
-        tensors = [
-            g.op("Cast", tensor, to_i=_C_onnx.TensorProtoDataType.INT64)
-            for tensor in tensors
-        ]
-        return g.op(
-            "Cast",
-            g.op("Einsum", *tensors, equation_s=equation),
-            to_i=_C_onnx.TensorProtoDataType.BOOL,
-        )
-    else:
-        return g.op("Einsum", *tensors, equation_s=equation)
-
-
-@_onnx_symbolic("aten::einsum")
-@symbolic_helper.parse_args("s", "v", "is")
-def einsum(g: jit_utils.GraphContext, equation, tensor_list, path=None):
-    tensors = symbolic_helper._unpack_list(tensor_list)
-    return _einsum_helper(g, equation, tensors)
-
-
-@_onnx_symbolic("aten::outer")
-@symbolic_helper.parse_args("v", "v")
-def outer(g: jit_utils.GraphContext, input, other):
-    # make sure to cast other to self's type
-    if _type_utils.JitScalarType.from_value(
-        other, _type_utils.JitScalarType.UNDEFINED
-    ) != _type_utils.JitScalarType.from_value(input):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=_type_utils.JitScalarType.from_value(input).onnx_type(),
-        )
-    return _einsum_helper(g, "i,j->ij", [input, other])
-
-
-def _dropout_returns_masked_input_and_mask(
-    g: jit_utils.GraphContext, input: torch._C.Value, p: float, train: bool
-) -> tuple[torch._C.Value, torch._C.Value | None]:
-    symbolic_helper.check_training_mode(train, "dropout")
-    # In eval mode, dropout is non-op. That is, if the node's
-    # train param is set to False, dropout just returns its inputs.
-    if not train:
-        return input, None
-    p = g.op("Constant", value_t=torch.tensor(p))
-    t = g.op("Constant", value_t=torch.tensor(train, dtype=torch.bool))
-    r, mask = g.op("Dropout", input, p, t, outputs=2)
-    return r, mask
-
-
-@_onnx_symbolic("aten::dropout")
-@symbolic_helper.parse_args("v", "f", "b")
-def dropout(g: jit_utils.GraphContext, input, p, train):
-    masked, _ = _dropout_returns_masked_input_and_mask(g, input, p, train)
-    return masked
-
-
-@_onnx_symbolic("aten::native_dropout")
-@symbolic_helper.parse_args("v", "f", "b")
-def native_dropout(g: jit_utils.GraphContext, input, p, train):
-    return _dropout_returns_masked_input_and_mask(g, input, p, train)
-
-
-@_onnx_symbolic("aten::nll_loss")
-def nll_loss(g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index):
-    # none reduction : onnx::Constant[value={0}]
-    # mean reduction : onnx::Constant[value={1}]
-    # sum reduction : onnx::Constant[value={2}]
-    reduction = symbolic_helper._maybe_get_const(reduction, "i")
-    reduction_vals = ["none", "mean", "sum"]
-    reduction = reduction_vals[reduction]
-
-    # in onnx NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
-    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
-    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
-    if weight.node().mustBeNone():
-        nllloss = g.op(
-            "NegativeLogLikelihoodLoss",
-            self,
-            target,
-            reduction_s=reduction,
-            ignore_index_i=ignore_index,
-        )
-    else:
-        nllloss = g.op(
-            "NegativeLogLikelihoodLoss",
-            self,
-            target,
-            weight,
-            reduction_s=reduction,
-            ignore_index_i=ignore_index,
-        )
-
-    return nllloss
-
-
-@_onnx_symbolic("aten::nll_loss2d")
-def nll_loss2d(
-    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
-):
-    return nll_loss(g, self, target, weight, reduction, ignore_index)
-
-
-@_onnx_symbolic("aten::nll_loss_nd")
-def nll_loss_nd(
-    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
-):
-    return nll_loss(g, self, target, weight, reduction, ignore_index)
-
-
-@_onnx_symbolic("aten::cross_entropy_loss")
-def cross_entropy_loss(
-    g: jit_utils.GraphContext,
-    self,
-    target,
-    weight,
-    reduction,
-    ignore_index,
-    label_smoothing,
-):
-    # none reduction : onnx::Constant[value={0}]
-    # mean reduction : onnx::Constant[value={1}]
-    # sum reduction : onnx::Constant[value={2}]
-    reduction = symbolic_helper._maybe_get_const(reduction, "i")
-    reduction_vals = ["none", "mean", "sum"]
-    reduction = reduction_vals[reduction]
-
-    label_smoothing = symbolic_helper._maybe_get_const(label_smoothing, "f")
-    if label_smoothing is not None and label_smoothing > 0.0:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX does not support label_smoothing", self
-        )
-
-    # in onnx SoftmaxCrossEntropyLoss specification, ignore_index is optional without default value.
-    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
-    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
-    if weight.node().mustBeNone():
-        celoss = g.op(
-            "SoftmaxCrossEntropyLoss",
-            self,
-            target,
-            reduction_s=reduction,
-            ignore_index_i=ignore_index,
-        )
-    else:
-        celoss = g.op(
-            "SoftmaxCrossEntropyLoss",
-            self,
-            target,
-            weight,
-            reduction_s=reduction,
-            ignore_index_i=ignore_index,
-        )
-
-    return celoss
-
-
-@_onnx_symbolic("aten::binary_cross_entropy_with_logits")
-@symbolic_helper.parse_args("v", "v", "v", "v", "i")
-def binary_cross_entropy_with_logits(
-    g: jit_utils.GraphContext, input, target, weight, pos_weight, reduction
-):
-    p = g.op("Constant", value_t=torch.tensor([1]))
-    sig_x = opset9.sigmoid(g, input)
-    log_sig_x = opset9.log(g, sig_x)
-    sub_1_x = opset9.sub(g, p, sig_x)
-    sub_1_y = opset9.sub(g, p, target)
-    log_1_x = opset9.log(g, sub_1_x)
-    if pos_weight is None or symbolic_helper._is_none(pos_weight):
-        output = opset9.neg(
-            g,
-            opset9.add(
-                g, opset9.mul(g, target, log_sig_x), opset9.mul(g, sub_1_y, log_1_x)
-            ),
-        )
-    else:
-        output = opset9.neg(
-            g,
-            opset9.add(
-                g,
-                opset9.mul(g, opset9.mul(g, target, log_sig_x), pos_weight),
-                opset9.mul(g, sub_1_y, log_1_x),
-            ),
-        )
-
-    if weight is not None and not symbolic_helper._is_none(weight):
-        output = opset9.mul(g, weight, output)
-
-    reduction = symbolic_helper._maybe_get_const(reduction, "i")
-    if reduction == 0:
-        return output
-    elif reduction == 1:
-        return g.op("ReduceMean", output, keepdims_i=0)
-    elif reduction == 2:
-        return g.op("ReduceSum", output, keepdims_i=0)
-    else:
-        return symbolic_helper._onnx_unsupported(
-            "binary_cross_entropy_with_logits with reduction other than none, mean, or sum",
-            input,
-        )
+"""Backward compatibility module for torch.onnx.symbolic_opset12."""
 
+from __future__ import annotations
 
-@_onnx_symbolic("aten::celu")
-def celu(g: jit_utils.GraphContext, self, alpha):
-    alpha = symbolic_helper._maybe_get_const(alpha, "f")
-    # if the input is of type double cast it to float
-    if (
-        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
-        == _type_utils.JitScalarType.DOUBLE
-    ):
-        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-        out = g.op("Celu", self, alpha_f=alpha)
-        return g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
-
-    return g.op("Celu", self, alpha_f=alpha)
-
-
-@_onnx_symbolic("aten::argmax")
-@symbolic_helper.parse_args("v", "v", "b")
-def argmax(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-):
-    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
-
-
-@_onnx_symbolic("aten::argmin")
-@symbolic_helper.parse_args("v", "v", "b")
-def argmin(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-):
-    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
-
-
-@_onnx_symbolic("aten::pow")
-def pow(g: jit_utils.GraphContext, self, exponent):
-    return g.op("Pow", self, exponent)
-
-
-@_onnx_symbolic("aten::ge")
-def ge(g: jit_utils.GraphContext, input, other):
-    return g.op("GreaterOrEqual", input, other)
-
-
-@_onnx_symbolic("aten::le")
-def le(g: jit_utils.GraphContext, input, other):
-    return g.op("LessOrEqual", input, other)
-
-
-@_onnx_symbolic("aten::unfold")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
-    const_size = symbolic_helper._maybe_get_const(size, "i")
-    const_step = symbolic_helper._maybe_get_const(step, "i")
-    if not symbolic_helper._is_value(const_size) and not symbolic_helper._is_value(
-        const_step
-    ):
-        return opset9.unfold(g, input, dimension, const_size, const_step)
-
-    sizedim = symbolic_helper._get_tensor_dim_size(input, dimension)
-    if sizedim is not None:
-        low_start = g.op("Constant", value_t=torch.tensor(0))
-        low_end = g.op("Constant", value_t=torch.tensor(sizedim))
-        hi_end = g.op("Constant", value_t=torch.tensor(sizedim + 1))
-        low_indices = g.op("Range", low_start, low_end, step)
-        hi_indices = g.op("Range", size, hi_end, step)
-
-        low_size = symbolic_helper._size_helper(
-            g, low_indices, g.op("Constant", value_t=torch.tensor(0))
-        )
-        hi_size = symbolic_helper._size_helper(
-            g, hi_indices, g.op("Constant", value_t=torch.tensor(0))
-        )
-
-        ndim = symbolic_helper._get_tensor_rank(input)
-        assert ndim is not None
-        perm = list(range(0, ndim))
-        perm.append(perm.pop(dimension))
-
-        unsqueeze_list = []
-        loop_condition = g.op("Constant", value_t=torch.tensor(1))
-        loop_condition = g.op(
-            "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
-        )
-        loop_len = g.op("Min", low_size, hi_size)
-
-        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
-            g, "Loop", loop_len, loop_condition, n_blocks=1
-        )
-
-        loop_block = loop_context.block
-        block_input_iter = utils._add_input_to_block(loop_block)
-        cond = utils._add_input_to_block(loop_block)  # noqa: F841
-
-        starts = loop_context.op("Gather", low_indices, block_input_iter)
-        ends = loop_context.op("Gather", hi_indices, block_input_iter)
-        axes = loop_context.op("Constant", value_t=torch.tensor([2]))
-        starts = symbolic_helper._unsqueeze_helper(loop_context, starts, [0])
-        ends = symbolic_helper._unsqueeze_helper(loop_context, ends, [0])
-        stack = loop_context.op("Slice", input, starts, ends, axes)
-
-        unsqueeze = symbolic_helper._unsqueeze_helper(
-            loop_context, loop_context.op("Transpose", stack, perm_i=perm), [dimension]
-        )
-        unsqueeze_list.append(unsqueeze)
-        concat = loop_context.op("Concat", *unsqueeze_list, axis_i=0)
-
-        cond_out = loop_context.op(
-            "Cast", loop_condition, _C_onnx.TensorProtoDataType.BOOL
-        )
-        utils._add_output_to_block(loop_block, cond_out)
-        utils._add_output_to_block(loop_block, concat)
-
-        loop_output = loop.node().output()
-        perm = [0, 1, 2, 3, 4]
-        perm[0], perm[dimension + 1] = perm[dimension + 1], perm[0]
-        transpose = g.op("Transpose", loop_output, perm_i=perm)
-        squeeze = symbolic_helper._squeeze_helper(g, transpose, [0])
-
-        return squeeze
-
-    return symbolic_helper._unimplemented("Unfold", "input size not accessible")
-
-
-@_onnx_symbolic("aten::tensordot")
-@symbolic_helper.parse_args("v", "v", "is", "is", "v")
-def tensordot(g: jit_utils.GraphContext, input_a, input_b, dims_a, dims_b, out=None):
-    if out is not None:
-        symbolic_helper._unimplemented(
-            "Tensordot", "Out parameter is not supported for tensordot."
-        )
-
-    dim_count_a = symbolic_helper._get_tensor_rank(input_a)
-    if dim_count_a is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of tensordot for tensor(input_a) of unknown rank.",
-            input_a,
-        )
-
-    dim_count_b = symbolic_helper._get_tensor_rank(input_b)
-    if dim_count_b is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of tensordot for tensor(input_b) of unknown rank.",
-            input_b,
-        )
-
-    dims_a = [
-        (dims_a[i] + dim_count_a) if (dims_a[i] < 0) else dims_a[i]
-        for i in range(len(dims_a))
-    ]
-    dims_b = [
-        (dims_b[i] + dim_count_b) if (dims_b[i] < 0) else dims_b[i]
-        for i in range(len(dims_b))
-    ]
-
-    left_dims_a = [i for i in range(dim_count_a) if (i not in dims_a)]
-    left_dims_b = [i for i in range(dim_count_b) if (i not in dims_b)]
-
-    new_input_a = opset9.permute(g, input_a, left_dims_a + dims_a)
-    new_input_b = opset9.permute(g, input_b, dims_b + left_dims_b)
-
-    input_shape = g.op("Shape", new_input_a)
-    left_sizes_a = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[0], ends=[len(left_dims_a)]
-    )
-    shape_sizes = [
-        left_sizes_a,
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-    ]
-    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
-
-    input_shape = g.op("Shape", output_a)
-    slices = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
-    )
-    shape_sizes = [
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-        slices,
-    ]
-    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
-
-    input_shape = g.op("Shape", new_input_b)
-    left_sizes_b = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[len(dims_b)], ends=[sys.maxsize]
-    )
-    slices = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[0], ends=[len(dims_b)]
-    )
-    shape_sizes = [
-        slices,
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-    ]
-    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
-
-    input_shape = g.op("Shape", output_b)
-    slices = symbolic_helper._slice_helper(
-        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
-    )
-    shape_sizes = [
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
-        slices,
-    ]
-    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
 
-    output = einsum(g, "ij,jk->ik", g.op("prim::ListConstruct", *[output_a, output_b]))
+__all__: list[str] = []
 
-    shape_sizes = [left_sizes_a, left_sizes_b]
-    return opset9._reshape_from_tensor(g, output, shape_sizes)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset12 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset13.py b/torch/onnx/symbolic_opset13.py
index aa40c5578042..18aff9295be8 100644
--- a/torch/onnx/symbolic_opset13.py
+++ b/torch/onnx/symbolic_opset13.py
@@ -1,1113 +1,8 @@
-# mypy: allow-untyped-defs
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
+"""Backward compatibility module for torch.onnx.symbolic_opset13."""
 
-# This file exports ONNX ops for opset 13
-import functools
+from __future__ import annotations
 
-import torch
-import torch._C._onnx as _C_onnx
-from torch.onnx import (
-    _constants,
-    _type_utils,
-    errors,
-    symbolic_helper,
-    symbolic_opset11 as opset11,
-    symbolic_opset9 as opset9,
-    utils,
-)
-from torch.onnx._internal import jit_utils, registration
 
+__all__: list[str] = []
 
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=13)
-
-
-@_onnx_symbolic("aten::softmax")
-@symbolic_helper.parse_args("v", "i", "none")
-def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
-    softmax = g.op("Softmax", input, axis_i=dim)
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        softmax = g.op(
-            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-
-    return softmax
-
-
-@_onnx_symbolic("aten::log_softmax")
-@symbolic_helper.parse_args("v", "i", "none")
-def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
-    return_op = g.op("LogSoftmax", input, axis_i=dim)
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        return_op = g.op(
-            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-    return return_op
-
-
-@_onnx_symbolic("aten::frobenius_norm")
-@symbolic_helper.parse_args("v", "v", "i")
-def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
-    dim_val = symbolic_helper._maybe_get_const(dim, "is")
-    if not symbolic_helper._is_value(dim_val) and len(dim_val) == 0:
-        return g.op("ReduceL2", self, keepdims_i=0)
-    sqr = g.op("Mul", self, self)
-    sumsqr = symbolic_helper._reducesum_helper(g, sqr, dim, keepdims_i=keepdim)
-    return g.op("Sqrt", sumsqr)
-
-
-@_onnx_symbolic("aten::split")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
-    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
-        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
-        if _outputs is None:
-            return split_out
-        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
-        if (
-            symbolic_helper._is_packed_list(split_size_or_sizes)
-            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
-        ):
-            split_sizes = [
-                symbolic_helper._unsqueeze_helper(g, v, [0])
-                for v in symbolic_helper._unpack_list(split_size_or_sizes)
-            ]
-
-            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-            res = []
-            for i in range(_outputs):
-                end = g.op(
-                    "Add", start, split_sizes[i]
-                )  # split_sizes is a list of same length as _outputs
-                res.append(g.op("Slice", self, start, end, axis))
-                start = end
-            return res
-        return [
-            g.op(
-                "SequenceAt",
-                split_out,
-                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
-            )
-            for i in range(_outputs)
-        ]
-
-    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
-    if split_val.dim() > 0:
-        return g.op("Split", self, split_size_or_sizes, axis_i=dim, outputs=_outputs)
-    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
-
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        if _outputs is not None:
-            size = split_size * _outputs
-        else:
-            raise errors.SymbolicValueError(
-                "Unknown dimension size not supported", self
-            )
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-    splits = g.op("Constant", value_t=torch.tensor(splits))
-    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::split_with_sizes")
-def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
-    return split(g, self, split_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::unsafe_split")
-def unsafe_split(
-    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
-):
-    return split(g, self, split_size_or_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::unsafe_split_with_sizes")
-def unsafe_split_with_sizes(
-    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
-):
-    return split_with_sizes(g, self, split_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::tensor_split")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def tensor_split(
-    g: jit_utils.GraphContext, self, indices_or_sections, dim, _outputs=None
-):
-    axis = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-    axis = opset11.unsqueeze(g, axis, 0)
-    const_1 = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
-
-    if symbolic_helper._is_split_static(indices_or_sections, _outputs):
-        split_val = symbolic_helper._node_get(indices_or_sections.node(), "value")
-
-        if split_val.dim() > 0:
-            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-            res = []
-            assert _outputs is not None
-            for i in range(_outputs - 1):
-                end = g.op(
-                    "Gather",
-                    indices_or_sections,
-                    g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
-                    axis_i=0,
-                )
-                res.append(g.op("Slice", self, start, end, axis))
-                start = end
-
-            end = symbolic_helper._size_helper(g, self, axis)
-            res.append(g.op("Slice", self, start, end, axis))
-            return res
-
-        split_size = symbolic_helper._get_const(
-            indices_or_sections, "i", "indices_or_sections"
-        )
-
-        size = symbolic_helper._get_tensor_dim_size(self, dim)
-        if size is None:
-            if _outputs is not None:
-                size = split_size * _outputs
-            else:
-                raise errors.SymbolicValueError(
-                    "Unknown dimension size not supported", self
-                )
-
-        min_split_size = size // split_size
-        num_splits_one_extra = size % split_size
-
-        splits = num_splits_one_extra * [min_split_size + 1]
-        leftover = (split_size - num_splits_one_extra) * [min_split_size]
-
-        splits = g.op(
-            "Constant", value_t=torch.tensor(splits + leftover, dtype=torch.long)
-        )
-        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-
-    if (
-        symbolic_helper._is_tensor(indices_or_sections)
-        and symbolic_helper._get_tensor_rank(indices_or_sections) == 1
-    ):
-        loop_len = symbolic_helper._size_helper(
-            g, indices_or_sections, g.op("Constant", value_t=torch.tensor(0))
-        )
-        loop_len = opset11.unsqueeze(g, loop_len, 0)
-        loop_condition = g.op("Cast", const_1, to_i=_C_onnx.TensorProtoDataType.BOOL)
-
-        # To make the first slice in the below loop work,
-        # we pad a zero to the first position so that it will be the initial start of slice.
-        padding_0 = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
-        indices_or_sections = g.op("Concat", padding_0, indices_or_sections, axis_i=0)
-
-        final_splits = g.op("SequenceEmpty")
-        # Loop inputs
-        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
-            g, "Loop", loop_len, loop_condition, final_splits, outputs=1, n_blocks=1
-        )
-
-        loop_block = loop_context.block
-        block_input_iter = utils._add_input_to_block(loop_block)
-        cond = utils._add_input_to_block(loop_block)  # noqa: F841
-        final_splits = utils._add_input_to_block(loop_block)
-
-        start = loop_context.op(
-            "Gather", indices_or_sections, block_input_iter, axis_i=0
-        )
-        end = loop_context.op(
-            "Gather",
-            indices_or_sections,
-            loop_context.op("Add", block_input_iter, const_1),
-            axis_i=0,
-        )
-
-        slice = loop_context.op("Slice", self, start, end, axis)
-        final_splits = loop_context.op("SequenceInsert", final_splits, slice)
-
-        # Loop outputs
-        cond_out = loop_context.op("Identity", loop_condition)
-        utils._add_output_to_block(loop_block, cond_out)
-        utils._add_output_to_block(loop_block, final_splits)
-
-        loop_out = loop.node().output()
-        start = g.op(
-            "Gather",
-            indices_or_sections,
-            g.op("Constant", value_t=torch.tensor(-1, dtype=torch.long)),
-            axis_i=0,
-        )
-        start = opset11.unsqueeze(g, start, 0)
-        end = symbolic_helper._size_helper(g, self, axis)
-
-        last_slice = g.op("Slice", self, start, end, axis)
-
-        return g.op("SequenceInsert", loop_out, last_slice)
-
-    else:  # scalar tensor
-        dim_size = symbolic_helper._size_helper(g, self, axis)
-        min_split_size = g.op("Div", dim_size, indices_or_sections)
-        min_split_size_plus_1 = g.op(
-            "Add",
-            min_split_size,
-            const_1,
-        )
-        num_splits_one_extra = g.op("Mod", dim_size, indices_or_sections)
-        splits = g.op("Tile", min_split_size_plus_1, num_splits_one_extra)
-        leftover = g.op(
-            "Tile",
-            min_split_size,
-            g.op(
-                "Sub",
-                opset11.unsqueeze(g, indices_or_sections, 0),
-                num_splits_one_extra,
-            ),
-        )
-
-        splits = g.op("Concat", splits, leftover, axis_i=0)
-        if _outputs is None:
-            return g.op("SplitToSequence", self, splits, axis_i=dim)
-        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::unbind")
-@symbolic_helper.parse_args("v", "i", "i")
-def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
-    if _outputs is None:
-        return g.op(
-            "SplitToSequence",
-            self,
-            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
-            axis_i=dim,
-            keepdims_i=0,
-        )
-
-    splits = g.op("Constant", value_t=torch.tensor([1] * _outputs))
-    outputs = g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-    outputs = [outputs] if _outputs == 1 else outputs
-    squeezed_outputs = [
-        g.op("Squeeze", out, g.op("Constant", value_t=torch.tensor([dim])))
-        for out in outputs
-    ]
-    return squeezed_outputs
-
-
-@_onnx_symbolic("aten::nonzero_numpy")
-# Emitted from `torch.nonzero(x, as_tuple=True)`
-def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
-    return unbind(g, opset9.nonzero(g, input), 1, _outputs=_outputs)
-
-
-@_onnx_symbolic("aten::where")
-@symbolic_helper.parse_args("v", "v", "v", "i")
-def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
-    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
-    if not symbolic_helper._is_bool(condition):
-        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    if self is None:
-        condition = opset9.nonzero(g, condition)
-        return symbolic_helper._unbind_helper(
-            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
-        )
-    return g.op("Where", condition, self, other)
-
-
-@_onnx_symbolic("aten::fake_quantize_per_channel_affine")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i")
-def fake_quantize_per_channel_affine(
-    g: jit_utils.GraphContext,
-    inputs,
-    scale,
-    zero_point,
-    axis,
-    quant_min=-128,
-    quant_max=127,
-):
-    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
-    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
-    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
-        raise errors.SymbolicValueError(
-            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
-            f"Got ({quant_min}, {quant_max})",
-            inputs,
-        )
-    # ONNX defines zero_point to be int8 or uint8
-    if quant_min == 0:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
-    else:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
-    quantized = g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis)
-    if (quant_min, quant_max) == (0, 127):
-        quantized = g.op(
-            "Clip",
-            quantized,
-            opset9.unused(g),
-            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
-        )
-    return g.op("DequantizeLinear", quantized, scale, zero_point, axis_i=axis)
-
-
-@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i")
-def fake_quantize_per_tensor_affine(
-    g: jit_utils.GraphContext,
-    inputs,
-    scale,
-    zero_point,
-    quant_min=-128,
-    quant_max=127,
-):
-    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
-    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
-    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
-        raise errors.SymbolicValueError(
-            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
-            f"Got ({quant_min}, {quant_max})",
-            inputs,
-        )
-    if quant_min == 0:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
-    else:
-        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
-    if (
-        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
-        != _type_utils.JitScalarType.FLOAT
-    ):
-        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    quantized = g.op("QuantizeLinear", inputs, scale, zero_point)
-    if (quant_min, quant_max) == (0, 127):
-        quantized = g.op(
-            "Clip",
-            quantized,
-            opset9.unused(g),
-            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
-        )
-    return g.op("DequantizeLinear", quantized, scale, zero_point)
-
-
-def _reduce_op_symbolic(onnx_op_name):
-    def symbolic(g, self, dim=None, keepdim=None):
-        self = symbolic_helper._maybe_cast_reduce_op_input(g, self)
-        if dim is None:
-            # all-reduce path
-            return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
-        else:
-            keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
-            return g.op(onnx_op_name, self, dim, keepdims_i=keepdim)
-
-    return symbolic
-
-
-@_onnx_symbolic(
-    "aten::sum",
-    decorate=[symbolic_helper._apply_params("ReduceSum", "sum")],
-)
-def _reduce_with_dtype(onnx_op, name):
-    symbolic = _reduce_op_symbolic(onnx_op)
-
-    @symbolic_helper._overload_by_arg_count
-    def reduce(g, *args, **kwargs):
-        @symbolic_helper.parse_args("v", "none")
-        def reduce_nodim(g, self, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return symbolic_helper._unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        @symbolic_helper.parse_args("v", "v", "i", "none")
-        def reduce_dim(g, self, dim, keepdim, dtype):
-            dtype_onnx = None
-            if dtype.node().kind() == "onnx::Constant":
-                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
-                self = g.op("Cast", self, to_i=dtype_onnx)
-            elif dtype.node().kind() != "prim::Constant":
-                return symbolic_helper._unimplemented(name, "dtype", dtype)
-            result = symbolic(g, self, dim, keepdim)
-            if dtype_onnx is not None:
-                result_dtype_onnx = _type_utils.JitScalarType.from_value(
-                    result
-                ).onnx_type()
-                if result_dtype_onnx != dtype_onnx:
-                    result = g.op("Cast", result, to_i=dtype_onnx)
-            return result
-
-        return reduce_nodim, reduce_dim
-
-    return reduce
-
-
-# Ported from
-# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/core.py#L6097
-# NOTE: Supporting aten::unflatten before opset13 needs helper function to adjust ONNX op changes in Concat, Slice, ...
-@_onnx_symbolic("aten::unflatten")
-def unflatten(g: jit_utils.GraphContext, input, dim, unflattened_size):
-    input_dim = symbolic_helper._get_tensor_rank(input)
-    if input_dim is None:
-        return symbolic_helper._unimplemented(
-            "dim",
-            "ONNX and PyTorch use different strategies to split the input. "
-            "Input rank must be known at export time.",
-        )
-
-    # dim could be negative
-    input_dim = g.op("Constant", value_t=torch.tensor([input_dim], dtype=torch.int64))
-    dim = g.op("Add", input_dim, dim)
-    dim = g.op("Mod", dim, input_dim)
-
-    input_size = g.op("Shape", input)
-
-    head_start_idx = g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))
-    head_end_idx = g.op(
-        "Reshape", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
-    )
-    head_part_rank = g.op("Slice", input_size, head_start_idx, head_end_idx)
-
-    dim_plus_one = g.op(
-        "Add", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
-    )
-    tail_start_idx = g.op(
-        "Reshape",
-        dim_plus_one,
-        g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64)),
-    )
-    tail_end_idx = g.op(
-        "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
-    )
-    tail_part_rank = g.op("Slice", input_size, tail_start_idx, tail_end_idx)
-
-    final_shape = g.op(
-        "Concat", head_part_rank, unflattened_size, tail_part_rank, axis_i=0
-    )
-
-    return symbolic_helper._reshape_helper(g, input, final_shape)
-
-
-@_onnx_symbolic("aten::unsafe_chunk")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
-    if _outputs is None:
-        return g.op(
-            "SplitToSequence",
-            self,
-            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
-            axis_i=dim,
-            keepdims_i=0,
-        )
-
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        return symbolic_helper._unimplemented("unsafe_chunk", "unknown dimension size")
-    split_size = (size + chunks - 1) // chunks
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-
-    # TODO: So far we don"t have a module using this method. We"ll keep
-    # this as a constant unless we see a request of dynamics in any
-    # user's modules.
-    splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
-    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::tile")
-def tile(g: jit_utils.GraphContext, self, dims):
-    self_shape = g.op("Shape", self)
-    self_rank = g.op("Size", self_shape)
-    dims_rank = g.op("Size", dims)
-    diff = g.op("Sub", self_rank, dims_rank)
-    const_zero = g.op("Constant", value_t=torch.tensor([0]))
-
-    # 1. If dims is shorter than self.shape pad dims with 1
-    dims_shorter_than_self_shape = g.op("Greater", diff, const_zero)
-    (
-        if_op_greater,
-        (if_context_greater, else_context_greater),
-        _,
-    ) = jit_utils.add_op_with_blocks(
-        g, "If", dims_shorter_than_self_shape, n_blocks=2, outputs=1
-    )
-    const_one = if_context_greater.op("Constant", value_t=torch.LongTensor([1]))
-    diff_1d_greater = if_context_greater.op("Reshape", diff, const_one)
-    exapnd_ones_greater = if_context_greater.op("Expand", const_one, diff_1d_greater)
-    dims_ = if_context_greater.op("Concat", exapnd_ones_greater, dims, axis_i=0)
-    utils._add_output_to_block(if_context_greater.block, dims_)
-    identity_dim = else_context_greater.op("Identity", dims)
-    utils._add_output_to_block(else_context_greater.block, identity_dim)
-    dims_final = if_op_greater.node().output()
-
-    # 2. If dims is longer than self.shape pad self.shape with 1
-    dims_longer_than_self_shape = g.op("Less", diff, const_zero)
-    (
-        if_op_less,
-        (if_context_less, else_context_less),
-        _,
-    ) = jit_utils.add_op_with_blocks(
-        g, "If", dims_longer_than_self_shape, n_blocks=2, outputs=1
-    )
-    const_one = if_context_less.op("Constant", value_t=torch.LongTensor([1]))
-    diff_1d_less = if_context_less.op(
-        "Reshape",
-        if_context_less.op("Abs", diff),
-        const_one,
-    )
-    exapnd_ones_less = if_context_less.op("Expand", const_one, diff_1d_less)
-    self_final_shape = if_context_less.op(
-        "Concat", exapnd_ones_less, self_shape, axis_i=0
-    )
-    self_ = if_context_less.op("Reshape", self, self_final_shape)
-    utils._add_output_to_block(if_context_less.block, self_)
-    identity_self = else_context_less.op("Identity", self)
-    utils._add_output_to_block(else_context_less.block, identity_self)
-    self_final = if_op_less.node().output()
-
-    dims_final = g.op("Cast", dims_final, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return g.op("Tile", self_final, dims_final)
-
-
-@_onnx_symbolic("aten::repeat_interleave")
-def repeat_interleave(
-    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
-):
-    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
-    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
-    input_sizes = symbolic_helper._get_tensor_sizes(self)
-    if repeats_dim is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
-            self,
-        )
-    if repeats_sizes is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
-            self,
-        )
-    if input_sizes is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
-            self,
-        )
-
-    final_dim = dim
-    # if dim is None flatten
-    # By default, use the flattened input array, and return a flat output array
-    if symbolic_helper._is_none(dim):
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([-1]))
-        )
-        dim = torch.tensor(0, dtype=torch.int64)
-    else:
-        dim = symbolic_helper._maybe_get_scalar(dim)
-
-    # Handle cases where dim is negative
-    if dim < 0:
-        dim += len(input_sizes)
-
-    output_sizes = input_sizes.copy()
-    for idx, input_size in enumerate(input_sizes):
-        if input_size is None:
-            output_sizes[idx], input_sizes[idx] = 0, -1
-
-    # Check if all indices should be repeated the same number of times.
-    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
-        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
-            g, self, repeats, dim
-        )
-
-    cond_dynamic_repeats = repeats_dim == 1 and repeats_sizes[0] is None
-    # If input size is dynamic or repeats vector is dynamic
-    if output_sizes[dim] == 0 or cond_dynamic_repeats:
-        reps = symbolic_helper._size_helper(g, self, dim)
-        reps = opset11.unsqueeze(g, reps, 0)
-
-        # Check if repeats is dynamic
-        # As repeats is dynamic, we use a where node as a substitute for the if statement
-        # If repests_dim = 1, expand repeats otherwise use original tensor
-        if cond_dynamic_repeats:
-            repeat_dim = symbolic_helper._size_helper(
-                g, repeats, g.op("Constant", value_t=torch.LongTensor([0]))
-            )
-            repeat_cond = g.op(
-                "Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1]))
-            )
-            repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats)
-    # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
-    # provided along one of the dynamic axes provided. A simple example would be
-    # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2
-    # Now, repeat interleaving can be performed in pytorch when the value of * matches
-    # with the number of elements in repeat, for example if * -> 2, number of repeats
-    # should be 2 as well.
-    else:
-        return opset9.repeat_interleave(g, self, repeats, final_dim)
-
-    reps_like = g.op(
-        "ConstantOfShape",
-        g.op("Shape", repeats),
-        value_t=torch.tensor([1], dtype=torch.long),
-    )
-    r_splits = split(g, repeats, reps_like, 0)
-    i_splits = split(g, self, reps_like, dim)
-
-    output_sizes[dim], input_sizes[dim] = -1, 1
-
-    # Create a loop to iterate over each value along the dimension
-    # and perform individual interleaving using the repeats tensor
-    # Loop is of the following pattern
-    # input (trip_count, cond)
-    #   int trip_count = ...;
-    #   bool cond = ...;
-    #   for (int i=0; i < trip_count && cond; ++i) {
-    #     cond = ...;
-    #   }
-
-    # Loop conditions
-    loop_condition = g.op("Constant", value_t=torch.tensor(1))
-    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    loop_len = reps
-
-    # Create an empty sequence to store final expansions
-    final_splits = g.op("SequenceEmpty")
-
-    # Loop inputs
-    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
-        g, "Loop", loop_len, loop_condition, final_splits, n_blocks=1
-    )
-
-    loop_block = loop_context.block
-    block_input_iter = utils._add_input_to_block(loop_block)
-    cond = utils._add_input_to_block(loop_block)  # noqa: F841
-    final_splits = utils._add_input_to_block(loop_block)
-
-    r_split = loop_context.op("SequenceAt", r_splits, block_input_iter)
-    i_split = loop_context.op("SequenceAt", i_splits, block_input_iter)
-
-    i_split = opset11.unsqueeze(loop_context, i_split, dim + 1)
-    r_concat = [
-        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[: dim + 1])),
-        r_split,
-        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1 :])),
-    ]
-    r_concat = loop_context.op("Concat", *r_concat, axis_i=0)
-    i_split = opset9.expand(loop_context, i_split, r_concat, None)
-    i_split = symbolic_helper._reshape_helper(
-        loop_context, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes))
-    )
-    final_splits = loop_context.op("SequenceInsert", final_splits, i_split)
-
-    # Loop outputs
-    cond_out = loop_context.op(
-        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
-    )
-    utils._add_output_to_block(loop_block, cond_out)
-    utils._add_output_to_block(loop_block, final_splits)
-
-    loop_out = loop.node().output()
-    loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim)
-    return loop_out
-
-
-@_onnx_symbolic("aten::diagonal")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def diagonal(g: jit_utils.GraphContext, self, offset, dim1, dim2):
-    rank = symbolic_helper._get_tensor_rank(self)
-    # Replace negative indexing when rank is known
-    if rank is not None:
-        dim1 = dim1 if dim1 >= 0 else dim1 + rank
-        dim2 = dim2 if dim2 >= 0 else dim2 + rank
-
-    dim1_size = opset9.size(
-        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim1]))
-    )
-    dim2_size = opset9.size(
-        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim2]))
-    )
-    # Create appropriate mask
-    mask_shape = g.op("Concat", dim1_size, dim2_size, axis_i=0)
-    mask = opset9.zeros(g, mask_shape, None, None, None)
-    mask = g.op("EyeLike", mask, k_i=offset)
-    # dim1 and dim2 appended as a dimension at the end of the shape
-
-    if rank is not None:
-        axes = list(range(rank))
-        axes.remove(dim1)
-        axes.remove(dim2)
-        self = g.op("Transpose", self, perm_i=axes + [dim1, dim2])
-    else:
-        return symbolic_helper._unimplemented("diagonal", "unknown input rank")
-
-    # Multiply input and mask to calculate values along diagonal
-    # The mask consists of one values where diagonal values are to be calculated
-    # For example:
-    # [[1.1, 1.2, 1.3],   *    [[1, 0, 0]   =   [[1.1, 0, 0],
-    #  [2.1, 2.2, 2.3],         [0, 1, 0]        [0, 2.2, 0],
-    #  [3.1, 3.2, 3.3]]         [0, 0, 1]]       [0, 0, 3.3]]
-    result = g.op("Mul", self, mask)
-    result = symbolic_helper._reducesum_helper(g, result, axes_i=[-1], keepdims_i=0)
-
-    # Calculate gather indices based on offset and dims
-    # If offset is greater than zero, set offset to zero as this aids in
-    # calculation of selection window
-    offset_op = g.op("Constant", value_t=torch.LongTensor([offset]))
-    if offset >= 0:
-        diag_size = g.op(
-            "Max",
-            g.op("Min", dim1_size, g.op("Sub", dim2_size, offset_op)),
-            g.op("Constant", value_t=torch.LongTensor([0])),
-        )
-        offset = 0
-    else:
-        diag_size = g.op(
-            "Max",
-            g.op("Min", g.op("Add", dim1_size, offset_op), dim2_size),
-            g.op("Constant", value_t=torch.LongTensor([0])),
-        )
-    diag_size = g.op("Concat", diag_size, axis_i=0)
-
-    # Calculate which diagonal values to select
-    # For example, in cases with offsets:
-    # [[0, 1.1, 0]
-    #  [0, 0, 2.2]]
-    # we need to select the last two columns, so we create a tensor
-    # with all columns that are to be selected
-    # So in this example, it is [1, 2]
-    select_window_ones_fill = opset9.ones(g, diag_size, 4, None, None)
-    select_window = g.op(
-        "CumSum",
-        select_window_ones_fill,
-        g.op("Constant", value_t=torch.LongTensor([0])),
-    )
-    select_window = g.op(
-        "Add",
-        select_window,
-        g.op("Constant", value_t=torch.LongTensor([abs(offset) - 1])),
-    )
-
-    gather_shape = [
-        opset9.size(g, result, dim=g.op("Constant", value_t=torch.LongTensor([axis])))
-        for axis in list(range(rank))[:-2]
-    ]
-    gather_shape.append(diag_size)
-    gather_shape = g.op("Concat", *gather_shape, axis_i=0)
-    gather_indices = opset9.zeros(g, gather_shape, 4, None, None)
-
-    # There might be cases where offset value is greater than number of rows/columns
-    # and might cause the diagonal to overrun and as a result of this, diag_size would be zero.
-    # For example, if
-    #       offset = 9, dim1_size = 2 (columns), dim2_size = 4 (rows)
-    #       diag_size = max(min(2, (4-9)), 0) = 0, based on calculation above
-    # Cases with diagonal overrun always result in diag_size = max(0, -ve value) = 0
-    # In cases without diagonal overrun, we select the appropriate rows/columns along which we
-    # are calculating diagonal values. In cases with diagonal overrun, we return a tensor which has
-    # the dimension of the row/column where overrun occurred as 0-dim, as we are essentially
-    # returning an empty tensor
-    overrun_cond = g.op(
-        "Not",
-        g.op(
-            "Equal",
-            diag_size,
-            g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64)),
-        ),
-    )
-
-    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
-        g, "If", overrun_cond, n_blocks=2
-    )
-
-    gather_indices_if_block = if_context.op("Add", gather_indices, select_window)
-    gather_indices_if_block = symbolic_helper._unsqueeze_helper(
-        if_context, gather_indices_if_block, [rank - 1]
-    )
-    final_non_overrun = if_context.op(
-        "GatherND", result, gather_indices_if_block, batch_dims_i=rank - 2
-    )
-    final_overrun = opset9.zeros(else_context, gather_shape, 6, None, None)
-    utils._add_output_to_block(if_context.block, final_non_overrun)
-    utils._add_output_to_block(else_context.block, final_overrun)
-    return if_op
-
-
-# Quantized ops
-
-
-@_onnx_symbolic("quantized::linear")
-def quantized_linear(
-    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.linear(g, input, weight, bias)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::linear_relu")
-def quantized_linear_relu(
-    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.linear(g, input, weight, bias)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv1d_relu")
-def quantized_conv1d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv2d_relu")
-def quantized_conv2d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv3d_relu")
-def quantized_conv3d_relu(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
-    output = opset9.relu(g, output)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv1d")
-def quantized_conv1d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv2d")
-def quantized_conv2d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv3d")
-def quantized_conv3d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose1d")
-def quantized_conv_transpose1d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose2d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose2d")
-def quantized_conv_transpose2d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose2d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-@_onnx_symbolic("quantized::conv_transpose3d")
-def quantized_conv_transpose3d(
-    g: jit_utils.GraphContext,
-    q_input,
-    q_weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    dilation,
-    groups,
-    op_scale,
-    op_zero_point,
-):
-    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
-    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
-    q_bias = symbolic_helper.requantize_bias_helper(
-        g, bias, input_scale, weight_scale, axis
-    )
-    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
-
-    output = opset9.conv_transpose3d(
-        g, input, weight, bias, stride, padding, output_padding, groups, dilation
-    )
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset13 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset14.py b/torch/onnx/symbolic_opset14.py
index 80743c6a4912..367aa9eb0832 100644
--- a/torch/onnx/symbolic_opset14.py
+++ b/torch/onnx/symbolic_opset14.py
@@ -1,291 +1,8 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-"""This file exports ONNX ops for opset 14.
+"""Backward compatibility module for torch.onnx.symbolic_opset14."""
 
-Note [ONNX operators that are added/updated in opset 14]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-New operators:
-    HardSwish, Trilu
-
-Updated operators:
-    Reshape
-    Add, Sub, Mul, Div
-    GRU, LSTM, RNN
-    BatchNorm, Cumsum, Relu
-"""
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
 from __future__ import annotations
 
-import functools
-
-import torch
-from torch.onnx import _constants, _type_utils, symbolic_helper
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils, registration
-
-
-__all__ = [
-    "hardswish",
-    "tril",
-    "triu",
-    "reshape",
-    "batch_norm",
-    "quantized_hardswish",
-    "scaled_dot_product_attention",
-]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=14)
-
-
-@_onnx_symbolic("aten::hardswish")
-@symbolic_helper.parse_args("v")
-def hardswish(g: jit_utils.GraphContext, self):
-    return g.op("HardSwish", self)
-
-
-@_onnx_symbolic("aten::tril")
-def tril(g: jit_utils.GraphContext, self, diagonal, out=None):
-    return g.op("Trilu", self, diagonal, upper_i=0)
-
-
-@_onnx_symbolic("aten::triu")
-def triu(g: jit_utils.GraphContext, self, diagonal, out=None):
-    return g.op("Trilu", self, diagonal, upper_i=1)
-
-
-@_onnx_symbolic("aten::reshape")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v")
-def reshape(g: jit_utils.GraphContext, self, shape):
-    # NOTE: Due to bug in ORT https://github.com/microsoft/onnxruntime/issues/10664
-    #       Reshape export cannot utilize the new allowzero attribute introduced in opset 14.
-    return symbolic_helper._reshape_helper(g, self, shape, allowzero=0)
-
-
-@_onnx_symbolic("aten::batch_norm")
-@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
-def batch_norm(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    running_mean,
-    running_var,
-    training,
-    momentum,
-    eps,
-    cudnn_enabled,
-):
-    if (
-        torch.is_autocast_enabled()
-        and not symbolic_helper.args_have_same_dtype(
-            [input, weight, bias, running_mean, running_var]
-        )
-        and GLOBALS.export_onnx_opset_version < 15
-    ):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "BatchNormalization",
-            14,
-            15,
-            "All input tensors must have the same `dtype`."
-            " Turn off Autocast or export using opset version 15.",
-            input,
-        )
-
-    symbolic_helper.check_training_mode(training, "batch_norm")
-    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
-        g, input, weight, bias, running_mean, running_var
-    )
-    out = g.op(
-        "BatchNormalization",
-        input,
-        weight,
-        bias,
-        running_mean,
-        running_var,
-        epsilon_f=eps,
-        momentum_f=1 - momentum,
-        training_mode_i=0 if not training else 1,
-        outputs=1 if not training else 3,
-    )
-    if not training:
-        return out
-    else:
-        res, new_running_mean, new_running_var = out
-        new_running_mean.setType(running_mean.type())
-        new_running_var.setType(running_var.type())
-        return res
-
-
-@_onnx_symbolic("quantized::hardswish")
-def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = hardswish(g, x)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-# Ported from
-# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/nn.py#L1504
-# aten_scaled_dot_product_attention
-# NOTE: Need op.Trilu
-@_onnx_symbolic("aten::scaled_dot_product_attention")
-@symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v", "b")
-def scaled_dot_product_attention(
-    g: jit_utils.GraphContext,
-    query: torch._C.Value,
-    key: torch._C.Value,
-    value: torch._C.Value,
-    attn_mask: torch._C.Value | None = None,
-    dropout_p: float = 0.0,
-    is_causal: bool = False,
-    scale: torch._C.Value | None = None,
-    enable_gqa: bool = False,
-):
-    assert (not is_causal) or (is_causal and symbolic_helper._is_none(attn_mask)), (
-        "is_causal and attn_mask cannot be set at the same time"
-    )
-    assert not enable_gqa, (
-        "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
-    )
-
-    if symbolic_helper._is_none(scale):
-        scale = _attention_scale(g, query)
-
-    if is_causal:
-        attn_mask = _causal_attention_mask(g, query, key)
-
-    # Swap the last two axes of key
-    # NOTE: onnx-script has different logic here, because the attribute perms in
-    # transpose needs list of ints
-    key_shape_builtin = symbolic_helper._get_tensor_rank(key)
-    key_transposed_axes = list(range(key_shape_builtin))
-    key_transposed_axes[-1], key_transposed_axes[-2] = (
-        key_transposed_axes[-2],
-        key_transposed_axes[-1],
-    )
-    key_transposed = g.op("Transpose", key, perm_i=key_transposed_axes)
-
-    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
-    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
-    query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
-    key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
-    mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
-
-    if symbolic_helper._is_none(attn_mask):
-        mul_qk_add = mul_qk
-        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
-    elif (
-        _type_utils.JitScalarType.from_value(attn_mask)
-        == _type_utils.JitScalarType.BOOL
-    ):
-        # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
-        const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
-        const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
-        attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
-        mul_qk_add = g.op("Add", mul_qk, attn_mask)
-        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
-        # When using scaled dot product attention with a boolean mask, the softmax operation might return NaN values
-        # due to the presence of -inf in an entire row (padding tokens), resulting in 0/0 (NaN) in the softmax output.
-        # This is because there's no safe softmax imp in ONNX, so we need to handle NaN values explicitly to match
-        # the behavior of PyTorch with boolean masks.
-        attn_weight = g.op("Where", g.op("IsNaN", attn_weight), const_zero, attn_weight)
-    elif _type_utils.JitScalarType.from_value(attn_mask) in (
-        _type_utils.JitScalarType.FLOAT,
-        _type_utils.JitScalarType.HALF,
-        _type_utils.JitScalarType.BFLOAT16,
-    ):
-        mul_qk_add = g.op("Add", mul_qk, attn_mask)
-        attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
-    else:
-        raise ValueError(
-            f"Unsupported type for attn_mask: {_type_utils.JitScalarType.from_value(attn_mask)}"
-        )
-
-    if dropout_p != 0:
-        attn_weight = g.op(
-            "Dropout",
-            attn_weight,
-            g.op("Constant", value_t=torch.tensor(dropout_p, dtype=torch.float)),
-        )
-
-    return g.op("MatMul", attn_weight, value)
-
-
-def _attention_scale(
-    g: jit_utils.GraphContext, query: torch._C.Value
-) -> torch._C.Value:
-    """Calculate the scale factor for the attention result.
-
-    Args:
-        query: Tensor of shape [..., L, E]
-
-    Returns:
-        Scalar scale factor := 1 / math.sqrt(query.size(-1))
-    """
-    query_shape = g.op("Shape", query)
-    query_shape_last = g.op(
-        "Slice",
-        query_shape,
-        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
-        g.op(
-            "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
-        ),
-    )
-    embedding_size = g.op(
-        "Cast",
-        query_shape_last,
-        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
-    )
-    const_one = g.op("Constant", value_t=torch.tensor([1.0], dtype=torch.float))
-    scale = g.op("Div", const_one, g.op("Sqrt", embedding_size))
-    # Add a Cast to convert the scale back to original type
-    scale = g.op(
-        "Cast",
-        scale,
-        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
-    )
-    return scale
-
-
-def _causal_attention_mask(
-    g: jit_utils.GraphContext, query: torch._C.Value, key: torch._C.Value
-) -> torch._C.Value:
-    """Create a causal mask for the given query and key tensors.
-
-    Equivalent to::
-        mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
-        attn_mask = torch.zeros(L, S, dtype=torch.float)
-        attn_mask = attn_mask.masked_fill(not mask, -float("inf"))
-
-    Args:
-        query: Tensor of shape [..., L, E]
-        key: Tensor of shape [..., S, E]
-
-    Returns:
-        Tensor of shape [L, S]
-    """
 
-    query_shape = g.op("Shape", query)
-    key_shape = g.op("Shape", key)
+__all__: list[str] = []
 
-    last_idx = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
-    second_last_idx = g.op("Constant", value_t=torch.tensor([-2], dtype=torch.int64))
-    target_length = g.op("Slice", query_shape, second_last_idx, last_idx)
-    source_length = g.op("Slice", key_shape, second_last_idx, last_idx)
-    # attn_mask = torch.ones(L, S) := {
-    size = g.op("Concat", target_length, source_length, axis_i=0)
-    const_one = g.op("Constant", value_t=torch.tensor([1.0]))
-    attn_mask = g.op("Expand", const_one, size)
-    # }
-    attn_mask = g.op("Trilu", attn_mask, upper_i=0)
-    # The causal mask has 0s in the lower triangle and -inf in the upper triangle.
-    const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
-    const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
-    attn_mask = g.op(
-        "Where", g.op("Equal", attn_mask, const_zero), const_neg_inf, const_zero
-    )
-    return attn_mask
+from torch.onnx._internal.torchscript_exporter.symbolic_opset14 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset15.py b/torch/onnx/symbolic_opset15.py
index 08f8dcbf5a22..e04e3b045212 100644
--- a/torch/onnx/symbolic_opset15.py
+++ b/torch/onnx/symbolic_opset15.py
@@ -1,80 +1,8 @@
-# mypy: allow-untyped-defs
-"""This file exports ONNX ops for opset 15.
+"""Backward compatibility module for torch.onnx.symbolic_opset15."""
 
-Note [ONNX operators that are added/updated in opset 15]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/master/docs/Changelog.md#version-15-of-the-default-onnx-operator-set
-New operators:
-    Bernoulli
-    CastLike
-    Optional
-    OptionalGetElement
-    OptionalHasElement
+from __future__ import annotations
 
-Updated operators:
-    BatchNormalization https://github.com/onnx/onnx/pull/3545
-                        Backwards compatible
-                        TODO: test coverage for mixed types inputs.
-    Pow                https://github.com/onnx/onnx/pull/3412
-                        Backwards compatible
-                        TODO: bfloat16 support.
-    Shape              https://github.com/onnx/onnx/pull/3580
-                        Backwards compatible
-                        TODO: optional start/end attribute.
-"""
 
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
+__all__: list[str] = []
 
-import functools
-
-import torch
-from torch import _C
-from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
-from torch.onnx._internal import jit_utils, registration
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=15)
-
-
-@_onnx_symbolic("aten::__is_")
-def aten__is_(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_none(other):
-        if isinstance(self.type(), _C.OptionalType):
-            none = g.op("OptionalHasElement", self)
-            return g.op("Not", none)
-        else:
-            return g.op("Constant", value_t=torch.BoolTensor([0]))
-    return opset9.eq(g, self, other)
-
-
-@_onnx_symbolic("aten::__isnot_")
-@opset9.wrap_logical_op_with_negation  # type: ignore[has-type]
-def aten__isnot_(g: jit_utils.GraphContext, self, other):
-    return aten__is_(g, self, other)
-
-
-@_onnx_symbolic("aten::bernoulli")
-def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
-    if out is not None and not symbolic_helper._is_none(out):
-        symbolic_helper._unimplemented(
-            "Bernoulli", "out parameter is not supported for bernoulli", input
-        )
-    if generator is not None and not symbolic_helper._is_none(generator):
-        symbolic_helper._unimplemented(
-            "Bernoulli", "generator is not supported for bernoulli", input
-        )
-    if p is None or symbolic_helper._is_none(p):
-        return g.op("Bernoulli", input)
-    return opset9.bernoulli(g, input, p, generator, out)
-
-
-@_onnx_symbolic("prim::unchecked_cast")
-def prim_unchecked_cast(g: jit_utils.GraphContext, self):
-    # exists to refine the type of the Value
-    # if x is Optional[Tensor], unchecked_cast will cast
-    # x to Tensor, so the rest of the graph knows that x is a Tensor.
-    if isinstance(self.type(), _C.OptionalType):
-        return g.op("OptionalGetElement", self)
-
-    return self
+from torch.onnx._internal.torchscript_exporter.symbolic_opset15 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset16.py b/torch/onnx/symbolic_opset16.py
index d4a7baa78c2d..9a248bb0f26c 100644
--- a/torch/onnx/symbolic_opset16.py
+++ b/torch/onnx/symbolic_opset16.py
@@ -1,185 +1,8 @@
-# mypy: allow-untyped-defs
-"""This file exports ONNX ops for opset 16.
+"""Backward compatibility module for torch.onnx.symbolic_opset16."""
 
-Note [ONNX Operators that are added/updated in opset 16]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-16-of-the-default-onnx-operator-set
-New operators:
-    GridSample https://github.com/onnx/onnx/pull/3557
 
-Updated operators:
-    Identity
-    If
-    LeakyRelu
-    Loop
-    PRelu
-    RoiAlign
-    Scan
-    ScatterElements
-    ScatterND
-    Where
-    GreaterOrEqual
-    LessOrEqual
-"""
+__all__: list[str] = []
 
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-import functools
-
-import torch
-from torch.nn.functional import (
-    GRID_SAMPLE_INTERPOLATION_MODES,
-    GRID_SAMPLE_PADDING_MODES,
-)
-from torch.onnx import _type_utils, errors, symbolic_helper, utils
-from torch.onnx._internal import jit_utils, registration
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=16)
-
-
-# note (mkozuki): Why `grid_sampler` instead of `grid_sample`?
-# Because `torch.nn.functional.grid_sample` calls `torch.grid_sampler`.
-@_onnx_symbolic("aten::grid_sampler")
-@symbolic_helper.parse_args("v", "v", "i", "i", "b")
-def grid_sampler(
-    g: jit_utils.GraphContext,
-    input,
-    grid,
-    mode_enum,
-    padding_mode_enum,
-    align_corners,
-):
-    # Check the input and grid tensor rank beforehand.
-    if symbolic_helper._get_tensor_rank(input) == 5:
-        return symbolic_helper._onnx_unsupported("GridSample with 5D volumetric input")
-    mode_s = {v: k for k, v in GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg]
-    padding_mode_s = {v: k for k, v in GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg]
-        padding_mode_enum
-    ]
-    return g.op(
-        "GridSample",
-        input,
-        grid,
-        align_corners_i=int(align_corners),
-        mode_s=mode_s,
-        padding_mode_s=padding_mode_s,
-    )
-
-
-@_onnx_symbolic("aten::scatter_add")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
-    src_type = _type_utils.JitScalarType.from_value(
-        src, _type_utils.JitScalarType.UNDEFINED
-    )
-    src_sizes = symbolic_helper._get_tensor_sizes(src)
-    index_sizes = symbolic_helper._get_tensor_sizes(index)
-
-    if len(src_sizes) != len(index_sizes):
-        return symbolic_helper._unimplemented(
-            "scatter_add",
-            f"`index` ({index_sizes}) should have the same dimensionality as `src` ({src_sizes})",
-        )
-
-    # PyTorch only allows index shape <= src shape, so we can only consider
-    # taking index as subset size to src, like PyTorch does. When sizes for src
-    # and index are not matched or there are dynamic axes, we take index shape to
-    # slice src to accommodate.
-    if src_sizes != index_sizes or None in index_sizes:
-        adjusted_shape = g.op("Shape", index)
-        starts = g.op("Constant", value_t=torch.tensor([0] * len(index_sizes)))
-        src = g.op("Slice", src, starts, adjusted_shape)
-
-    src = symbolic_helper._maybe_get_scalar(src)
-    if symbolic_helper._is_value(src):
-        return g.op("ScatterElements", self, index, src, axis_i=dim, reduction_s="add")
-    else:
-        # Check if scalar "src" has same type as self (PyTorch allows different
-        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
-        if _type_utils.JitScalarType.from_value(self) != src_type:
-            src = g.op(
-                "Cast",
-                src,
-                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
-            )
-
-        return g.op(
-            "ScatterElements",
-            self,
-            index,
-            src,
-            axis_i=dim,
-            reduction_s="add",
-        )
-
-
-@_onnx_symbolic("aten::scatter_reduce")
-@symbolic_helper.parse_args("v", "i", "v", "v", "s", "b")
-def scatter_reduce(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    dim: int,
-    index: torch._C.Value,
-    src: torch._C.Value,
-    reduce: str,
-    include_self: bool,
-):
-    if reduce == "mean":
-        raise errors.OnnxExporterError(
-            "ONNX does not support mean reduction for scatter_reduce"
-        )
-    if not include_self:
-        raise errors.OnnxExporterError(
-            "ONNX does not support include_self=False for scatter_reduce"
-        )
-
-    reduce_mode = {  # convert torch string name to onnx string name
-        "mean": "none",  # 'mean' doesn't support in ONNX 1.14 definition
-        "sum": "add",
-        "prod": "mul",
-        "amin": "min",
-        "amax": "max",
-    }
-    onnx_reduce = reduce_mode[reduce]
-
-    self_rank = g.op("Size", g.op("Shape", self))
-
-    # if self_rank == 0:  # assert (index_rank == 0 and rank_src == 0)
-    self_rank_is_zero = g.op(
-        "Equal", self_rank, g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
-    )
-    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
-        g, "If", self_rank_is_zero, n_blocks=2, outputs=3
-    )
-    neg_1 = if_context.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
-
-    self_reshape = if_context.op("Reshape", self, neg_1)
-    utils._add_output_to_block(if_context.block, self_reshape)
-    index_reshape = if_context.op("Reshape", index, neg_1)
-    utils._add_output_to_block(if_context.block, index_reshape)
-    src_reshape = if_context.op("Reshape", src, neg_1)
-    utils._add_output_to_block(if_context.block, src_reshape)
-
-    self_identity = else_context.op("Identity", self)
-    utils._add_output_to_block(else_context.block, self_identity)
-    index_identitye = else_context.op("Identity", index)
-    utils._add_output_to_block(else_context.block, index_identitye)
-    src_identity = else_context.op("Identity", src)
-    utils._add_output_to_block(else_context.block, src_identity)
-
-    result = g.op("ScatterElements", *if_op, axis_i=dim, reduction_s=onnx_reduce)
-
-    # if self_rank == 0:
-    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
-        g, "If", self_rank_is_zero, n_blocks=2, outputs=1
-    )
-    result_squeezed = if_context.op("Squeeze", result)
-    utils._add_output_to_block(if_context.block, result_squeezed)
-    result_identity = else_context.op("Identity", result)
-    utils._add_output_to_block(else_context.block, result_identity)
-    result_final = if_op.node().output()
-
-    return result_final
+from torch.onnx._internal.torchscript_exporter.symbolic_opset16 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset17.py b/torch/onnx/symbolic_opset17.py
index bcf80058fe2a..800acd446b5d 100644
--- a/torch/onnx/symbolic_opset17.py
+++ b/torch/onnx/symbolic_opset17.py
@@ -1,239 +1,8 @@
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-"""This file exports ONNX ops for opset 17.
+"""Backward compatibility module for torch.onnx.symbolic_opset17."""
 
-Note [ONNX Operators that are added/updated in opset 17]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-17-of-the-default-onnx-operator-set
-New operators:
-    BlackmanWindow
-    DFT
-    HammingWindow
-    HannWindow
-    LayerNormalization
-    MelWeightMatrix
-    STFT
-    SequenceMap
-"""
 
-import functools
-from collections.abc import Sequence
-from typing import Optional
+__all__: list[str] = []
 
-import torch
-from torch import _C
-from torch.onnx import _type_utils, errors, symbolic_helper
-from torch.onnx._internal import jit_utils, registration
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-__all__ = ["layer_norm", "stft", "quantized_layer_norm"]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=17)
-
-
-@_onnx_symbolic("aten::layer_norm")
-@symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
-def layer_norm(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    normalized_shape: Sequence[int],
-    weight: _C.Value,
-    bias: _C.Value,
-    eps: float,
-    cudnn_enable: bool,
-):
-    # normalized_shape: input shape from an expected input of size
-    # axis: The first normalization dimension.
-    # layer_norm normalizes on the last D dimensions,
-    # where D is the size of normalized_shape
-    axis = -len(normalized_shape)
-    scalar_type = _type_utils.JitScalarType.from_value(
-        input, _type_utils.JitScalarType.FLOAT
-    )
-    dtype = scalar_type.dtype()
-    if symbolic_helper._is_none(weight):
-        weight_value = torch.ones(normalized_shape, dtype=dtype)
-        weight = g.op("Constant", value_t=weight_value)
-    if symbolic_helper._is_none(bias):
-        bias_value = torch.zeros(normalized_shape, dtype=dtype)
-        bias = g.op("Constant", value_t=bias_value)
-    return g.op(
-        "LayerNormalization",
-        input,
-        weight,
-        bias,
-        epsilon_f=eps,
-        axis_i=axis,
-    )
-
-
-@_onnx_symbolic("quantized::layer_norm")
-def quantized_layer_norm(
-    g: jit_utils.GraphContext,
-    x,
-    normalized_shape,
-    weight,
-    bias,
-    eps,
-    op_scale,
-    op_zero_point,
-):
-    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
-
-    output = layer_norm(g, x, normalized_shape, weight, bias, eps, False)
-
-    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
-
-
-def _compute_edge_sizes(n_fft, window_size):
-    """Helper function to compute the sizes of the edges (left and right)
-    of a given window centered within an FFT size."""
-    left = (n_fft - window_size) // 2
-    right = n_fft - left - window_size
-    return left, right
-
-
-@_onnx_symbolic("aten::stft")
-@symbolic_helper.parse_args("v", "i", "i", "i", "v", "b", "b", "b", "b")
-def stft(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    n_fft: int,
-    hop_length: Optional[int] = None,
-    win_length: Optional[int] = None,
-    window: Optional[_C.Value] = None,
-    normalized: bool = False,
-    onesided: Optional[bool] = True,
-    return_complex: Optional[bool] = False,
-    align_to_window: Optional[bool] = None,
-) -> _C.Value:
-    """Associates `torch.stft` with the `STFT` ONNX operator.
-    Note that torch.stft calls _VF.stft, without centering or padding options.
-    Hence, this function does not contain these two arguments.
-    See torch.stft source code for more info.
-
-    Args:
-        g: Graph to write the ONNX representation into
-        input: Input tensor for the transformation
-        n_fft: FFT size
-        hop_length: Size of the hop. Defaults to `floot(n_fft // 4)`
-        win_length: Size of the analysis window. Defaults to `n_fft`
-        window: Analysis window. Defaults to a window of all ones
-        normalized: Whether to return a normalized STFT
-        onesided: Whether to return only half (+1) of the results, given the
-            symmetry of the STFT
-        return_complex: Whether to return the complex value (Note: Must be
-            `False` or `None`)
-
-    Returns:
-        op: Operator for torch.stft associated with STFT (ONNX)
-    """
-    # Checks
-    if return_complex:
-        raise errors.SymbolicValueError(
-            msg="STFT does not currently support complex types", value=input
-        )
-
-    if align_to_window is not None:
-        raise errors.SymbolicValueError(
-            msg="STFT does not currently support the align_to_window option",
-            value=input,
-        )  # TODO(#145944): add compatibility with align_to_window option.
-
-    # Get STFT sizes
-    frame_step_value = hop_length if hop_length is not None else n_fft // 4
-    frame_step_const = g.op(
-        "Constant", value_t=torch.tensor(frame_step_value, dtype=torch.int64)
-    )
-    frame_length_const = g.op(
-        "Constant", value_t=torch.tensor(n_fft, dtype=torch.int64)
-    )
-
-    # Pre-process input if needed
-    signal = input
-    signal_rank = symbolic_helper._get_tensor_rank(signal)
-    if signal_rank == 1:
-        # Add batch dimension
-        signal = g.op(
-            "Unsqueeze",
-            signal,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-    elif signal_rank is None or signal_rank > 2:
-        raise errors.SymbolicValueError(
-            msg="STFT can only take inputs of 1 [signal] or 2 [batch, signal] dimensions. "
-            f"Current rank of signal is {signal_rank}, please reduce it.",
-            value=input,
-        )
-
-    # Get window and make sure it's the same size as `win_length` or `n_fft`
-    n_win = symbolic_helper._get_tensor_dim_size(window, dim=0)
-    if n_win is not None:
-        win_length_default = win_length if win_length else n_fft
-        assert n_win == win_length_default, (
-            "Analysis window size must equal `win_length` or `n_fft`. "
-            f"Please, set `win_length` or `n_fft` to match `window` size ({n_win})",
-        )
-
-        # Center window around zeros if needed (required by ONNX's STFT)
-        if n_win < n_fft:
-            left, right = _compute_edge_sizes(n_fft, n_win)
-            left_win = g.op("Constant", value_t=torch.zeros(left))
-            right_win = g.op("Constant", value_t=torch.zeros(right))
-            window = g.op("Concat", left_win, window, right_win, axis_i=0)
-
-    # Create window, if needed
-    if symbolic_helper._is_none(window):
-        if win_length:
-            if win_length > n_fft:
-                raise errors.SymbolicValueError(
-                    msg="The analysis window can't be longer than the size of the FFT. "
-                    f"Please set `win_length` ({win_length}) to `n_fft` ({n_fft}) or less.",
-                    value=input,
-                )
-
-            # Center window, if needed
-            left, right = _compute_edge_sizes(n_fft, win_length)
-            torch_window = torch.hstack(
-                (torch.zeros(left), torch.ones(win_length), torch.zeros(right))
-            )
-        else:
-            # Rectangle window
-            torch_window = torch.ones(n_fft)
-        assert torch_window.shape[0] == n_fft
-        window = g.op("Constant", value_t=torch_window)
-    window = g.op(
-        "Cast", window, to_i=_type_utils.JitScalarType.from_value(signal).onnx_type()
-    )
-
-    # Run STFT
-    result = g.op(
-        "STFT",
-        signal,
-        frame_step_const,
-        window,
-        frame_length_const,
-        onesided_i=1 if onesided is None or onesided else 0,
-    )
-
-    # Transpose to mimic torch.stft's behavior
-    result = g.op("Transpose", result, perm_i=[0, 2, 1, 3])
-
-    # Remove batch dimension, if needed
-    if signal_rank == 1:
-        result = g.op(
-            "Squeeze",
-            result,
-            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
-        )
-
-    # Normalize, if needed
-    if normalized:
-        sqrt_nfft = torch.sqrt(torch.tensor(n_fft, dtype=signal.type().dtype()))
-        result = g.op("Div", result, g.op("Constant", value_t=sqrt_nfft))
-
-    return result
+from torch.onnx._internal.torchscript_exporter.symbolic_opset17 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset18.py b/torch/onnx/symbolic_opset18.py
index 76f5d4df6ec2..cc07a60f018d 100644
--- a/torch/onnx/symbolic_opset18.py
+++ b/torch/onnx/symbolic_opset18.py
@@ -1,265 +1,8 @@
-# mypy: allow-untyped-defs
-"""This file exports ONNX ops for opset 18.
+"""Backward compatibility module for torch.onnx.symbolic_opset18."""
 
-Note [ONNX Operators that are added/updated in opset 18]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-18-of-the-default-onnx-operator-set
-New operators:
-    BitwiseAnd
-    CenterCropPad
-    Col2Im
-    Mish
-    OptionalGetElement
-    OptionalHasElement
-    Pad
-    Resize
-    ScatterElements
-    ScatterND
-    Split
-"""
 
-import functools
-from collections.abc import Sequence
-from typing import Optional
+__all__: list[str] = []
 
-import torch
-from torch import _C
-from torch.onnx import _type_utils, symbolic_helper, symbolic_opset9 as opset9
-from torch.onnx._internal import jit_utils, registration
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in symbolic_helper.py
-
-__all__ = [
-    "col2im",
-]
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=18)
-
-
-@_onnx_symbolic("aten::__and_")
-@_onnx_symbolic("aten::bitwise_and")
-def __and_(g: jit_utils.GraphContext, self, other):
-    # do type promotion (scalars don't seem to apply)
-    args = [self, other]
-    # type promotion doesn't happen with torch.bitwise_and(tensor, scalar)
-    prom_args = [arg for arg in args if symbolic_helper._get_tensor_rank(arg)]
-    if len(prom_args) == 0:
-        prom_args = args
-    promotion_jit_type = symbolic_helper._type_promote_from_values(*prom_args)
-    self = symbolic_helper._maybe_cast_to_type(g, self, promotion_jit_type)
-    other = symbolic_helper._maybe_cast_to_type(g, other, promotion_jit_type)
-    if promotion_jit_type == _type_utils.JitScalarType.BOOL:
-        return g.op("And", self, other)
-    return g.op("BitwiseAnd", self, other)
-
-
-@_onnx_symbolic("aten::col2im")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is")
-def col2im(
-    g,
-    input: _C.Value,
-    output_size: _C.Value,
-    kernel_size: _C.Value,
-    dilation: Sequence[int],
-    padding: Sequence[int],
-    stride: Sequence[int],
-):
-    # convert [i0, i1, ..., in] into [i0, i0, i1, i1, ..., in, in]
-    adjusted_padding: list[int] = []
-    for pad in padding:
-        adjusted_padding.extend(pad for _ in range(2))
-
-    num_dimensional_axis = symbolic_helper._get_tensor_sizes(output_size)[0]
-    if not adjusted_padding:
-        adjusted_padding = [0, 0] * num_dimensional_axis
-
-    if not dilation:
-        dilation = [1] * num_dimensional_axis
-
-    if not stride:
-        stride = [1] * num_dimensional_axis
-
-    return g.op(
-        "Col2Im",
-        input,
-        output_size,
-        kernel_size,
-        dilations_i=dilation,
-        pads_i=adjusted_padding,
-        strides_i=stride,
-    )
-
-
-@_onnx_symbolic(
-    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
-)
-@_onnx_symbolic(
-    "aten::prod",
-    decorate=[
-        symbolic_helper._apply_params(
-            "ReduceProd", "prod", allow_multi_dim_support=False
-        )
-    ],
-)
-def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
-    return symbolic_helper._reduce_with_dtype_helper(
-        onnx_op, name, allow_multi_dim_support
-    )
-
-
-@_onnx_symbolic("aten::native_layer_norm")
-@symbolic_helper.quantized_args(True, False, False, False)
-@symbolic_helper.parse_args("v", "is", "v", "v", "f")
-def _native_layer_norm(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    normalized_shape: Sequence[int],
-    weight: _C.Value,
-    bias: _C.Value,
-    eps: float,
-) -> tuple[_C.Value, _C.Value, _C.Value]:
-    return opset9.native_layer_norm(g, input, normalized_shape, weight, bias, eps)
-
-
-@_onnx_symbolic("aten::glu")
-@symbolic_helper.parse_args("v", "i")
-def _glu(g: jit_utils.GraphContext, input, dim):
-    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
-    if dim_size is not None:
-        assert dim_size % 2 == 0
-
-    first, second = g.op("Split", input, axis_i=dim, num_outputs_i=2, outputs=2)
-    return g.op("Mul", first, g.op("Sigmoid", second))
-
-
-@_onnx_symbolic("aten::max")
-# torch.max (same for torch.min) actually has two interfaces smashed together:
-# torch.max(x, dim, keepdim) and torch.max(x, y)
-# TODO(justinchuby): Support multiple quantized args in output
-def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::maximum")
-@symbolic_helper.quantized_args(True, True)
-def maximum(g: jit_utils.GraphContext, input, other):
-    return max(g, input, dim_or_y=other)
-
-
-@_onnx_symbolic("aten::min")
-# TODO(justinchuby): Support multiple quantized args in output
-def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::minimum")
-@symbolic_helper.quantized_args(True, True)
-def minimum(g: jit_utils.GraphContext, input, other):
-    return min(g, input, dim_or_y=other)
-
-
-@_onnx_symbolic("aten::amax")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "is", "i")
-def amax(g: jit_utils.GraphContext, self, dim, keepdim):
-    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-    return g.op("ReduceMax", self, axes, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::amin")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "is", "i")
-def amin(g: jit_utils.GraphContext, self, dim, keepdim):
-    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-    return g.op("ReduceMin", self, axes, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::aminmax")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "i")
-def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
-    if not symbolic_helper._is_none(dim):
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-        axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
-        return g.op("ReduceMin", self, axes, keepdims_i=keepdim), g.op(
-            "ReduceMax", self, axes, keepdims_i=keepdim
-        )
-    else:
-        return g.op("ReduceMin", self, keepdims_i=keepdim), g.op(
-            "ReduceMax", self, keepdims_i=keepdim
-        )
-
-
-@_onnx_symbolic("aten::var_mean")
-def _var_mean(g: jit_utils.GraphContext, input, *args):
-    if len(args) == 1:
-        return symbolic_helper._var_mean_helper(g, input, None, args[0], None)
-    else:
-        return symbolic_helper._var_mean_helper(g, input, *args)
-
-
-@_onnx_symbolic("aten::logsumexp")
-@symbolic_helper.parse_args("v", "is", "i")
-def _logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
-    if dim is None:
-        return g.op("ReduceLogSumExp", input, keepdims_i=0)
-    else:
-        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
-        return g.op("ReduceLogSumExp", input, axes, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::linalg_matrix_norm")
-@symbolic_helper.parse_args("v", "v", "is", "b", "v")
-def _linalg_matrix_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: torch._C.Value,
-    dim: list[int],
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    return opset9.linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
-
-
-@_onnx_symbolic("aten::embedding_bag")
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    return symbolic_helper._embedding_bag_helper(
-        g,
-        embedding_matrix,
-        indices,
-        offsets,
-        scale_grad_by_freq,
-        mode,
-        sparse,
-        per_sample_weights,
-        include_last_offset,
-        padding_idx,
-    )
-
-
-@_onnx_symbolic("aten::linalg_vector_norm")
-@symbolic_helper.parse_args("v", "f", "is", "b", "v")
-def linalg_vector_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: float,
-    dim: Optional[Sequence[int]],
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset18 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset19.py b/torch/onnx/symbolic_opset19.py
index 781bc2d200c7..4f7a54fc1dd3 100644
--- a/torch/onnx/symbolic_opset19.py
+++ b/torch/onnx/symbolic_opset19.py
@@ -1,31 +1,8 @@
-"""This file exports ONNX ops for opset 19.
+"""Backward compatibility module for torch.onnx.symbolic_opset19."""
 
-Note [ONNX Operators that are added/updated in opset 19]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-19-of-the-default-onnx-operator-set
-New operators:
-AveragePool
-Cast
-CastLike
-Constant
-DeformConv
-DequantizeLinear
-Equal
-Identity
-If
-Loop
-Pad
-QuantizeLinear
-Reshape
-Resize
-Scan
-Shape
-Size
-"""
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in symbolic_helper.py
 
 __all__: list[str] = []
+
+from torch.onnx._internal.torchscript_exporter.symbolic_opset19 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset20.py b/torch/onnx/symbolic_opset20.py
index d96f770ca11e..56635a781161 100644
--- a/torch/onnx/symbolic_opset20.py
+++ b/torch/onnx/symbolic_opset20.py
@@ -1,92 +1,8 @@
-# mypy: allow-untyped-defs
-"""This file exports ONNX ops for opset 20.
+"""Backward compatibility module for torch.onnx.symbolic_opset20."""
 
-Note [ONNX Operators that are added/updated in opset 20]
+from __future__ import annotations
 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-20-of-the-default-onnx-operator-set
-New operators:
-    AffineGrid
-    ConstantOfShape
-    DFT
-    Gelu
-    GridSample
-    ImageDecoder
-    IsInf
-    IsNaN
-    ReduceMax
-    ReduceMin
-    RegexFullMatch
-    StringConcat
-    StringSplit
-"""
 
-import functools
+__all__: list[str] = []
 
-import torch.nn.functional as F
-from torch import _C
-from torch.onnx import symbolic_helper
-from torch.onnx._internal import jit_utils, registration
-
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in symbolic_helper.py
-
-__all__ = ["_grid_sampler", "_affine_grid_generator", "gelu"]
-
-
-def convert_grid_sample_mode(mode_s):
-    return (
-        "linear" if mode_s == "bilinear" else "cubic" if mode_s == "bicubic" else mode_s
-    )
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=20)
-
-
-@_onnx_symbolic("aten::grid_sampler")
-@symbolic_helper.parse_args("v", "v", "i", "i", "b")
-def _grid_sampler(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    grid: _C.Value,
-    mode_enum: int,
-    padding_mode_enum: int,
-    align_corners: bool,
-):
-    mode_s = {v: k for k, v in F.GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg, index]
-    # mode string changes at https://onnx.ai/onnx/operators/text_diff_GridSample_16_20.html
-    mode_s = convert_grid_sample_mode(mode_s)
-    padding_mode_s = {v: k for k, v in F.GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg, index]
-        padding_mode_enum  # type: ignore[index]
-    ]
-    return g.op(
-        "GridSample",
-        input,
-        grid,
-        align_corners_i=int(align_corners),
-        mode_s=mode_s,
-        padding_mode_s=padding_mode_s,
-    )
-
-
-@_onnx_symbolic("aten::affine_grid_generator")
-@symbolic_helper.parse_args("v", "v", "b")
-def _affine_grid_generator(
-    g: jit_utils.GraphContext,
-    theta: _C.Value,
-    size: _C.Value,
-    align_corners: bool,
-):
-    return g.op(
-        "AffineGrid",
-        theta,
-        size,
-        align_corners_i=int(align_corners),
-    )
-
-
-@_onnx_symbolic("aten::gelu")
-@symbolic_helper.parse_args("v", "s")
-def gelu(g: jit_utils.GraphContext, self: _C.Value, approximate: str = "none"):
-    return g.op("Gelu", self, approximate_s=approximate)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset20 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset7.py b/torch/onnx/symbolic_opset7.py
index c647ead4e297..c11e769677ec 100644
--- a/torch/onnx/symbolic_opset7.py
+++ b/torch/onnx/symbolic_opset7.py
@@ -1,67 +1,8 @@
-# mypy: allow-untyped-defs
-"""
-Note [ONNX operators that are added/updated from opset 7 to opset 8]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-New operators:
-  Expand
+"""Backward compatibility module for torch.onnx.symbolic_opset7."""
 
-Updated operators:
-  Min, Max, Sum, Mean: supports multidirectional broadcasting.
-  MaxPool: added optional indices output.
-  Scan
-"""
+from __future__ import annotations
 
-import functools
-import warnings
 
-from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
-from torch.onnx._internal import jit_utils, registration
+__all__: list[str] = []
 
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=7)
-
-block_listed_operators = (
-    "scan",
-    "expand",
-    "expand_as",
-    "meshgrid",
-    "adaptive_max_pool1d",
-    "adaptive_max_pool2d",
-    "adaptive_max_pool3d",
-    "max_pool1d_with_indices",
-    "max_pool2d_with_indices",
-    "max_pool3d_with_indices",
-)
-
-
-# NOTE: max, min, sum, mean: broadcasting is not supported in opset 7.
-# torch.max (same for torch.min) actually has two interfaces smashed together:
-# torch.max(x, dim, keepdim) and torch.max(x, y)
-@_onnx_symbolic("aten::max")
-def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.max(input, other)
-    if keepdim is None and dim_or_y is not None:
-        warnings.warn(
-            "Multidirectional broadcasting is not supported in opset 7. "
-            "This might cause the onnx model to be incorrect, if inputs to max operators "
-            "have different shapes"
-        )
-    return opset9.max(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::min")
-def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    # torch.min(input, other)
-    if keepdim is None and dim_or_y is not None:
-        warnings.warn(
-            "Multidirectional broadcasting is not supported in opset 7. "
-            "This might cause the onnx model to be incorrect, if inputs to min operators "
-            "have different shapes"
-        )
-    return opset9.min(g, self, dim_or_y, keepdim)
-
-
-for block_listed_op in block_listed_operators:
-    _onnx_symbolic(f"aten::{block_listed_op}")(
-        symbolic_helper._block_list_in_opset(block_listed_op)
-    )
+from torch.onnx._internal.torchscript_exporter.symbolic_opset7 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset8.py b/torch/onnx/symbolic_opset8.py
index 41abf46be2a0..0e4411649f3e 100644
--- a/torch/onnx/symbolic_opset8.py
+++ b/torch/onnx/symbolic_opset8.py
@@ -1,463 +1,8 @@
-# mypy: allow-untyped-defs
-"""
-Note [ONNX operators that are added/updated from opset 8 to opset 9]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-New operators:
-    Compress
-    ConstantOfShape
-    EyeLike
-    MaxUnpool
-    OneHot
-    Sinh
-    Cosh
-    Asinh
-    Acosh
-    Atanh
-    Shrink
-    IsNaN
-    Sign
-    Erf
-    Scatter
-    Where
-    NonZero
-    TfIdfVectorizer
-    MeanVarianceNormalization
+"""Backward compatibility module for torch.onnx.symbolic_opset8."""
 
-Updated operators:
-    BatchNormalization: removed spatial attribute.
-    Greater, Less, Constant, MatMul, PRelu, Gemm, Flatten: more data types{integers} supported.
-    Cast: more data types{string} supported.
-    Upsample: moved scales from attribute to input.
-    Scan
-"""
+from __future__ import annotations
 
-import functools
-import warnings
 
-import torch
-from torch._C import _onnx as _C_onnx
-from torch.onnx import _type_utils, errors, symbolic_helper, symbolic_opset9 as opset9
-from torch.onnx._internal import jit_utils, registration
+__all__: list[str] = []
 
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=8)
-
-block_listed_operators = (
-    "nonzero",
-    "where",
-    "scatter",
-    "scatter_add",
-    "erf",
-    "sign",
-    "isnan",
-    "gather",
-    "arange",
-    "masked_fill",
-    "index_fill",
-    "index_copy",
-    "repeat_interleave",
-    "any",
-    "all",
-)
-
-for block_listed_op in block_listed_operators:
-    _onnx_symbolic(f"aten::{block_listed_op}")(
-        symbolic_helper._block_list_in_opset(block_listed_op)
-    )
-
-
-@_onnx_symbolic(
-    "aten::upsample_nearest1d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest2d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest3d",
-    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
-)
-@_onnx_symbolic(
-    "aten::upsample_linear1d",
-    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_bilinear2d",
-    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
-)
-@_onnx_symbolic(
-    "aten::upsample_trilinear3d",
-    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
-)
-def _interpolate(name, dim, interpolate_mode):
-    def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = symbolic_helper._get_interpolate_attributes(
-            g, interpolate_mode, args
-        )
-        symbolic_helper._interpolate_warning(interpolate_mode)
-        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
-        if align_corners:
-            return symbolic_helper._unimplemented(name, "align_corners == True", input)
-        output_size = symbolic_helper._maybe_get_const(output_size, "is")
-        if symbolic_helper._is_value(output_size):
-            return symbolic_helper._unimplemented(
-                name, "torch._C.Value (output_size) indexing"
-            )
-        if scales is None:
-            scales = [
-                1.0
-                if i < 2
-                else float(output_size[-(dim - i)])
-                / float(input.type().sizes()[-(dim - i)])
-                for i in range(0, dim)
-            ]
-        return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
-
-    return symbolic_fn
-
-
-@_onnx_symbolic("aten::__interpolate")
-def __interpolate(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-    antialias,
-):
-    align_corners = symbolic_helper._maybe_get_const(align_corners, "b")
-    if not symbolic_helper._is_none(align_corners) and align_corners:
-        return symbolic_helper._unimplemented("interpolate", "align_corners == True")
-
-    if not symbolic_helper._is_none(scale_factor) and symbolic_helper._is_value(
-        scale_factor
-    ):
-        return symbolic_helper._unimplemented(
-            "interpolate", "dynamic scales in opset 8"
-        )
-
-    if not symbolic_helper._is_none(size) and symbolic_helper._is_value(size):
-        return symbolic_helper._unimplemented("interpolate", "dynamic size in opset 8")
-
-    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
-        g, input, size, scale_factor, mode, align_corners
-    )
-    return g.op("Upsample", input, mode_s=mode, scales_f=scales)
-
-
-# NOTE: We should create a wrapper for this kind of operation, after resolving the shape/type propagation
-#       issue for "cast" operators. Some symbolic functions depend on shape information of input tensor, which
-#       is lost after casting.
-def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
-    floating_scalar_types = {
-        _type_utils.JitScalarType.HALF,
-        _type_utils.JitScalarType.FLOAT,
-        _type_utils.JitScalarType.DOUBLE,
-    }
-    old_type = None
-    # Cast the input tensor to Float if its scalarType is known and is not floating number.
-    # If casting is performed, return the old scalarType, otherwise return None.
-    arg0_type = _type_utils.JitScalarType.from_value(
-        args[0], _type_utils.JitScalarType.UNDEFINED
-    )
-    if arg0_type != _type_utils.JitScalarType.UNDEFINED:
-        old_type = arg0_type
-        if old_type not in floating_scalar_types:
-            old_type = old_type.scalar_name()  # type: ignore[assignment]
-            args = tuple(
-                g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-                for arg in args
-            )
-        else:
-            return (None,) + args
-    else:
-        warnings.warn(
-            "Only floating datatype is supported for these operators: "
-            "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
-            "the onnx model to be incorrect, if inputs have integer datatypes."
-        )
-    return (old_type,) + args
-
-
-def _cast_to_type(g: jit_utils.GraphContext, input, to_type):
-    if to_type is None:
-        return input
-    return getattr(opset9, f"_cast_{to_type}")(g, input, False)
-
-
-def _comparison_operator(g: jit_utils.GraphContext, input, other, op_name):
-    other = symbolic_helper._maybe_get_scalar(other)
-    other = symbolic_helper._if_scalar_type_as(other, input)
-    _, input, other = _try_cast_integer_to_float(g, input, other)
-    return g.op(op_name, input, other)
-
-
-# NOTE: For symbolics {gt, lt, bmm, matmul, prelu, mm, addmm, view, flatten},
-#       integer input type not supported in opset8. Cast to float if possible.
-@_onnx_symbolic("aten::gt")
-def gt(g: jit_utils.GraphContext, input, other):
-    return _comparison_operator(g, input, other, "Greater")
-
-
-@_onnx_symbolic("aten::lt")
-def lt(g: jit_utils.GraphContext, input, other):
-    return _comparison_operator(g, input, other, "Less")
-
-
-@_onnx_symbolic("aten::bmm")
-def bmm(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._try_get_scalar_type(self):
-        old_type, self, other = _try_cast_integer_to_float(g, self, other)
-        return _cast_to_type(g, g.op("MatMul", self, other), old_type)
-    else:
-        return g.op("MatMul", self, other)
-
-
-@_onnx_symbolic("aten::matmul")
-def matmul(g: jit_utils.GraphContext, self, other):
-    return bmm(g, self, other)
-
-
-@_onnx_symbolic("aten::prelu")
-def prelu(g: jit_utils.GraphContext, self, weight):
-    self_rank = symbolic_helper._get_tensor_rank(self)
-    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
-    if self_rank is not None and self_rank > 2:
-        weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1)))
-    elif self_rank == 0 and weight_sizes == [1]:
-        # self and weight are both scalar but weight has rank == 1, squeeze weight.
-        weight = symbolic_helper._squeeze_helper(g, weight, [0])
-    if symbolic_helper._try_get_scalar_type(self):
-        old_type, self, weight = _try_cast_integer_to_float(g, self, weight)
-        return _cast_to_type(g, g.op("PRelu", self, weight), old_type)
-    else:
-        return g.op("PRelu", self, weight)
-
-
-@_onnx_symbolic("aten::mm")
-def mm(g: jit_utils.GraphContext, self, other):
-    # Create a dummy C tensor. Only needed for API purposes, the value is
-    # since beta = 0
-    scalar_type = symbolic_helper._try_get_scalar_type(self, other)
-    if scalar_type is None:
-        raise errors.SymbolicValueError(
-            "mm can only operate on tensors with known types", self
-        )
-    zero_constant = g.op(
-        "Constant",
-        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
-    )
-
-    if symbolic_helper._try_get_scalar_type(self):
-        old_type, self, other, zero_constant = _try_cast_integer_to_float(
-            g, self, other, zero_constant
-        )
-        return _cast_to_type(
-            g,
-            g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0),
-            old_type,
-        )
-    return g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0)
-
-
-@_onnx_symbolic("aten::addmm")
-@symbolic_helper.parse_args("v", "v", "v", "t", "t")
-def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
-    if symbolic_helper._try_get_scalar_type(self):
-        old_type, self, mat1, mat2 = _try_cast_integer_to_float(g, self, mat1, mat2)
-        return _cast_to_type(
-            g,
-            g.op(
-                "Gemm",
-                mat1,
-                mat2,
-                self,
-                beta_f=symbolic_helper._scalar(beta),
-                alpha_f=symbolic_helper._scalar(alpha),
-            ),
-            old_type,
-        )
-    else:
-        return g.op(
-            "Gemm",
-            mat1,
-            mat2,
-            self,
-            beta_f=symbolic_helper._scalar(beta),
-            alpha_f=symbolic_helper._scalar(alpha),
-        )
-
-
-@_onnx_symbolic("aten::flatten")
-def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
-    start_dim_i = symbolic_helper._get_const(start_dim, "i", "start_dim")
-    end_dim_i = symbolic_helper._get_const(end_dim, "i", "end_dim")
-
-    dim = input.type().dim()
-    if end_dim_i < 0:
-        end_dim_i = dim + end_dim_i
-    # use ONNX's Flatten operator for cases where the output shape is 2D
-    if start_dim_i == 1 and end_dim_i == dim - 1:
-        if symbolic_helper._try_get_scalar_type(input):
-            old_type, input = _try_cast_integer_to_float(g, input)
-            return _cast_to_type(
-                g, g.op("Flatten", input, axis_i=start_dim_i), old_type
-            )
-        else:
-            return g.op("Flatten", input, axis_i=start_dim_i)
-    if start_dim_i == 0 and end_dim_i == dim - 2:
-        if symbolic_helper._try_get_scalar_type(input):
-            old_type, input = _try_cast_integer_to_float(g, input)
-            return _cast_to_type(
-                g, g.op("Flatten", input, axis_i=end_dim_i + 1), old_type
-            )
-        else:
-            return g.op("Flatten", input, axis_i=end_dim_i + 1)
-
-    return opset9.flatten(g, input, start_dim, end_dim)
-
-
-def _constant_fill(g: jit_utils.GraphContext, sizes, dtype: int, const_value):
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    if not scalar_type.dtype().is_floating_point:
-        result = g.op(
-            "ConstantFill",
-            sizes,
-            dtype_i=_type_utils.JitScalarType.FLOAT.onnx_type(),
-            input_as_shape_i=1,
-            value_f=const_value,
-        )
-        return g.op("Cast", result, to_i=scalar_type.onnx_type())
-    else:
-        return g.op(
-            "ConstantFill",
-            sizes,
-            dtype_i=scalar_type.onnx_type(),
-            input_as_shape_i=1,
-            value_f=const_value,
-        )
-
-
-@_onnx_symbolic("aten::empty")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def empty(
-    g: jit_utils.GraphContext,
-    sizes,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    return zeros(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::empty_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def empty_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    return zeros_like(g, input, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::zeros")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v")
-def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
-    # NOTE: no way to set device and layout in ONNX, so we ignore it
-    return _constant_fill(g, sizes, dtype, 0)
-
-
-@_onnx_symbolic("aten::zeros_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def zeros_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    return _constant_fill(g, shape, dtype, 0)
-
-
-@_onnx_symbolic("aten::ones")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v")
-def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
-    return _constant_fill(g, sizes, dtype, 1)
-
-
-@_onnx_symbolic("aten::ones_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def ones_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    return _constant_fill(g, shape, dtype, 1)
-
-
-@_onnx_symbolic("aten::full")
-def full(
-    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
-):
-    const_value = symbolic_helper._maybe_get_const(value, "t")
-    if symbolic_helper._is_value(const_value):
-        tmp = zeros(g, sizes, dtype, layout, device)
-        return opset9.add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
-    else:
-        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        return _constant_fill(g, sizes, dtype, const_value)
-
-
-@_onnx_symbolic("aten::full_like")
-@symbolic_helper.parse_args("v", "f", "i", "v", "v", "v", "v")
-def full_like(
-    g: jit_utils.GraphContext,
-    input,
-    fill_value,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    return _constant_fill(g, shape, dtype, fill_value)
-
-
-@_onnx_symbolic("aten::repeat")
-def repeat(g: jit_utils.GraphContext, self, repeats):
-    if not symbolic_helper._is_value(repeats):
-        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
-    if symbolic_helper._is_packed_list(repeats):
-        repeat_size_len = len(symbolic_helper._unpack_list(repeats))
-    else:
-        const_repeats = symbolic_helper._maybe_get_const(repeats, "is")
-        repeat_size_len = len(const_repeats)
-    if self.isCompleteTensor():
-        sizes = self.type().sizes()
-        diff_dims = repeat_size_len - len(sizes)
-        if diff_dims > 0:
-            self = opset9.view(
-                g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes))
-            )
-    return g.op("Tile", self, repeats)
+from torch.onnx._internal.torchscript_exporter.symbolic_opset8 import *  # noqa: F401,F403
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index af56a8751459..bd0f4795340a 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1,6653 +1,14 @@
-# mypy: allow-untyped-decorators
-# mypy: allow-untyped-defs
-# mypy: disable-error-code=arg-type
-"""This file exports ONNX ops for opset 9.
-
-Opset 9 is supported by ONNX release 1.4.1
-release on 01/23/19
-"""
+"""Backward compatibility module for torch.onnx.symbolic_opset9."""
 
 from __future__ import annotations
 
-import builtins
-import functools
-import math
-import sys
-import warnings
-from typing import Callable, TYPE_CHECKING
-from typing_extensions import deprecated
-
-import torch
-import torch._C._onnx as _C_onnx
-import torch.nn.modules.utils
-import torch.onnx
-from torch import _C
-
-# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
-from torch.onnx import _constants, _type_utils, errors, symbolic_helper
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils, registration
-
-
-if TYPE_CHECKING:
-    from collections.abc import Sequence
-
-    from torch.types import Number
-
-# EDITING THIS FILE? READ THIS FIRST!
-# see Note [Edit Symbolic Files] in README.md
-
-__all__ = [
-    "abs",
-    "acos",
-    "add",
-    "addcmul",
-    "addmm",
-    "alias",
-    "amax",
-    "amin",
-    "aminmax",
-    "arange",
-    "argmax",
-    "argmin",
-    "as_strided",
-    "as_tensor",
-    "asin",
-    "atan",
-    "atan2",
-    "baddbmm",
-    "batch_norm",
-    "bernoulli",
-    "bitwise_not",
-    "bitwise_or",
-    "bmm",
-    "broadcast_tensors",
-    "broadcast_to",
-    "bucketize",
-    "cat",
-    "cdist",
-    "ceil",
-    "clamp_max",
-    "clamp_min",
-    "clamp",
-    "clone",
-    "constant_pad_nd",
-    "contiguous",
-    "conv_tbc",
-    "conv_transpose1d",
-    "conv_transpose2d",
-    "conv_transpose3d",
-    "conv1d",
-    "conv2d",
-    "conv3d",
-    "convert_element_type",
-    "convolution",
-    "cos",
-    "cosine_similarity",
-    "cross",
-    "cumsum",
-    "detach",
-    "dim",
-    "div",
-    "dot",
-    "dropout",
-    "elu",
-    "embedding_bag",
-    "embedding",
-    "empty_like",
-    "empty",
-    "eq",
-    "erf",
-    "exp",
-    "expand_as",
-    "expand",
-    "eye",
-    "fill",
-    "flatten",
-    "floor_divide",
-    "floor",
-    "floordiv",
-    "frobenius_norm",
-    "full_like",
-    "full",
-    "gather",
-    "ge",
-    "gelu",
-    "get_pool_ceil_padding",
-    "glu",
-    "group_norm",
-    "gt",
-    "hann_window",
-    "hardshrink",
-    "hardsigmoid",
-    "hardswish",
-    "hardtanh",
-    "index_add",
-    "index_copy",
-    "index_fill",
-    "index_put",
-    "index_select",
-    "index",
-    "instance_norm",
-    "is_floating_point",
-    "is_pinned",
-    "isnan",
-    "item",
-    "kl_div",
-    "layer_norm",
-    "le",
-    "leaky_relu",
-    "lerp",
-    "lift",
-    "linalg_cross",
-    "linalg_matrix_norm",
-    "linalg_norm",
-    "linalg_vector_norm",
-    "linear",
-    "linspace",
-    "log_sigmoid",
-    "log_softmax",
-    "log",
-    "log10",
-    "log1p",
-    "log2",
-    "logical_and",
-    "logical_not",
-    "logical_or",
-    "logical_xor",
-    "logit",
-    "logsumexp",
-    "lstm_cell",
-    "lstm",
-    "lt",
-    "masked_fill",
-    "masked_fill_",
-    "matmul",
-    "max_pool1d_with_indices",
-    "max_pool2d_with_indices",
-    "max_pool3d_with_indices",
-    "max",
-    "maximum",
-    "meshgrid",
-    "min",
-    "minimum",
-    "mish",
-    "mm",
-    "movedim",
-    "mse_loss",
-    "mul",
-    "multinomial",
-    "mv",
-    "narrow",
-    "native_layer_norm",
-    "ne",
-    "neg",
-    "new_empty",
-    "new_full",
-    "new_ones",
-    "new_zeros",
-    "nonzero_numpy",
-    "nonzero",
-    "norm",
-    "numel",
-    "numpy_T",
-    "one_hot",
-    "ones_like",
-    "ones",
-    "onnx_placeholder",
-    "pad",
-    "pairwise_distance",
-    "permute",
-    "pixel_shuffle",
-    "pixel_unshuffle",
-    "pow",
-    "prelu",
-    "prim_constant_chunk",
-    "prim_constant_split",
-    "prim_constant",
-    "prim_data",
-    "prim_device",
-    "prim_dtype",
-    "prim_if",
-    "prim_layout",
-    "prim_list_construct",
-    "prim_list_unpack",
-    "prim_loop",
-    "prim_max",
-    "prim_min",
-    "prim_shape",
-    "prim_tolist",
-    "prim_tuple_construct",
-    "prim_type",
-    "prim_unchecked_cast",
-    "prim_uninitialized",
-    "rand_like",
-    "rand",
-    "randint_like",
-    "randint",
-    "randn_like",
-    "randn",
-    "reciprocal",
-    "reflection_pad",
-    "relu",
-    "relu6",
-    "remainder",
-    "repeat_interleave",
-    "repeat",
-    "replication_pad",
-    "reshape_as",
-    "reshape",
-    "roll",
-    "rrelu",
-    "rsqrt",
-    "rsub",
-    "scalar_tensor",
-    "scatter_add",
-    "scatter",
-    "select",
-    "selu",
-    "sigmoid",
-    "sign",
-    "silu",
-    "sin",
-    "size",
-    "slice",
-    "softmax",
-    "softplus",
-    "softshrink",
-    "sort",
-    "split_with_sizes",
-    "split",
-    "sqrt",
-    "square",
-    "squeeze",
-    "stack",
-    "std_mean",
-    "std",
-    "sub",
-    "t",
-    "take",
-    "tan",
-    "tanh",
-    "tanhshrink",
-    "tensor",
-    "threshold",
-    "to",
-    "topk",
-    "transpose",
-    "true_divide",
-    "type_as",
-    "unbind",
-    "unfold",
-    "unsafe_chunk",
-    "unsafe_split_with_sizes",
-    "unsafe_split",
-    "unsqueeze",
-    "unsupported_complex_operators",
-    "noop_complex_operators",
-    "unused",
-    "var_mean",
-    "var",
-    "view_as",
-    "view",
-    "where",
-    "wrap_logical_op_with_cast_to",
-    "wrap_logical_op_with_negation",
-    "zeros_like",
-    "zeros",
-    "zero",
-]
-
-
-_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=9)
-
-
-def _export(name: str):
-    """Exports the function in the current global namespace."""
-
-    def wrapper(func):
-        globals()[name] = func
-        __all__.append(name)
-        return func
-
-    return wrapper
-
-
-def unused(g):
-    """Represents "missing" optional inputs."""
-    n = g.op("prim::Constant")
-    n.setType(_C.OptionalType.ofTensor())
-    return n
-
-
-@_onnx_symbolic("aten::_shape_as_tensor")
-def _shape_as_tensor(g: jit_utils.GraphContext, input):
-    return g.op("Shape", input)
-
-
-@_onnx_symbolic("aten::_reshape_from_tensor")
-def _reshape_from_tensor(g: jit_utils.GraphContext, input, shape):
-    if isinstance(shape, list):
-        shape = g.op("Concat", *shape, axis_i=0)
-    return reshape(g, input, shape)
-
-
-@_onnx_symbolic("aten::reshape")
-@symbolic_helper.quantized_args(True)
-def reshape(g: jit_utils.GraphContext, self, shape):
-    return symbolic_helper._reshape_helper(g, self, shape)
-
-
-@_onnx_symbolic("aten::reshape_as")
-@symbolic_helper.quantized_args(True)
-def reshape_as(g: jit_utils.GraphContext, self, other):
-    shape = g.op("Shape", other)
-    return reshape(g, self, shape)
-
-
-@_onnx_symbolic("aten::add")
-def add(g: jit_utils.GraphContext, self, other, alpha=None):
-    """
-    This function takes the add function and returns the corresponding ONNX operator.
-
-    This function is not meant to be called directly by the user.
-
-    Args:
-        g (GraphContext): The graph context.
-        self (Tensor): The first operand.
-        other (Tensor): The second operand.
-        alpha (float, optional): The scaling factor for the second operand. Defaults to None.
-
-    Returns:
-        ONNX operator.
-    """
-    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "Add", 9, 11, "Add between list of tensors not supported", self
-        )
-    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
-        other = g.op("Mul", other, alpha)
-    return g.op("Add", self, other)
-
-
-@_onnx_symbolic("aten::sub")
-def sub(g: jit_utils.GraphContext, self, other, alpha=None):
-    """
-    Consumes sub function and returns the corresponding ONNX operator.
-
-    This function is not meant to be called directly by the user.
-
-    Args:
-        g (GraphContext): The graph context.
-        self (Tensor): The first operand.
-        other (Tensor): The second operand.
-        alpha (Optional[Tensor]): A scaling factor to apply to the second operand.
-            If `alpha` is not provided, it defaults to 1.
-
-    Returns:
-        ONNX operator
-    """
-    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
-        other = g.op("Mul", other, alpha)
-    return g.op("Sub", self, other)
-
-
-@_onnx_symbolic("aten::rsub")
-def rsub(g: jit_utils.GraphContext, self, other, alpha=None):
-    return sub(g, other, self, alpha=alpha)
-
-
-@_onnx_symbolic("aten::mul")
-def mul(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_bool(self) and symbolic_helper._is_bool(other):
-        # ONNX Mul doesn't support Boolean, so use And as an equivalent operator.
-        return g.op("And", self, other)
-    else:
-        return g.op("Mul", self, other)
-
-
-@_onnx_symbolic("aten::div")
-def div(g: jit_utils.GraphContext, self, other, *args):
-    if len(args) == 0:
-        return true_divide(g, self, other)
-    else:
-        return _div_rounding_mode(g, self, other, *args)
-
-
-@_onnx_symbolic("aten::addcmul")
-@symbolic_helper.parse_args("v", "v", "v", "f")
-def addcmul(g: jit_utils.GraphContext, self, tensor1, tensor2, value=1.0):
-    value_tens = g.op("Constant", value_t=torch.tensor([value]))
-    return add(g, self, mul(g, mul(g, tensor1, tensor2), value_tens))
-
-
-@symbolic_helper.parse_args("v", "v", "s")
-def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
-    if rounding_mode is None:
-        return true_divide(g, self, other)
-    elif rounding_mode == "floor":
-        return _floor_divide(g, self, other)
-    elif rounding_mode == "trunc":
-        return _trunc_divide(g, self, other)
-    else:
-        raise errors.SymbolicValueError(
-            f'Unsupported rounding mode: "{rounding_mode}". Expected None, "floor" or "trunc"',
-            self,
-        )
-
-
-def _trunc_divide(g: jit_utils.GraphContext, self, other):
-    out = g.op("Div", self, other)
-    # the correct operation is truncate, which is not supported in ONNX,
-    # we cannot call floor since it will behave differently for negative numbers
-    # (eg. -0.1 should become -0 )
-    # - if scalar_type information are not available, assume that
-    # we need to call floor (treat as float)
-    out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.INT64)
-
-    # Matching PyTorch's behavior:
-    # - if self is fp the output's type is self's type
-    # - if self is not fp and other is fp, the output is of type JitScalarType.FLOAT
-    # - self is not fp and other is not fp, the output's type is self's output type
-    # - the output type defaults to Float
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    )
-    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
-        if not symbolic_helper._is_fp(self) and symbolic_helper._is_fp(other):
-            out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-        else:
-            out = g.op(
-                "Cast",
-                out,
-                to_i=scalar_type.onnx_type(),
-            )
-    else:
-        out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    return out
-
-
-def _floor_divide(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
-        out = true_divide(g, self, other)
-        return g.op("Floor", out)
-    else:
-        # Integer division does truncation rounding
-        div = g.op("Div", self, other)
-        # Division is negative if: self < 0 != other < 0
-        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
-        negative = g.op(
-            "Xor",
-            symbolic_helper._lt_helper(g, self, zero),
-            symbolic_helper._lt_helper(g, other, zero),
-        )
-
-        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
-        mod = g.op("Sub", self, g.op("Mul", div, other))
-        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
-
-        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
-        fixup = g.op("Mul", fixup_mask, one)
-        return g.op("Sub", div, fixup)
-
-
-@_onnx_symbolic("aten::floor_divide")
-def floor_divide(g: jit_utils.GraphContext, self, other):
-    # Deprecated behavior, floor_divide actually truncates
-    return _trunc_divide(g, self, other)
-
-
-@_onnx_symbolic("aten::floordiv")
-def floordiv(g: jit_utils.GraphContext, self, other):
-    return floor_divide(g, self, other)
-
-
-@_onnx_symbolic("aten::true_divide")
-def true_divide(g: jit_utils.GraphContext, self, other):
-    """Division where both inputs are cast to floating types
-
-    If both inputs are floating, performs div as usual
-    If only one input is a floating type, the other input is cast to its type
-    If neither input is a floating type, both inputs are cast to the default scalar type
-    """
-
-    # Case 1: either values are floating
-    # Performs div as usual.
-    # Implicit casting will be handled in scalar type analysis pass.
-    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
-        return g.op("Div", self, other)
-
-    # Case 2: neither is floating
-    # Casts both inputs to the default scalar type
-    scalar_type = torch.get_default_dtype()
-    onnx_scalar_type = _C_onnx.TensorProtoDataType.FLOAT
-    assert scalar_type is torch.float or scalar_type is torch.double
-    if torch.get_default_dtype() is torch.double:
-        onnx_scalar_type = _C_onnx.TensorProtoDataType.DOUBLE
-
-    self = g.op("Cast", self, to_i=onnx_scalar_type)
-    other = g.op("Cast", other, to_i=onnx_scalar_type)
-    return g.op("Div", self, other)
-
-
-@_onnx_symbolic("aten::reciprocal")
-def reciprocal(g: jit_utils.GraphContext, self):
-    # torch.reciprocal implicitly casts to float, so we do the same.
-    if not symbolic_helper._is_fp(self):
-        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    return g.op("Reciprocal", self)
-
-
-@_onnx_symbolic("aten::cat")
-@symbolic_helper.parse_args("v", "i")
-def cat(g: jit_utils.GraphContext, tensor_list, dim):
-    """Implement concatenation of pytorch tensors in ONNX along the specified `dim` dimension.
-
-    Parameters:
-        g (jit_utils.GraphContext): Graph context.
-        tensor_list (List[torch.Tensor]): List of tensors to concatenate.
-        dim (int): Dimension along which to concatenate the tensors.
-
-    Returns:
-        ONNX graph node representing the concatenated tensor.
-    """
-    tensors = symbolic_helper._unpack_list(tensor_list)
-    # torch.cat ignores empty tensors such as `torch.Tensor([])`
-    # These needs to be removed as input from ONNX's concat too, otherwise shape inference
-    # will likely fail due to inputs with different ranks (0 for empty tensor, > 0 for anything else)
-    nonempty_tensors = []
-    for t in tensors:
-        if symbolic_helper._is_constant(t) and not symbolic_helper._get_tensor_dim_size(
-            t, 0
-        ):
-            continue
-        nonempty_tensors.append(t)
-    assert len(nonempty_tensors) > 0
-    assert all(
-        symbolic_helper._get_tensor_rank(nonempty_tensors[0]) is None
-        or symbolic_helper._get_tensor_rank(t) is None
-        or symbolic_helper._get_tensor_rank(t)
-        == symbolic_helper._get_tensor_rank(nonempty_tensors[0])
-        for t in nonempty_tensors
-    )
-    tensor_list.node().removeAllInputs()
-    for t in nonempty_tensors:
-        tensor_list.node().addInput(t)
-
-    tensors = symbolic_helper._unpack_list(tensor_list)
-    return g.op("Concat", *tensors, axis_i=dim)
-
-
-@_onnx_symbolic("aten::stack")
-@symbolic_helper.parse_args("v", "i")
-def stack(g: jit_utils.GraphContext, tensor_list, dim):
-    unsqueezed = [
-        symbolic_helper._unsqueeze_helper(g, t, [dim])
-        for t in symbolic_helper._unpack_list(tensor_list)
-    ]
-    return g.op("Concat", *unsqueezed, axis_i=dim)
-
-
-@_onnx_symbolic("aten::list")
-def _list(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("aten::mm")
-def mm(g: jit_utils.GraphContext, self, other):
-    # Create a dummy C tensor. Only needed for API purposes, the value is
-    # since beta = 0
-    C = g.op("Constant", value_t=torch.tensor([1]))
-    return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0)
-
-
-@_onnx_symbolic("aten::bmm")
-def bmm(g: jit_utils.GraphContext, self, other):
-    return g.op("MatMul", self, other)
-
-
-@_onnx_symbolic("aten::matmul")
-def matmul(g: jit_utils.GraphContext, self, other):
-    return g.op("MatMul", self, other)
-
-
-@_onnx_symbolic("aten::addmm")
-@symbolic_helper.parse_args("v", "v", "v", "t", "t")
-def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
-    scalar_type = None
-    self_scalar_type = symbolic_helper._try_get_scalar_type(self)
-    mat1_scalar_type = symbolic_helper._try_get_scalar_type(mat1)
-    mat2_scalar_type = symbolic_helper._try_get_scalar_type(mat2)
-    if self_scalar_type is not None:
-        scalar_type = self_scalar_type
-    elif mat1_scalar_type is not None:
-        scalar_type = mat1_scalar_type
-    elif mat2_scalar_type is not None:
-        scalar_type = mat2_scalar_type
-
-    mat1_rank = symbolic_helper._get_tensor_rank(mat1)
-    mat2_rank = symbolic_helper._get_tensor_rank(mat2)
-
-    def is_not_none_nor(v, u):
-        return v is not None and v != u
-
-    if scalar_type is not None and (
-        is_not_none_nor(mat1_rank, 2) or is_not_none_nor(mat2_rank, 2)
-    ):
-        res1 = g.op("MatMul", mat1, mat2)
-        res2 = self
-
-        alpha = symbolic_helper._scalar(alpha)
-        beta = symbolic_helper._scalar(beta)
-
-        if alpha != 1:
-            alpha = g.op(
-                "Constant", value_t=torch.tensor(alpha, dtype=scalar_type.dtype())
-            )
-            res1 = g.op("Mul", res1, alpha)
-        if beta != 1:
-            beta = g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    symbolic_helper._scalar(beta), dtype=scalar_type.dtype()
-                ),
-            )
-            res2 = g.op("Mul", res2, beta)
-
-        return g.op("Add", res1, res2)
-
-    return g.op(
-        "Gemm",
-        mat1,
-        mat2,
-        self,
-        beta_f=symbolic_helper._scalar(beta),
-        alpha_f=symbolic_helper._scalar(alpha),
-    )
-
-
-@_onnx_symbolic("aten::neg")
-def neg(g: jit_utils.GraphContext, self):
-    return g.op("Neg", self)
-
-
-@_onnx_symbolic("aten::sqrt")
-def sqrt(g: jit_utils.GraphContext, self):
-    if _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.UNDEFINED
-    ) in {
-        _type_utils.JitScalarType.UINT8,
-        _type_utils.JitScalarType.INT8,
-        _type_utils.JitScalarType.INT16,
-        _type_utils.JitScalarType.INT,
-        _type_utils.JitScalarType.INT64,
-    }:
-        # torch converts all int inputs to sqrt to float
-        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-
-    return g.op("Sqrt", self)
-
-
-@_onnx_symbolic("aten::rsqrt")
-def rsqrt(g: jit_utils.GraphContext, self):
-    return g.op(
-        "Div", symbolic_helper._if_scalar_type_as(torch.ones(1), self), sqrt(g, self)
-    )
-
-
-@_onnx_symbolic("aten::tanh")
-# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qtanh.cpp
-@symbolic_helper.quantized_args(True, scale=2.0 / 256.0, zero_point=128)
-def tanh(g: jit_utils.GraphContext, self):
-    return g.op("Tanh", self)
-
-
-@_onnx_symbolic("aten::sin")
-def sin(g: jit_utils.GraphContext, self):
-    return g.op("Sin", self)
-
-
-@_onnx_symbolic("aten::cos")
-def cos(g: jit_utils.GraphContext, self):
-    return g.op("Cos", self)
-
-
-@_onnx_symbolic("aten::tan")
-def tan(g: jit_utils.GraphContext, self):
-    return g.op("Tan", self)
-
-
-@_onnx_symbolic("aten::asin")
-def asin(g: jit_utils.GraphContext, self):
-    return g.op("Asin", self)
-
-
-@_onnx_symbolic("aten::acos")
-def acos(g: jit_utils.GraphContext, self):
-    return g.op("Acos", self)
-
-
-@_onnx_symbolic("aten::atan")
-def atan(g: jit_utils.GraphContext, self):
-    return g.op("Atan", self)
-
-
-@_onnx_symbolic("aten::atan2")
-def atan2(g: jit_utils.GraphContext, self, other):
-    # self is y, and other is x on coordinate
-    slope = g.op("Div", self, other)
-    atan = g.op("Atan", slope)
-    const_zero = g.op("Constant", value_t=torch.tensor(0))
-    const_pi = g.op("Constant", value_t=torch.tensor(math.pi))
-
-    condition_second_or_third_quadrant = g.op("Greater", self, const_zero)
-    second_third_quadrant = g.op(
-        "Where",
-        condition_second_or_third_quadrant,
-        g.op("Add", atan, const_pi),
-        g.op("Sub", atan, const_pi),
-    )
-
-    condition_14_or_23_quadrant = g.op("Less", other, const_zero)
-    result = g.op("Where", condition_14_or_23_quadrant, second_third_quadrant, atan)
-
-    return result
-
-
-@_onnx_symbolic("aten::sigmoid")
-# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
-@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
-def sigmoid(g: jit_utils.GraphContext, self):
-    """Converts the corresponding PyTorch function into ONNX operators.
-
-    It is not meant to be called directly by a user.
-
-    Args:
-        g (jit_utils.GraphContext): Graph context.
-        self (Tensor): the input tensor.
-    Returns:
-        ONNX operator
-    """
-    return g.op("Sigmoid", self)
-
-
-@_onnx_symbolic("aten::sign")
-def sign(g: jit_utils.GraphContext, self):
-    return g.op("Sign", self)
-
-
-@symbolic_helper.quantized_args(True)
-def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
-    assert len(starts) == len(ends)
-    if len(starts) == 1 and starts[0] == 0 and ends[0] == _constants.INT64_MAX:
-        return input
-    return g.op("Slice", input, axes_i=axes, starts_i=starts, ends_i=ends)
-
-
-@_onnx_symbolic(
-    "aten::sum", decorate=[symbolic_helper._apply_params("ReduceSum", "sum")]
-)
-@_onnx_symbolic(
-    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
-)
-# torch.prod does not support multidimensional "dim"
-@_onnx_symbolic(
-    "aten::prod",
-    decorate=[
-        symbolic_helper._apply_params(
-            "ReduceProd", "prod", allow_multi_dim_support=False
-        )
-    ],
-)
-def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
-    return symbolic_helper._reduce_with_dtype_helper(
-        onnx_op, name, allow_multi_dim_support
-    )
-
-
-@_onnx_symbolic("aten::cumsum")
-@symbolic_helper.parse_args("v", "i", "none")
-def cumsum(g: jit_utils.GraphContext, input, dim, dtype):
-    symbolic_helper._onnx_opset_unsupported("cumsum", 9, 11, input)
 
+__all__: list[str] = []
 
-@_onnx_symbolic("aten::_sample_dirichlet")
-def _sample_dirichlet(g: jit_utils.GraphContext, self, generator):
-    return symbolic_helper._onnx_unsupported("_sample_dirichlet", self)
-
-
-@_onnx_symbolic("aten::_standard_gamma")
-def _standard_gamma(g: jit_utils.GraphContext, self, generator):
-    return symbolic_helper._onnx_unsupported("_standard_gamma", self)
-
-
-@_onnx_symbolic("aten::t")
-def t(g: jit_utils.GraphContext, self):
-    rank = symbolic_helper._get_tensor_rank(self)
-    if rank is None or rank < 2:
-        # The transpose of a 1d or 0d tensor is itself. ONNX does not define the behavior
-        # clearly and onnxruntime fails on these cases. So we add an Identity node to
-        # mirror the behavior of eager mode.
-        return g.op("Identity", self)
-    return g.op("Transpose", self, perm_i=(1, 0))
-
-
-@_onnx_symbolic("aten::numpy_T")
-@symbolic_helper.quantized_args(True)
-def numpy_T(g: jit_utils.GraphContext, input):
-    ndim = symbolic_helper._get_tensor_rank(input)
-    assert ndim is not None
-    perm = list(reversed(range(0, ndim)))
-    return g.op("Transpose", input, perm_i=perm)
-
-
-@_onnx_symbolic("aten::expand")
-@symbolic_helper.quantized_args(True)
-def expand(g: jit_utils.GraphContext, self, size, implicit):
-    """Implement the expand function for a pytorch tensor in ONNX according to specified `size`"""
-    size = symbolic_helper._maybe_get_const(size, "is")
-    if not symbolic_helper._is_value(size):
-        size = g.op("Constant", value_t=torch.LongTensor(size))
-    elif symbolic_helper._is_packed_list(size):
-        # Expand with -1 dim value means dim is unchanged.
-        # Since onnx::expand supports two-way broadcasting,
-        # -1 dim value can be exported to onnx as 1
-        size = symbolic_helper._reshape_helper(
-            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
-        )
-    dtype = _type_utils.JitScalarType.INT64
-    ones = ones_like(g, size, dtype)
-    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
-    size = where(g, g.op("Equal", size, neg_ones), ones, size)
-    return g.op("Expand", self, size)
-
-
-@_onnx_symbolic("aten::broadcast_to")
-@symbolic_helper.quantized_args(True)
-def broadcast_to(g: jit_utils.GraphContext, self, size):
-    size = symbolic_helper._maybe_get_const(size, "is")
-    if not symbolic_helper._is_value(size):
-        size = g.op("Constant", value_t=torch.LongTensor(size))
-    elif symbolic_helper._is_packed_list(size):
-        # Expand with -1 dim value means dim is unchanged.
-        # Since onnx::expand supports two-way broadcasting,
-        # -1 dim value can be exported to onnx as 1
-        size = symbolic_helper._reshape_helper(
-            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
-        )
-    dtype = _type_utils.JitScalarType.INT64
-    ones = ones_like(g, size, dtype)
-    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
-    size = where(g, g.op("Equal", size, neg_ones), ones, size)
-    return g.op("Expand", self, size)
-
-
-@_onnx_symbolic("aten::expand_as")
-@symbolic_helper.quantized_args(True, True)
-def expand_as(g: jit_utils.GraphContext, self, other):
-    self_t = symbolic_helper._maybe_get_const(self, "t")
-    if isinstance(self_t, torch.Tensor):
-        orig_type = self_t.dtype
-        self_t = self_t.to(torch.double)
-        dims = []
-        for d in range(self_t.dim()):
-            if torch.equal(self_t.mean(d).unsqueeze(d).expand_as(self_t), self_t):
-                dims.append(d)
-                self = g.op(
-                    "Constant", value_t=self_t.mean(dims, keepdim=True).to(orig_type)
-                )
-
-    shape = g.op("Shape", other)
-    return g.op("Expand", self, shape)
-
-
-@_onnx_symbolic("aten::embedding")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "i", "b", "v")
-def embedding(
-    g: jit_utils.GraphContext,
-    weight,
-    indices,
-    padding_idx,
-    scale_grad_by_freq,
-    sparse,
-):
-    if scale_grad_by_freq and GLOBALS.export_training:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of embedding with scale_grad_by_freq=True "
-            "for training mode. ONNX does not support scaling the gradients.",
-            weight,
-        )
-    if padding_idx >= 0 and GLOBALS.export_training:
-        warnings.warn(
-            "Warning: ONNX export of embedding with padding_idx >= 0 "
-            "for training mode. "
-            "ONNX does not support not updating the embedding vector at padding_idx during training."
-        )
-
-    return g.op("Gather", weight, indices)
-
-
-@_onnx_symbolic("aten::embedding_bag")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
-def embedding_bag(
-    g: jit_utils.GraphContext,
-    embedding_matrix,
-    indices,
-    offsets,
-    scale_grad_by_freq,
-    mode,
-    sparse,
-    per_sample_weights,
-    include_last_offset,
-    padding_idx,
-):
-    if not symbolic_helper._is_none(per_sample_weights):
-        return symbolic_helper._onnx_unsupported(
-            "embedding_bag with per_sample_weights"
-        )
-
-    return symbolic_helper._onnx_unsupported("embedding_bag", embedding_matrix)
-
-
-@_onnx_symbolic("aten::size")
-@symbolic_helper.quantized_args(True, quantize_output=False)
-def size(g: jit_utils.GraphContext, self, dim=None):
-    if dim is None:
-        return g.op("Shape", self)
-    if symbolic_helper._maybe_get_const(dim, "i") < 0:
-        rank = symbolic_helper._get_tensor_rank(self)
-        if rank is not None:
-            dim = symbolic_helper._maybe_get_const(dim, "i") + rank
-            dim = g.op("Constant", value_t=torch.tensor(dim))
-    return symbolic_helper._size_helper(g, self, dim)
-
-
-@_onnx_symbolic("aten::transpose")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "i", "i")
-def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
-    if dim0 == dim1:  # micro-optimization
-        return self
-
-    # NB: Transpose in ONNX is actually a Permute
-    rank = symbolic_helper._get_tensor_rank(self)
-    if rank is not None:
-        axes = list(range(rank))
-        axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
-        return g.op("Transpose", self, perm_i=axes)
-    else:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of transpose for tensor of unknown rank.",
-            self,
-        )
-
-
-@_onnx_symbolic("aten::permute")
-@symbolic_helper.parse_args("v", "is")
-def permute(g: jit_utils.GraphContext, self, dims):
-    if dims == list(range(0, len(dims))):
-        return self
-    return g.op("Transpose", self, perm_i=dims)
-
-
-@_onnx_symbolic("aten::view")
-@symbolic_helper.quantized_args(True)
-def view(g: jit_utils.GraphContext, self, size):
-    return reshape(g, self, size)
-
-
-@_onnx_symbolic("aten::view_as")
-def view_as(g: jit_utils.GraphContext, self, other):
-    shape = g.op("Shape", other)
-    return reshape(g, self, shape)
-
-
-@_onnx_symbolic("aten::unsafe_chunk")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
-    if _outputs is None:
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "unsafe_chunk", 9, 11, "Dynamic number of outputs not supported", self
-        )
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        return symbolic_helper._unimplemented(
-            "unsafe_chunk", "unknown dimension size", self
-        )
-    split_size = (size + chunks - 1) // chunks
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::split")
-@symbolic_helper.parse_args("v", "v", "i", "i")
-def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
-    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "split", 9, 11, "Dynamic number of outputs not supported", self
-        )
-    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
-    if split_val.dim() > 0:
-        return split_with_sizes(g, self, split_size_or_sizes, dim, _outputs)
-    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
-
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        if _outputs is not None:
-            size = split_size * _outputs
-        else:
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "split", 9, 11, "Unknown dimension size not supported", self
-            )
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::unsafe_split")
-def unsafe_split(
-    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
-):
-    return split(g, self, split_size_or_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::split_with_sizes")
-@symbolic_helper.parse_args("v", "is", "i", "i")
-def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
-    if not symbolic_helper._is_split_static(split_sizes, _outputs):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "split_with_sizes", 9, 11, "Dynamic number of outputs not supported", self
-        )
-    return g.op("Split", self, split_i=split_sizes, axis_i=dim, outputs=_outputs)
-
-
-@_onnx_symbolic("aten::unsafe_split_with_sizes")
-def unsafe_split_with_sizes(
-    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
-):
-    return split_with_sizes(g, self, split_sizes, dim, _outputs)
-
-
-@_onnx_symbolic("aten::unbind")
-@symbolic_helper.parse_args("v", "i", "i")
-def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
-    if _outputs is None:
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "unbind", 9, 11, "Dynamic number of outputs not supported", self
-        )
-
-    outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs)
-    outputs = [outputs] if _outputs == 1 else outputs
-    squeezed_outputs = [
-        symbolic_helper._squeeze_helper(g, out, [dim]) for out in outputs
-    ]
-    return squeezed_outputs
-
-
-@_onnx_symbolic("aten::select")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "i", "v")
-def select(g: jit_utils.GraphContext, self, dim, index):
-    """Implement the select functionality for a pytorch tensor in ONNX.
-
-    Selects elements from the input tensor along the specified `dim` dimension based on the `index` tensor.
-    """
-    index = symbolic_helper._maybe_get_scalar(index)
-    if (not symbolic_helper._is_value(index)) and (index < 0):
-        if index == -1:
-            end_index = _constants.INT64_MAX
-        else:
-            end_index = index + 1
-        slice_node = symbolic_helper._slice_helper(
-            g, self, axes=[dim], starts=[index], ends=[end_index]
-        )
-        return symbolic_helper._squeeze_helper(g, slice_node, [dim])
-    else:
-        # FIXME(justinchuby): can index be an int and not a value?
-        return g.op("Gather", self, index, axis_i=dim)
-
-
-@_onnx_symbolic("aten::square")
-def square(g: jit_utils.GraphContext, self):
-    return g.op("Mul", self, self)
-
-
-@_onnx_symbolic("aten::squeeze")
-def squeeze(g: jit_utils.GraphContext, self, dim=None):
-    if dim is None:
-        return g.op("Squeeze", self)
-
-    squeeze_dim = symbolic_helper._get_const(dim, "i", "dim")
-    # Handle negative dims
-    if squeeze_dim < 0:
-        rank = symbolic_helper._get_tensor_rank(self)
-        if rank is not None:
-            warnings.warn(
-                "ONNX export squeeze with negative axis "
-                + str(squeeze_dim)
-                + " might cause the onnx model to be incorrect. "
-                + "Negative axis is not supported in ONNX. "
-                + "Axis is converted to "
-                + str(squeeze_dim + rank)
-                + " based on input shape at export time. "
-                + "Passing an tensor of different rank in execution will be incorrect."
-            )
-            squeeze_dim += rank
-        else:
-            return symbolic_helper._unimplemented(
-                "squeeze", "negative axis with unknown input rank", self
-            )
-
-    dim_size = symbolic_helper._get_tensor_dim_size(self, squeeze_dim)
-    if dim_size is None:
-        warnings.warn(
-            "This model contains a squeeze operation on dimension "
-            + str(squeeze_dim)
-            + " on an input "
-            + "with unknown shape. Note that if the size of dimension "
-            + str(squeeze_dim)
-            + " of the input "
-            + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on "
-            + "non-singleton dimensions, it is recommended to export this model using opset "
-            + "version 11 or higher."
-        )
-        return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
-    if dim_size > 1:
-        warnings.warn(
-            "This model contains a squeeze operation on dimension "
-            + str(squeeze_dim)
-            + ". The size of "
-            + "this dimension in the given input is "
-            + str(dim_size)
-            + ". The model will "
-            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
-            + "input shapes, please use opset version 11 to "
-            + "export the model."
-        )
-        return self
-
-    warnings.warn(
-        "This model contains a squeeze operation on dimension "
-        + str(squeeze_dim)
-        + ". If the model is "
-        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model."
-    )
-    return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
-
-
-@_onnx_symbolic("aten::prelu")
-def prelu(g: jit_utils.GraphContext, self, weight):
-    self_rank = symbolic_helper._get_tensor_rank(self)
-    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
-    weight_rank = len(weight_sizes)
-    if self_rank is not None:
-        if self_rank > 2:
-            # make weight unidirectional broadcastable
-            weight = symbolic_helper._unsqueeze_helper(
-                g, weight, list(range(1, self_rank - 1))
-            )
-        elif self_rank == 0 and weight_sizes == [1]:
-            # self and weight are both scalar but weight has rank == 1, squeeze weight.
-            weight = symbolic_helper._squeeze_helper(g, weight, [0])
-            weight_rank = 0
-
-    if self_rank is not None and weight_rank is not None:
-        assert self_rank >= weight_rank, (
-            f"rank(x) should be >= rank(slope) but got {self_rank} < {weight_rank}"
-        )
-    return g.op("PRelu", self, weight)
-
-
-@_onnx_symbolic("aten::silu")
-def silu(g: jit_utils.GraphContext, input):
-    return g.op("Mul", input, g.op("Sigmoid", input))
-
-
-@_onnx_symbolic("aten::mish")
-def mish(g: jit_utils.GraphContext, input):
-    return g.op("Mul", input, g.op("Tanh", g.op("Softplus", input)))
-
-
-@_onnx_symbolic("aten::relu")
-@symbolic_helper.quantized_args(True)
-def relu(g: jit_utils.GraphContext, input):
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Relu", input, opset_before=14
-    )
-
-
-@_onnx_symbolic("aten::relu6")
-@symbolic_helper.quantized_args(True)
-def relu6(g: jit_utils.GraphContext, input):
-    return clamp(g, input, 0, 6)
-
-
-@_onnx_symbolic("aten::ceil")
-def ceil(g: jit_utils.GraphContext, input):
-    return g.op("Ceil", input)
-
-
-@_onnx_symbolic("aten::floor")
-def floor(g: jit_utils.GraphContext, input):
-    return g.op("Floor", input)
-
-
-@_onnx_symbolic("aten::len")
-def _len(g: jit_utils.GraphContext, self):
-    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
-    return symbolic_helper._squeeze_helper(g, sz_0, [0])
-
-
-@_onnx_symbolic("aten::threshold")
-@symbolic_helper.parse_args("v", "t", "t")
-def threshold(g: jit_utils.GraphContext, self, threshold, value):
-    # See Note [Export inplace]
-    if symbolic_helper._scalar(threshold) != 0:
-        return symbolic_helper._unimplemented("threshold", "non-zero threshold", self)
-    if symbolic_helper._scalar(value) != 0:
-        return symbolic_helper._unimplemented("threshold", "non-zero value", self)
-    return g.op("Relu", self)
-
-
-@_onnx_symbolic("aten::leaky_relu")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "f", "b")
-def leaky_relu(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    negative_slope: float,
-    inplace: bool = False,
-):
-    # See Note [Export inplace]
-    return g.op("LeakyRelu", input, alpha_f=negative_slope)
-
-
-@_onnx_symbolic("aten::glu")
-@symbolic_helper.parse_args("v", "i")
-def glu(g: jit_utils.GraphContext, input, dim):
-    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
-    if dim_size is not None:
-        assert dim_size % 2 == 0
-
-    first, second = g.op("Split", input, axis_i=dim, outputs=2)
-    return g.op("Mul", first, g.op("Sigmoid", second))
-
-
-@_onnx_symbolic("aten::softmax")
-@symbolic_helper.parse_args("v", "i", "none")
-def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
-    # Softmax does normalization at vector level.
-    # PyTorch and ONNX use different strategies to split the input tensor into vectors.
-    # Thus dim and axis have different meanings.
-    # PyTorch slices the input tensor into vectors along the `dim`-th dimension.
-    # ONNX reshapes the input into a 2-D tensor, and `axis` indicates where the input is coerced.
-    # If input is a 2 x 3 tensor:
-    # input = [[1.0, 1.0, 1.0],
-    #          [1.0, 1,0, 1,0]]
-    # with dim = 0, the result is:
-    # result = [[0.5, 0.5, 0.5],
-    #           [0.5, 0.5, 0.5]]
-    # with axis = 0, the result is:
-    # result = [[0.167, 0.167, 0.167],
-    #           [0.167, 0.167, 0.167]]
-    # So only when dim and axis both equal to ndim - 1 (the last dimension),
-    # their semantics are equivalent.
-    # So use softmax when dim and axis both equal to ndim - 1,
-    # otherwise transpose the input to put the vectors to be normalized to the last dimension.
-    # When input rank is not known at export time we compute softmax using a subgraph
-    # with other operators
-    input_dim = symbolic_helper._get_tensor_rank(input)
-    if input_dim is not None:
-        # TODO: remove this as onnx opset 11 spec allows negative axes
-        if dim < 0:
-            dim = input_dim + dim
-
-        is_transpose_required = input_dim != dim + 1
-
-        if is_transpose_required:
-            axes = list(range(input_dim))
-            axes[dim], axes[-1] = axes[-1], axes[dim]
-            input = g.op("Transpose", input, perm_i=axes)
-            dim = input_dim - 1
-
-        softmax = g.op("Softmax", input, axis_i=dim)
-        if dtype and dtype.node().kind() != "prim::Constant":
-            parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-            softmax = g.op(
-                "Cast",
-                softmax,
-                to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type(),
-            )
-
-        if is_transpose_required:
-            softmax = g.op("Transpose", softmax, perm_i=axes)  # type: ignore[possibly-undefined]
-        return softmax
-
-    # Apply max normalization.
-    input = g.op("Sub", input, g.op("ReduceMax", input, axes_i=[dim], keepdims_i=1))
-
-    exp = g.op("Exp", input)
-    sum = symbolic_helper._reducesum_helper(g, exp, axes_i=[dim])
-    softmax = g.op("Div", exp, sum)
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        softmax = g.op(
-            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-    return softmax
-
-
-@_onnx_symbolic("aten::softplus")
-def softplus(g: jit_utils.GraphContext, self, beta, threshold):
-    beta_const = symbolic_helper._maybe_get_const(beta, "f")
-    if beta_const != 1:
-        return g.op("Div", g.op("Softplus", g.op("Mul", self, beta)), beta)
-    return g.op("Softplus", self)
-
-
-@_onnx_symbolic("aten::get_pool_ceil_padding")
-def get_pool_ceil_padding(input, kernel_size, stride, padding):
-    # TODO(justinchuby): Looks like this op is deprecated in torch
-    sizes = symbolic_helper._get_tensor_sizes(input)
-    dim = sizes[-len(padding) :] if sizes is not None else None
-    if dim is None or any(i is None for i in dim):
-        return symbolic_helper._unimplemented(
-            "get_pool_ceil_padding", "input size not accessible", input
-        )
-    ceiled_output_dim = [
-        int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])))
-        + 1
-        for i in range(0, len(padding))
-    ]
-    # ensure last pooling starts inside
-    ceiled_output_dim = [
-        (
-            ceiled_output_dim[i] - 1
-            if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
-            else ceiled_output_dim[i]
-        )
-        for i in range(0, len(ceiled_output_dim))
-    ]
-    padding_ceil = [
-        (
-            0
-            if (stride[i] == 1)
-            else (
-                kernel_size[i]
-                - (
-                    dim[i]
-                    + 2 * padding[i]
-                    - ((ceiled_output_dim[i] - 1) * stride[i] + 1)
-                )
-            )
-        )
-        for i in range(0, len(padding))
-    ]
-    # ensure padding is not > kernel_size
-    padding_ceil = [
-        (
-            (
-                int(padding_ceil[i])
-                if padding_ceil[i] < kernel_size[i] - 1
-                else int(kernel_size[i] - 1)
-            )
-            if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
-            else int(padding_ceil[i])
-        )
-        for i in range(0, len(padding_ceil))
-    ]
-    return padding_ceil
-
-
-@_onnx_symbolic(
-    "aten::max_pool1d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool1d", torch.nn.modules.utils._single, 1, return_indices=False
-        ),
-        _export("max_pool1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::max_pool2d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool2d", torch.nn.modules.utils._pair, 2, return_indices=False
-        ),
-        _export("max_pool2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::max_pool3d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "max_pool3d", torch.nn.modules.utils._triple, 3, return_indices=False
-        ),
-        _export("max_pool3d"),
-    ],
-)
-def _max_pool(name, tuple_fn, ndims, return_indices):
-    @symbolic_helper.quantized_args(True, False, False, False, False, False)
-    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
-    def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
-        if set(tuple_fn(dilation)) != {1}:
-            return symbolic_helper._unimplemented(name, "dilation", input)
-        if not stride:
-            stride = kernel_size
-        padding = tuple(tuple_fn(padding))
-        if ceil_mode:
-            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
-            padding = padding + tuple(a + b for (a, b) in zip(padding_ceil, padding))
-        else:
-            padding = padding * 2
-        kwargs = {
-            "kernel_shape_i": tuple_fn(kernel_size),
-            "pads_i": padding,
-            "strides_i": tuple_fn(stride),
-        }
-        # easy but hacky way to get flattened indices values
-        # to be used to convert the indices values to non-flattened.
-        # In ONNX the indices are computed as a flatten 1-D tensor,
-        # so the values in indices are in [0, N x C x D1 x ... x Dn).
-        # To convert the indices to the same format used by Pytorch,
-        # we first execute a maxpool with a kernel and stride of 1 on the same input.
-        # This will result in a tensor of indices in which each index will have it's own value.
-        # Using this tensor as a reference, we extract the first index of each axis and subtract
-        # it from each index of this axis in the indices to convert.
-        # This step will result in a tensor were each dimension has values of indices within
-        # the dimension it is in.
-        # For more information :
-        # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
-        if return_indices:
-            r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
-            _, flattened_indices = g.op(
-                "MaxPool",
-                input,
-                outputs=2,
-                kernel_shape_i=[1 for _ in range(ndims)],
-                strides_i=[1 for _ in range(ndims)],
-            )
-            # convert indices to have non-flattened indices values
-            s = symbolic_helper._slice_helper(
-                g,
-                flattened_indices,
-                axes=[2 + i for i in range(ndims)],
-                starts=list(tuple_fn(0)),
-                ends=list(tuple_fn(1)),
-            )
-            indices = sub(g, indices, s)
-            return r, indices
-        else:
-            r = g.op("MaxPool", input, outputs=1, **kwargs)
-            return r
-
-    return symbolic_fn
-
-
-max_pool1d_with_indices = _onnx_symbolic("aten::max_pool1d_with_indices")(
-    _max_pool(
-        "max_pool1d_with_indices",
-        torch.nn.modules.utils._single,
-        1,
-        return_indices=True,
-    )
-)
-max_pool2d_with_indices = _onnx_symbolic("aten::max_pool2d_with_indices")(
-    _max_pool(
-        "max_pool2d_with_indices",
-        torch.nn.modules.utils._pair,
-        2,
-        return_indices=True,
-    )
-)
-max_pool3d_with_indices = _onnx_symbolic("aten::max_pool3d_with_indices")(
-    _max_pool(
-        "max_pool3d_with_indices",
-        torch.nn.modules.utils._triple,
-        3,
-        return_indices=True,
-    )
-)
-
-
-@_onnx_symbolic(
-    "aten::avg_pool1d",
-    decorate=[
-        symbolic_helper._apply_params("avg_pool1d", torch.nn.modules.utils._single),
-        _export("avg_pool1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::avg_pool2d",
-    decorate=[
-        symbolic_helper._apply_params("avg_pool2d", torch.nn.modules.utils._pair),
-        _export("avg_pool2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::avg_pool3d",
-    decorate=[
-        symbolic_helper._apply_params("avg_pool3d", torch.nn.modules.utils._triple),
-        _export("avg_pool3d"),
-    ],
-)
-def _avg_pool(name, tuple_fn):
-    @symbolic_helper.quantized_args(True)
-    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
-    def symbolic_fn(
-        g,
-        input: _C.Value,
-        kernel_size: Sequence[int],
-        stride: Sequence[int],
-        padding: int | Sequence[int],
-        ceil_mode: int,
-        count_include_pad: int,
-        divisor_override=None,
-    ):
-        if not stride:
-            stride = kernel_size
-        padding = symbolic_helper._avgpool_helper(
-            tuple_fn, padding, kernel_size, stride, divisor_override, name
-        )
-        assert isinstance(padding, tuple)
-        adjusted_padding = padding
-        # Although onnx::AvgPool provides count_include_pad,
-        # The corner case of Average Pooling with ceil_mode on
-        # PyTorch allows sliding window go off bound, which leads to
-        # this accommodation.
-        # More detail on https://github.com/pytorch/pytorch/issues/57178
-        if count_include_pad:
-            input = symbolic_helper._op_with_optional_float_cast(
-                g,
-                "Pad",
-                input,
-                pads_i=((0,) * 2 + padding) * 2,
-                mode_s="constant",
-                value_f=0.0,
-                opset_before=11,
-            )
-            adjusted_padding = (0,) * len(padding)
-        if ceil_mode:
-            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
-            adjusted_padding = adjusted_padding + tuple(
-                a + b for (a, b) in zip(padding_ceil, adjusted_padding)
-            )
-        else:
-            adjusted_padding = adjusted_padding * 2
-        output = g.op(
-            "AveragePool",
-            input,
-            kernel_shape_i=tuple_fn(kernel_size),
-            strides_i=tuple_fn(stride),
-            pads_i=adjusted_padding,
-        )
-        return output
-
-    return symbolic_fn
-
-
-@_onnx_symbolic(
-    "aten::adaptive_avg_pool1d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_avg_pool1d", "AveragePool", torch.nn.modules.utils._single
-        ),
-        _export("adaptive_avg_pool1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::adaptive_avg_pool2d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_avg_pool2d", "AveragePool", torch.nn.modules.utils._pair
-        ),
-        _export("adaptive_avg_pool2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::adaptive_avg_pool3d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_avg_pool3d", "AveragePool", torch.nn.modules.utils._triple
-        ),
-        _export("adaptive_avg_pool3d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::adaptive_max_pool1d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_max_pool1d",
-            "MaxPool",
-            torch.nn.modules.utils._single,
-            max_pool1d_with_indices,
-        ),
-        _export("adaptive_max_pool1d"),
-    ],
+from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import *  # noqa: F401,F403
+from torch.onnx._internal.torchscript_exporter.symbolic_opset9 import (  # noqa: F401
+    _prepare_onnx_paddings,
+    _reshape_from_tensor,
+    _slice,
+    _var_mean,
 )
-@_onnx_symbolic(
-    "aten::adaptive_max_pool2d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_max_pool2d",
-            "MaxPool",
-            torch.nn.modules.utils._pair,
-            max_pool2d_with_indices,
-        ),
-        _export("adaptive_max_pool2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::adaptive_max_pool3d",
-    decorate=[
-        symbolic_helper._apply_params(
-            "adaptive_max_pool3d",
-            "MaxPool",
-            torch.nn.modules.utils._triple,
-            max_pool3d_with_indices,
-        ),
-        _export("adaptive_max_pool3d"),
-    ],
-)
-def _adaptive_pool(name, type, tuple_fn, fn=None):
-    @symbolic_helper.quantized_args(True, False)
-    def symbolic_fn(g, input, output_size):
-        # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
-        # by executing a GlobalPool.
-        # It is also supported for cases where the output size is a factor of the input size.
-        # For these cases the stride and kernel size are uniform along all the indices of
-        # the same dimension, which makes it possible to export it to ONNX.
-        # for MaxPool, GlobalMaxPool does not return indices,
-        # so we try using max_poolxd_with_indices, and if it is not possible
-        # (input is not a complete tensor or output size not factor of input size)
-        # then we call GlobalAveragePool and return None for the indices
-        output_size_value = output_size
-        try:
-            output_size = symbolic_helper._parse_arg(output_size, "is")
-        except Exception:
-            # FIXME(justinchuby): Avoid catching Exception.
-            # Catch a more specific exception instead.
-            return symbolic_helper._onnx_unsupported(
-                "adaptive pooling, since output_size is not constant.", input
-            )
-        if output_size == [1] * len(output_size) and type == "AveragePool":
-            return g.op("GlobalAveragePool", input)
-        sizes = symbolic_helper._get_tensor_sizes(input)
-        try:
-            dim = sizes[2:]
-        except Exception:
-            # FIXME(justinchuby): Avoid catching Exception.
-            # Catch a more specific exception instead.
-            dim = None
-        if dim is None or any(i is None for i in dim):
-            if output_size == [1] * len(output_size):
-                return g.op("GlobalMaxPool", input), None
-            return symbolic_helper._unimplemented(
-                name, "input size not accessible", input
-            )
-        # verify if output size % input size = 0 for all dim
-        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
-        if mod != [0] * len(mod):
-            if output_size == [1] * len(output_size):
-                return g.op("GlobalMaxPool", input), None
-            return symbolic_helper._unimplemented(
-                name, "output size that are not factor of input size", output_size_value
-            )
-        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
-        # call max_poolxd_with_indices to get indices in the output
-        if type == "MaxPool":
-            return fn(g, input, k, k, (0,) * len(dim), (1,) * len(dim), False)
-        output = g.op(type, input, kernel_shape_i=tuple_fn(k), strides_i=tuple_fn(k))
-        return output
-
-    return symbolic_fn
-
-
-def _prepare_onnx_paddings(dim: int, pad):
-    """Generate paddings in ONNX order based on pad in pytorch.
-    Args:
-        dim: the dimension of the tensor.
-        pad: the paddings in pytorch.
-            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ...
-    """
-    # The desired order of paddings is
-    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
-    # n is the dimension of input.
-    # assume zero-dimensions in the beginning
-    paddings = list(pad[:]) + [0] * (dim * 2 - len(pad))
-    # reverse order and collate first beginnings and then ends
-    paddings = paddings[-2::-2] + paddings[-1::-2]
-    return paddings
-
-
-def _convert_padding_node(input):
-    padding = symbolic_helper._maybe_get_const(input, "is")
-    if symbolic_helper._is_value(padding) and symbolic_helper._is_packed_list(padding):
-        input_list = symbolic_helper._unpack_list(padding)
-        try:
-            padding = [
-                symbolic_helper._get_const(v, "i", "padding") for v in input_list
-            ]
-        except Exception:
-            # FIXME(justinchuby): Avoid catching Exception.
-            # Catch a more specific exception instead.
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "Pad", 9, 11, "The sizes of the padding must be constant", input
-            )
-    return padding
-
-
-@_onnx_symbolic("aten::constant_pad_nd")
-def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
-    mode = "constant"
-    try:
-        value = symbolic_helper._get_const(value, "f", "value")
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "Pad", 9, 11, "The value for the padding must be constant", value
-        )
-
-    padding = _convert_padding_node(padding)
-    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
-    )
-
-
-def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value):
-    padding = _convert_padding_node(pad)
-    assert len(padding) % 2 == 0
-    ndim = len(padding) // 2
-
-    cur = input
-    for idx in range(ndim):
-        pad_r = padding[-(2 * idx + 1)]
-        pad_l = padding[-(2 * idx + 2)]
-        tensors = []
-        if pad_l > 0:
-            left = symbolic_helper._slice_helper(
-                g, cur, axes=[2 + idx], starts=[-(pad_l)], ends=[_constants.INT64_MAX]
-            )
-            tensors.append(left)
-
-        if pad_l < 0 or pad_r < 0:
-            start = builtins.max(0, -pad_l)
-            end = -(builtins.max(0, -pad_r))
-            middle = symbolic_helper._slice_helper(
-                g,
-                cur,
-                axes=[2 + idx],
-                starts=[start],
-                ends=[end],
-            )
-            tensors.append(middle)
-        else:
-            tensors.append(cur)
-
-        if pad_r > 0:
-            right = symbolic_helper._slice_helper(
-                g, cur, axes=[2 + idx], starts=[0], ends=[pad_r]
-            )
-            tensors.append(right)
-
-        cur = g.op("Concat", *tensors, axis_i=(2 + idx))
-
-    return cur
-
-
-@_onnx_symbolic("aten::reflection_pad1d")
-@_onnx_symbolic("aten::reflection_pad2d")
-@_onnx_symbolic("aten::reflection_pad3d")
-def reflection_pad(g: jit_utils.GraphContext, input, padding):
-    mode = "reflect"
-    padding = _convert_padding_node(padding)
-    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
-    )
-
-
-@_onnx_symbolic("aten::replication_pad1d")
-@_onnx_symbolic("aten::replication_pad2d")
-@_onnx_symbolic("aten::replication_pad3d")
-def replication_pad(g: jit_utils.GraphContext, input, padding):
-    mode = "edge"
-    padding = _convert_padding_node(padding)
-    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
-    )
-
-
-@_onnx_symbolic("aten::pad")
-def pad(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    pad: _C.Value,
-    mode: _C.Value,
-    value: _C.Value,
-):
-    mode = symbolic_helper._parse_arg(mode, "s")
-    if mode == "replicate":
-        return replication_pad(g, input, pad)
-    elif mode == "reflect":
-        return reflection_pad(g, input, pad)
-    elif mode == "constant":
-        return constant_pad_nd(g, input, pad, value)
-    elif mode == "circular":
-        return _pad_circular(g, input, pad)
-    else:
-        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
-
-
-@_onnx_symbolic(
-    "aten::upsample_nearest1d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest"),
-        _export("upsample_nearest1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest2d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest"),
-        _export("upsample_nearest2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_nearest3d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest"),
-        _export("upsample_nearest3d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_linear1d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_linear1d", 3, "linear"),
-        _export("upsample_linear1d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_bilinear2d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear"),
-        _export("upsample_bilinear2d"),
-    ],
-)
-@_onnx_symbolic(
-    "aten::upsample_trilinear3d",
-    decorate=[
-        symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear"),
-        _export("upsample_trilinear3d"),
-    ],
-)
-def _interpolate(name: str, dim: int, interpolate_mode: str):
-    def symbolic_fn(g, input, output_size, *args):
-        scales, align_corners = symbolic_helper._get_interpolate_attributes(
-            g, interpolate_mode, args
-        )
-        symbolic_helper._interpolate_warning(interpolate_mode)
-        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
-        if align_corners:
-            return symbolic_helper._unimplemented(name, "align_corners == True", input)
-        if scales is None:
-            scales = symbolic_helper._interpolate_size_to_scales(
-                g, input, output_size, dim
-            )
-        return g.op("Upsample", input, scales, mode_s=interpolate_mode)
-
-    return symbolic_fn
-
-
-@_onnx_symbolic("aten::__interpolate")
-def __interpolate(
-    g: jit_utils.GraphContext,
-    input,
-    size,
-    scale_factor,
-    mode,
-    align_corners,
-    recompute_scale_factor,
-    antialias,
-):
-    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
-        g, input, size, scale_factor, mode, align_corners
-    )
-    return g.op("Upsample", input, scales, mode_s=mode)
-
-
-@_onnx_symbolic("aten::bitwise_not")
-def bitwise_not(g: jit_utils.GraphContext, input):
-    if not symbolic_helper._is_bool(input):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise Not "
-            "for non-boolean input values",
-            input,
-        )
-    return g.op("Not", input)
-
-
-@_onnx_symbolic("aten::bitwise_or")
-def bitwise_or(g, self, other):
-    if not symbolic_helper._is_bool(self):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise OR "
-            "for non-boolean input values. self: ",
-            self,
-        )
-    if not symbolic_helper._is_bool(other):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise OR "
-            "for non-boolean input values. other: ",
-            other,
-        )
-    return g.op("Or", self, other)
-
-
-def wrap_logical_op_with_cast_to(to_type):
-    def decorator(fn):
-        @functools.wraps(fn)
-        def wrap_with_cast(g, input, other):
-            to_cast_func = globals()[f"_cast_{to_type}"]
-            return fn(g, to_cast_func(g, input, False), to_cast_func(g, other, False))
-
-        return wrap_with_cast
-
-    return decorator
-
-
-def wrap_logical_op_with_negation(func: Callable) -> Callable:
-    @functools.wraps(func)
-    def wrap_with_not(g, input, other):
-        return g.op("Not", func(g, input, other))
-
-    return wrap_with_not
-
-
-@_onnx_symbolic("aten::__not_")
-def __not_(g: jit_utils.GraphContext, self):
-    if not symbolic_helper._is_bool(self):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise Not "
-            "for non-boolean input values",
-            self,
-        )
-    return g.op("Not", self)
-
-
-@_onnx_symbolic("aten::eq")
-@symbolic_helper.quantized_args(True, True)
-def eq(g: jit_utils.GraphContext, self, other):
-    if isinstance(self.type(), _C.DeviceObjType) and isinstance(
-        other.type(), _C.DeviceObjType
-    ):
-        # ONNX doesn't have devices, so consider them all to be equal.
-        # The no-op check for equality will get constant-folded.
-        return g.op("Constant", value_t=torch.tensor(True, dtype=torch.bool))
-    self_node = self.node()
-    other_node = other.node()
-    if self_node.kind() == other_node.kind() == "onnx::Constant":
-        if self_node.kindOf("value") == other_node.kindOf("value") == "s":
-            # Exporting strings to ONNX is not supported.
-            # If both strings are constant, we can compare them directly.
-            # The no-op check for equality will get constant-folded.
-            return g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    self_node.s("value") == other_node.s("value"),
-                    dtype=torch.bool,
-                ),
-            )
-
-    return g.op("Equal", self, other)
-
-
-@_onnx_symbolic("aten::ne")
-@symbolic_helper.quantized_args(True, True)
-@wrap_logical_op_with_negation
-def ne(g: jit_utils.GraphContext, self, other):
-    return eq(g, self, other)
-
-
-@_onnx_symbolic("aten::gt")
-@symbolic_helper.quantized_args(True, True)
-def gt(g: jit_utils.GraphContext, input, other):
-    return _gt_impl(g, input, other)
-
-
-def _gt_impl(g: jit_utils.GraphContext, input, other):
-    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
-        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
-    return g.op("Greater", input, other)
-
-
-@_onnx_symbolic("aten::lt")
-@symbolic_helper.quantized_args(True, True)
-def lt(g: jit_utils.GraphContext, input, other):
-    return _lt_impl(g, input, other)
-
-
-def _lt_impl(g: jit_utils.GraphContext, input, other):
-    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
-        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
-    return g.op("Less", input, other)
-
-
-@_onnx_symbolic("aten::ge")
-@symbolic_helper.quantized_args(True, True)
-@wrap_logical_op_with_negation
-def ge(g: jit_utils.GraphContext, input, other):
-    return _lt_impl(g, input, other)
-
-
-@_onnx_symbolic("aten::le")
-@symbolic_helper.quantized_args(True, True)
-@wrap_logical_op_with_negation
-def le(g: jit_utils.GraphContext, input, other):
-    return _gt_impl(g, input, other)
-
-
-@_onnx_symbolic("aten::__and_")
-def __and_(g: jit_utils.GraphContext, input, other):
-    if not symbolic_helper._is_bool(input):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise AND "
-            "for non-boolean input values",
-            input,
-        )
-    if not symbolic_helper._is_bool(other):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise AND "
-            "for non-boolean input values",
-            other,
-        )
-    return g.op("And", input, other)
-
-
-@_onnx_symbolic("aten::__or_")
-def __or_(g: jit_utils.GraphContext, input, other):
-    if not symbolic_helper._is_bool(input):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise OR "
-            "for non-boolean input values",
-            input,
-        )
-    if not symbolic_helper._is_bool(other):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise OR "
-            "for non-boolean input values",
-            other,
-        )
-    return g.op("Or", input, other)
-
-
-@_onnx_symbolic("aten::__xor_")
-def __xor_(g: jit_utils.GraphContext, input, other):
-    if not symbolic_helper._is_bool(input):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise XOR "
-            "for non-boolean input values",
-            input,
-        )
-    if not symbolic_helper._is_bool(other):
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting bitwise XOR "
-            "for non-boolean input values",
-            other,
-        )
-    return g.op("Xor", input, other)
-
-
-@_onnx_symbolic("aten::logical_and")
-@wrap_logical_op_with_cast_to("Bool")
-def logical_and(g: jit_utils.GraphContext, input, other):
-    return g.op("And", input, other)
-
-
-@_onnx_symbolic("aten::logical_or")
-@wrap_logical_op_with_cast_to("Bool")
-def logical_or(g: jit_utils.GraphContext, input, other):
-    return g.op("Or", input, other)
-
-
-@_onnx_symbolic("aten::logical_xor")
-@wrap_logical_op_with_cast_to("Bool")
-def logical_xor(g: jit_utils.GraphContext, input, other):
-    return g.op("Xor", input, other)
-
-
-@_onnx_symbolic("aten::logical_not")
-def logical_not(g: jit_utils.GraphContext, input):
-    return g.op("Not", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL))
-
-
-@_onnx_symbolic("aten::__rshift_")
-def __rshift_(g: jit_utils.GraphContext, self, other):
-    # make sure to cast other to self's type
-    # (when self is long, make sure that other is not float)
-    self_scalar_type = _type_utils.JitScalarType.from_value(self)
-    if (
-        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
-        != self_scalar_type
-    ):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=self_scalar_type.onnx_type(),
-        )
-
-    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
-    # exponent (same type as self) has to be float or double in onnx::Pow
-    if not symbolic_helper._is_fp(self):
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    two_pow = g.op("Pow", two, other)
-    two_pow = g.op(
-        "Cast",
-        two_pow,
-        to_i=self_scalar_type.onnx_type(),
-    )
-    rshift = g.op("Div", self, two_pow)
-    return rshift
-
-
-@_onnx_symbolic("aten::__lshift_")
-def __lshift_(g: jit_utils.GraphContext, self, other):
-    # make sure to cast other to self's type
-    # (when self is long, make sure that other is not float)
-    self_scalar_type = _type_utils.JitScalarType.from_value(self)
-    if (
-        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
-        != self_scalar_type
-    ):
-        other = g.op(
-            "Cast",
-            other,
-            to_i=self_scalar_type.onnx_type(),
-        )
-
-    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
-    # exponent (same type as self) has to be float or double in onnx::Pow
-    if not symbolic_helper._is_fp(self):
-        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    two_pow = g.op("Pow", two, other)
-    two_pow = g.op(
-        "Cast",
-        two_pow,
-        to_i=self_scalar_type.onnx_type(),
-    )
-    lshift = g.op("Mul", self, two_pow)
-    return lshift
-
-
-@_onnx_symbolic("aten::where")
-@symbolic_helper.parse_args("v", "v", "v", "i")
-def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
-    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
-    if not symbolic_helper._is_bool(condition):
-        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    if self is None:
-        condition = nonzero(g, condition)
-        return symbolic_helper._unbind_helper(
-            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
-        )
-    return g.op("Where", condition, self, other)
-
-
-@_onnx_symbolic("aten::log_softmax")
-@symbolic_helper.parse_args("v", "i", "none")
-def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
-    # PyTorch dim and ONNX axis have different meanings.
-    # See Softmax comment for details.
-    # TODO: remove this as onnx opset 11 spec allows negative axes
-    input_dim = symbolic_helper._get_tensor_rank(input)
-    if input_dim is None:
-        return symbolic_helper._unimplemented(
-            "dim",
-            "ONNX and PyTorch use different strategies to split the input. "
-            "Input rank must be known at export time.",
-        )
-    if dim < 0:
-        dim = input_dim + dim
-    is_transpose_required = input_dim != dim + 1
-    # ONNX only supports log_softmax with dim = -1. Transpose must be added before and after log_softmax to support other cases.
-    if is_transpose_required:
-        axes = list(range(input_dim))
-        axes[dim], axes[-1] = axes[-1], axes[dim]
-        input = g.op("Transpose", input, perm_i=axes)
-        dim = input_dim - 1
-    return_op = g.op("LogSoftmax", input, axis_i=dim)
-    if dtype and dtype.node().kind() != "prim::Constant":
-        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        return_op = g.op(
-            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
-        )
-    if is_transpose_required:
-        return_op = g.op("Transpose", return_op, perm_i=axes)  # type: ignore[possibly-undefined]
-    return return_op
-
-
-@_onnx_symbolic("aten::_log_softmax")
-@symbolic_helper.parse_args("v", "i", "i")
-def _log_softmax(g: jit_utils.GraphContext, input, dim, half_to_float):
-    if (
-        half_to_float
-        and _type_utils.JitScalarType.from_value(
-            input, _type_utils.JitScalarType.UNDEFINED
-        )
-        == _type_utils.JitScalarType.HALF
-    ):
-        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    return log_softmax(g, input, dim)
-
-
-@_onnx_symbolic("aten::_convolution")
-@symbolic_helper.parse_args(
-    "v", "v", "v", "is", "is", "is", "i", "is", "i", "i", "i", "i", "i"
-)
-def _convolution(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    transposed,
-    output_padding,
-    groups,
-    benchmark,
-    deterministic,
-    cudnn_enabled,
-    allow_tf32=None,
-):
-    weight_size = symbolic_helper._get_tensor_sizes(weight)
-    try:
-        kernel_shape = weight_size[2:]
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        kernel_shape = None
-
-    if kernel_shape is None or any(i is None for i in kernel_shape):
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
-            input,
-        )
-
-    args = [input, weight]
-    # ONNX only supports 1D bias
-    if (
-        not symbolic_helper._is_none(bias)
-        and symbolic_helper._get_tensor_rank(bias) == 1
-    ):
-        args.append(bias)
-
-    kwargs = {
-        "kernel_shape_i": weight_size[2:],
-        "strides_i": stride,
-        # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
-        # symmetric padding
-        "pads_i": padding + padding,
-        "dilations_i": dilation,
-        "group_i": groups,
-    }
-
-    if any(o != 0 for o in output_padding):
-        # ONNX supports both output_shape and output_padding. they are equivalent expressive.
-        # output_padding is more straightforward, so we use it here.
-        # output_shape = stride * (input_shape - 1) + output_padding + kernel_shape - padding * 2
-        assert transposed
-        assert len(stride) == len(output_padding)
-        kwargs["output_padding_i"] = output_padding
-
-    n = g.op("ConvTranspose" if transposed else "Conv", *args, **kwargs)
-
-    if (
-        not symbolic_helper._is_none(bias)
-        and symbolic_helper._get_tensor_rank(bias) != 1
-    ):
-        return g.op("Add", n, bias)
-    else:
-        return n
-
-
-@_onnx_symbolic("aten::_convolution_mode")
-@symbolic_helper.parse_args(
-    "v",
-    "v",
-    "v",
-    "is",
-    "s",
-    "is",
-    "i",
-)
-def _convolution_mode(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    groups,
-):
-    weight_size = symbolic_helper._get_tensor_sizes(weight)
-    try:
-        kernel_shape = weight_size[2:]
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        kernel_shape = None
-
-    if kernel_shape is None or any(i is None for i in kernel_shape):
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
-            input,
-        )
-
-    args = [input, weight]
-    # ONNX only supports 1D bias
-    if (
-        not symbolic_helper._is_none(bias)
-        and symbolic_helper._get_tensor_rank(bias) == 1
-    ):
-        args.append(bias)
-
-    if padding == "valid":
-        padding = "VALID"
-    elif padding == "same":
-        padding = "SAME_UPPER"
-    kwargs = {
-        "kernel_shape_i": weight_size[2:],
-        "strides_i": stride,
-        "auto_pad_s": padding,
-        "dilations_i": dilation,
-        "group_i": groups,
-    }
-
-    n = g.op("Conv", *args, **kwargs)
-
-    if (
-        not symbolic_helper._is_none(bias)
-        and symbolic_helper._get_tensor_rank(bias) != 1
-    ):
-        return g.op("Add", n, bias)
-    else:
-        return n
-
-
-@_onnx_symbolic("aten::convolution")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is", "i")
-def convolution(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    dilation,
-    transposed,
-    output_padding,
-    groups,
-):
-    return _convolution(
-        g,
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        transposed,
-        output_padding,
-        groups,
-        None,
-        None,
-        None,
-        None,
-    )
-
-
-@_onnx_symbolic("aten::conv1d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
-def conv1d(
-    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
-):
-    str_padding = symbolic_helper._parse_arg(padding, "s")
-    if str_padding in ["valid", "same"]:
-        return _convolution_mode(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            str_padding,
-            dilation,
-            groups,
-        )
-    else:
-        padding = symbolic_helper._parse_arg(padding, "is")
-        return _convolution(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            False,
-            (),
-            groups,
-            None,
-            None,
-            None,
-            None,
-        )
-
-
-@_onnx_symbolic("aten::conv2d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
-def conv2d(
-    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
-):
-    str_padding = symbolic_helper._parse_arg(padding, "s")
-    if str_padding in ["valid", "same"]:
-        return _convolution_mode(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            str_padding,
-            dilation,
-            groups,
-        )
-    else:
-        padding = symbolic_helper._parse_arg(padding, "is")
-        return _convolution(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            False,
-            (),
-            groups,
-            None,
-            None,
-            None,
-            None,
-        )
-
-
-@_onnx_symbolic("aten::conv3d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
-def conv3d(
-    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
-):
-    str_padding = symbolic_helper._parse_arg(padding, "s")
-    if str_padding in ["valid", "same"]:
-        return _convolution_mode(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            str_padding,
-            dilation,
-            groups,
-        )
-    else:
-        padding = symbolic_helper._parse_arg(padding, "is")
-        return _convolution(
-            g,
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            False,
-            (),
-            groups,
-            None,
-            None,
-            None,
-            None,
-        )
-
-
-@_onnx_symbolic("aten::conv_transpose1d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
-def conv_transpose1d(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    groups,
-    dilation,
-):
-    return _convolution(
-        g,
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        True,
-        output_padding,
-        groups,
-        None,
-        None,
-        None,
-        None,
-    )
-
-
-@_onnx_symbolic("aten::conv_transpose2d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
-def conv_transpose2d(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    groups,
-    dilation,
-):
-    return _convolution(
-        g,
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        True,
-        output_padding,
-        groups,
-        None,
-        None,
-        None,
-        None,
-    )
-
-
-@_onnx_symbolic("aten::conv_transpose3d")
-@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
-def conv_transpose3d(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    stride,
-    padding,
-    output_padding,
-    groups,
-    dilation,
-):
-    return _convolution(
-        g,
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        True,
-        output_padding,
-        groups,
-        None,
-        None,
-        None,
-        None,
-    )
-
-
-@_onnx_symbolic("aten::batch_norm")
-@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
-def batch_norm(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    running_mean,
-    running_var,
-    training,
-    momentum,
-    eps,
-    cudnn_enabled,
-):
-    symbolic_helper.check_training_mode(training, "batch_norm")
-
-    if (
-        torch.is_autocast_enabled()
-        and not symbolic_helper.args_have_same_dtype(
-            [input, weight, bias, running_mean, running_var]
-        )
-        and GLOBALS.export_onnx_opset_version < 15
-    ):
-        return symbolic_helper._onnx_opset_unsupported_detailed(
-            "BatchNormalization",
-            9,
-            15,
-            "All input tensors must have the same `dtype`."
-            " Turn off Autocast or export using opset version 15.",
-            input,
-        )
-
-    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
-        g, input, weight, bias, running_mean, running_var
-    )
-    out = g.op(
-        "BatchNormalization",
-        input,
-        weight,
-        bias,
-        running_mean,
-        running_var,
-        epsilon_f=eps,
-        momentum_f=1 - momentum,
-        outputs=1 if not training else 5,
-    )
-    if not training:
-        return out
-    else:
-        res, new_running_mean, new_running_var, saved_mean, saved_var = out
-        new_running_mean.setType(running_mean.type())
-        new_running_var.setType(running_var.type())
-        saved_mean.setDebugName("batch_norm_dead_output-" + saved_mean.debugName())
-        saved_var.setDebugName("batch_norm_dead_output-" + saved_var.debugName())
-        return res
-
-
-@_onnx_symbolic("aten::native_layer_norm")
-@symbolic_helper.quantized_args(True, False, False, False)
-@symbolic_helper.parse_args("v", "is", "v", "v", "f")
-def native_layer_norm(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    normalized_shape: Sequence[int],
-    weight: _C.Value,
-    bias: _C.Value,
-    eps: float,
-) -> tuple[_C.Value, _C.Value, _C.Value]:
-    axes = [-i for i in range(len(normalized_shape), 0, -1)]
-
-    two_cst = symbolic_helper._generate_wrapped_number(g, 2.0)
-    eps_cst = symbolic_helper._generate_wrapped_number(g, eps)
-
-    if g.opset < 18:
-        mean = g.op("ReduceMean", input, axes_i=axes)
-    else:
-        mean = g.op(
-            "ReduceMean",
-            input,
-            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
-        )
-
-    numerator = sub(g, input, mean)
-
-    # Cast it to eps dtype to avoid precision loss
-    is_type_half = (
-        _type_utils.JitScalarType.from_value(numerator)
-        == _type_utils.JitScalarType.HALF
-    )
-    if is_type_half:
-        eps_dtype = _type_utils.JitScalarType.from_value(eps_cst)
-        numerator = g.op(
-            "Cast", numerator, to_i=_type_utils.JitScalarType(eps_dtype).onnx_type()
-        )
-
-    # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
-    if g.opset < 18:
-        variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
-    else:
-        variance = g.op(
-            "ReduceMean",
-            pow(g, numerator, two_cst),
-            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
-        )
-
-    denominator = sqrt(g, g.op("Add", variance, eps_cst))
-    normalized = g.op("Div", numerator, denominator)
-
-    # Cast back to input type as eps related ops are all done
-    if is_type_half:
-        input_dtype = _type_utils.JitScalarType.from_value(input)
-        normalized = g.op(
-            "Cast", normalized, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
-        )
-
-    if not (weight is None or symbolic_helper._is_none(weight)):
-        normalized = mul(g, normalized, weight)
-    if not (bias is None or symbolic_helper._is_none(bias)):
-        normalized = add(g, normalized, bias)
-
-    # rdenominator := 1 / sqrt(variance + eps)
-    # According to aten::native_layer_norm, rdenominator should have the same dtype as input,
-    # mean and normalized, so we need to Cast it back
-    if is_type_half:
-        denominator = g.op(
-            "Cast",
-            denominator,
-            to_i=_type_utils.JitScalarType(input_dtype).onnx_type(),  # type: ignore[possibly-undefined]
-        )
-        rdenominator = g.op("Reciprocal", denominator)
-    else:
-        rdenominator = reciprocal(g, denominator)
-
-    return normalized, mean, rdenominator
-
-
-@_onnx_symbolic("aten::layer_norm")
-@symbolic_helper.quantized_args(True, False, False, False)
-@symbolic_helper.parse_args("v", "is", "v", "v", "f", "b")
-def layer_norm(
-    g: jit_utils.GraphContext,
-    input: _C.Value,
-    normalized_shape: Sequence[int],
-    weight: _C.Value,
-    bias: _C.Value,
-    eps: float,
-    cudnn_enable: bool,
-) -> _C.Value:
-    normalized, _, _ = native_layer_norm(g, input, normalized_shape, weight, bias, eps)
-    return normalized
-
-
-@_onnx_symbolic("aten::instance_norm")
-@symbolic_helper.parse_args("v", "v", "v", "v", "v", "b", "f", "f", "b")
-def instance_norm(
-    g: jit_utils.GraphContext,
-    input,
-    weight,
-    bias,
-    running_mean,
-    running_var,
-    use_input_stats: bool,
-    momentum: Number,
-    eps: Number,
-    cudnn_enabled: bool,
-):
-    symbolic_helper.check_training_mode(use_input_stats, "instance_norm")
-    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
-    if weight is None or symbolic_helper._is_none(weight):
-        if channel_size is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of instance_norm for unknown channel size.",
-                input,
-            )
-        weight_value = torch.tensor(
-            [1.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        )
-        weight = g.op("Constant", value_t=weight_value)
-    if bias is None or symbolic_helper._is_none(bias):
-        if channel_size is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of instance_norm for unknown channel size.",
-                input,
-            )
-        bias_value = torch.tensor(
-            [0.0] * channel_size,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        )
-        bias = g.op("Constant", value_t=bias_value)
-    if (
-        running_mean is None
-        or symbolic_helper._is_none(running_mean)
-        or running_var is None
-        or symbolic_helper._is_none(running_var)
-    ):
-        return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
-    else:
-        input_size = symbolic_helper._get_tensor_sizes(input)
-        # If input shape is [N, C, H, W], reshape to [1, N * C, H, W] and call batch_norm.
-        # For more information instance_norm():
-        # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Normalization.cpp#L542
-        input_size_reshape = input_size.copy()
-        n = input_size[0]
-        if n is None:
-            raise errors.SymbolicValueError(
-                "Unsupported: ONNX export of instance_norm training for unknown "
-                "batch size.",
-                input,
-            )
-        c = input_size[1]
-        input_size_reshape[0] = 1
-        input_size_reshape[1] = n * c
-        weight_ = repeat(
-            g, weight, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
-        )
-        bias_ = repeat(
-            g, bias, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
-        )
-        running_mean_ = repeat(
-            g,
-            running_mean,
-            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
-        )
-        running_var_ = repeat(
-            g,
-            running_var,
-            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
-        )
-        input_reshaped = g.op(
-            "Reshape",
-            input,
-            g.op("Constant", value_t=torch.LongTensor(input_size_reshape)),
-        )
-        out = batch_norm(
-            g,
-            input_reshaped,
-            weight_,
-            bias_,
-            running_mean_,
-            running_var_,
-            use_input_stats,
-            momentum,
-            eps,
-            cudnn_enabled,
-        )
-        return view(g, out, g.op("Constant", value_t=torch.tensor(input_size)))
-
-
-@_onnx_symbolic("aten::unfold")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
-    sizes = symbolic_helper._get_tensor_sizes(input)
-    # FIXME(justinchuby): Get rid of the try catch here to improve readability
-    try:
-        sizedim = sizes[dimension]
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        sizedim = None
-    if sizedim is not None:
-        low_indices = range(0, sizedim, step)
-        hi_indices = range(size, sizedim + 1, step)
-        stack = [
-            symbolic_helper._slice_helper(
-                g, input, axes=[dimension], starts=[low], ends=[hi]
-            )
-            for low, hi in zip(low_indices, hi_indices)
-        ]
-        ndim = len(sizes)
-        perm = list(range(0, ndim))
-        perm.append(perm.pop(dimension))
-        unsqueeze = [
-            symbolic_helper._unsqueeze_helper(
-                g, g.op("Transpose", t, perm_i=perm), [dimension]
-            )
-            for t in stack
-        ]
-        return g.op("Concat", *unsqueeze, axis_i=dimension)
-    else:
-        return symbolic_helper._unimplemented(
-            "Unfold", "input size not accessible", input
-        )
-
-
-@_onnx_symbolic("aten::elu")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "t", "t", "t")
-def elu(g: jit_utils.GraphContext, input, alpha, scale, input_scale):
-    if scale and scale != 1.0:
-        return symbolic_helper._unimplemented(
-            "scale", "does not support scale in Elu", scale
-        )
-    if input_scale and input_scale != 1.0:
-        return symbolic_helper._unimplemented(
-            "input_scale", "does not support input_scale in Elu", input_scale
-        )
-    # See Note [Export inplace]
-    return g.op("Elu", input, alpha_f=symbolic_helper._scalar(alpha))
-
-
-@_onnx_symbolic("aten::selu")
-@symbolic_helper.quantized_args(True)
-def selu(g: jit_utils.GraphContext, input):
-    return g.op("Selu", input)
-
-
-@_onnx_symbolic("aten::index_select")
-@symbolic_helper.parse_args("v", "i", "v")
-def index_select(g: jit_utils.GraphContext, self, dim, index):
-    # In case of a scalar index, index_select returns a tensor with the same rank as the input.
-    # To match this behavior in ONNX, we make index a 1D tensor so that the following gather
-    # also produces a tensor with the same rank as the input.
-    return symbolic_helper._select_helper(g, self, dim, index)
-
-
-@_onnx_symbolic("aten::index_put")
-def index_put(g: jit_utils.GraphContext, self, indices_list_value, values, accumulate):
-    if symbolic_helper._is_packed_list(indices_list_value):
-        indices_list = symbolic_helper._unpack_list(indices_list_value)
-    else:
-        indices_list = [indices_list_value]
-
-    accumulate = symbolic_helper._parse_arg(accumulate, "b")
-
-    if len(indices_list) == 0:
-        if accumulate:
-            return add(g, self, values)
-        return values
-    symbolic_helper._onnx_opset_unsupported("index_put", 9, 11, self)
-
-
-@_onnx_symbolic("aten::index_fill")
-def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
-    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
-        g, self, dim, index
-    )
-    value = symbolic_helper._maybe_get_scalar(value)
-    value = symbolic_helper._if_scalar_type_as(value, self)
-    expanded_value = expand(g, value, expanded_index_shape, None)
-
-    return scatter(g, self, dim, expanded_index, expanded_value)
-
-
-@_onnx_symbolic("aten::index_copy")
-def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
-    _expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
-        g, self, dim, index
-    )
-    return scatter(g, self, dim, expanded_index, source)
-
-
-@_onnx_symbolic("aten::bucketize")
-@symbolic_helper.parse_args("v", "v", "b", "b")
-def bucketize(
-    g: jit_utils.GraphContext, self, boundaries, out_int32=False, right=False
-):
-    out_type = _C_onnx.TensorProtoDataType.INT64
-    if out_int32:
-        out_type = _C_onnx.TensorProtoDataType.INT32
-    # A tensor expanded_boundaries is created such that it
-    # contains a copy of boundaries for each element of self.
-    new_shape = g.op("Concat", g.op("Shape", boundaries), g.op("Shape", self), axis_i=0)
-    # Unsqueeze step is performed to respect ONNX's numpy style broadcasting for comparison ops
-    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
-    tensor_rank = symbolic_helper._get_tensor_rank(self)
-    assert tensor_rank is not None
-    unsqueeze_axes = list(range(1, tensor_rank + 1))
-    expanded_boundaries = expand(
-        g,
-        symbolic_helper._unsqueeze_helper(g, boundaries, unsqueeze_axes),
-        new_shape,
-        None,
-    )
-    # Compare each element of self to boundaries to get a tensor
-    # with leading 1s and trailing 0s.
-    # e.g., 4 > [1, 3, 4] = [1, 1, 0]
-    # The index of the last 1 is the bucket where the element should go.
-    if right:
-        cond = ge(g, self, expanded_boundaries)
-    else:
-        cond = gt(g, self, expanded_boundaries)
-    cond_out = g.op("Cast", cond, to_i=out_type)
-    # Sum to get the number of 1s corresponding to each element,
-    # which is the same as the bucket index.
-    # e.g., sum(4 > [1, 3, 4]) = sum([1, 1, 0]) = 2
-    return symbolic_helper._reducesum_helper(g, cond_out, axes_i=[0], keepdims_i=0)
-
-
-@_onnx_symbolic("aten::type_as")
-def type_as(g: jit_utils.GraphContext, self, other):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    other_dtype = symbolic_helper._try_get_scalar_type(other)
-    if self_dtype == other_dtype and self_dtype is not None:
-        return self
-    if other_dtype is not None:
-        return g.op(
-            "Cast",
-            self,
-            to_i=other_dtype.onnx_type(),
-        )
-
-    raise errors.SymbolicValueError(
-        "Unsupported: ONNX export of type_as for tensor "
-        "of unknown dtype. Please check if the dtype of the "
-        "parameter passed to the type_as function is correct.",
-        other,
-    )
-
-
-@_onnx_symbolic("aten::cosine_similarity")
-@symbolic_helper.parse_args("v", "v", "i", "f")
-def cosine_similarity(g: jit_utils.GraphContext, x1, x2, dim, eps):
-    cross = symbolic_helper._reducesum_helper(
-        g, mul(g, x1, x2), axes_i=[dim], keepdims_i=0
-    )
-    x1_l2 = symbolic_helper._reducesum_helper(
-        g, mul(g, x1, x1), axes_i=[dim], keepdims_i=0
-    )
-    x2_l2 = symbolic_helper._reducesum_helper(
-        g, mul(g, x2, x2), axes_i=[dim], keepdims_i=0
-    )
-    div_tens = max(
-        g, sqrt(g, mul(g, x1_l2, x2_l2)), g.op("Constant", value_t=torch.tensor([eps]))
-    )
-    return div(g, cross, div_tens)
-
-
-@_onnx_symbolic("aten::pairwise_distance")
-def pairwise_distance(g: jit_utils.GraphContext, input1, input2, p, eps, keepdim):
-    if not symbolic_helper._is_value(eps):
-        eps = g.op("Constant", value_t=torch.tensor([eps]))
-    inv_p = div(
-        g,
-        g.op("Constant", value_t=torch.tensor([1], dtype=torch.float)),
-        add(g, p, eps),
-    )
-    summation = symbolic_helper._reducesum_helper(
-        g,
-        pow(g, sub(g, input1, input2), p),
-        axes_i=[-1],
-        keepdims_i=symbolic_helper._parse_arg(keepdim, "i"),
-    )
-    return pow(g, summation, inv_p)
-
-
-@_onnx_symbolic("aten::clone")
-# ignore clone operators that are inserted by PyTorch autograd
-def clone(g: jit_utils.GraphContext, input, unused_memory_format):
-    return input
-
-
-@_onnx_symbolic("aten::abs")
-def abs(g: jit_utils.GraphContext, self):
-    return g.op("Abs", self)
-
-
-@_onnx_symbolic("aten::log")
-def log(g: jit_utils.GraphContext, self):
-    return g.op("Log", self)
-
-
-@_onnx_symbolic("aten::log1p")
-def log1p(g: jit_utils.GraphContext, self):
-    return log(g, add(g, symbolic_helper._if_scalar_type_as(torch.ones(1), self), self))
-
-
-@_onnx_symbolic("aten::log10")
-def log10(g: jit_utils.GraphContext, self):
-    _ln10 = 2.30258509299404568401
-    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor([_ln10])))
-
-
-@_onnx_symbolic("aten::pow")
-def pow(g: jit_utils.GraphContext, self, exponent):
-    f_dtype = _type_utils.JitScalarType.from_value(self)
-    if not symbolic_helper._is_fp(self):
-        f_dtype = _type_utils.JitScalarType.FLOAT
-        self = g.op("Cast", self, to_i=f_dtype.onnx_type())
-    if not symbolic_helper._is_fp(exponent):
-        exponent = g.op(
-            "Cast",
-            exponent,
-            to_i=f_dtype.onnx_type(),
-        )
-    pow = g.op("Pow", self, exponent)
-    return pow
-
-
-@_onnx_symbolic("aten::clamp")
-def clamp(g: jit_utils.GraphContext, self, min, max):
-    # min or max may be None that we need to dispatch to
-    # Clip separately, as ONNX does not have None syntax
-    if symbolic_helper._is_none(min):
-        return clamp_max(g, self, max)
-    elif symbolic_helper._is_none(max):
-        return clamp_min(g, self, min)
-    else:
-        if symbolic_helper._is_constant(min) and symbolic_helper._is_constant(max):
-            return symbolic_helper._op_with_optional_float_cast(
-                g,
-                "Clip",
-                self,
-                min_f=symbolic_helper._parse_arg(min, "f"),
-                max_f=symbolic_helper._parse_arg(max, "f"),
-                opset_before=12,
-            )
-        else:
-            return clamp_max(g, clamp_min(g, self, min), max)
-
-
-@_onnx_symbolic("aten::clamp_min")
-@symbolic_helper.parse_args("v", "v")
-def clamp_min(g: jit_utils.GraphContext, self, min):
-    if symbolic_helper._is_constant(min):
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Clip", self, min_f=symbolic_helper._parse_arg(min, "f"), opset_before=12
-        )
-    else:
-        dtype = _type_utils.JitScalarType.from_value(self)
-        min = g.op("Cast", min, to_i=dtype.onnx_type())
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Max", self, min, opset_before=12
-        )
-
-
-@_onnx_symbolic("aten::clamp_max")
-@symbolic_helper.parse_args("v", "v")
-def clamp_max(g: jit_utils.GraphContext, self, max):
-    if symbolic_helper._is_constant(max):
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Clip", self, max_f=symbolic_helper._parse_arg(max, "f"), opset_before=12
-        )
-    else:
-        dtype = _type_utils.JitScalarType.from_value(self)
-        max = g.op("Cast", max, to_i=dtype.onnx_type())
-        return symbolic_helper._op_with_optional_float_cast(
-            g, "Min", self, max, opset_before=12
-        )
-
-
-@_onnx_symbolic("aten::max")
-# torch.max (same for torch.min) actually has two interfaces smashed together:
-# torch.max(x, dim, keepdim) and torch.max(x, y)
-# TODO(justinchuby): Support multiple quantized args in output
-def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::maximum")
-@symbolic_helper.quantized_args(True, True)
-def maximum(g: jit_utils.GraphContext, input, other):
-    return max(g, input, dim_or_y=other)
-
-
-@_onnx_symbolic("aten::min")
-# TODO(justinchuby): Support multiple quantized args in output
-def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
-    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
-
-
-@_onnx_symbolic("aten::minimum")
-@symbolic_helper.quantized_args(True, True)
-def minimum(g: jit_utils.GraphContext, input, other):
-    return min(g, input, dim_or_y=other)
-
-
-@_onnx_symbolic("aten::amax")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "is", "i")
-def amax(g: jit_utils.GraphContext, self, dim, keepdim):
-    return g.op("ReduceMax", self, axes_i=dim, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::amin")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "is", "i")
-def amin(g: jit_utils.GraphContext, self, dim, keepdim):
-    return g.op("ReduceMin", self, axes_i=dim, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::aminmax")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "i")
-def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
-    reduce_kwargs = {"keepdims_i": keepdim}
-    if not symbolic_helper._is_none(dim):
-        dim = symbolic_helper._get_const(dim, "i", "dim")
-        reduce_kwargs["axes_i"] = [dim]
-
-    return g.op("ReduceMin", self, **reduce_kwargs), g.op(
-        "ReduceMax", self, **reduce_kwargs
-    )
-
-
-@_onnx_symbolic("aten::exp")
-def exp(g: jit_utils.GraphContext, self):
-    return g.op("Exp", self)
-
-
-@_onnx_symbolic("aten::dropout_")
-@_onnx_symbolic("aten::dropout")
-@symbolic_helper.parse_args("v", "f", "i")
-def dropout(g: jit_utils.GraphContext, input, p, train):
-    symbolic_helper.check_training_mode(train, "dropout")
-    # if train is False, dropout is no-op
-    if not train:
-        return input
-    r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
-    return r
-
-
-@_onnx_symbolic(
-    "aten::alpha_dropout_",
-    decorate=[symbolic_helper._apply_params("aten::alpha_dropout_")],
-)  # See Note [Export inplace]
-@_onnx_symbolic(
-    "aten::feature_alpha_dropout_",
-    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout_")],
-)
-@_onnx_symbolic(
-    "aten::feature_dropout_",
-    decorate=[symbolic_helper._apply_params("aten::feature_dropout_")],
-)
-@_onnx_symbolic(
-    "aten::feature_alpha_dropout",
-    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout")],
-)
-@_onnx_symbolic(
-    "aten::alpha_dropout",
-    decorate=[symbolic_helper._apply_params("aten::alpha_dropout")],
-)
-@_onnx_symbolic(
-    "aten::feature_dropout",
-    decorate=[symbolic_helper._apply_params("aten::feature_dropout")],
-)
-def _unsupported_dropout(name: str):
-    @symbolic_helper.parse_args("v", "none", "b")
-    def feature_dropout(g, input, p, train):
-        # NB: In inference mode, FeatureDropout is exported as an identity op.
-        if train:
-            return symbolic_helper._unimplemented(name, "training mode", input)
-        return input
-
-    return feature_dropout
-
-
-@_onnx_symbolic("aten::norm")
-@symbolic_helper.parse_args("v", "t", "is", "i", "v")
-def norm(g: jit_utils.GraphContext, self, p, dim, keepdim, dtype=None):
-    if p == 1:
-        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL1")
-    elif p == 2:
-        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL2")
-    else:
-        raise errors.SymbolicValueError(
-            "ONNX export only p-norms with p of 1 or 2", self
-        )
-    result = f(g, self, dim=dim, keepdim=keepdim)
-    if dtype is not None:
-        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    return result
-
-
-@_onnx_symbolic("aten::conv_tbc")
-@symbolic_helper.parse_args("v", "v", "v", "i")
-def conv_tbc(g: jit_utils.GraphContext, input, weight, bias, pad):
-    # input must have 3 dimensions, see:
-    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvolutionTBC.cpp#L8-L10
-    # input = (time, batch, in_channels)
-    # weight = (kernel_width, in_channels, out_channels)
-    # bias = (out_channels,)
-    input = g.op("Transpose", input, perm_i=[1, 2, 0])
-    weight = g.op("Transpose", weight, perm_i=[2, 1, 0])
-    conv = conv1d(g, input, weight, bias, [1], [pad], [1], 1)
-    return g.op("Transpose", conv, perm_i=[2, 0, 1])
-
-
-@_onnx_symbolic("aten::_unique")
-@symbolic_helper.parse_args("v", "i", "i")
-def _unique(g: jit_utils.GraphContext, input, sorted, return_inverse):
-    return symbolic_helper._onnx_unsupported("_unique", input)
-
-
-@_onnx_symbolic("aten::_unique2")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_counts):
-    symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input)
-
-
-@_onnx_symbolic("aten::_cast_Byte")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Byte(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.UINT8)
-
-
-@_onnx_symbolic("aten::_cast_Char")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Char(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT8)
-
-
-@_onnx_symbolic("aten::_cast_Short")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Short(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT16)
-
-
-@_onnx_symbolic("aten::_cast_Int")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Int(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
-
-
-@_onnx_symbolic("aten::_cast_Long")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Long(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
-
-
-@_onnx_symbolic("aten::_cast_Half")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Half(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT16)
-
-
-@_onnx_symbolic("aten::_cast_Float")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Float(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-
-
-@_onnx_symbolic("aten::_cast_Double")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Double(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
-
-
-@_onnx_symbolic("aten::_cast_Bool")
-@deprecated("Avoid using this function and create a Cast node instead")
-def _cast_Bool(g: jit_utils.GraphContext, input, non_blocking):
-    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL)
-
-
-@_onnx_symbolic("aten::empty")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def empty(
-    g: jit_utils.GraphContext,
-    sizes,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-    memory_format=None,
-):
-    return zeros(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::empty_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def empty_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    return zeros_like(g, input, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::new_empty")
-def new_empty(
-    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
-):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    if symbolic_helper._is_none(dtype) and self_dtype is not None:
-        dtype = self_dtype
-    return empty(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::scalar_tensor")
-def scalar_tensor(g: jit_utils.GraphContext, scalar, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        dtype = _type_utils.JitScalarType.FLOAT
-    scalar = g.op("Cast", scalar, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    return scalar
-
-
-@_onnx_symbolic("aten::tensor")
-def tensor(
-    g: jit_utils.GraphContext, data, dtype=None, device=None, requires_grad=False
-):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if symbolic_helper._is_packed_list(data):
-        if dtype is None:
-            dtype = _type_utils.JitScalarType.from_value(
-                symbolic_helper._unpack_list(data)[0]
-            )
-        input_list = []
-        for t in symbolic_helper._unpack_list(data):
-            shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
-            t = symbolic_helper._reshape_helper(g, t, shape_reference)
-            t = g.op("Cast", t, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-            input_list.append(t)
-        return g.op("Concat", *input_list, axis_i=0)
-    else:
-        if dtype is None:
-            dtype = _type_utils.JitScalarType.from_value(data)
-        if symbolic_helper._is_list(data) and (
-            symbolic_helper._is_tensor_list(data)
-            or symbolic_helper._is_scalar_list(data)
-        ):
-            data = g.op("ConcatFromSequence", data, axis_i=0, new_axis_i=1)
-    return g.op("Cast", data, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-
-
-@_onnx_symbolic("aten::as_tensor")
-def as_tensor(g: jit_utils.GraphContext, data, dtype=None, device=None):
-    return tensor(g, data, dtype, device)
-
-
-@_onnx_symbolic("aten::zeros")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v")
-def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
-    # NOTE: no way to set device, layout and pin_memory in ONNX, so we ignore it
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
-    if isinstance(sizes_, list) and len(sizes_) == 0:
-        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
-    return g.op(
-        "ConstantOfShape",
-        sizes,
-        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
-    )
-
-
-@_onnx_symbolic("aten::zeros_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def zeros_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    if symbolic_helper._is_none(dtype):
-        scalar_type = _type_utils.JitScalarType.from_value(
-            input, _type_utils.JitScalarType.FLOAT
-        )
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    return g.op(
-        "ConstantOfShape",
-        shape,
-        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
-    )
-
-
-@_onnx_symbolic("aten::new_zeros")
-def new_zeros(
-    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
-):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-
-    if symbolic_helper._is_none(dtype) and self_dtype is not None:
-        dtype = self_dtype
-    return zeros(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::zero")
-def zero(g: jit_utils.GraphContext, self):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    return zeros_like(g, self, self_dtype)
-
-
-@_onnx_symbolic("aten::ones")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v")
-def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
-    if isinstance(sizes_, list) and len(sizes_) == 0:
-        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
-    return g.op(
-        "ConstantOfShape",
-        sizes,
-        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
-    )
-
-
-@_onnx_symbolic("aten::ones_like")
-@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
-def ones_like(
-    g: jit_utils.GraphContext,
-    input,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    shape = g.op("Shape", input)
-    if symbolic_helper._is_none(dtype):
-        scalar_type = _type_utils.JitScalarType.from_value(
-            input, _type_utils.JitScalarType.FLOAT
-        )
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    return g.op(
-        "ConstantOfShape",
-        shape,
-        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
-    )
-
-
-@_onnx_symbolic("aten::new_ones")
-def new_ones(
-    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
-):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    if symbolic_helper._is_none(dtype) and self_dtype is not None:
-        dtype = self_dtype
-    return ones(g, sizes, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::full")
-def full(
-    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
-):
-    const_value = symbolic_helper._maybe_get_const(value, "t")
-    if symbolic_helper._is_value(const_value):
-        dtype = _type_utils.JitScalarType.FLOAT if dtype is None else dtype
-        tmp = zeros(g, sizes, dtype, layout, device)
-        return add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
-    else:
-        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-        if dtype is None:
-            scalar_type = _type_utils.JitScalarType.FLOAT
-        else:
-            scalar_type = _type_utils.JitScalarType(dtype)
-        sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
-        if isinstance(sizes_, list) and len(sizes_) == 0:
-            sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
-        return g.op(
-            "ConstantOfShape",
-            sizes,
-            value_t=const_value.view(1).to(scalar_type.dtype()),
-        )
-
-
-@_onnx_symbolic("aten::full_like")
-def full_like(
-    g: jit_utils.GraphContext,
-    input,
-    fill_value,
-    dtype=None,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    fill_value = symbolic_helper._maybe_get_const(fill_value, "f")
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.from_value(
-            input, _type_utils.JitScalarType.FLOAT
-        )
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    if symbolic_helper._is_value(fill_value):
-        tmp = zeros_like(g, input, dtype, layout, device)
-        fill_value = g.op("Cast", fill_value, to_i=scalar_type.onnx_type())
-        return add(g, tmp, fill_value, g.op("Constant", value_t=torch.tensor(1)))
-    else:
-        shape = g.op("Shape", input)
-        return g.op(
-            "ConstantOfShape",
-            shape,
-            value_t=torch.tensor([fill_value], dtype=scalar_type.dtype()),
-        )
-
-
-@_onnx_symbolic("aten::new_full")
-def new_full(
-    g: jit_utils.GraphContext,
-    self,
-    size,
-    fill_value,
-    dtype,
-    layout,
-    device,
-    pin_memory=False,
-):
-    self_dtype = symbolic_helper._try_get_scalar_type(self)
-    if symbolic_helper._is_none(dtype) and self_dtype is not None:
-        dtype = self_dtype
-    return full(g, size, fill_value, dtype, layout, device, pin_memory)
-
-
-@_onnx_symbolic("aten::eye")
-def eye(g: jit_utils.GraphContext, *args):
-    if len(args) == 5:
-        # aten::eye(n, dtype, layout, device, pin_memory)
-        n, dtype, layout, device, _pin_memory = args
-        dim_size = symbolic_helper._unsqueeze_helper(g, n, [0])
-        shape = g.op("Concat", dim_size, dim_size, axis_i=0)
-        tensor = zeros(g, shape, dtype, layout, device)
-        return g.op("EyeLike", tensor)
-    if len(args) == 6:
-        # aten::eye(n, m, dtype, layout, device, pin_memory)
-        n, m, dtype, layout, device, _pin_memory = args
-        shape = g.op(
-            "Concat",
-            symbolic_helper._unsqueeze_helper(g, n, [0]),
-            symbolic_helper._unsqueeze_helper(g, m, [0]),
-            axis_i=0,
-        )
-        tensor = zeros(g, shape, dtype, layout, device)
-        return g.op("EyeLike", tensor)
-
-    return symbolic_helper._unimplemented("aten::eye", f"with {len(args)} arguments")
-
-
-@_onnx_symbolic("aten::slice")
-def slice(g: jit_utils.GraphContext, self, *args):
-    if len(args) == 4:
-        # aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor
-        dim, start, end, step = args
-        step = symbolic_helper._parse_arg(step, "i")
-        if step != 1:
-            raise errors.SymbolicValueError("step!=1 is currently not supported", self)
-        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
-            start.type(), _C.NoneType
-        )
-        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
-            end.type(), _C.NoneType
-        )
-        is_start_onnx_const = start.node().kind() == "onnx::Constant"
-        is_end_onnx_const = end.node().kind() == "onnx::Constant"
-        if (
-            ((not is_start_none) and (not is_start_onnx_const))
-            or ((not is_end_none) and (not is_end_onnx_const))
-            or dim.node().kind() != "onnx::Constant"
-        ):
-            if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
-                raise errors.SymbolicValueError(
-                    "Unsupported: ONNX export of Slice with dynamic inputs. DynamicSlice "
-                    "is a deprecated experimental op. Please use statically allocated "
-                    "variables or export to a higher opset version.",
-                    self,
-                )
-            else:
-                start_unsqueezed = symbolic_helper._unsqueeze_helper(g, start, [0])
-                end_unsqueezed = symbolic_helper._unsqueeze_helper(g, end, [0])
-                dim_unsqueezed = symbolic_helper._unsqueeze_helper(g, dim, [0])
-                return g.op(
-                    "DynamicSlice",
-                    self,
-                    start_unsqueezed,
-                    end_unsqueezed,
-                    dim_unsqueezed,
-                )
-        else:
-            start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
-            end = (
-                _constants.INT64_MAX
-                if is_end_none
-                else symbolic_helper._parse_arg(end, "i")
-            )
-            dim = symbolic_helper._parse_arg(dim, "i")
-            return symbolic_helper._slice_helper(
-                g, self, axes=[dim], starts=[start], ends=[end]
-            )
-    elif len(args) == 3:
-        # aten::slice(t[] l, int start, int end, int step) -> t[]
-        start, end, step = args
-        dim = 0
-        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
-            start.type(), _C.NoneType
-        )
-        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
-            end.type(), _C.NoneType
-        )
-        start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
-        end = (
-            _constants.INT64_MAX
-            if is_end_none
-            else symbolic_helper._parse_arg(end, "i")
-        )
-        return symbolic_helper._slice_helper(
-            g, self, axes=[dim], starts=[start], ends=[end]
-        )
-
-    return symbolic_helper._unimplemented("aten::slice", f"with {len(args)} arguments")
-
-
-@_onnx_symbolic("aten::hardtanh")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "f", "f")
-def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Clip", self, min_f=min_val, max_f=max_val, opset_before=12
-    )
-
-
-@_onnx_symbolic("aten::hardswish")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v")
-def hardswish(g: jit_utils.GraphContext, self):
-    hs = hardsigmoid(g, self)
-    return g.op("Mul", self, hs)
-
-
-@_onnx_symbolic("aten::hardsigmoid")
-# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
-@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
-@symbolic_helper.parse_args("v")
-def hardsigmoid(g: jit_utils.GraphContext, self):
-    # Set alpha_f to 1 / 6 to make op equivalent to PyTorch's definition of Hardsigmoid.
-    # See https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
-    return g.op("HardSigmoid", self, alpha_f=1 / 6)
-
-
-@_onnx_symbolic("aten::tanhshrink")
-@symbolic_helper.parse_args("v")
-def tanhshrink(g: jit_utils.GraphContext, self):
-    return g.op("Sub", self, tanh(g, self))
-
-
-@_onnx_symbolic("aten::hardshrink")
-@symbolic_helper.parse_args("v", "f")
-def hardshrink(g: jit_utils.GraphContext, self, lambd):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.FLOAT
-    )
-    lambd_op = g.op(
-        "Constant",
-        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
-    )
-    cond = logical_or(g, gt(g, self, lambd_op), lt(g, self, neg(g, lambd_op)))
-    return g.op(
-        "Where",
-        cond,
-        self,
-        g.op(
-            "Constant",
-            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
-        ),
-    )
-
-
-@_onnx_symbolic("aten::softshrink")
-@symbolic_helper.parse_args("v", "f")
-def softshrink(g: jit_utils.GraphContext, self, lambd):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.FLOAT
-    )
-    lambd_op = g.op(
-        "Constant",
-        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
-    )
-    gt_cond = gt(g, self, lambd_op)
-    gt_out = g.op(
-        "Where",
-        gt_cond,
-        sub(g, self, lambd_op),
-        g.op(
-            "Constant",
-            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
-        ),
-    )
-    lt_cond = lt(g, self, neg(g, lambd_op))
-    lt_out = g.op(
-        "Where",
-        lt_cond,
-        add(g, self, lambd_op),
-        g.op(
-            "Constant",
-            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
-        ),
-    )
-    return add(g, gt_out, lt_out)
-
-
-@_onnx_symbolic("aten::alias")
-def alias(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("aten::unsqueeze")
-@symbolic_helper.parse_args("v", "i")
-def unsqueeze(g: jit_utils.GraphContext, self, dim):
-    """Implement unsqueezing a pytorch tensor in ONNX by inserting a new dimension at the specified `dim`"""
-    # Handle negative dim
-    if dim < 0:
-        rank = symbolic_helper._get_tensor_rank(self)
-        if rank is not None:
-            warnings.warn(
-                "ONNX export unsqueeze with negative axis "
-                + str(dim)
-                + " might cause the onnx model to be incorrect. "
-                + "Negative axis is not supported in ONNX. "
-                + "Axis is converted to "
-                + str(dim + rank + 1)
-                + " based on input shape at export time. "
-                + "Passing an tensor of different rank in execution will be incorrect."
-            )
-            dim = dim + rank + 1
-        else:
-            return symbolic_helper._unimplemented(
-                "unsqueeze", "negative axis with unknown input rank", self
-            )
-
-    return symbolic_helper._unsqueeze_helper(g, self, axes_i=[dim])
-
-
-@_onnx_symbolic("aten::sort")
-# TODO(justinchuby): Support multiple quantized args in output
-@symbolic_helper.parse_args("v", "i", "i", "none")
-def sort(g: jit_utils.GraphContext, self, dim, descending, out=None):
-    if out is not None:
-        symbolic_helper._unimplemented(
-            "Sort", "Out parameter is not supported for sort", self
-        )
-    self_sizes = symbolic_helper._get_tensor_sizes(self)
-    try:
-        dim_size = self_sizes[dim]
-    except Exception:
-        # FIXME(justinchuby): Avoid catching Exception.
-        # Catch a more specific exception instead.
-        dim_size = None
-
-    if dim_size is None:
-        return symbolic_helper._unimplemented("Sort", "input size not accessible", self)
-
-    return g.op("TopK", self, k_i=dim_size, axis_i=dim, outputs=2)
-
-
-@_onnx_symbolic("aten::numel")
-def numel(g: jit_utils.GraphContext, self):
-    return symbolic_helper._numel_helper(g, self)
-
-
-@_onnx_symbolic("aten::topk")
-# TODO(justinchuby): Support multiple quantized args in output
-@symbolic_helper.parse_args("v", "i", "i", "i", "i", "none")
-def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
-    if out is not None:
-        symbolic_helper._unimplemented(
-            "TopK", "Out parameter is not supported for topk", self
-        )
-    if not largest:
-        symbolic_helper._unimplemented("TopK", "Ascending TopK is not supported", self)
-
-    return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
-
-
-@_onnx_symbolic("prim::convert_element_type")
-def convert_element_type(g: jit_utils.GraphContext, self, *args):
-    dtype = symbolic_helper._get_const(args[0], "i", "dtype")
-    return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-
-
-@_onnx_symbolic("aten::to")
-def to(g: jit_utils.GraphContext, self, *args):
-    def is_aten_to_device_only(args):
-        if len(args) == 4:
-            # aten::to(Tensor, Device, bool, bool, memory_format)
-            return (
-                args[0].node().kind() == "prim::device"
-                or args[0].type().isSubtypeOf(_C.ListType.ofInts())
-                or isinstance(args[0].type(), _C.DeviceObjType)
-            )
-        elif len(args) == 5:
-            # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
-            # When dtype is None, this is a aten::to(device) call
-            dtype = symbolic_helper._get_const(args[1], "i", "dtype")
-            return dtype is None
-        elif len(args) in (6, 7):
-            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
-            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
-            # When dtype is None, this is a aten::to(device) call
-            dtype = symbolic_helper._get_const(args[0], "i", "dtype")
-            return dtype is None
-        return False
-
-    # ONNX doesn't have a concept of a device, so we ignore device-only casts
-    if is_aten_to_device_only(args):
-        return self
-
-    if len(args) == 4:
-        # TestONNXRuntime::test_ones_bool shows args[0] of aten::to() can be onnx::Constant[value=<Tensor>]()
-        # In this case, the constant value is a tensor not int,
-        # so symbolic_helper._maybe_get_const(args[0], 'i') would not work.
-        dtype = args[0]
-        if (
-            symbolic_helper._is_value(args[0])
-            and args[0].node().kind() == "onnx::Constant"
-        ):
-            tval = symbolic_helper._node_get(args[0].node(), "value")
-            if isinstance(tval, torch.Tensor):
-                if len(tval.shape) == 0:
-                    tval = tval.item()
-                    dtype = int(tval)
-                else:
-                    dtype = tval
-
-        if symbolic_helper._is_value(dtype) or isinstance(dtype, torch.Tensor):
-            # aten::to(Tensor, Tensor, bool, bool, memory_format)
-            dtype = _type_utils.JitScalarType.from_value(args[0])
-            return g.op(
-                "Cast",
-                self,
-                to_i=dtype.onnx_type(),
-            )
-        else:
-            # aten::to(Tensor, ScalarType, bool, bool, memory_format)
-            # memory_format is ignored
-            return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    elif len(args) == 5:
-        # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
-        dtype = symbolic_helper._get_const(args[1], "i", "dtype")
-        # memory_format is ignored
-        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    elif len(args) == 6:
-        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
-        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
-        # Layout, device and memory_format are ignored
-        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-    elif len(args) == 7:
-        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
-        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
-        # Layout, device and memory_format are ignored
-        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
-
-    return symbolic_helper._onnx_unsupported("Unknown aten::to signature", self)
-
-
-@_onnx_symbolic("aten::repeat")
-def repeat(g: jit_utils.GraphContext, self, repeats):
-    dtype = _type_utils.JitScalarType.INT64
-    shape_ = ones_like(g, repeats, dtype)
-    self = g.op("Expand", self, shape_)
-    return g.op("Tile", self, repeats)
-
-
-@_onnx_symbolic("aten::repeat_interleave")
-def repeat_interleave(
-    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
-):
-    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
-    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
-    input_sizes = symbolic_helper._get_tensor_sizes(self)
-    if repeats_dim is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
-            self,
-        )
-    if repeats_sizes is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
-            self,
-        )
-    if input_sizes is None:
-        raise errors.SymbolicValueError(
-            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
-            self,
-        )
-
-    # if dim is None flatten
-    # By default, use the flattened input array, and return a flat output array
-    if symbolic_helper._is_none(dim):
-        self = symbolic_helper._reshape_helper(
-            g, self, g.op("Constant", value_t=torch.tensor([-1]))
-        )
-        dim = torch.tensor(0, dtype=torch.int64)
-    else:
-        dim = symbolic_helper._maybe_get_scalar(dim)
-
-    # Handle cases where dim is negative
-    if dim < 0:
-        dim += len(input_sizes)
-
-    input_sizes_temp = input_sizes.copy()
-    for idx, input_size in enumerate(input_sizes):
-        if input_size is None:
-            input_sizes[idx], input_sizes_temp[idx] = 0, -1
-
-    # Cases where repeats is an int or single value tensor
-    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
-        if input_sizes[dim] == 0:
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "repeat_interleave",
-                9,
-                13,
-                "Unsupported along dimension with unknown input size",
-                self,
-            )
-        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
-            g, self, repeats, dim
-        )
-
-    # Cases where repeats is a 1 dim Tensor
-    elif repeats_dim == 1:
-        if input_sizes[dim] == 0:
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "repeat_interleave",
-                9,
-                13,
-                "Unsupported along dimension with unknown input size",
-                self,
-            )
-        if repeats_sizes[0] is None:
-            return symbolic_helper._onnx_opset_unsupported_detailed(
-                "repeat_interleave",
-                9,
-                13,
-                "Unsupported for cases with dynamic repeats",
-                self,
-            )
-        assert repeats_sizes[0] == input_sizes[dim], (
-            "repeats must have the same size as input along dim"
-        )
-        reps = repeats_sizes[0]
-    else:
-        raise errors.SymbolicValueError("repeats must be 0-dim or 1-dim tensor", self)
-
-    final_splits = []
-    r_splits = symbolic_helper._repeat_interleave_split_helper(g, repeats, reps, 0)
-    i_splits = symbolic_helper._repeat_interleave_split_helper(g, self, reps, dim)
-    input_sizes[dim], input_sizes_temp[dim] = -1, 1
-    for idx, r_split in enumerate(r_splits):
-        i_split = unsqueeze(g, i_splits[idx], dim + 1)
-        r_concat = [
-            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])),
-            r_split,
-            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])),
-        ]
-        r_concat = g.op("Concat", *r_concat, axis_i=0)
-        i_split = expand(g, i_split, r_concat, None)
-        i_split = symbolic_helper._reshape_helper(
-            g,
-            i_split,
-            g.op("Constant", value_t=torch.LongTensor(input_sizes)),
-            allowzero=0,
-        )
-        final_splits.append(i_split)
-    return g.op("Concat", *final_splits, axis_i=dim)
-
-
-@_onnx_symbolic("aten::pixel_shuffle")
-@symbolic_helper.parse_args("v", "i")
-def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
-    dims = symbolic_helper._get_tensor_sizes(self)
-    if len(dims) != 4:
-        return symbolic_helper._unimplemented(
-            "pixel_shuffle", "only support 4d input", self
-        )
-    if any(i is None for i in dims[1:]):
-        after_view = symbolic_helper._reshape_helper(
-            g,
-            symbolic_helper._unsqueeze_helper(g, self, [2, 3]),
-            g.op(
-                "Constant",
-                value_t=torch.tensor([0, -1, upscale_factor, upscale_factor, 0, 0]),
-            ),
-            allowzero=0,
-        )
-        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
-        # For dynamic input shapes, two reshapes are performed
-        reshape_h = symbolic_helper._reshape_helper(
-            g,
-            after_transpose,
-            g.op("Constant", value_t=torch.tensor([0, 0, -1, 1, 0, 0])),
-            allowzero=0,
-        )
-        reshape_w = symbolic_helper._reshape_helper(
-            g,
-            reshape_h,
-            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, 1])),
-            allowzero=0,
-        )
-        return symbolic_helper._squeeze_helper(g, reshape_w, [3, 5])
-    else:
-        output_channel = dims[1] // upscale_factor // upscale_factor
-        after_view = symbolic_helper._reshape_helper(
-            g,
-            self,
-            g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    [
-                        -1,
-                        output_channel,
-                        upscale_factor,
-                        upscale_factor,
-                        dims[2],
-                        dims[3],
-                    ]
-                ),
-            ),
-            allowzero=0,
-        )
-        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
-        return symbolic_helper._reshape_helper(
-            g,
-            after_transpose,
-            g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    [
-                        -1,
-                        output_channel,
-                        dims[2] * upscale_factor,
-                        dims[3] * upscale_factor,
-                    ]
-                ),
-            ),
-            allowzero=0,
-        )
-
-
-@_onnx_symbolic("aten::pixel_unshuffle")
-@symbolic_helper.parse_args("v", "i")
-def pixel_unshuffle(g: jit_utils.GraphContext, self, downscale_factor):
-    dims = symbolic_helper._get_tensor_sizes(self)
-    if len(dims) != 4:
-        return symbolic_helper._unimplemented(
-            "pixel_shuffle", "only support 4d input", self
-        )
-    if any(i is None for i in dims[1:]):
-        # For dynamic input shapes, two reshapes are performed
-        reshape_h = symbolic_helper._reshape_helper(
-            g,
-            symbolic_helper._unsqueeze_helper(g, self, [3]),
-            g.op("Constant", value_t=torch.tensor([0, 0, -1, downscale_factor, 0])),
-            allowzero=0,
-        )
-        reshape_w = symbolic_helper._reshape_helper(
-            g,
-            reshape_h,
-            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, downscale_factor])),
-            allowzero=0,
-        )
-        after_transpose = g.op("Transpose", reshape_w, perm_i=[0, 1, 3, 5, 2, 4])
-        final_reshape = symbolic_helper._reshape_helper(
-            g,
-            after_transpose,
-            g.op("Constant", value_t=torch.tensor([0, -1, 1, 1, 0, 0])),
-            allowzero=0,
-        )
-        return symbolic_helper._squeeze_helper(g, final_reshape, [2, 3])
-    else:
-        output_channel = dims[1] * downscale_factor * downscale_factor
-        after_view = symbolic_helper._reshape_helper(
-            g,
-            self,
-            g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    [
-                        -1,
-                        dims[1],
-                        dims[2] // downscale_factor,
-                        downscale_factor,
-                        dims[3] // downscale_factor,
-                        downscale_factor,
-                    ]
-                ),
-            ),
-            allowzero=0,
-        )
-        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 3, 5, 2, 4])
-        return symbolic_helper._reshape_helper(
-            g,
-            after_transpose,
-            g.op(
-                "Constant",
-                value_t=torch.tensor(
-                    [
-                        -1,
-                        output_channel,
-                        dims[2] // downscale_factor,
-                        dims[3] // downscale_factor,
-                    ]
-                ),
-            ),
-            allowzero=0,
-        )
-
-
-def _generic_rnn(
-    g: jit_utils.GraphContext,
-    variant,
-    input,
-    initial_states,
-    all_weights,
-    has_biases,
-    num_layers,
-    dropout,
-    train,
-    bidirectional,
-    batch_first=None,
-    batch_sizes=None,
-):
-    warnings.warn(
-        "Exporting a model to ONNX with a batch_size other than 1, "
-        + "with a variable length with "
-        + variant
-        + " can cause an error "
-        + "when running the ONNX model with a different batch size. "
-        + "Make sure to save the model with a batch size of 1, "
-        + "or define the initial states (h0/c0) as inputs of the model. "
-    )
-
-    onnxActivations = [
-        "Relu",
-        "Tanh",
-        "Sigmoid",
-        "Affine",
-        "LeakyRelu",
-        "ThresholdedRelu",
-        "ScaledTanh",
-        "HardSigmoid",
-        "Elu",
-        "Softsign",
-        "Softplus",
-    ]
-    variantToOnnxActivationMap = dict(
-        zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations)
-    )
-    weights_per_layer = 4 if has_biases else 2
-    # this means that projections are used inside LSTM, so need to tell user that it's not supported
-    if variant == "LSTM" and len(all_weights) != num_layers * weights_per_layer * (
-        1 + bidirectional
-    ):
-        return symbolic_helper._unimplemented("LSTM", "LSTMs with projections", input)
-    assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
-    layer_weights = [
-        all_weights[i : i + weights_per_layer]
-        for i in range(0, len(all_weights), weights_per_layer)
-    ]
-    if batch_first:
-        # batch, seq, feat -> seq, batch, feat
-        input = g.op("Transpose", input, perm_i=[1, 0, 2])
-    if dropout and train:
-        return symbolic_helper._unimplemented(
-            "RNN/GRU/LSTM", "dropout in training mode", input
-        )
-
-    if variant.startswith("RNN"):
-        nonlinearity = variantToOnnxActivationMap[variant[4:].lower()]
-        variant = "RNN"
-
-    w_hh = all_weights[1]
-    hidden_size = symbolic_helper._get_tensor_dim_size(w_hh, 1)
-    if hidden_size is None:
-        return symbolic_helper._unimplemented(
-            "RNN/GRU/LSTM", "unknown hidden size", input
-        )
-
-    unidirectional = not bidirectional
-
-    prev_output = input
-
-    h_outs = []
-    if variant == "RNN" or variant == "GRU":
-        h0 = initial_states
-    elif variant == "LSTM":
-        h0, c0 = initial_states
-        c_outs = []
-
-    sequence_lens = unused(g) if batch_sizes is None else batch_sizes
-
-    if variant == "GRU":
-        # pytorch is reset, input, hidden
-        # onnx is    input, reset, hidden
-        reform_permutation = [(1, 2), (0, 1), (2, 3)]
-    elif variant == "LSTM":
-        # pytorch is input, forget, cell, output.
-        # onnx is    input, output, forget, cell.
-        reform_permutation = [(0, 1), (3, 4), (1, 3)]
-
-    def reform_weights(g, w, n, intervals):
-        slices = [
-            symbolic_helper._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n])
-            for x, y in intervals
-        ]
-        return g.op("Concat", *slices, axis_i=0)
-
-    def transform_weights_no_bias(layer_index):
-        weights = layer_weights[layer_index]
-        if variant == "RNN":
-            weight_ih, weight_hh = weights
-        elif variant == "GRU" or variant == "LSTM":
-            weight_ih, weight_hh = (
-                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
-            )
-        return tuple(
-            symbolic_helper._unsqueeze_helper(g, x, [0])
-            for x in (weight_ih, weight_hh)  # type: ignore[possibly-undefined]
-        )
-
-    def transform_weights(layer_index):
-        weights = layer_weights[layer_index]
-        if variant == "RNN":
-            weight_ih, weight_hh, bias_ih, bias_hh = weights
-        elif variant == "GRU" or variant == "LSTM":
-            weight_ih, weight_hh, bias_ih, bias_hh = (
-                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
-            )
-        bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)  # type: ignore[possibly-undefined]
-        return tuple(
-            symbolic_helper._unsqueeze_helper(g, x, [0])
-            for x in (weight_ih, weight_hh, bias_concat)  # type: ignore[possibly-undefined]
-        )
-
-    def retrieve_state(x, start, end):
-        return (
-            x
-            if num_layers == 1
-            else symbolic_helper._slice_helper(
-                g, x, axes=[0], starts=[start], ends=[end]
-            )
-        )
-
-    for i in range(num_layers):
-        if unidirectional:
-            if weights_per_layer == 4:
-                weight_ih, weight_hh, bias_concat = transform_weights(i)
-            else:
-                weight_ih, weight_hh = transform_weights_no_bias(i)
-                bias_concat = unused(g)
-
-            state_indices = i, i + 1
-        else:
-            if weights_per_layer == 4:
-                weight_ih_f, weight_hh_f, bias_f = transform_weights(2 * i)
-                weight_ih_b, weight_hh_b, bias_b = transform_weights(2 * i + 1)
-                bias_concat = g.op("Concat", bias_f, bias_b, axis_i=0)
-            else:
-                weight_ih_f, weight_hh_f = transform_weights_no_bias(2 * i)
-                weight_ih_b, weight_hh_b = transform_weights_no_bias(2 * i + 1)
-                bias_concat = unused(g)
-
-            weight_ih = g.op("Concat", weight_ih_f, weight_ih_b, axis_i=0)
-            weight_hh = g.op("Concat", weight_hh_f, weight_hh_b, axis_i=0)
-
-            state_indices = 2 * i, 2 * i + 2
-
-        inputs = [prev_output, weight_ih, weight_hh, bias_concat, sequence_lens]
-
-        inputs.append(retrieve_state(h0, *state_indices))  # type: ignore[possibly-undefined]
-        if variant == "LSTM":
-            inputs.append(retrieve_state(c0, *state_indices))  # type: ignore[possibly-undefined]
-
-        extra_kwargs = {} if unidirectional else {"direction_s": "bidirectional"}
-        if variant == "RNN":
-            if bidirectional:
-                activation = [nonlinearity, nonlinearity]  # type: ignore[possibly-undefined]
-            else:
-                activation = [nonlinearity]  # type: ignore[possibly-undefined]
-
-            prev_output, h_out = g.op(
-                "RNN",
-                *inputs,
-                outputs=2,
-                hidden_size_i=hidden_size,
-                activations_s=activation,
-                **extra_kwargs,
-            )
-        elif variant == "GRU":
-            prev_output, h_out = g.op(
-                "GRU",
-                *inputs,
-                outputs=2,
-                hidden_size_i=hidden_size,
-                linear_before_reset_i=1,
-                **extra_kwargs,
-            )
-        elif variant == "LSTM":
-            prev_output, h_out, c_out = g.op(
-                "LSTM", *inputs, outputs=3, hidden_size_i=hidden_size, **extra_kwargs
-            )
-
-        if bidirectional:
-            # The ONNX RNN/GRU/LSTM produce an output of dimensions
-            #   seq_len, num_directions, batch, hidden_size
-            # We have to convert to match pytorch's expected
-            #   seq_len, batch, num_directions * hidden_size
-            # by first moving num_directions before hidden_size with
-            # Transpose, and then combining it with hidden_size
-            # with Reshape.
-            prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3])
-            prev_output = symbolic_helper._reshape_helper(
-                g,
-                prev_output,
-                g.op("Constant", value_t=torch.LongTensor([0, 0, -1])),
-                allowzero=0,
-            )
-        else:
-            prev_output = symbolic_helper._squeeze_helper(g, prev_output, [1])
-
-        h_outs.append(h_out)  # type: ignore[possibly-undefined]
-        if variant == "LSTM":
-            c_outs.append(c_out)  # type: ignore[possibly-undefined]
-    if batch_first:
-        # seq, batch, num_directions * hidden_size -> batch, seq, num_directions * hidden_size
-        prev_output = g.op("Transpose", prev_output, perm_i=[1, 0, 2])
-    h_outs = h_out if num_layers == 1 else g.op("Concat", *h_outs, axis_i=0)  # type: ignore[possibly-undefined]
-    if variant == "RNN" or variant == "GRU":
-        return prev_output, h_outs
-    elif variant == "LSTM":
-        c_outs = c_out if num_layers == 1 else g.op("Concat", *c_outs, axis_i=0)  # type: ignore[possibly-undefined]
-        return prev_output, h_outs, c_outs
-
-
-@symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
-def _lstm_full(
-    g: jit_utils.GraphContext,
-    input,
-    hidden_v,
-    weight_v,
-    has_biases,
-    num_layers,
-    dropout,
-    train,
-    bidirectional,
-    batch_first,
-):
-    hidden, weight = (
-        symbolic_helper._unpack_list(hidden_v),
-        symbolic_helper._unpack_list(weight_v),
-    )
-    return _generic_rnn(
-        g,
-        "LSTM",
-        input,
-        hidden,
-        weight,
-        has_biases,
-        num_layers,
-        dropout,
-        train,
-        bidirectional,
-        batch_first,
-    )
-
-
-@symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
-def _lstm_packed(
-    g: jit_utils.GraphContext,
-    input,
-    batch_sizes,
-    hidden_v,
-    weight_v,
-    has_biases,
-    num_layers,
-    dropout,
-    train,
-    bidirectional,
-):
-    hidden, weight = (
-        symbolic_helper._unpack_list(hidden_v),
-        symbolic_helper._unpack_list(weight_v),
-    )
-    return _generic_rnn(
-        g,
-        "LSTM",
-        input,
-        hidden,
-        weight,
-        has_biases,
-        num_layers,
-        dropout,
-        train,
-        bidirectional,
-        batch_sizes=batch_sizes,
-    )
-
-
-@_onnx_symbolic("aten::lstm")
-def lstm(g: jit_utils.GraphContext, *args):
-    if symbolic_helper._is_tensor_list(args[3]):
-        return _lstm_packed(g, *args)
-    else:
-        return _lstm_full(g, *args)
-
-
-@_onnx_symbolic("aten::lstm_cell")
-def lstm_cell(g: jit_utils.GraphContext, self, hidden, w_ih, w_hh, b_ih, b_hh):
-    input = symbolic_helper._unsqueeze_helper(g, self, [0])
-    hidden = symbolic_helper._unpack_list(hidden)
-    hidden = [symbolic_helper._unsqueeze_helper(g, x, [0]) for x in hidden]
-    weight = (
-        (w_ih, w_hh, b_ih, b_hh) if symbolic_helper._is_tensor(b_ih) else (w_ih, w_hh)
-    )
-    has_biases = True if symbolic_helper._is_tensor(b_ih) else False
-    _, h_outs, c_outs = _generic_rnn(
-        g,
-        "LSTM",
-        input,
-        hidden,
-        weight,
-        has_biases,
-        num_layers=1,
-        dropout=0,
-        train=0,
-        bidirectional=False,
-        batch_first=False,
-    )
-    return symbolic_helper._squeeze_helper(
-        g, h_outs, [0]
-    ), symbolic_helper._squeeze_helper(g, c_outs, [0])
-
-
-@_onnx_symbolic(
-    "aten::gru", decorate=[symbolic_helper._apply_params("GRU"), _export("gru")]
-)
-@_onnx_symbolic(
-    "aten::rnn_tanh",
-    decorate=[symbolic_helper._apply_params("RNN_TANH"), _export("rnn_tanh")],
-)
-@_onnx_symbolic(
-    "aten::rnn_relu",
-    decorate=[symbolic_helper._apply_params("RNN_RELU"), _export("rnn_relu")],
-)
-def _one_hidden_rnn(kind: str):
-    @symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
-    def _rnn_full(
-        g,
-        input,
-        hidden,
-        weight_v,
-        has_biases,
-        num_layers,
-        dropout,
-        train,
-        bidirectional,
-        batch_first,
-    ):
-        weight = symbolic_helper._unpack_list(weight_v)
-        return _generic_rnn(
-            g,
-            kind,
-            input,
-            hidden,
-            weight,
-            has_biases,
-            num_layers,
-            dropout,
-            train,
-            bidirectional,
-            batch_first,
-        )
-
-    @symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
-    def _rnn_packed(
-        g,
-        input,
-        batch_sizes,
-        hidden,
-        weight_v,
-        has_biases,
-        num_layers,
-        dropout,
-        train,
-        bidirectional,
-    ):
-        weight = symbolic_helper._unpack_list(weight_v)
-        return _generic_rnn(
-            g,
-            kind,
-            input,
-            hidden,
-            weight,
-            has_biases,
-            num_layers,
-            dropout,
-            train,
-            bidirectional,
-            batch_sizes=batch_sizes,
-        )
-
-    def symbolic(g, *args):
-        if symbolic_helper._is_tensor_list(args[3]):
-            return _rnn_packed(g, *args)
-        else:
-            return _rnn_full(g, *args)
-
-    return symbolic
-
-
-@_onnx_symbolic("aten::_dim_arange")
-@symbolic_helper.parse_args("v", "i")
-def _dim_arange(g: jit_utils.GraphContext, like, dim):
-    like_shape = g.op("Shape", like)
-    stop = g.op(
-        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
-    )
-    # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-    return arange(g, stop, 4, None, None, None)
-
-
-@_onnx_symbolic("aten::detach")
-def detach(g: jit_utils.GraphContext, input):
-    # Erase aten::detach nodes because ONNX is inference only
-    return input
-
-
-@_onnx_symbolic("aten::contiguous")
-@symbolic_helper.parse_args("v", "i")
-def contiguous(g: jit_utils.GraphContext, input, memory_format):
-    if memory_format > 2:  # allower values are any, preserve and contiguous_format
-        raise errors.SymbolicValueError(
-            "onnx memory_format support is not implemented", input
-        )
-    return input
-
-
-@_onnx_symbolic("aten::_pack_padded_sequence")
-@symbolic_helper.parse_args("v", "v", "i")
-def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first):
-    # Currently there is no PackPadded operator in ONNX. We rely on an
-    # optimization pass to remove this later. It is an error if all
-    # PackPadded operators cannot be optimized out.
-    if batch_first:
-        input = g.op("Transpose", input, perm_i=[1, 0, 2])
-    if not lengths.type().isSubtypeOf(torch._C.TensorType.get()):
-        raise errors.SymbolicValueError(
-            "'lengths' must be a Tensor for ONNX export", input
-        )
-    # We know it's a TensorType so this check is now safe.
-    # It's really only necessary because those operators expand to something that
-    # only works with int32 types in Caffe2...
-    if (
-        _type_utils.JitScalarType.from_value(
-            lengths, _type_utils.JitScalarType.UNDEFINED
-        )
-        != _type_utils.JitScalarType.INT
-    ):
-        lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32)
-    return g.op("prim::PackPadded", input, lengths, outputs=2)
-
-
-@_onnx_symbolic("aten::_pad_packed_sequence")
-@symbolic_helper.parse_args("v", "v", "i", "t", "v")
-def _pad_packed_sequence(
-    g: jit_utils.GraphContext,
-    data,
-    batch_sizes,
-    batch_first,
-    padding_value,
-    total_length,
-):
-    # Ignore total_length as it is not supported in _symbolic_pad_packed_sequence
-    # It is only useful/used when training using data_parallel model, so
-    # It shouldn't be relevant for ONNX anyway
-    data, lengths = g.op("prim::PadPacked", data, batch_sizes, outputs=2)
-    if batch_first:
-        data = g.op("Transpose", data, perm_i=[1, 0, 2])
-    return data, lengths
-
-
-@_onnx_symbolic("aten::randint")
-def randint(g: jit_utils.GraphContext, low, high, shapes, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    low_i = symbolic_helper._get_const(low, "i", "low")
-    high_i = symbolic_helper._get_const(high, "i", "high")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.INT64
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    if low_i is None:
-        raise symbolic_helper._onnx_unsupported("randint", low)
-    if high_i is None:
-        raise symbolic_helper._onnx_unsupported("randint", high)
-
-    shape = symbolic_helper._maybe_get_const(shapes, "is")
-    if symbolic_helper._is_value(shape):
-        shape_const = g.op(
-            "ConstantOfShape",
-            shapes,
-            value_t=torch.tensor([0], dtype=torch.float),
-        )
-        randn = g.op(
-            "RandomUniformLike",
-            shape_const,
-            low_f=low_i,
-            high_f=high_i,
-        )
-    else:
-        randn = g.op(
-            "RandomUniform",
-            shape_i=shape,
-            low_f=low_i,
-            high_f=high_i,
-        )
-
-    # cast to integer type
-    int_dtype = _type_utils.JitScalarType.INT64
-    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
-    if int_dtype != scalar_type:
-        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
-    return randint
-
-
-@_onnx_symbolic("aten::randint_like")
-def randint_like(g: jit_utils.GraphContext, self, low, high, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    low_i = symbolic_helper._get_const(low, "i", "low")
-    high_i = symbolic_helper._get_const(high, "i", "high")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.INT64
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    if low_i is None:
-        raise symbolic_helper._onnx_unsupported("randint", low)
-    if high_i is None:
-        raise symbolic_helper._onnx_unsupported("randint", high)
-
-    randn = g.op(
-        "RandomUniformLike",
-        self,
-        low_f=low_i,
-        high_f=high_i,
-    )
-
-    # cast to integer type
-    int_dtype = _type_utils.JitScalarType.INT64
-    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
-    if int_dtype != scalar_type:
-        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
-    return randint
-
-
-@_onnx_symbolic("aten::randn")
-def randn(g: jit_utils.GraphContext, shapes, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    shape = symbolic_helper._maybe_get_const(shapes, "is")
-    if symbolic_helper._is_value(shape):
-        shape_const = g.op(
-            "ConstantOfShape",
-            shapes,
-            value_t=torch.tensor([0], dtype=torch.float),
-        )
-        return g.op(
-            "RandomNormalLike",
-            shape_const,
-            dtype_i=scalar_type.onnx_type(),
-        )
-    return g.op(
-        "RandomNormal",
-        shape_i=shape,
-        dtype_i=scalar_type.onnx_type(),
-    )
-
-
-@_onnx_symbolic("aten::rand")
-def rand(g: jit_utils.GraphContext, shapes, dtype, *options):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    shape = symbolic_helper._maybe_get_const(shapes, "is")
-    if symbolic_helper._is_value(shape):
-        shape_const = g.op(
-            "ConstantOfShape",
-            shapes,
-            value_t=torch.tensor([0], dtype=torch.float),
-        )
-        return g.op(
-            "RandomUniformLike",
-            shape_const,
-            dtype_i=scalar_type.onnx_type(),
-        )
-    return g.op(
-        "RandomUniform",
-        shape_i=shape,
-        dtype_i=scalar_type.onnx_type(),
-    )
-
-
-@_onnx_symbolic("aten::randn_like")
-def randn_like(
-    g: jit_utils.GraphContext,
-    self,
-    dtype,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        scalar_type = _type_utils.JitScalarType.from_value(
-            self, _type_utils.JitScalarType.FLOAT
-        )
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-    return g.op("RandomNormalLike", self, dtype_i=scalar_type.onnx_type())
-
-
-@_onnx_symbolic("aten::rand_like")
-def rand_like(
-    g: jit_utils.GraphContext,
-    self,
-    dtype,
-    layout=None,
-    device=None,
-    pin_memory=False,
-    memory_format=None,
-):
-    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
-    if dtype is None:
-        dtype = _type_utils.JitScalarType.from_value(
-            self, _type_utils.JitScalarType.FLOAT
-        )
-    return g.op(
-        "RandomUniformLike", self, dtype_i=_type_utils.JitScalarType(dtype).onnx_type()
-    )
-
-
-@_onnx_symbolic("aten::rrelu")
-@symbolic_helper.parse_args("v", "f", "f", "i", "none")
-def rrelu(g: jit_utils.GraphContext, input, lower, upper, training, generator):
-    if not training:
-        slope = (upper + lower) / 2.0
-        return g.op("LeakyRelu", input, alpha_f=slope)
-    p = g.op("RandomUniformLike", input, high_f=upper, low_f=lower)
-    return g.op("PRelu", input, p)
-
-
-@_onnx_symbolic("aten::bernoulli")
-def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
-    if out is not None and not symbolic_helper._is_none(out):
-        symbolic_helper._unimplemented(
-            "Bernoulli", "out parameter is not supported for bernoulli", input
-        )
-    if generator is not None and not symbolic_helper._is_none(generator):
-        symbolic_helper._unimplemented(
-            "Bernoulli", "generator is not supported for bernoulli", input
-        )
-
-    dtype = _type_utils.JitScalarType.from_value(
-        input, _type_utils.JitScalarType.UNDEFINED
-    )
-    if dtype == _type_utils.JitScalarType.UNDEFINED:
-        return symbolic_helper._unimplemented(
-            "Bernoulli", "input dtype not accessible", input
-        )
-
-    rands = g.op(
-        "RandomUniformLike",
-        input,
-        high_f=1.0,
-        low_f=0.0,
-        dtype_i=dtype.onnx_type(),
-    )
-    prob = p if p is not None and not symbolic_helper._is_none(p) else input
-    output = g.op("Less", rands, prob)
-    return g.op("Cast", output, to_i=dtype.onnx_type())
-
-
-@_onnx_symbolic("aten::log_sigmoid")
-@symbolic_helper.parse_args("v")
-def log_sigmoid(g: jit_utils.GraphContext, input):
-    p = g.op("Sigmoid", input)
-    return g.op("Log", p)
-
-
-@_onnx_symbolic("aten::erf")
-@symbolic_helper.parse_args("v")
-def erf(g: jit_utils.GraphContext, input):
-    return g.op("Erf", input)
-
-
-@_onnx_symbolic("aten::flatten")
-@symbolic_helper.quantized_args(True, False, False)
-@symbolic_helper.parse_args("v", "i", "i")
-def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
-    dim = symbolic_helper._get_tensor_rank(input)
-    if dim is None:
-        return symbolic_helper._unimplemented(
-            "dim",
-            "ONNX and PyTorch use different strategies to split the input. "
-            "Input rank must be known at export time.",
-            input,
-        )
-
-    if dim == 0:
-        return symbolic_helper._reshape_helper(g, input, [1])
-    if dim == 1:
-        return g.op("Identity", input)
-    # TODO: remove this as onnx opset 11 spec allows negative axes
-    if end_dim < 0:
-        end_dim = dim + end_dim
-    # use ONNX's Flatten operator for cases where the output shape is 2D
-    if start_dim == 1 and end_dim == dim - 1:
-        return g.op("Flatten", input, axis_i=start_dim)
-    if start_dim == 0 and end_dim == dim - 2:
-        return g.op("Flatten", input, axis_i=end_dim + 1)
-
-    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
-
-
-@_onnx_symbolic("aten::nonzero")
-@symbolic_helper.parse_args("v")
-def nonzero(g: jit_utils.GraphContext, input):
-    """Emitted from `torch.nonzero(x, as_tuple=False)`"""
-    return t(g, g.op("NonZero", input))
-
-
-@_onnx_symbolic("aten::nonzero_numpy")
-# Emitted from `torch.nonzero(x, as_tuple=True)`
-def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
-    return unbind(g, nonzero(g, input), 1, _outputs=_outputs)
-
-
-@_onnx_symbolic("aten::isnan")
-@symbolic_helper.parse_args("v")
-def isnan(g: jit_utils.GraphContext, input):
-    output = g.op("IsNaN", input)
-    return output
-
-
-@_onnx_symbolic("aten::any")
-def _any(g: jit_utils.GraphContext, *args):
-    # aten::any(Tensor self)
-    if len(args) == 1:
-        input = args[0]
-        dim, keepdim = None, 0
-    # aten::any(Tensor self, int[]? dim, bool keepdim)
-    else:
-        input, dim, keepdim = args
-        # Can be int list or single int
-        dim = symbolic_helper._parse_arg(dim, "t")
-        dim = [int(d) for d in dim.view(-1)]
-        keepdim = symbolic_helper._parse_arg(keepdim, "i")
-    input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
-    input_sum = symbolic_helper._reducesum_helper(
-        g, input, axes_i=dim, keepdims_i=keepdim
-    )
-    return gt(g, input_sum, g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)))
-
-
-@_onnx_symbolic("aten::all")
-def _all(g: jit_utils.GraphContext, *args):
-    input = g.op("Not", args[0])
-    # aten::all(Tensor self)
-    if len(args) == 1:
-        return g.op("Not", _any(g, input))
-    # aten::all(Tensor self, int[]? dim, bool keepdim)
-    else:
-        return g.op("Not", _any(g, input, args[1], args[2]))
-
-
-@_onnx_symbolic("aten::narrow")
-@symbolic_helper.parse_args("v", "i", "i", "i")
-def narrow(g: jit_utils.GraphContext, input, dim, start, length):
-    return symbolic_helper._slice_helper(
-        g, input, axes=[dim], starts=[start], ends=[start + length]
-    )
-
-
-@_onnx_symbolic("aten::argmax")
-@symbolic_helper.parse_args("v", "v", "b")
-def argmax(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-):
-    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
-
-
-@_onnx_symbolic("aten::argmin")
-@symbolic_helper.parse_args("v", "v", "b")
-def argmin(
-    g: jit_utils.GraphContext,
-    input: torch._C.Value,
-    dim: torch._C.Value,
-    keepdim: bool,
-):
-    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
-
-
-@_onnx_symbolic("aten::scatter")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def scatter(g: jit_utils.GraphContext, self, dim, index, src):
-    src_type = _type_utils.JitScalarType.from_value(
-        src, _type_utils.JitScalarType.UNDEFINED
-    )
-    src = symbolic_helper._maybe_get_scalar(src)
-    if symbolic_helper._is_value(src):
-        return g.op("Scatter", self, index, src, axis_i=dim)
-    else:
-        # Check if scalar "src" has same type as self (PyTorch allows different
-        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
-        self_scalar_type = _type_utils.JitScalarType.from_value(self)
-        if self_scalar_type != src_type:
-            src = g.op("Cast", src, to_i=self_scalar_type.onnx_type())
-        return g.op("Scatter", self, index, expand_as(g, src, index), axis_i=dim)
-
-
-@_onnx_symbolic("aten::scatter_add")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
-    scalar_type = symbolic_helper._try_get_scalar_type(self)
-    if scalar_type is None:
-        return symbolic_helper._unimplemented(
-            "scatter_add", "input dtype not accessible", self
-        )
-    sizes = symbolic_helper._get_tensor_sizes(self, allow_nonstatic=False)
-    if sizes:
-        to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=scalar_type.dtype()))
-    else:
-        to_add = zeros_like(g, self, scalar_type)
-    to_add = symbolic_helper._scatter_helper(g, to_add, dim, index, src)
-    return add(g, self, to_add)
-
-
-@_onnx_symbolic("aten::log2")
-def log2(g: jit_utils.GraphContext, self):
-    _ln2 = 0.693147180559945309
-    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor(_ln2)))
-
-
-@_onnx_symbolic("aten::is_floating_point")
-def is_floating_point(g: jit_utils.GraphContext, self):
-    if symbolic_helper._is_fp(self):
-        return g.op("Constant", value_t=torch.BoolTensor([1]))
-    return g.op("Constant", value_t=torch.BoolTensor([0]))
-
-
-@_onnx_symbolic("aten::__is_")
-def __is_(g: jit_utils.GraphContext, self, other):
-    if symbolic_helper._is_none(other):
-        if symbolic_helper._is_none(self):
-            return g.op("Constant", value_t=torch.BoolTensor([1]))
-        return g.op("Constant", value_t=torch.BoolTensor([0]))
-    return eq(g, self, other)
-
-
-@_onnx_symbolic("aten::__isnot_")
-@wrap_logical_op_with_negation
-def __isnot_(g: jit_utils.GraphContext, self, other):
-    return __is_(g, self, other)
-
-
-@_onnx_symbolic("aten::one_hot")
-def one_hot(g: jit_utils.GraphContext, self, num_classes):
-    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
-    # onnxruntime supports limited type combinations for OneHot.
-    if _type_utils.JitScalarType.from_value(
-        num_classes, _type_utils.JitScalarType.UNDEFINED
-    ) in {
-        _type_utils.JitScalarType.UINT8,
-        _type_utils.JitScalarType.INT8,
-        _type_utils.JitScalarType.INT,
-        _type_utils.JitScalarType.INT16,
-    }:
-        num_classes = g.op("Cast", num_classes, to_i=_C_onnx.TensorProtoDataType.INT64)
-    return g.op("OneHot", self, num_classes, values, axis_i=-1)
-
-
-@_onnx_symbolic("aten::gather")
-@symbolic_helper.parse_args("v", "i", "v", "v")
-def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
-    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
-        return symbolic_helper._unimplemented("gather", "sparse_grad == True", self)
-    # NOTE: This workaround is needed since GatherElement is only supported
-    #       since opset 11, and Gather in ONNX is not the same as torch.gather.
-    scalar_type = _type_utils.JitScalarType.from_value(self)
-    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
-    depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
-    index = g.op(
-        "Cast",
-        g.op("OneHot", index, depth, values, axis_i=dim),
-        to_i=scalar_type.onnx_type(),
-    )
-    mul = g.op("Mul", symbolic_helper._unsqueeze_helper(g, self, [dim + 1]), index)
-    return symbolic_helper._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
-
-
-@symbolic_helper.parse_args("v", "is", "i", "i")
-def _var_mean(g: jit_utils.GraphContext, input, dim, correction, keepdim):
-    return symbolic_helper._var_mean_helper(g, input, dim, correction, keepdim)
-
-
-@_onnx_symbolic("aten::std")
-def std(g: jit_utils.GraphContext, input, *args):
-    var, _ = var_mean(g, input, *args)
-    return g.op("Sqrt", var)
-
-
-@_onnx_symbolic("aten::var")
-def var(g: jit_utils.GraphContext, input, *args):
-    var, _ = var_mean(g, input, *args)
-    return var
-
-
-@_onnx_symbolic("aten::var_mean")
-def var_mean(g: jit_utils.GraphContext, input, *args):
-    if len(args) == 1:
-        return _var_mean(g, input, None, args[0], None)
-    else:
-        return _var_mean(g, input, *args)
-
-
-@_onnx_symbolic("aten::std_mean")
-def std_mean(g: jit_utils.GraphContext, input, *args):
-    var, mean = var_mean(g, input, *args)
-    return g.op("Sqrt", var), mean
-
-
-@_onnx_symbolic("aten::logsumexp")
-@symbolic_helper.parse_args("v", "is", "i")
-def logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
-    return g.op("ReduceLogSumExp", input, axes_i=dim, keepdims_i=keepdim)
-
-
-@_onnx_symbolic("aten::arange")
-def arange(g: jit_utils.GraphContext, *args):
-    def _get_arange_dtype(dtype):
-        dtype = symbolic_helper._maybe_get_const(dtype, "i")
-        return dtype
-
-    def _float_step_convert(range_tensor):
-        if symbolic_helper._is_fp(range_tensor):
-            range_tensor = g.op(
-                "Cast",
-                g.op("Ceil", range_tensor),
-                to_i=_type_utils.JitScalarType.INT64.onnx_type(),
-            )
-        return range_tensor
-
-    if len(args) == 2 or len(args) == 5:
-        if len(args) == 2:
-            # aten::arange(Scalar end, Tensor out)
-            dtype = None
-        else:
-            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-            dtype = _get_arange_dtype(args[1])
-        dtype, end, start, step = symbolic_helper._arange_cast_helper(
-            g, end=args[0], dtype=dtype
-        )
-        end = symbolic_helper._unsqueeze_helper(g, end, [0])
-        range_tensor = _float_step_convert(end)
-        arange_tensor = symbolic_helper._squeeze_helper(
-            g, nonzero(g, ones(g, range_tensor, dtype, None, None)), [1]
-        )
-        return g.op(
-            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
-        )
-    elif len(args) == 4 or len(args) == 7:
-        if len(args) == 4:
-            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
-            dtype = None
-        else:
-            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
-            dtype = _get_arange_dtype(args[3])
-        dtype, end, start, step = symbolic_helper._arange_cast_helper(
-            g, start=args[0], end=args[1], step=args[2], dtype=dtype
-        )
-        step = symbolic_helper._unsqueeze_helper(g, step, [0])
-        end = symbolic_helper._unsqueeze_helper(g, end, [0])
-        start = symbolic_helper._unsqueeze_helper(g, start, [0])
-        range_tensor = _float_step_convert(g.op("Div", g.op("Sub", end, start), step))
-        arange_tensor = symbolic_helper._squeeze_helper(
-            g, nonzero(g, ones(g, range_tensor, None, None, None)), [1]
-        )
-        arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
-        return g.op(
-            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
-        )
-    elif len(args) == 6:
-        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
-        dtype = _get_arange_dtype(args[2])
-        dtype, end, start, step = symbolic_helper._arange_cast_helper(
-            g, start=args[0], end=args[1], dtype=dtype
-        )
-        end = symbolic_helper._unsqueeze_helper(g, end, [0])
-        start = symbolic_helper._unsqueeze_helper(g, start, [0])
-        range_tensor = _float_step_convert(g.op("Sub", end, start))
-        arange_tensor = g.op(
-            "Add",
-            symbolic_helper._squeeze_helper(
-                g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]
-            ),
-            start,
-        )
-        return g.op(
-            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
-        )
-
-    return symbolic_helper._unimplemented("aten::arange", f"with {len(args)} arguments")
-
-
-@_onnx_symbolic("aten::linspace")
-def linspace(
-    g: jit_utils.GraphContext, start, end, steps, dtype, layout, device, pin_memory
-):
-    range_tensor = symbolic_helper._arange_helper(g, steps, None)
-    step = div(
-        g,
-        sub(g, end, start),
-        sub(g, steps, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))),
-    )
-    return add(g, mul(g, range_tensor, step), start)
-
-
-@_onnx_symbolic("aten::lift")
-def lift(g: jit_utils.GraphContext, self):
-    # at::lift() is a no-op from the perspective of tracing for onnx
-    return self
-
-
-@_onnx_symbolic("aten::masked_fill")
-def masked_fill(g: jit_utils.GraphContext, self, mask, value):
-    """Implement the masked_fill functionality available for a pytorch tensor in ONNX.
-
-    Fills elements of the input tensor with `value` where `mask` is True.
-    """
-    mask = g.op("Cast", mask, to_i=_C_onnx.TensorProtoDataType.BOOL)
-    value = symbolic_helper._maybe_get_scalar(value)
-    return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self)
-
-
-@_onnx_symbolic("aten::masked_fill_")
-def masked_fill_(g: jit_utils.GraphContext, self, mask, value):
-    return masked_fill(g, self, mask, value)
-
-
-@_onnx_symbolic("aten::index")
-def index(g: jit_utils.GraphContext, self, index):
-    if symbolic_helper._is_packed_list(index):
-        indices = symbolic_helper._unpack_list(index)
-    else:
-        indices = [index]
-
-    def try_mask_to_index(index):
-        if not symbolic_helper._is_none(index) and (
-            _type_utils.JitScalarType.from_value(
-                index, _type_utils.JitScalarType.UNDEFINED
-            )
-            == _type_utils.JitScalarType.UINT8
-            or symbolic_helper._is_bool(index)
-        ):
-            if g.opset < 9:
-                raise errors.SymbolicValueError(
-                    "Exporting masked indices are only supported after ONNX opset 9.",
-                    self,
-                )
-            warnings.warn(
-                "Exporting aten::index operator with indices of type Byte. "
-                "Only 1-D indices are supported. In any other case, "
-                "this will produce an incorrect ONNX graph."
-            )
-            index = symbolic_helper._squeeze_helper(g, nonzero(g, index), [1])
-        return index
-
-    indices = [try_mask_to_index(idx) for idx in indices]
-    if len(indices) == 1:
-        return symbolic_helper._select_helper(
-            g, self, 0, indices[0], apply_reshape=False
-        )
-    else:
-        # Multiple tensors as indices. Each tensor could either be
-        #   1. prim::Constant()
-        #           representing ":" in python indexing. E.g. tensor[:, :]
-        #   2. prim::Constant[value=...] or tensor output
-        #           representing advanced indexing. E.g. tensor[[0, 1], [2, 0]].
-        # For more info on advanced indexing,
-        # check https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing
-
-        # Consider a general case of
-        #       t: [x_1, y_1, y_2, ..., x_m, ..., y_n]
-        # where t is a tensor of rank m+n, {x_i} are axes where tensor index is provided, and {y_i} are axes for ":".
-        # Same results can be achieved through transposing t into
-        #       t: [x_1, x_2, ..., x_m, y_1, y_2, ..., y_n]
-        # and use gatherND. However ONNX does not have gatherND, to use 1d gather we'll need to flatten t
-        # and process the tensor indices.
-        #       t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n]
-        #       tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j))
-        # After gather, reshape and transpose back.
-        adv_idx_indices = [
-            i for i, idx in enumerate(indices) if not symbolic_helper._is_none(idx)
-        ]
-
-        if len(adv_idx_indices) == 0:
-            return self
-        elif len(adv_idx_indices) == 1:
-            return index_select(
-                g, self, adv_idx_indices[0], indices[adv_idx_indices[0]]
-            )
-        else:
-            rank = symbolic_helper._get_tensor_rank(self)
-            if rank is None:
-                return symbolic_helper._unimplemented(
-                    "aten::index",
-                    "operator of advanced indexing on tensor of unknown rank. ",
-                    self,
-                )
-            # TODO: If indexing is supported natively in ONNX in future opsets,
-            #       update the warning to recommend exporting with higher opset version.
-            warnings.warn(
-                "Exporting aten::index operator of advanced indexing in opset "
-                f"{GLOBALS.export_onnx_opset_version}"
-                " is achieved by combination of multiple ONNX operators, "
-                "including Reshape, Transpose, Concat, and Gather. "
-                "If indices include negative values, the exported graph will produce incorrect results."
-            )
-            adv_idx_count = len(adv_idx_indices)
-            shape_tensor = _shape_as_tensor(g, self)
-            dim_tensor_list = [
-                g.op(
-                    "Gather",
-                    shape_tensor,
-                    g.op("Constant", value_t=torch.LongTensor([dim])),
-                    axis_i=0,
-                )
-                for dim in range(rank)
-            ]
-
-            self = g.op(
-                "Transpose",
-                self,
-                perm_i=adv_idx_indices
-                + [i for i in range(rank) if i not in adv_idx_indices],
-            )
-            self = g.op("Flatten", self, axis_i=adv_idx_count)
-
-            # Note that tensor indices will be broadcasted while accumulating. Thus we get the final subarray shape as well.
-            cum_adv_index = indices[adv_idx_indices[-1]]
-            multiplier = dim_tensor_list[adv_idx_indices[-1]]
-            for i in range(adv_idx_count - 2, -1, -1):
-                adv_index = g.op("Mul", indices[adv_idx_indices[i]], multiplier)
-                cum_adv_index = g.op("Add", cum_adv_index, adv_index)
-                multiplier = g.op(
-                    "Mul", multiplier, dim_tensor_list[adv_idx_indices[i]]
-                )
-
-            # perform gather
-            self = index_select(g, self, 0, cum_adv_index)
-
-            cum_adv_index_shape_tensor = _shape_as_tensor(g, cum_adv_index)
-            # check if all advanced indices are consecutive.
-            # Refer to https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
-            # to understand how the subarray position is decided.
-            if adv_idx_indices == list(
-                range(adv_idx_indices[0], adv_idx_indices[-1] + 1)
-            ):
-                # unfold regular index axes
-                folded_adv_idx_shape_list = [
-                    g.op("Constant", value_t=torch.LongTensor([-1]))
-                ] + [
-                    dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices
-                ]
-                folded_adv_idx_shape = g.op(
-                    "Concat", *folded_adv_idx_shape_list, axis_i=0
-                )
-                self = symbolic_helper._reshape_helper(g, self, folded_adv_idx_shape)
-
-                # Transpose folded advanced indexed axis to its original location.
-                adv_idx_permute = (
-                    list(range(1, adv_idx_indices[0] + 1))
-                    + [0]
-                    + list(range(adv_idx_indices[0] + 1, rank - adv_idx_count + 1))
-                )
-                self = g.op("Transpose", self, perm_i=adv_idx_permute)
-
-                # unfold advanced index axes
-                final_shape_list = (
-                    [dim_tensor_list[i] for i in range(adv_idx_indices[0])]
-                    + [cum_adv_index_shape_tensor]
-                    + [
-                        dim_tensor_list[i]
-                        for i in range(adv_idx_indices[0], rank)
-                        if i not in adv_idx_indices
-                    ]
-                )
-                final_shape = g.op("Concat", *final_shape_list, axis_i=0)
-            else:
-                final_shape = g.op(
-                    "Concat",
-                    cum_adv_index_shape_tensor,
-                    *[
-                        dim_tensor_list[i]
-                        for i in range(rank)
-                        if i not in adv_idx_indices
-                    ],
-                    axis_i=0,
-                )
-
-            return symbolic_helper._reshape_helper(g, self, final_shape)
-
-
-@_onnx_symbolic("aten::linalg_norm")
-@symbolic_helper.parse_args("v", "v", "is", "b", "v")
-def linalg_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: torch._C.Value,
-    dim: Sequence[int] | None,
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.norm.html
-    ord_value = None
-    if dim is None:
-        if symbolic_helper._is_none(ord):
-            self = symbolic_helper._reshape_helper(g, self, [-1])
-            ord = g.op("Constant", value_t=torch.LongTensor([2]))
-        self_dim = symbolic_helper._get_tensor_rank(self)
-        if self_dim is None:
-            return symbolic_helper._unimplemented(
-                "dim", "Input rank must be known at export time.", self
-            )
-        if self_dim == 1:
-            ord_value = symbolic_helper._parse_arg(ord, "f")
-        else:
-            dim = [0, 1]
-    else:
-        if len(dim) == 1:
-            if symbolic_helper._is_none(ord):
-                ord = g.op("Constant", value_t=torch.LongTensor([2]))
-            ord_value = symbolic_helper._parse_arg(ord, "f")
-    if ord_value:
-        return linalg_vector_norm(g, self, ord_value, dim, keepdim, dtype)
-    return linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
-
-
-@_onnx_symbolic("aten::linalg_vector_norm")
-@symbolic_helper.parse_args("v", "f", "is", "b", "v")
-def linalg_vector_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: float,
-    dim: Sequence[int] | None,
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
-
-
-@_onnx_symbolic("aten::linalg_matrix_norm")
-@symbolic_helper.parse_args("v", "v", "is", "b", "v")
-def linalg_matrix_norm(
-    g: jit_utils.GraphContext,
-    self: torch._C.Value,
-    ord: torch._C.Value,
-    dim: list[int],
-    keepdim: bool,
-    dtype: torch._C.Value,
-):
-    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.matrix_norm.html
-    ord_value = symbolic_helper._parse_arg(ord, "s")
-    if ord_value == "fro":
-        return frobenius_norm(g, self, dim, keepdim)
-    elif ord_value == "nuc":
-        return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==nuc", self)
-    else:
-        ord_value = symbolic_helper._parse_arg(ord, "f")
-        if ord_value is None:
-            return frobenius_norm(g, self, dim, keepdim)
-        if ord_value == 2 or ord_value == -2:
-            # ord = 2/-2 unimplemented due to lack of operators
-            # used to calculate singular values
-            return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==2", self)
-        # Wrap the dim vector to handle negative dim values
-        self_dim = symbolic_helper._get_tensor_rank(self)
-        if self_dim is None:
-            return symbolic_helper._unimplemented(
-                "linalg.matrix_norm", "Input rank must be known at export time.", self
-            )
-        # Common implementation for cases with
-        # ord = 1/-1 and ord = inf/-inf
-        if dim[0] < 0:
-            dim[0] += self_dim
-        if dim[1] < 0:
-            dim[1] += self_dim
-
-        if ord_value == math.inf or ord_value == -math.inf:
-            dim[0], dim[1] = dim[1], dim[0]
-        if dim[1] > dim[0] and not keepdim:
-            dim[1] -= 1
-        sum = symbolic_helper._reducesum_helper(
-            g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim
-        )
-        if ord_value > 0:
-            result, _indices = max(
-                g,
-                sum,
-                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
-                keepdim=keepdim,
-            )
-        else:
-            result, _indices = min(
-                g,
-                sum,
-                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
-                keepdim=keepdim,
-            )
-        return result
-
-
-@_onnx_symbolic("aten::linalg_cross")
-@symbolic_helper.parse_args("v", "v", "i")
-def linalg_cross(g: jit_utils.GraphContext, input, other, dim=-1):
-    return cross(g, input, other, dim)
-
-
-@_onnx_symbolic("aten::frobenius_norm")
-@symbolic_helper.parse_args("v", "is", "b")
-def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
-    sqr = g.op("Mul", self, self)
-    sumsqr = symbolic_helper._reducesum_helper(g, sqr, axes_i=dim, keepdims_i=keepdim)
-    return g.op("Sqrt", sumsqr)
-
-
-@_onnx_symbolic("aten::multinomial")
-@symbolic_helper.parse_args("v", "i", "b", "v")
-def multinomial(
-    g: jit_utils.GraphContext, input, num_samples, replacement=False, generator=None
-):
-    if generator is not None and not symbolic_helper._is_none(generator):
-        symbolic_helper._unimplemented(
-            "Multinomial", "generator is not supported for multinomial", input
-        )
-    if not replacement and num_samples > 1:
-        symbolic_helper._unimplemented(
-            "Multinomial",
-            "replacement=False when num_samples > 1 is not supported for multinomial",
-            input,
-        )
-
-    log_input = log(g, input)
-    return g.op(
-        "Multinomial",
-        log_input,
-        dtype_i=_C_onnx.TensorProtoDataType.INT64,
-        sample_size_i=num_samples,
-    )
-
-
-@_onnx_symbolic("aten::baddbmm")
-def baddbmm(g: jit_utils.GraphContext, self, batch1, batch2, beta, alpha):
-    scalar_type = _type_utils.JitScalarType.from_value(self)
-    batch_mul = matmul(g, batch1, batch2)
-    mul_a = mul(
-        g,
-        batch_mul,
-        g.op("Cast", alpha, to_i=scalar_type.onnx_type()),
-    )
-    mul_b = mul(
-        g,
-        self,
-        g.op("Cast", beta, to_i=scalar_type.onnx_type()),
-    )
-    return add(g, mul_a, mul_b)
-
-
-@_onnx_symbolic("aten::meshgrid")
-@symbolic_helper.parse_args("v", "s")
-def meshgrid(g: jit_utils.GraphContext, tensor_list, indexing: str | None = None):
-    if indexing is None:
-        indexing = "ij"
-    elif indexing not in {"ij", "xy"}:
-        raise errors.SymbolicValueError(
-            f"Unsupported indexing: {indexing}", tensor_list
-        )
-    unpacked_tensor_list = symbolic_helper._unpack_list(tensor_list)
-    if indexing == "xy":
-        unpacked_tensor_list[:2] = unpacked_tensor_list[1::-1]
-    tensors = [
-        symbolic_helper._reshape_helper(
-            g, t, g.op("Constant", value_t=torch.LongTensor([-1]))
-        )
-        for t in unpacked_tensor_list
-    ]
-    tensors_shape = [g.op("Shape", t) for t in tensors]
-    out_shape = g.op("Concat", *tensors_shape, axis_i=0)
-    out = []
-    for i, t in enumerate(tensors):
-        shape_i = [g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))] * len(
-            tensors
-        )
-        shape_i[i] = tensors_shape[i]
-        t_reshaped = _reshape_from_tensor(g, t, g.op("Concat", *shape_i, axis_i=0))
-        out.append(g.op("Expand", t_reshaped, out_shape))
-    if indexing == "xy":
-        out[0], out[1] = out[1], out[0]
-    return g.op("prim::ListConstruct", *out)
-
-
-@_onnx_symbolic("aten::remainder")
-def remainder(g: jit_utils.GraphContext, input, other):
-    div = _floor_divide(g, input, other)
-    quo = g.op("Mul", div, other)
-    return g.op("Sub", input, quo)
-
-
-@_onnx_symbolic("aten::gelu")
-@symbolic_helper.parse_args("v", "s")
-def gelu(g: jit_utils.GraphContext, self: torch._C.Value, approximate: str = "none"):
-    if approximate == "tanh":
-        kBeta = math.sqrt(2 / math.pi)
-        kKappa = 0.044715
-
-        beta = torch.tensor(kBeta, dtype=torch.double)
-        kappa = torch.tensor(kKappa, dtype=torch.double)
-        one = torch.tensor(1.0, dtype=torch.double)
-        half = torch.tensor(0.5, dtype=torch.double)
-
-        self_cube = mul(g, self, mul(g, self, self))
-        inner = mul(g, beta, add(g, self, mul(g, kappa, self_cube)))
-        return mul(g, half, mul(g, self, add(g, one, g.op("Tanh", inner))))
-    else:
-        _sqrt2 = 1.4142135623730951
-        erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double)))
-        erf_plusone = add(
-            g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))
-        )
-        return mul(
-            g,
-            mul(g, self, erf_plusone),
-            g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double)),
-        )
-
-
-@_onnx_symbolic("aten::group_norm")
-@symbolic_helper.quantized_args(True, False, False, False)
-@symbolic_helper.parse_args("v", "i", "v", "v", "f", "i")
-def group_norm(
-    g: jit_utils.GraphContext, input, num_groups, weight, bias, eps, cudnn_enabled
-):
-    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
-    if channel_size is not None:
-        assert channel_size % num_groups == 0
-    input_rank = symbolic_helper._get_tensor_rank(input)
-    if input_rank is None:
-        return symbolic_helper._unimplemented("group_norm", "unknown input rank", input)
-    # 0 in the shape list keeps dimension value unchanged.
-    shape = [0, num_groups, -1]
-    input_reshaped = symbolic_helper._reshape_helper(
-        g, input, g.op("Constant", value_t=torch.LongTensor(shape))
-    )
-
-    # C is always divisible by num_groups
-    # Due to shape difference. we need to apply weight and bias after
-    # instance norm computation and reshape
-    weight_ = g.op(
-        "Constant",
-        value_t=torch.tensor(
-            [1.0] * num_groups,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        ),
-    )
-    bias_ = g.op(
-        "Constant",
-        value_t=torch.tensor(
-            [0.0] * num_groups,
-            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
-        ),
-    )
-
-    norm_reshaped = g.op(
-        "InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps
-    )
-    norm = symbolic_helper._reshape_helper(g, norm_reshaped, g.op("Shape", input))
-
-    if weight is None or weight.node().mustBeNone():
-        weight_value = torch.tensor(
-            [1.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
-        )
-        weight = g.op("Constant", value_t=weight_value)
-    if bias is None or bias.node().mustBeNone():
-        bias_value = torch.tensor(
-            [0.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
-        )
-        bias = g.op("Constant", value_t=bias_value)
-
-    # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
-    axes = list(range(1, input_rank - 1))
-    return add(
-        g,
-        mul(g, norm, symbolic_helper._unsqueeze_helper(g, weight, axes)),
-        symbolic_helper._unsqueeze_helper(g, bias, axes),
-    )
-
-
-@_onnx_symbolic("aten::_weight_norm")
-@symbolic_helper.parse_args("v", "v", "i")
-def _weight_norm(g: jit_utils.GraphContext, weight_v, weight_g, dim):
-    rank = symbolic_helper._get_tensor_rank(weight_v)
-    if rank is not None:
-        # W = g * ((v) / ||v||)
-        # Compute norm_except_dim for l2 norm. dim = None means over all dims
-        # torch's weight_norm module sets dim = -1 if it's None.
-        # This conflicts the logic for negative axes to access dims backwards
-        # TODO: Might need a fix in torch group_norm module
-        axes = list(range(rank))
-        if dim is not None:
-            if dim < -1:
-                dim += rank
-            if dim != -1:
-                axes.remove(dim)
-        norm_v = norm(g, weight_v, 2, axes, 1)
-        div = g.op("Div", weight_v, norm_v)
-        return g.op("Mul", div, weight_g)
-    raise errors.SymbolicValueError(
-        "Unsupported: ONNX export of _weight_norm for tensor of unknown rank.",
-        weight_v,
-    )
-
-
-@_onnx_symbolic("aten::dim")
-def dim(g: jit_utils.GraphContext, self):
-    """Implement the dim functionality available for a pytorch tensor in ONNX"""
-    # ONNX does not support dim directly in this opset so we can use 2 ops to get the info
-    shape = g.op("Shape", self)
-    return g.op("Size", shape)
-
-
-@_onnx_symbolic("aten::__contains_")
-def __contains_(g: jit_utils.GraphContext, self, element):
-    unpacked_list = symbolic_helper._unpack_list(self)
-    if all(
-        symbolic_helper._is_constant(x) for x in unpacked_list
-    ) and symbolic_helper._is_constant(element):
-        return g.op(
-            "Constant",
-            value_t=torch.tensor(
-                symbolic_helper._node_get(element.node(), "value")
-                in (symbolic_helper._node_get(x.node(), "value") for x in unpacked_list)
-            ),
-        )
-
-    raise errors.SymbolicValueError(
-        "Unsupported: ONNX export of __contains__ for non-constant list or element.",
-        self,
-    )
-
-
-@_onnx_symbolic("aten::__getitem_")
-def __getitem_(g: jit_utils.GraphContext, self, i):
-    return select(g, self, g.op("Constant", value_t=torch.tensor([0])), i)
-
-
-@_onnx_symbolic("aten::item")
-def item(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("aten::take")
-def take(g: jit_utils.GraphContext, self, index):
-    self_flattened = symbolic_helper._reshape_helper(
-        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
-    )
-    out = index_select(g, self_flattened, 0, index)
-    out = reshape_as(g, out, index)
-    return out
-
-
-def _kl_div_log_target_impl(g: jit_utils.GraphContext, input, target):
-    diff_ = sub(g, target, input)
-    exp_ = exp(g, target)
-    output = mul(g, exp_, diff_)
-    return output
-
-
-def _kl_div_non_log_target_impl(g: jit_utils.GraphContext, input, target):
-    log_ = log(g, target)
-    diff_ = sub(g, log_, input)
-    output_pos = mul(g, target, diff_)
-    zeros_ = zeros_like(g, output_pos)
-    mask_ = gt(g, target, g.op("Constant", value_t=torch.tensor(0)))
-    output = where(g, mask_, output_pos, zeros_)
-    return output
-
-
-@_onnx_symbolic("aten::kl_div")
-@symbolic_helper.parse_args("v", "v", "i", "b")
-def kl_div(g: jit_utils.GraphContext, input, target, reduction, log_target):
-    if log_target:
-        output = _kl_div_log_target_impl(g, input, target)
-    else:
-        output = _kl_div_non_log_target_impl(g, input, target)
-
-    if reduction == 0:
-        return output
-    elif reduction == 1:
-        return g.op("ReduceMean", output, keepdims_i=0)
-    elif reduction == 2:
-        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
-    else:
-        return symbolic_helper._onnx_unsupported(
-            "kl_div with reduction other than none, mean, or sum.", input
-        )
-
-
-@_onnx_symbolic("aten::mse_loss")
-@symbolic_helper.parse_args("v", "v", "i")
-def mse_loss(g: jit_utils.GraphContext, input, target, reduction):
-    output = mul(g, sub(g, input, target), sub(g, input, target))
-    if reduction == 0:
-        return output
-    elif reduction == 1:
-        return g.op("ReduceMean", output, keepdims_i=0)
-    elif reduction == 2:
-        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
-    else:
-        return symbolic_helper._onnx_unsupported(
-            "mse_loss with reduction other than none, mean, or sum.", input
-        )
-
-
-@_onnx_symbolic("aten::as_strided")
-@symbolic_helper.quantized_args(True)
-@symbolic_helper.parse_args("v", "v", "is", "i")
-def as_strided(g: jit_utils.GraphContext, self, sizes, strides, offset=None):
-    sizes = symbolic_helper._maybe_get_const(sizes, "is")
-    rank = len(strides)
-    self_1d = symbolic_helper._reshape_helper(
-        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
-    )
-    ind: torch.Tensor | None
-    if not symbolic_helper._is_value(sizes):
-        ind = torch.tensor([0], dtype=torch.long)
-        for i, (size, stride) in enumerate(zip(sizes, strides)):
-            r_size = [1] * rank
-            r_size[i] = -1
-            ind = ind + torch.arange(size).view(r_size) * stride
-        if offset:
-            ind = ind + offset
-        return g.op("Gather", self_1d, g.op("Constant", value_t=ind))
-    else:
-        ind = None
-        for i, stride in enumerate(strides):
-            r_size = [1] * rank
-            r_size[i] = -1
-            size = select(
-                g,
-                sizes,
-                g.op("Constant", value_t=torch.tensor([0])),
-                g.op("Constant", value_t=torch.tensor(i)),
-            )
-            tmp_ind = symbolic_helper._reshape_helper(
-                g,
-                arange(g, size, 4, None, None, None),
-                g.op("Constant", value_t=torch.tensor(r_size)),
-            )
-            tmp_ind = g.op(
-                "Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride]))
-            )
-            if ind is None:
-                ind = tmp_ind
-            else:
-                ind = g.op("Add", ind, tmp_ind)
-        if offset:
-            ind = g.op("Add", ind, g.op("Constant", torch.tensor([offset])))
-        return g.op("Gather", self_1d, ind)
-
-
-@_onnx_symbolic("aten::__derive_index")
-def __derive_index(g: jit_utils.GraphContext, index, start, step):
-    return g.op("Add", start, g.op("Mul", index, step))
-
-
-@_onnx_symbolic("aten::__range_length")
-# Source code for aten op can be found here: pytorch/torch/csrc/jit/runtime/register_prim_ops.cpp
-# if (step > 0 && lo < hi) {
-#   push(stack, 1 + (hi - 1 - lo) / step);
-# } else if (step < 0 && lo > hi) {
-#   push(stack, 1 + (lo - 1 - hi) / (0 - step));
-# } else {
-#  push(stack, 0);
-# }
-def __range_length(g: jit_utils.GraphContext, lo, hi, step):
-    sub = g.op("Sub", hi, lo)
-    div = g.op("Ceil", true_divide(g, sub, step))
-    return g.op("Cast", div, to_i=_C_onnx.TensorProtoDataType.INT64)
-
-
-@_onnx_symbolic("aten::linear")
-def linear(g: jit_utils.GraphContext, input, weight, bias):
-    rank = symbolic_helper._get_tensor_rank(input)
-    weight = t(g, weight)
-    if rank == 2 and not bias.node().mustBeNone():
-        alpha = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
-        beta = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
-        output = addmm(g, bias, input, weight, alpha, beta)
-    else:
-        output = matmul(g, input, weight)
-        if not bias.node().mustBeNone():
-            output = add(g, bias, output)
-
-    return output
-
-
-@_onnx_symbolic("aten::hann_window")
-@symbolic_helper.parse_args("v", "b", "i", "v", "v", "v", "v")
-def hann_window(
-    g: jit_utils.GraphContext,
-    window_length,
-    periodic=True,
-    dtype: int | None = None,
-    layout=None,
-    device=None,
-    pin_memory=None,
-    requires_grad=False,
-):
-    if dtype is None:
-        dtype_ = torch.get_default_dtype()
-        if not dtype_ or not dtype_.is_floating_point:
-            dtype_ = torch.float
-        scalar_type = _type_utils.JitScalarType.from_dtype(dtype_)
-    else:
-        scalar_type = _type_utils.JitScalarType(dtype)
-
-    n_array = arange(g, window_length, 4, None, None, None)
-    output = g.op("Cast", n_array, to_i=_C_onnx.TensorProtoDataType.FLOAT)
-    output = mul(
-        g, g.op("Constant", value_t=torch.tensor(math.pi, dtype=torch.float)), output
-    )
-
-    if periodic is False:
-        window_length = sub(
-            g, window_length, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int))
-        )
-    output = div(g, output, window_length)
-    output = g.op(
-        "Cast",
-        square(g, sin(g, output)),
-        to_i=scalar_type.onnx_type(),
-    )
-
-    return output
-
-
-@_onnx_symbolic("aten::mv")
-def mv(g: jit_utils.GraphContext, self, vec):
-    return matmul(g, self, vec)
-
-
-@_onnx_symbolic("aten::dot")
-def dot(g: jit_utils.GraphContext, self, other):
-    return matmul(g, self, other)
-
-
-@_onnx_symbolic("aten::movedim")
-@symbolic_helper.parse_args("v", "t", "t")
-def movedim(g: jit_utils.GraphContext, self, source, destination):
-    # This is a pythonic implementation mostly taken from aten/src/ATen/native/TensorShape.cpp::movedim
-    source = source.view(-1)
-    destination = destination.view(-1)
-
-    assert source.size() == destination.size()
-
-    if (source == destination).all():
-        return self
-
-    self_rank = symbolic_helper._get_tensor_rank(self)
-    assert self_rank is not None
-
-    perm = list(range(self_rank))
-
-    src_dims = perm.copy()
-    dst_dims = perm.copy()
-
-    for src, dst in zip(source.tolist(), destination.tolist()):
-        perm[dst] = src
-        src_dims[src] = -1
-        dst_dims[dst] = -1
-
-    src_dims = [dim for dim in src_dims if dim != -1]
-    dst_dims = [dim for dim in dst_dims if dim != -1]
-
-    for src, dst in zip(src_dims, dst_dims):
-        perm[dst] = src
-
-    return g.op("Transpose", self, perm_i=perm)
-
-
-@_onnx_symbolic("aten::fill")
-@symbolic_helper.parse_args("v", "v")
-def fill(g: jit_utils.GraphContext, self, value):
-    scalar_type = _type_utils.JitScalarType.from_value(
-        self, _type_utils.JitScalarType.FLOAT
-    )
-    return full_like(g, self, value, scalar_type)
-
-
-@_onnx_symbolic("aten::index_add")
-def index_add(g: jit_utils.GraphContext, self, dim, index, other, alpha=None):
-    warnings.warn(
-        "Warning: ONNX export does not support duplicated values in 'index' field, "
-        + "this will cause the ONNX model to be incorrect."
-    )
-
-    # ONNX does not support "alpha" argument, unlike aten index_add
-    # See: https://github.com/pytorch/pytorch/pull/65993#issuecomment-953151102 for more context
-    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
-        return symbolic_helper._unimplemented("index_add", "alpha != 1", self)
-
-    dim = symbolic_helper._maybe_get_const(dim, "i")
-    if dim is None:
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting 'index_add_()' function with "
-            "unknown 'dim' value.",
-            self,
-        )
-
-    self_dim_rank = symbolic_helper._get_tensor_rank(self)
-    other_dim_rank = symbolic_helper._get_tensor_rank(other)
-
-    if self_dim_rank is None or other_dim_rank is None:
-        raise errors.SymbolicValueError(
-            "ONNX export does NOT support exporting 'index_add_()' function while "
-            "the rank of self tensor or tensor to be added is unknown.",
-            self,
-        )
-
-    if other_dim_rank != self_dim_rank:
-        delta = self_dim_rank - other_dim_rank
-        for i in range(delta):
-            other = symbolic_helper._unsqueeze_helper(
-                g, other, [symbolic_helper._get_tensor_rank(other)]
-            )
-
-    other_dim_size = symbolic_helper._get_tensor_dim_size(other, dim)
-    self_dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
-
-    if (other_dim_size is not None) and (self_dim_size is not None):
-        if other_dim_size > self_dim_size:
-            raise errors.SymbolicValueError(
-                "ONNX export does not support exporting 'index_add_()' function with "
-                "duplicated values in 'index' parameter yet.",
-                self,
-            )
-
-    # Construct a new shape. It's almost as same as self except the size of the 'dim'
-    # dimension is 1, so that we can expand other dimensions as expected.
-    new_shape_axes = list(range(self_dim_rank))
-    new_shape_starts = [0 for i in range(self_dim_rank)]
-    new_shape_ends = [sys.maxsize if (i != dim) else 1 for i in range(self_dim_rank)]
-
-    new_shape = symbolic_helper._slice_helper(
-        g, self, axes=new_shape_axes, starts=new_shape_starts, ends=new_shape_ends
-    )
-    other = expand_as(g, other, new_shape)
-
-    for i in range(dim):
-        index = symbolic_helper._unsqueeze_helper(g, index, [0])
-
-    for i in range(self_dim_rank - dim - 1):
-        index = symbolic_helper._unsqueeze_helper(
-            g, index, [symbolic_helper._get_tensor_rank(index)]
-        )
-
-    return scatter_add(g, self, dim, expand_as(g, index, other), other)
-
-
-@_onnx_symbolic("aten::roll")
-@symbolic_helper.parse_args("v", "is", "is")
-def roll(g: jit_utils.GraphContext, self, shifts, dims):
-    assert len(shifts) == len(dims)
-
-    result = self
-    for i in range(len(shifts)):
-        shapes = []
-        shape = symbolic_helper._slice_helper(
-            g, result, axes=[dims[i]], starts=[-shifts[i]], ends=[sys.maxsize]
-        )
-        shapes.append(shape)
-        shape = symbolic_helper._slice_helper(
-            g, result, axes=[dims[i]], starts=[0], ends=[-shifts[i]]
-        )
-        shapes.append(shape)
-        result = g.op("Concat", *shapes, axis_i=dims[i])
-
-    return result
-
-
-@_onnx_symbolic("aten::cross")
-@symbolic_helper.parse_args("v", "v", "i")
-def cross(g: jit_utils.GraphContext, input, other, dim=None):
-    dim = symbolic_helper._get_dim_for_cross(input, dim)
-    # If we have two tensors such that
-    # A = [a, b, c], B = [d, e, f], we permute the tensor such that we have
-    # After first roll,
-    # A' = [b, c, a], B' = [f, d, e], so that we calculate (b*f, c*d, a*e)
-    roll_x_1 = roll(g, input, [2], [dim])
-    roll_y_1 = roll(g, other, [1], [dim])
-    # After second roll,
-    # A' = [c, a, b], B' = [e, f, d], so that we calculate (c*e, a*f, b*d)
-    roll_x_2 = roll(g, input, [1], [dim])
-    roll_y_2 = roll(g, other, [2], [dim])
-    # cross product is calculated as
-    # result = [(b*f - c*e), (c*d - a*f), (a*e - b*d)]
-    return sub(g, mul(g, roll_x_1, roll_y_1), mul(g, roll_x_2, roll_y_2))
-
-
-@_onnx_symbolic("aten::cdist")
-def cdist(
-    g: jit_utils.GraphContext,
-    x1,
-    x2,
-    p=2.0,
-    compute_mode="use_mm_for_euclid_dist_if_necessary",
-):
-    # X1.shape = (B * P * D), X2.shape = (B * R * D)
-    # In order to respect numpy style broadcasting as demonstrated in
-    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
-    # we unsqueeze both input tensors
-    row_size_x1 = symbolic_helper._get_tensor_dim_size(x1, -2)
-    row_size_x2 = symbolic_helper._get_tensor_dim_size(x2, -2)
-    assert row_size_x1 is not None
-    assert row_size_x2 is not None
-    p_float = symbolic_helper._parse_arg(p, "f")
-    compute_mode = symbolic_helper._parse_arg(compute_mode, "i")
-    if p_float == 2.0 and (
-        compute_mode == 1
-        or (compute_mode is None and row_size_x1 >= 25 and row_size_x2 >= 25)
-    ):
-        return _euclidean_dist(g, x1, x2)
-    rank = symbolic_helper._get_tensor_rank(x1)
-    assert rank is not None
-    broadcasted_x1 = symbolic_helper._unsqueeze_helper(g, x1, [rank - 1])
-    broadcasted_x2 = symbolic_helper._unsqueeze_helper(g, x2, [rank - 2])
-    return pairwise_distance(
-        g, broadcasted_x1, broadcasted_x2, p, eps=1e-06, keepdim=False
-    )
-
-
-def _euclidean_dist(g: jit_utils.GraphContext, x1, x2):
-    # X1.shape = (B * P * D), X2.shape = (B * R * D)
-    # using matrix multiplication to accelerate the calculation of
-    # the euclidean distance
-    rank = symbolic_helper._get_tensor_rank(x1)
-    assert rank is not None
-    x1_norm = symbolic_helper._reducesum_helper(
-        g,
-        pow(g, x1, symbolic_helper._generate_wrapped_number(g, 2.0)),
-        axes_i=[-1],
-        keepdims_i=True,
-    )
-    x1_pad = ones_like(g, x1_norm)
-    x2_norm = symbolic_helper._reducesum_helper(
-        g,
-        pow(g, x2, symbolic_helper._generate_wrapped_number(g, 2.0)),
-        axes_i=[-1],
-        keepdims_i=True,
-    )
-    x2_pad = ones_like(g, x2_norm)
-    x1_ = g.op(
-        "Concat",
-        *[
-            mul(g, symbolic_helper._generate_wrapped_number(g, -2.0), x1),
-            x1_norm,
-            x1_pad,
-        ],
-        axis_i=-1,
-    )
-    x2_ = g.op("Concat", *[x2, x2_pad, x2_norm], axis_i=-1)
-    result = matmul(g, x1_, transpose(g, x2_, -2, -1))
-    dtype = _type_utils.JitScalarType.from_value(result)
-    min = g.op(
-        "Cast", symbolic_helper._generate_wrapped_number(g, 0.0), to_i=dtype.onnx_type()
-    )
-    result = symbolic_helper._op_with_optional_float_cast(
-        g, "Max", result, min, opset_before=12
-    )
-    result = sqrt(g, result)
-    return result
-
-
-@_onnx_symbolic("aten::lerp")
-def lerp(g: jit_utils.GraphContext, self, end, weight):
-    # Conditional for better numeric. This has been discussed in
-    # https://github.com/pytorch/pytorch/pull/18871
-    diff = g.op("Sub", end, self)
-    return where(
-        g,
-        g.op("Less", weight, g.op("Constant", value_t=torch.tensor(0.5))),
-        g.op("Add", self, g.op("Mul", weight, diff)),
-        g.op(
-            "Sub",
-            end,
-            g.op(
-                "Mul",
-                diff,
-                g.op("Sub", g.op("Constant", value_t=torch.tensor(1.0)), weight),
-            ),
-        ),
-    )
-
-
-@_onnx_symbolic("aten::broadcast_tensors")
-def broadcast_tensors(g: jit_utils.GraphContext, self):
-    all_tensors = symbolic_helper._unpack_list(self)
-    t_with_final_shape = zeros_like(g, all_tensors[0])
-
-    # Add operator supports multidirectional broadcasting. So we leverage this function
-    # to infer the final shape generated by the broadcast.
-    for t in all_tensors:
-        t_with_final_shape = add(g, t_with_final_shape, t)
-
-    t_list = [expand_as(g, t, t_with_final_shape) for t in all_tensors]
-    return g.op("prim::ListConstruct", *t_list)
-
-
-@_onnx_symbolic("aten::is_pinned")
-def is_pinned(g: jit_utils.GraphContext, self, device=None):
-    # Unused by ONNX.
-    return None
-
-
-@_onnx_symbolic("prim::ConstantSplit")
-def prim_constant_split(g: jit_utils.GraphContext, self, split_size, dim):
-    size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if size is None:
-        return symbolic_helper._unimplemented(
-            "prim::ConstantSplit", "unknown dimension size", self
-        )
-    splits = [split_size] * (size // split_size)
-    leftover = size % split_size
-    if leftover:
-        splits.append(leftover)
-    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
-
-
-# TODO: It would be better to export this as a chunk directly, as this is
-# less sensitive to changes in input size.
-# TODO: Once we have proper scoping, stop reimplementing chunk, delete this
-# method, and use the desugared version
-@_onnx_symbolic("prim::ConstantChunk")
-def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
-    dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
-    if dim_size is None:
-        return symbolic_helper._unimplemented(
-            "prim::ConstantChunk", "unknown dimension size", self
-        )
-    split_size = (dim_size + chunks - 1) // chunks
-    return prim_constant_split(g, self, split_size, dim)
-
-
-@_onnx_symbolic("prim::shape")
-def prim_shape(g: jit_utils.GraphContext, self):
-    return g.op("Shape", self)
-
-
-@_onnx_symbolic("prim::max")
-def prim_max(g: jit_utils.GraphContext, self, other):
-    return symbolic_helper._op_with_optional_float_cast(
-        g, "Max", self, other, opset_before=12
-    )
-
-
-@_onnx_symbolic("prim::min")
-def prim_min(g: jit_utils.GraphContext, self, other=None):
-    if not other:
-        if symbolic_helper._is_packed_list(self):
-            self = stack(g, self, g.op("Constant", value_t=torch.tensor([0])))
-        return min(g, self)
-    return min(g, self, other)
-
-
-@_onnx_symbolic("prim::data")
-def prim_data(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("prim::layout")
-def prim_layout(g: jit_utils.GraphContext, self):
-    # Always return 'torch.strided'. Other layout types are not supported by JIT 'TensorType'.
-    # Layout class defined in 'c10/core/Layout.h'.
-    return g.op("Constant", value_t=torch.tensor(0))
-
-
-@_onnx_symbolic("prim::ListConstruct")
-def prim_list_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
-    return None
-
-
-@_onnx_symbolic("prim::ListUnpack")
-def prim_list_unpack(
-    g: jit_utils.GraphContext, *inputs, **kwargs
-) -> list[_C.Value] | None:
-    if len(inputs) == 1 and inputs[0].node().kind() == "prim::ListConstruct":
-        # Cancel the previous node if it is ListConstruct by returning its inputs
-        # TODO(justinchuby): Use a public method in the helper module
-        return symbolic_helper._unpack_list(inputs[0])
-
-    return None
-
-
-@_onnx_symbolic("prim::TupleConstruct")
-def prim_tuple_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
-    return None
-
-
-@_onnx_symbolic("prim::Uninitialized")
-def prim_uninitialized(g: jit_utils.GraphContext, *inputs, **kwargs):
-    return None
-
-
-# exists to refine the type of the Value
-# if x is an optional Tensor, unchecked_cast will cast
-# x to Tensor, so the rest of the graph knows that x is a Tensor
-# this doesn't do anything in runtime and is a noop in ONNX
-@_onnx_symbolic("prim::unchecked_cast")
-def prim_unchecked_cast(g: jit_utils.GraphContext, self):
-    return self
-
-
-@_onnx_symbolic("prim::dtype")
-def prim_dtype(g: jit_utils.GraphContext, self):
-    scalar_type = symbolic_helper._try_get_scalar_type(self)
-    if scalar_type is None:
-        scalar_type = _type_utils.JitScalarType.FLOAT
-    # This node records a torch dtype as int
-    return g.op("Constant", value_t=torch.tensor(scalar_type))
-
-
-@_onnx_symbolic("prim::tolist")
-def prim_tolist(g: jit_utils.GraphContext, input, dim_val, elem_ty_val):
-    """tolist is currently supported only for 1D input tensors.
-
-    dim_val and elem_ty_val represent dimension and type annotations
-    that need to match dimension and type of the input tensor.
-    """
-    dim = symbolic_helper._maybe_get_const(dim_val, "i")
-    if dim > 1:
-        return symbolic_helper._unimplemented("prim::tolist", "dim_val > 1", input)
-    return input
-
-
-# -----------------------------------------------------------------------------
-# Symbolic functions that need extra context
-# -----------------------------------------------------------------------------
-@_onnx_symbolic("prim::device")
-def prim_device(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
-    output_type = g.original_node.output().type()
-    if isinstance(output_type, _C.DeviceObjType):
-        return None
-
-    return symbolic_helper._unimplemented(
-        "prim::device",
-        f"output type should be 'DeviceObjType', not '{output_type.kind()}'",
-        g.original_node.output(),
-    )
-
-
-@_onnx_symbolic("prim::Loop")
-def prim_loop(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
-    node = g.original_node
-    env = g.env
-    values_in_env = g.values_in_env
-    params_dict = g.params_dict
-
-    operator_export_type = GLOBALS.operator_export_type
-    opset_version = GLOBALS.export_onnx_opset_version
-
-    old_blocks = tuple(node.blocks())
-    _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
-        g, "Loop", *inputs, outputs=node.outputsSize(), n_blocks=len(old_blocks)
-    )
-
-    for old_block, new_block_context in zip(old_blocks, new_block_contexts):
-        # Copy input metadata to subblock
-        #
-        #   prim::Loop(iter, cond, input_1, ..., input_n)
-        #     block0(iter, input_1, ..., input_n)
-        #
-        # For `Loop` node, copy metadata for `iter`, `input_1`, ..., `input_n`.
-        for i, b_in in enumerate(old_block.inputs()):
-            if i == 0 and i < len(inputs):
-                b_in.setType(inputs[i].type())
-            # For optional block inputs, they may switch between None not-None inside
-            # the loop body, so if the loop input is not optional, the block input may
-            # still need to be optional.
-            if (
-                i > 0
-                and (i + 1) < len(inputs)
-                and not isinstance(b_in.type(), _C.OptionalType)
-            ):
-                b_in.setType(inputs[i + 1].type())
-        torch._C._jit_pass_onnx_block(
-            old_block,
-            new_block_context.block,
-            operator_export_type,
-            env,
-            values_in_env,
-            False,
-        )
-    fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
-        new_node, opset_version
-    )
-    # Run shape type inference for Loop after subblock is converted.
-    if GLOBALS.onnx_shape_inference:
-        torch._C._jit_pass_onnx_node_shape_type_inference(
-            new_node, params_dict, opset_version
-        )
-    return fixed_outputs
-
-
-@_onnx_symbolic("prim::If")
-def prim_if(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
-    n = g.original_node
-    block = g.block
-    env = g.env
-    values_in_env = g.values_in_env
-    params_dict = g.params_dict
-
-    operator_export_type = GLOBALS.operator_export_type
-    opset_version = GLOBALS.export_onnx_opset_version
-
-    static_if = inputs[0].node().kind() == "onnx::Constant"
-    if static_if:
-        # Fold static if
-        #
-        # The torch IR
-        # graph(%embedding_matrix.1 : Float(10, 15, strides=[15, 1], requires_grad=0, device=cpu),
-        #    %input.1 : Long(6, strides=[1], requires_grad=0, device=cpu), ...
-        # %65 : Bool(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
-        # %21 : Long(device=cpu) = aten::eq(%20, %64)
-        # %22 : Long(device=cpu) = prim::If(%21)
-        #     block0():
-        #     %23 : Long(device=cpu) = aten::is_floating_point(%input.1)
-        #     -> (%23)
-        #     block1():
-        #     -> (%65)
-        # %input.53 : Tensor, %weight : Tensor = prim::If(%22)
-        #     block0():
-        #     -> (%embedding_matrix.1, %input.1)
-        #     block1():
-        #     -> (%input.1, %embedding_matrix.1)
-        # %26 : int[] = aten::size(%input.53)
-        #
-        # The converted ONNX graph
-        # %10 : Bool(device=cpu) = onnx::Constant[value={0}]()
-        # %14 : Bool(device=cpu) = onnx::Equal(%13, %8)
-        # %15 : Bool(requires_grad=0, device=cpu) = onnx::Constant[value={0}]()
-        # %16 : Long(1, strides=[1], device=cpu) = onnx::Shape(%input.1)
-        input_flag = symbolic_helper._node_get(inputs[0].node(), "value").tolist()
-        const_value = (
-            all(input_flag) if isinstance(input_flag, list) else bool(input_flag)
-        )
-        block_idx = 0 if const_value else 1
-        current_b = list(n.blocks())[block_idx]
-        env = torch._C._jit_pass_onnx_block(
-            current_b,
-            block,
-            operator_export_type,
-            env,
-            values_in_env,
-            True,
-        )
-        if_output_list = list(n.outputs())
-        current_b_list = list(current_b.outputs())
-
-        final_b_list = []
-        for idx in range(len(if_output_list)):
-            if current_b_list[idx] not in env:
-                raise errors.SymbolicValueError(
-                    f"The sub block ATen output {current_b_list[idx]} is not in env.",
-                    current_b_list[idx],
-                )  # type:ignore[operator]
-            onnx_b = env[current_b_list[idx]]
-            final_b_list.append(onnx_b)
-        return final_b_list
-    else:
-        old_blocks = tuple(n.blocks())
-        _new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
-            g, "If", *inputs, outputs=n.outputsSize(), n_blocks=len(old_blocks)
-        )
-
-        for old_block, new_block_context in zip(old_blocks, new_block_contexts):
-            torch._C._jit_pass_onnx_block(
-                old_block,
-                new_block_context.block,
-                operator_export_type,
-                env,
-                values_in_env,
-                False,
-            )
-        fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
-            new_node, opset_version
-        )
-        # Run shape type inference for If after subblock is converted.
-        if GLOBALS.onnx_shape_inference:
-            torch._C._jit_pass_onnx_node_shape_type_inference(
-                new_node, params_dict, opset_version
-            )
-        return fixed_outputs
-
-
-@_onnx_symbolic("prim::Constant")
-def prim_constant(g: jit_utils.GraphContext, *inputs, **attrs):
-    node = g.original_node
-
-    if node.mustBeNone():
-        return None
-    # This must go before checking for string values, because some device constants
-    # have string values, but we want to keep them as unconverted Device types so
-    # that eq() can work on them.
-    if isinstance(node.output().type(), _C.DeviceObjType):
-        return None
-    if node.kindOf("value") == "t":
-        return g.op("Constant", value_t=symbolic_helper._node_get(node, "value"))
-    if node.kindOf("value") == "s":
-        return g.op("Constant", value_s=symbolic_helper._node_get(node, "value"))
-    if node.output().type().isSubtypeOf(
-        _C.ListType.ofInts()
-    ) or node.output().type().isSubtypeOf(_C.ListType.ofFloats()):
-        return g.op(
-            "Constant", value_t=torch.tensor(symbolic_helper._node_get(node, "value"))
-        )
-    if node.output().type().isSubtypeOf(_C.ListType.ofStrings()):
-        str_constants = [
-            g.op("Constant", value_s=s)
-            for s in symbolic_helper._node_get(node, "value")
-        ]
-        return g.op("prim::ListConstruct", *str_constants)
-
-    raise errors.SymbolicValueError(
-        f"Unsupported prim::Constant kind: '{node.kindOf('value')}'. "
-        f"Please send a bug report at {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
-        node.output(),
-    )
-
-
-@_onnx_symbolic("prim::type")
-def prim_type(g: jit_utils.GraphContext, device_value: _C.Value, *args, **kwargs):
-    if device_value.node().kind() == "prim::device":
-        device = jit_utils.get_device_from_value(device_value.node().input())
-        if device is not None:
-            return g.op("Constant", value_s=str(device))
-
-    return symbolic_helper._unimplemented(
-        "prim::type",
-        "Device type cannot be statically determined.",
-        device_value,
-    )
-
-
-@_onnx_symbolic("onnx::Placeholder")
-def onnx_placeholder(g: jit_utils.GraphContext, *inputs, **attrs):
-    node = g.original_node
-    block = g.block
-    env = g.env
-    values_in_env = g.values_in_env
-
-    return torch._C._jit_onnx_convert_pattern_from_subblock(
-        block, node, env, values_in_env
-    )
-
-
-@_onnx_symbolic("aten::resolve_conj")
-@_onnx_symbolic("aten::resolve_neg")
-def noop_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
-    # ONNX does not have operators to *directly* manipulate real/imaginary components
-    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
-    # which results in failures due to missing operators for complex numbers
-
-    # `aten::resolve_conj` and `aten::resolve_neg` can safely be implemented as no-op
-    return input
-
-
-@_onnx_symbolic("aten::_conj")
-@_onnx_symbolic("aten::conj_physical")
-def unsupported_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
-    # ONNX does not have operators to *directly* manipulate real/imaginary components
-    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
-    # which results in failures due to missing operators for complex numbers
-
-    # While `aten::_conj` and `aten::conj_physical` raise exception when input is complex
-    if symbolic_helper.is_complex_value(input):
-        # FIXME(justinchuby): report correct name for symbolic being executed
-        return symbolic_helper._onnx_unsupported(
-            "aten::_conj, aten::conj_physical",
-            input,
-        )
-
-    # they can safely be implemented as no-op for real numbers only
-    return noop_complex_operators(g, input)
-
-
-@_onnx_symbolic("aten::logit")
-def logit(g: jit_utils.GraphContext, self: torch._C.Value, eps: torch._C.Value):
-    one = g.op("Constant", value_t=torch.tensor(1.0))
-
-    if not symbolic_helper._is_none(eps):
-        eps = g.op(
-            "Cast", eps, to_i=_type_utils.JitScalarType.from_value(self).onnx_type()
-        )
-        one_sub_eps = g.op("Sub", one, eps)
-        self_less_equal_one_sub_eps = g.op("Greater", one_sub_eps, self)
-        temporary_self = g.op("Where", self_less_equal_one_sub_eps, self, one_sub_eps)
-
-        temporary_self_less_eps = g.op("Less", temporary_self, eps)
-        z = g.op("Where", temporary_self_less_eps, eps, temporary_self)
-    else:
-        z = self
-
-    sub = g.op("Sub", one, z)
-    div = g.op("Div", z, sub)
-    return g.op("Log", div)
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index ec08090a595f..6b1d752bb04e 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -1,1880 +1,8 @@
-# mypy: allow-untyped-defs
-"""Functions to export models into the ONNX IR format.
-
-These models can be loaded with the ONNX library and then
-converted to models which run on other deep learning frameworks.
-"""
+"""Backward compatibility module for torch.onnx.utils."""
 
 from __future__ import annotations
 
-import contextlib
-import copy
-import inspect
-import re
-import typing
-import warnings
-from typing import Any, Callable, cast
-from typing_extensions import deprecated
-
-import torch
-import torch._C._onnx as _C_onnx
-import torch.jit._trace
-import torch.serialization
-from torch import _C
-from torch.onnx import _constants, errors, symbolic_helper  # noqa: F401
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import jit_utils, onnx_proto_utils, registration
-
-
-if typing.TYPE_CHECKING:
-    from collections.abc import Collection, Mapping, Sequence
-
-
-__all__ = [
-    "select_model_mode_for_export",
-    "disable_apex_o2_state_dict_hook",
-    "setup_onnx_logging",
-    "exporter_context",
-    "export",
-    "model_signature",
-    "warn_on_static_input_change",
-    "unpack_quantized_tensor",
-    "unconvertible_ops",
-    "register_custom_op_symbolic",
-    "unregister_custom_op_symbolic",
-]
-
-
-# TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
-# Skip check due to cannot import IValue from torch._C
-_params_dict = {}  # type: ignore[var-annotated]
-
-
-@deprecated("Please set training mode before exporting the model", category=None)
-@contextlib.contextmanager
-def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode):
-    """A context manager to temporarily set the training mode of ``model``
-    to ``mode``, resetting it when we exit the with-block.
-
-    .. deprecated:: 2.7
-        Please set training mode before exporting the model.
-
-    Args:
-        model: Same type and meaning as ``model`` arg to :func:`export`.
-        mode: Same type and meaning as ``training`` arg to :func:`export`.
-    """
-    if not isinstance(mode, _C_onnx.TrainingMode):
-        raise TypeError(
-            f"'mode' should be a torch.onnx.TrainingMode enum, but got '{type(mode)}'."
-        )
-    originally_training: bool = False
-
-    if hasattr(model, "training"):
-        originally_training = model.training
-
-        # ONNX opset 12 has better support for training amenable models, with updated
-        # versions of the dropout and batch_norm operators
-        if mode == _C_onnx.TrainingMode.TRAINING or (
-            mode == _C_onnx.TrainingMode.PRESERVE and originally_training
-        ):
-            GLOBALS.export_training = True
-            if GLOBALS.export_onnx_opset_version < 12:
-                warnings.warn(
-                    "You are exporting the model in training mode with onnx opset "
-                    f"version {GLOBALS.export_onnx_opset_version}. "
-                    "Opset versions lower than opset 12 will not be able to export "
-                    "nodes such as Dropout and BatchNorm correctly."
-                )
-        else:
-            GLOBALS.export_training = False
-
-        GLOBALS.training_mode = mode
-        if mode == _C_onnx.TrainingMode.TRAINING:
-            model.train(True)
-        elif mode == _C_onnx.TrainingMode.EVAL:
-            model.train(False)
-        # else mode == _C_onnx.TrainingMode.PRESERVE, do nothing
-
-    try:
-        yield
-    finally:
-        if hasattr(model, "training") and not mode == _C_onnx.TrainingMode.PRESERVE:
-            model.train(originally_training)
-
-
-@deprecated(
-    "Please remove usage of this function. Copy its logic if it is required in user code",
-    category=None,
-)
-@contextlib.contextmanager
-def disable_apex_o2_state_dict_hook(model: torch.nn.Module | torch.jit.ScriptFunction):
-    """A context manager to temporarily disable the Apex O2 hook that returns.
-
-    .. deprecated:: 2.7
-        Please remove usage of this function.
-    """
-    # Apex O2 hook state_dict to return fp16 weights as fp32.
-    # Exporter cannot identify them as same tensors.
-    # Since this hook is only used by optimizer, it is safe to
-    # remove this hook while exporting.
-    if not isinstance(model, torch.jit.ScriptFunction):
-        model_hooks = {}  # type: ignore[var-annotated]
-        for module in model.modules():
-            for key, hook in module._state_dict_hooks.items():
-                if type(hook).__name__ == "O2StateDictHook":
-                    if module not in model_hooks:
-                        model_hooks[module] = {}
-                    model_hooks[module][key] = hook
-            if module in model_hooks:
-                for key in model_hooks[module]:
-                    module._state_dict_hooks.pop(key)
-        try:
-            yield
-        finally:
-            # Add the hooks back
-            for module, m_map in model_hooks.items():
-                for key, hook in m_map.items():
-                    module._state_dict_hooks[key] = hook
-    else:
-        try:
-            yield
-        finally:
-            pass
-
-
-@deprecated("The feature will be removed. Please remove usage of this function")
-@contextlib.contextmanager
-def setup_onnx_logging(verbose: bool):
-    """A context manager to temporarily set the ONNX logging verbosity.
-
-    .. deprecated:: 2.7
-        Please remove usage of this function.
-    """
-    is_originally_enabled = _C._jit_is_onnx_log_enabled
-    if is_originally_enabled or verbose:  # type: ignore[truthy-function]
-        _C._jit_set_onnx_log_enabled(True)
-    try:
-        yield
-    finally:
-        if not is_originally_enabled:  # type: ignore[truthy-function]
-            _C._jit_set_onnx_log_enabled(False)
-
-
-@deprecated(
-    "The feature will be removed. Please remove usage of this function "
-    "and implement equivalent logic if needed",
-    category=None,
-)
-@contextlib.contextmanager
-def exporter_context(model, mode: _C_onnx.TrainingMode, verbose: bool):
-    """A context manager to temporarily set the training mode of ``model``
-    to ``mode``, disable the Apex O2 hook, and set the ONNX logging verbosity.
-
-    .. deprecated:: 2.7
-        Please set training mode before exporting the model.
-    """
-    with (
-        select_model_mode_for_export(model, mode) as mode_ctx,
-        disable_apex_o2_state_dict_hook(model) as apex_ctx,
-        setup_onnx_logging(verbose) as log_ctx,
-    ):
-        yield (mode_ctx, apex_ctx, log_ctx)
-
-
-def _get_torch_export_args(
-    args: tuple[Any, ...],
-    kwargs: dict[str, Any] | None,
-) -> tuple[tuple[Any, ...], dict[str, Any] | None]:
-    """Obtain the arguments for torch.onnx.export from the model and the input arguments."""
-    if not kwargs and args and isinstance(args[-1], dict):
-        kwargs = args[-1]
-        args = args[:-1]
-    return args, kwargs
-
-
-def export(
-    model: torch.nn.Module | torch.jit.ScriptModule | torch.jit.ScriptFunction,
-    args: tuple[Any, ...] | torch.Tensor,
-    f: str,
-    *,
-    kwargs: dict[str, Any] | None = None,
-    export_params: bool = True,
-    verbose: bool = False,
-    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
-    input_names: Sequence[str] | None = None,
-    output_names: Sequence[str] | None = None,
-    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
-    opset_version: int | None = None,
-    do_constant_folding: bool = True,
-    dynamic_axes: Mapping[str, Mapping[int, str]]
-    | Mapping[str, Sequence[int]]
-    | None = None,
-    keep_initializers_as_inputs: bool | None = None,
-    custom_opsets: Mapping[str, int] | None = None,
-    export_modules_as_functions: bool | Collection[type[torch.nn.Module]] = False,
-    autograd_inlining: bool = True,
-) -> None:
-    r"""Exports a model into ONNX format.
-
-    If ``model`` is not a :class:`torch.jit.ScriptModule` nor a
-    :class:`torch.jit.ScriptFunction`, this runs
-    ``model`` once in order to convert it to a TorchScript graph to be exported
-    (the equivalent of :func:`torch.jit.trace`). Thus this has the same limited support
-    for dynamic control flow as :func:`torch.jit.trace`.
-
-    Args:
-        model: The model to be exported.
-        args:
-
-            args can be structured either as:
-
-            1. ONLY A TUPLE OF ARGUMENTS::
-
-                args = (x, y, z)
-
-            The tuple should contain model inputs such that ``model(*args)`` is a valid
-            invocation of the model. Any non-Tensor arguments will be hard-coded into the
-            exported model; any Tensor arguments will become inputs of the exported model,
-            in the order they occur in the tuple.
-
-            2. A TENSOR::
-
-                args = torch.Tensor([1])
-
-            This is equivalent to a 1-ary tuple of that Tensor.
-
-            3. A TUPLE OF ARGUMENTS ENDING WITH A DICTIONARY OF NAMED ARGUMENTS::
-
-                args = (x, {"y": input_y, "z": input_z})
-
-            All but the last element of the tuple will be passed as non-keyword arguments,
-            and named arguments will be set from the last element. If a named argument is
-            not present in the dictionary, it is assigned the default value, or None if a
-            default value is not provided.
-
-            .. warning::
-                This behavior will be deprecated in a future release. Please use the
-                kwargs argument instead.
-
-            .. note::
-                If a dictionary is the last element of the args tuple, it will be
-                interpreted as containing named arguments. In order to pass a dict as the
-                last non-keyword arg, provide an empty dict as the last element of the args
-                tuple. For example, instead of::
-
-                    torch.onnx.export(
-                        model,
-                        (
-                            x,
-                            # WRONG: will be interpreted as named arguments
-                            {y: z},
-                        ),
-                        "test.onnx.pb",
-                    )
-
-                Write::
-
-                    torch.onnx.export(model, (x, {y: z}, {}), "test.onnx.pb")
-
-        f: Path to the output ONNX model file. E.g. "model.onnx".
-        kwargs: Named arguments to the model.
-        export_params: If True, all parameters will
-            be exported. Set this to False if you want to export an untrained model.
-            In this case, the exported model will first take all of its parameters
-            as arguments, with the ordering as specified by ``model.state_dict().values()``
-        verbose: if True, prints a description of the
-            model being exported to stdout. In addition, the final ONNX graph will include the
-            field ``doc_string``` from the exported model which mentions the source code locations
-            for ``model``. If True, ONNX exporter logging will be turned on.
-        training:
-            * ``TrainingMode.EVAL``: export the model in inference mode.
-            * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
-                False and in training mode if model.training is True.
-            * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations
-                which might interfere with training.
-        input_names (list of str, default empty list): names to assign to the
-            input nodes of the graph, in order.
-        output_names (list of str, default empty list): names to assign to the
-            output nodes of the graph, in order.
-        operator_export_type (enum, default OperatorExportTypes.ONNX):
-
-            .. warning::
-                This option will be deprecated in a future release. Future exported
-                graphs will always use the default opset domain.
-
-            * ``OperatorExportTypes.ONNX``: Export all ops as regular ONNX ops
-                (in the default opset domain).
-            * ``OperatorExportTypes.ONNX_FALLTHROUGH``: Try to convert all ops
-                to standard ONNX ops in the default opset domain. If unable to do so
-                (e.g. because support has not been added to convert a particular torch op to ONNX),
-                fall back to exporting the op into a custom opset domain without conversion. Applies
-                to `custom ops <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
-                as well as ATen ops. For the exported model to be usable, the runtime must support
-                these non-standard ops.
-            * ``OperatorExportTypes.ONNX_ATEN``: All ATen ops (in the TorchScript namespace "aten")
-                are exported as ATen ops (in opset domain "org.pytorch.aten").
-                `ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library, so
-                this instructs the runtime to use PyTorch's implementation of these ops.
-
-                .. warning::
-
-                    Models exported this way are probably runnable only by Caffe2.
-
-                    This may be useful if the numeric differences in implementations of operators are
-                    causing large differences in behavior between PyTorch and Caffe2 (which is more
-                    common on untrained models).
-
-            * ``OperatorExportTypes.ONNX_ATEN_FALLBACK``: Try to export each ATen op
-                (in the TorchScript namespace "aten") as a regular ONNX op. If we are unable to do so
-                (e.g. because support has not been added to convert a particular torch op to ONNX),
-                fall back to exporting an ATen op. See documentation on OperatorExportTypes.ONNX_ATEN for
-                context.
-                For example::
-
-                    graph(%0 : Float):
-                    %3 : int = prim::Constant[value=0]()
-                    # conversion unsupported
-                    %4 : Float = aten::triu(%0, %3)
-                    # conversion supported
-                    %5 : Float = aten::mul(%4, %0)
-                    return (%5)
-
-                Assuming ``aten::triu`` is not supported in ONNX, this will be exported as::
-
-                    graph(%0 : Float):
-                    %1 : Long() = onnx::Constant[value={0}]()
-                    # not converted
-                    %2 : Float = aten::ATen[operator="triu"](%0, %1)
-                    # converted
-                    %3 : Float = onnx::Mul(%2, %0)
-                    return (%3)
-
-                .. warning::
-
-                    Models exported this way are probably runnable only by Caffe2.
-
-        opset_version (int, default 18): The version of the
-            `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
-            to target. Must be >= 7.
-        do_constant_folding: Apply the constant-folding optimization.
-            Constant-folding will replace some of the ops that have all constant inputs
-            with pre-computed constant nodes.
-        dynamic_axes:
-
-            By default the exported model will have the shapes of all input and output tensors
-            set to exactly match those given in ``args``. To specify axes of tensors as
-            dynamic (i.e. known only at run-time), set ``dynamic_axes`` to a dict with schema:
-
-            * KEY (str): an input or output name. Each name must also be provided in ``input_names`` or
-                ``output_names``.
-            * VALUE (dict or list): If a dict, keys are axis indices and values are axis names. If a
-                list, each element is an axis index.
-
-            For example::
-
-                class SumModule(torch.nn.Module):
-                    def forward(self, x):
-                        return torch.sum(x, dim=1)
-
-
-                torch.onnx.export(
-                    SumModule(),
-                    (torch.ones(2, 2),),
-                    "onnx.pb",
-                    input_names=["x"],
-                    output_names=["sum"],
-                )
-
-            Produces::
-
-                input {
-                  name: "x"
-                  ...
-                      shape {
-                        dim {
-                          dim_value: 2  # axis 0
-                        }
-                        dim {
-                          dim_value: 2  # axis 1
-                ...
-                output {
-                  name: "sum"
-                  ...
-                      shape {
-                        dim {
-                          dim_value: 2  # axis 0
-                ...
-
-            While::
-
-                torch.onnx.export(
-                    SumModule(),
-                    (torch.ones(2, 2),),
-                    "onnx.pb",
-                    input_names=["x"],
-                    output_names=["sum"],
-                    dynamic_axes={
-                        # dict value: manually named axes
-                        "x": {0: "my_custom_axis_name"},
-                        # list value: automatic names
-                        "sum": [0],
-                    },
-                )
-
-            Produces::
-
-                input {
-                  name: "x"
-                  ...
-                      shape {
-                        dim {
-                          dim_param: "my_custom_axis_name"  # axis 0
-                        }
-                        dim {
-                          dim_value: 2  # axis 1
-                ...
-                output {
-                  name: "sum"
-                  ...
-                      shape {
-                        dim {
-                          dim_param: "sum_dynamic_axes_1"  # axis 0
-                ...
-
-        keep_initializers_as_inputs: If True, all the
-            initializers (typically corresponding to parameters) in the
-            exported graph will also be added as inputs to the graph. If False,
-            then initializers are not added as inputs to the graph, and only
-            the non-parameter inputs are added as inputs.
-            This may allow for better optimizations (e.g. constant folding) by
-            backends/runtimes.
-
-            If True, `deduplicate_initializers` pass will not be executed. This means
-            initializers with duplicated values will not be deduplicated and
-            will be treated as distinct inputs to the graph. This allows different
-            input initializers to be supplied at the runtime following export.
-
-            If ``opset_version < 9``, initializers MUST be part of graph
-            inputs and this argument will be ignored and the behavior will be
-            equivalent to setting this argument to True.
-
-        custom_opsets (dict[str, int], default empty dict): A dict with schema:
-
-            * KEY (str): opset domain name
-            * VALUE (int): opset version
-
-            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
-            the opset version is set to 1. Only custom opset domain name and version should be
-            indicated through this argument.
-
-        export_modules_as_functions: Flag to enable
-            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
-            particular types of modules to export as local functions in ONNX.
-            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
-            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
-            Module variables will be exported as function attributes. There are two categories of function
-            attributes.
-
-            1. Annotated attributes: class variables that have type annotations via
-            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
-            will be exported as attributes.
-            Annotated attributes are not used inside the subgraph of ONNX local function because
-            they are not created by PyTorch JIT tracing, but they may be used by consumers
-            to determine whether or not to replace the function with a particular fused kernel.
-
-            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
-            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
-            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
-
-            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
-            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
-            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
-                only if the type of the ``nn.Module`` is found in the set.
-
-        autograd_inlining: Flag used to control whether to inline autograd functions.
-            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
-
-    Raises:
-        :class:`torch.onnx.errors.CheckerError`: If the ONNX checker detects an invalid ONNX graph.
-        :class:`torch.onnx.errors.UnsupportedOperatorError`: If the ONNX graph cannot be exported because it
-            uses an operator that is not supported by the exporter.
-        :class:`torch.onnx.errors.OnnxExporterError`: Other errors that can occur during export.
-            All errors are subclasses of :class:`errors.OnnxExporterError`.
-    """
-    if operator_export_type != _C_onnx.OperatorExportTypes.ONNX:
-        warnings.warn(
-            "Setting `operator_export_type` to something other than default is deprecated. "
-            "The option will be removed in a future release.",
-            category=DeprecationWarning,
-        )
-    if training == _C_onnx.TrainingMode.TRAINING:
-        warnings.warn(
-            "Setting `training` to something other than default is deprecated. "
-            "The option will be removed in a future release. Please set the training mode "
-            "before exporting the model.",
-            category=DeprecationWarning,
-        )
-
-    args = (args,) if isinstance(args, torch.Tensor) else args
-    if kwargs is not None:
-        args = args + (kwargs,)
-
-    _export(
-        model,
-        args,
-        f,
-        export_params,
-        verbose,
-        training,
-        input_names,
-        output_names,
-        operator_export_type=operator_export_type,
-        opset_version=opset_version,
-        do_constant_folding=do_constant_folding,
-        dynamic_axes=dynamic_axes,
-        keep_initializers_as_inputs=keep_initializers_as_inputs,
-        custom_opsets=custom_opsets,
-        export_modules_as_functions=export_modules_as_functions,
-        autograd_inlining=autograd_inlining,
-    )
-
-    return None
-
-
-def _is_constant_tensor_list(node):
-    if node.kind() != "prim::Constant":
-        return False
-    output_type = node.output().type()
-    if output_type.isSubtypeOf(_C.ListType.ofTensors()):
-        return True
-    if output_type.isSubtypeOf(_C.ListType(_C.OptionalType.ofTensor())):
-        return True
-
-
-# ONNX can't handle constants that are lists of tensors, which can
-# get generated in constant prop. So we split them back into prim::ListConstructs
-
-
-def _split_tensor_list_constants(g, block):
-    for node in block.nodes():
-        for subblock in node.blocks():
-            _split_tensor_list_constants(g, subblock)
-        if _is_constant_tensor_list(node):
-            inputs = []
-            for val in node.output().toIValue():
-                input = g.insertConstant(val)
-                input.node().moveBefore(node)
-                input.node().copyMetadata(node)
-                inputs.append(input)
-
-            lc = (
-                g.create("prim::ListConstruct", inputs)
-                .insertBefore(node)
-                .output()
-                .setType(_C.ListType.ofTensors())
-            )
-            lc.node().copyMetadata(node)
-            node.output().replaceAllUsesWith(lc)
-
-
-def _optimize_graph(
-    graph: _C.Graph,
-    operator_export_type: _C_onnx.OperatorExportTypes,
-    _disable_torch_constant_prop: bool = False,
-    fixed_batch_size: bool = False,
-    params_dict=None,
-    dynamic_axes=None,
-    input_names=None,
-    module=None,
-):
-    if params_dict is None:
-        params_dict = {}
-
-    # Inline everything
-    _C._jit_pass_inline(graph)
-
-    # Remove fork/wait nodes
-    _C._jit_pass_inline_fork_wait(graph)
-    _C._jit_pass_lint(graph)
-    if GLOBALS.autograd_inlining:
-        _C._jit_pass_onnx_autograd_function_process(graph)
-    _C._jit_pass_lower_all_tuples(graph)
-
-    # we now record some ops like ones/zeros
-    # into a trace where we previously recorded constants.
-    # use constant prop to maintain our current level of onnx support
-    # without implementing symbolics for all of them
-    if _disable_torch_constant_prop is False:
-        _C._jit_pass_constant_propagation(graph)
-
-    _split_tensor_list_constants(graph, graph)
-    # run dce to eliminate dead parts of the graph that might have been
-    # left behind by things like symbolic_override
-    _C._jit_pass_dce(graph)
-    _C._jit_pass_lint(graph)
-
-    # CSE should improve perf when Autocast is used with disabled cache
-    # Autocast is disabled due to a limitation on tracer as described at https://github.com/pytorch/pytorch/issues/84092
-    # Must run before _C._jit_pass_erase_number_types to prevent type substitution
-    if _C._jit_pass_cse(graph):
-        _C._jit_pass_onnx_lint(graph)
-
-    _C._jit_pass_canonicalize_graph_fuser_ops(graph)
-    _C._jit_pass_lint(graph)
-    _C._jit_pass_peephole(graph, True)
-    _C._jit_pass_fuse_addmm(graph)
-    _C._jit_pass_lint(graph)
-
-    _C._jit_pass_peephole(graph, True)
-    _C._jit_pass_lower_all_tuples(graph)
-    # in _jit_pass_onnx, symbolic functions are called for each node for conversion.
-    # However, there are nodes that cannot be converted without additional context.
-    # For example, the number of outputs from split (and whether it is static or dynamic) is unknown
-    # until the point where it is unpacked by listUnpack node.
-    # This pass does a preprocess, and prepares the nodes such that enough context can be received
-    # by the symbolic function.
-    _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
-    _C._jit_pass_onnx_preprocess(graph)
-
-    # onnx does not support tuples, so try to remove them
-    _C._jit_pass_lint(graph)
-
-    # onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0
-    _C._jit_pass_prepare_division_for_onnx(graph)
-
-    _C._jit_pass_onnx_remove_print(graph)
-    _C._jit_pass_onnx_preprocess_caffe2(graph)
-
-    symbolic_helper._quantized_ops.clear()
-    # Unpack quantized weights for conv and linear ops and insert into graph.
-    _C._jit_pass_onnx_unpack_quantized_weights(graph, params_dict)
-    # onnx only supports tensors, so we turn all out number types into tensors
-    _C._jit_pass_erase_number_types(graph)
-    if GLOBALS.onnx_shape_inference:
-        input_names = [] if input_names is None else input_names
-        dynamic_axes = {} if dynamic_axes is None else dynamic_axes
-        _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
-    _C._jit_pass_onnx_lint(graph)
-
-    graph = _C._jit_pass_onnx(graph, operator_export_type)
-    _C._jit_pass_onnx_lint(graph)
-    _C._jit_pass_lint(graph)
-
-    _C._jit_pass_onnx_scalar_type_analysis(
-        graph, True, GLOBALS.export_onnx_opset_version
-    )
-    _C._jit_pass_lint(graph)
-
-    _C._jit_pass_onnx_peephole(
-        graph, GLOBALS.export_onnx_opset_version, fixed_batch_size
-    )
-    _C._jit_pass_lint(graph)
-
-    # graph is not a valid jit graph anymore because types have been replaced
-    # (e.g. int with Tensor), so it now contains operators that don't actually
-    # exist. We can't run normal dead code elimination because it'd fail trying
-    # to look up if an operator has side effects, but we can run a dead code
-    # elimination variant that doesn't need to look up if an op has side effects.
-    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-    _C._jit_pass_lint(graph)
-    graph = _C._jit_pass_canonicalize(graph)
-    _C._jit_pass_lint(graph)
-    if GLOBALS.onnx_shape_inference:
-        try:
-            _C._jit_pass_onnx_graph_shape_type_inference(
-                graph, params_dict, GLOBALS.export_onnx_opset_version
-            )
-        except RuntimeError:
-            # NOTE: shape type inference error should not stop the export process
-            # https://github.com/pytorch/pytorch/issues/132205
-            pass
-
-    return graph
-
-
-def warn_on_static_input_change(input_states):
-    """Warns that changes to input dictionaries and strings won't take effect in the traced ONNX graph.
-
-    We accept dictionaries and strings as ONNX inputs, but they should be only for
-    configuration use. we detect here if these inputs are modified, and if so we warn
-    the user that the changes won't take effect in the traced ONNX graph.
-    """
-    for input, traced_input in zip(input_states[0], input_states[1]):
-        if isinstance(input, dict):
-            if list(input.keys()) != list(traced_input.keys()):
-                warning = (
-                    "We detected that you are modifying a dictionary that is an input to your "
-                    "model. "
-                    "Note that dictionaries are allowed as inputs in ONNX but they should be "
-                    "handled with care. "
-                    "Usages of dictionaries is not recommended, and should not be used except "
-                    "for configuration use. "
-                    "Also note that the order and values of the keys must remain the same. "
-                )
-                warnings.warn(warning)
-        elif isinstance(input, str):
-            if input != traced_input:
-                warning = (
-                    "The model seems to have string inputs/outputs. "
-                    "Note that strings will not appear as inputs/outputs of the ONNX graph. "
-                )
-                warnings.warn(warning)
-
-
-def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type):
-    """Resolves the arguments that are ignored when export_type != operator_export_type.ONNX."""
-    return arg_value
-
-
-def _decide_keep_init_as_input(
-    keep_initializers_as_inputs: bool | None,
-    operator_export_type: _C_onnx.OperatorExportTypes,
-    opset_version: int,
-):
-    """Decides whether the initializers in the graph should be listed as ONNX graph inputs.
-
-    This method encapsulates the logic to decide whether the initializers in the graph
-    should be listed as ONNX graph inputs (i.e., whether to choose ONNX IR v3 or v4).
-    If keep_initializers_as_inputs is not specified (None), then we decide whether to keep
-    initializers as graph inputs (val_keep_init_as_ip) based on export type. If export type
-    is ONNX, then do not keep initializers as input (val_keep_init_as_ip=False). For all other
-    export types keep initializers as input (val_keep_init_as_ip=True).
-    If keep_initializers_as_inputs is specified, then respect it. Unless opset version <= 8,
-    in which case it must be ignored because for opset version <= 8, all initializers MUST be
-    part of graph input (only ONNX IR v3 is allowed), i.e. val_keep_init_as_ip=True.
-
-    Special handling is needed for opset version 8 or lower, because irrespective
-    of user input for keep_initializers_as_inputs, the graph must follow ONNX IR v3
-    semantics, i.e. all initializers must be listed as ONNX graph input.
-    """
-
-    if opset_version < 9:
-        if keep_initializers_as_inputs is False:
-            warnings.warn(
-                "Setting 'keep_initializers_as_inputs=False' for opset version"
-                "8 or lower would lead to an invalid ONNX graph. Therefore, "
-                "'keep_initializers_as_inputs=False' is ignored during export."
-                "Exported model will have initializers as graph inputs (compliant "
-                " to ONNX IR v3)."
-            )
-        return True  # i.e. True == initializers are part of graph input (ONNX IR v3)
-    val_keep_init_as_ip = (
-        True if keep_initializers_as_inputs is None else keep_initializers_as_inputs
-    )
-    if (
-        keep_initializers_as_inputs is None
-        and operator_export_type is _C_onnx.OperatorExportTypes.ONNX
-    ):
-        val_keep_init_as_ip = False
-    return val_keep_init_as_ip
-
-
-def _decide_add_node_names(add_node_names, operator_export_type):
-    return _resolve_args_by_export_type(
-        "add_node_names", add_node_names, operator_export_type
-    )
-
-
-def _decide_constant_folding(do_constant_folding, operator_export_type, training):
-    do_constant_folding = _resolve_args_by_export_type(
-        "do_constant_folding", do_constant_folding, operator_export_type
-    )
-    if do_constant_folding and (
-        training is not None and training is not _C_onnx.TrainingMode.EVAL
-    ):
-        warnings.warn(
-            "It is recommended that constant folding be turned off ('do_constant_folding=False') "
-            "when exporting the model in training-amenable mode, i.e. with 'training=TrainingMode.TRAIN' "
-            "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
-            "learnable model parameters may not translate correctly in the exported ONNX model "
-            "because constant folding mutates model parameters. Please consider "
-            "turning off constant folding or setting the training=TrainingMode.EVAL."
-        )
-    return do_constant_folding
-
-
-def _signature(model) -> inspect.Signature:
-    should_be_callable = getattr(model, "forward", model)
-    if callable(should_be_callable):
-        return inspect.signature(should_be_callable)
-    raise ValueError("model has no forward method and is not callable")
-
-
-def _decide_input_format(model, args):
-    try:
-        sig = _signature(model)
-    except ValueError as e:
-        warnings.warn(f"{e}, skipping _decide_input_format")
-        return args
-    try:
-        ordered_list_keys = list(sig.parameters.keys())
-        if ordered_list_keys[0] == "self":
-            ordered_list_keys = ordered_list_keys[1:]
-        args_dict: dict = {}
-        if isinstance(args, list):
-            args_list = args
-        elif isinstance(args, tuple):
-            args_list = list(args)
-        else:
-            args_list = [args]
-        if isinstance(args_list[-1], dict):
-            args_dict = args_list[-1]
-            args_list = args_list[:-1]
-        n_nonkeyword = len(args_list)
-        for optional_arg in ordered_list_keys[n_nonkeyword:]:
-            if optional_arg in args_dict:
-                args_list.append(args_dict[optional_arg])
-            # Check if this arg has a default value
-            else:
-                param = sig.parameters[optional_arg]
-                if param.default != param.empty:
-                    args_list.append(param.default)
-        args = args_list if isinstance(args, list) else tuple(args_list)
-    # Cases of models with no input args
-    except IndexError:
-        warnings.warn("No input args, skipping _decide_input_format")
-    except Exception as e:
-        warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}")
-    return args
-
-
-def _trace(func, args, operator_export_type, return_outs=False):
-    # Special case for common case of passing a single Tensor
-    if isinstance(args, torch.Tensor):
-        args = (args,)
-
-    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
-        func,
-        args,
-        strict=False,
-        _force_outplace=False,
-        _return_inputs_states=True,
-    )
-    warn_on_static_input_change(inputs_states)
-
-    trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={})
-    if return_outs:
-        return trace_graph, torch_out
-    return trace_graph
-
-
-def _trace_and_get_graph_from_model(model, args):
-    # A basic sanity check: make sure the state_dict keys are the same
-    # before and after running the model.  Fail fast!
-    orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
-
-    # Disable Autocast cache because it replaces kernel's weight and bias
-    # by (undesired) constants.
-    # No perf impact for when there are reused weights since https://github.com/pytorch/pytorch/pull/85665
-    prev_autocast_cache_enabled = torch.is_autocast_cache_enabled()
-    torch.set_autocast_cache_enabled(False)
-    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
-        model,
-        args,
-        strict=False,
-        _force_outplace=False,
-        _return_inputs_states=True,
-    )
-    torch.set_autocast_cache_enabled(prev_autocast_cache_enabled)
-
-    warn_on_static_input_change(inputs_states)
-
-    if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():
-        raise RuntimeError(
-            "state_dict changed after running the tracer; "
-            "something weird is happening in your model!"
-        )
-
-    return trace_graph, torch_out
-
-
-def _get_param_count_list(method_graph, args_params):
-    param_count_list = []
-    for input_, arg_params_ in zip(method_graph.inputs(), args_params):
-        if "PackedParams" in str(input_.type()):
-            in_vars, _ = torch.jit._flatten(arg_params_)
-            param_count_list.append(len(in_vars))
-        else:
-            param_count_list.append(arg_params_ is not None)
-
-    return param_count_list
-
-
-def _check_flatten_did_not_remove(original, jit_flattened):
-    """torch.jit._flatten removes None. Check if it did so in this case."""
-
-    def flatten(x):
-        if isinstance(x, (list, tuple)):
-            for inner in x:
-                yield from flatten(inner)
-        elif isinstance(x, dict):
-            for inner in x.values():
-                yield from flatten(inner)
-        else:
-            yield x
-
-    flattened_with_none = list(flatten(original))
-    num_none = len(flattened_with_none) - len(jit_flattened)
-    assert num_none >= 0
-    if num_none:
-        raise ValueError(
-            f"args contained {num_none} None's after flattening. "
-            "When exporting a ScriptModule or ScriptFunction, no args may "
-            "be None because that breaks type propagation."
-        )
-
-
-def _create_jit_graph(
-    model: torch.nn.Module | torch.jit.ScriptFunction, args: Sequence[Any]
-) -> tuple[_C.Graph, list[_C.IValue], Any | None, _C.ScriptModule | None]:
-    if isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)):
-        flattened_args = tuple(torch.jit._flatten(tuple(args))[0])
-        _check_flatten_did_not_remove(args, flattened_args)
-        torch_out = None
-
-        if isinstance(model, torch.jit.ScriptModule):
-            try:
-                graph = model.forward.graph  # type: ignore[attr-defined]
-            except AttributeError as e:
-                raise RuntimeError("'forward' method must be a script method") from e
-            _C._jit_pass_onnx_function_substitution(graph)
-            freezed_module = _C._freeze_module(
-                cast(_C.ScriptModule, model._c), preserveParameters=True
-            )
-            module, params = _C._jit_onnx_list_model_parameters(freezed_module)
-            method_graph = module._get_method("forward").graph
-            args_params = tuple(args) + tuple(params)
-            param_count_list = _get_param_count_list(method_graph, args_params)
-            in_vars, _ = torch.jit._flatten(args_params)
-            graph = _C._propagate_and_assign_input_shapes(
-                method_graph, tuple(in_vars), param_count_list, False, False
-            )
-            return graph, params, torch_out, module
-
-        # torch.jit.ScriptFunction
-        params = []
-        graph = model.graph
-        _C._jit_pass_onnx_function_substitution(graph)
-        param_count_list = _get_param_count_list(graph, args)
-        graph = _C._propagate_and_assign_input_shapes(
-            graph, flattened_args, param_count_list, False, False
-        )
-        return graph, params, torch_out, None
-
-    graph, torch_out = _trace_and_get_graph_from_model(model, args)
-    _C._jit_pass_onnx_lint(graph)
-    state_dict = torch.jit._unique_state_dict(model)
-    params = list(state_dict.values())
-    graph_inputs = list(graph.inputs())
-    user_input_num = len(graph_inputs) - len(state_dict)
-    param_names = list(state_dict.keys())
-    for i, inp in enumerate(graph_inputs):
-        if i >= user_input_num:
-            inp.setDebugName(param_names[i - user_input_num])
-    _C._jit_pass_onnx_function_substitution(graph)
-    return graph, params, torch_out, None
-
-
-def _get_named_param_dict(graph, params):
-    input_and_param_names = [val.debugName() for val in graph.inputs()]
-    param_names = input_and_param_names[len(input_and_param_names) - len(params) :]
-    _params_dict = dict(zip(param_names, params))
-    return _params_dict
-
-
-def _get_example_outputs(model, args):
-    input_args = copy.deepcopy(args)
-    input_kwargs = {}
-    if input_args and isinstance(input_args[-1], dict):
-        input_kwargs = input_args[-1]
-        input_args = input_args[:-1]
-
-    example_outputs = model(*input_args, **input_kwargs)
-    if isinstance(example_outputs, list):
-        example_outputs = [example_outputs]
-    elif not isinstance(example_outputs, tuple):
-        example_outputs = (example_outputs,)
-
-    return example_outputs
-
-
-_qtype_vtype_map = {
-    torch.quint8: torch.uint8,
-    torch.qint8: torch.int8,
-    torch.qint32: torch.int32,
-    torch.quint4x2: torch.int8,
-}
-
-
-def unpack_quantized_tensor(value, cast_onnx_accepted=True):
-    if isinstance(value, torch.Tensor) and value.dtype in _qtype_vtype_map:
-        q_value_dequantize = value.dequantize()
-        q_scale = (
-            torch.tensor(value.q_scale(), dtype=torch.double)
-            if cast_onnx_accepted
-            else torch.tensor(value.q_scale(), dtype=torch.float32)
-        )
-        q_zero_point = (
-            torch.tensor(value.q_zero_point(), dtype=torch.int64)
-            if cast_onnx_accepted
-            else torch.tensor(value.q_zero_point(), dtype=_qtype_vtype_map[value.dtype])
-        )
-        q_value = q_value_dequantize / q_scale + q_zero_point
-        q_value = q_value.to(dtype=_qtype_vtype_map[value.dtype])
-        return q_value, q_scale, q_zero_point
-    else:
-        return (value,)
-
-
-def _pre_trace_quant_model(model, args):
-    r"""Returns `torch.jit.trace(model, args)` if model is quantized. Otherwise do nothing and return
-    original model.
-
-    This is due to https://github.com/pytorch/pytorch/issues/75761.
-    """
-    if any(
-        hasattr(m, "_packed_params") for m in getattr(model, "modules", list)()
-    ) or any(getattr(arg, "is_quantized", False) for arg in args):
-        return torch.jit.trace(model, args)
-    return model
-
-
-def _model_to_graph(
-    model,
-    args,
-    verbose=False,
-    input_names=None,
-    output_names=None,
-    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
-    do_constant_folding=True,
-    _disable_torch_constant_prop=False,
-    fixed_batch_size=False,
-    training=_C_onnx.TrainingMode.EVAL,
-    dynamic_axes=None,
-) -> tuple[
-    _C.Graph,
-    dict[str, torch.Tensor],
-    torch.Tensor
-    | tuple[torch.Tensor, ...]
-    | list[torch.Tensor]
-    | dict[str, torch.Tensor]
-    | Any
-    | None,
-]:
-    """Converts model into an ONNX graph.
-
-    Returns:
-        graph: A TorchScript IR Graph with ONNX nodes.
-        params_dict: Dict from input param name to param value.
-        torch_out: The output tensors resulting from the trace of ``model``.
-            If ``model`` is a :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`,
-            this will be None, since we are not doing any tracing.
-    """
-    # TODO: can we simplify this to always return a tuple of Tensor or None?
-
-    # Special case for common case of passing a single Tensor
-    if isinstance(args, (torch.Tensor, int, float, bool)):
-        args = (args,)
-
-    model = _pre_trace_quant_model(model, args)
-    graph, params, torch_out, module = _create_jit_graph(model, args)
-    params_dict = _get_named_param_dict(graph, params)
-
-    try:
-        graph = _optimize_graph(
-            graph,
-            operator_export_type,
-            _disable_torch_constant_prop=_disable_torch_constant_prop,
-            fixed_batch_size=fixed_batch_size,
-            params_dict=params_dict,
-            dynamic_axes=dynamic_axes,
-            input_names=input_names,
-            module=module,
-        )
-    except Exception:
-        _C._jit_onnx_log("Torch IR graph at exception: ", graph)
-        raise
-
-    is_script = isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule))
-    if is_script:
-        example_outputs = _get_example_outputs(model, args)
-        example_outputs_final = ()
-        for example_output in example_outputs:
-            example_outputs_final += unpack_quantized_tensor(example_output)
-        out_vars, desc = torch.jit._flatten(example_outputs_final)
-        _C._jit_pass_onnx_assign_output_shape(
-            graph,
-            out_vars,
-            desc,
-            GLOBALS.onnx_shape_inference,
-            is_script,
-            GLOBALS.export_onnx_opset_version,
-        )
-
-    # NB: ONNX requires complete information about output types, which might be
-    # erased by some optimizations, so we need to set it explicitly again.
-    else:
-        if not isinstance(torch_out, (list, tuple)):
-            output_wrapped = [torch_out]
-        else:
-            output_wrapped = torch_out  # type: ignore[assignment]
-
-        output_tensors, out_desc = torch.jit._flatten(tuple(output_wrapped))
-        # assign_output_shape pass is not compatible with quantized outputs.
-        # Quantized outputs are flattened to 3 values in ONNX, while packed as
-        # single value in PyTorch.
-        if not any(getattr(out, "is_quantized", False) for out in output_tensors):
-            _C._jit_pass_onnx_assign_output_shape(
-                graph,
-                output_tensors,
-                out_desc,
-                GLOBALS.onnx_shape_inference,
-                is_script,
-                GLOBALS.export_onnx_opset_version,
-            )
-
-    _set_input_and_output_names(graph, input_names, output_names)
-    params_dict = _get_named_param_dict(graph, params)
-
-    if (
-        do_constant_folding
-        and GLOBALS.export_onnx_opset_version
-        >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
-    ):
-        if training is None or training == _C_onnx.TrainingMode.EVAL:
-            params_dict = _C._jit_pass_onnx_eval_peephole(graph, params_dict)
-
-        params_dict = _C._jit_pass_onnx_constant_fold(
-            graph, params_dict, GLOBALS.export_onnx_opset_version
-        )
-        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-
-    if GLOBALS.onnx_shape_inference:
-        try:
-            _C._jit_pass_onnx_graph_shape_type_inference(
-                graph, params_dict, GLOBALS.export_onnx_opset_version
-            )
-        except RuntimeError:
-            # NOTE: shape type inference error should not stop the export process
-            # https://github.com/pytorch/pytorch/issues/132205
-            pass
-
-    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
-
-    # For ONNX opset < 9, constants only have three data types: float16, float, double.
-    # In this pass transform constants of other data types to float/double + cast operator.
-    if GLOBALS.export_onnx_opset_version < 9:
-        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
-
-    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
-    _C._jit_decay_packed_param_input_types(graph)
-
-    # If output names lack a proper name and are identified only by their unique
-    # give them a legible name for debugging purposes
-    _apply_friendly_debug_names(graph, params_dict)
-
-    return graph, params_dict, torch_out
-
-
-@deprecated(
-    "Unconvertible ops are not definitive. Please remove usage of this function"
-)
-def unconvertible_ops(
-    model,
-    args,
-    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
-    opset_version: int | None = None,
-) -> tuple[_C.Graph, list[str]]:
-    """Returns an approximated list of all ops that are yet supported by :mod:`torch.onnx`.
-
-    .. deprecated:: 2.5
-        Unconvertible ops are not definitive. Please remove usage of this function.
-
-    The list is approximated because some ops may be removed during the conversion
-    process and don't need to be converted. Some other ops may have partial support
-    that will fail conversion with particular inputs. Please open a Github Issue
-    for op support requests.
-
-    Args:
-        model: Same as the `model` parameter in :func:`torch.onnx.export`.
-        args: Same as the `args` parameter in :func:`torch.onnx.export`.
-        training: Same as the `training` parameter in :func:`torch.onnx.export`.
-        opset_version: Same as the `opset_version` parameter in :func:`torch.onnx.export`.
-
-    Returns:
-        The JIT graph and a list of unconvertible ops in the format of "domain::op".
-    """
-
-    opset_version = opset_version or _constants.ONNX_DEFAULT_OPSET
-    GLOBALS.export_onnx_opset_version = opset_version
-
-    try:
-        with exporter_context(model, training, verbose=False):
-            # Create a mostly clean JIT graph that contains the plain aten and
-            # other ops we can check with the symbolic registry.
-            # NOTE: We don't want to actually convert any ops to ONNX or run any
-            # symbolic functions because there is a higher chance that a pass
-            # fails or an unconvertible op messes up the graph during ONNX conversion.
-            # This way we can always generate a list just by looking at the names
-            # of the ops in the graph.
-            args = _decide_input_format(model, args)
-            model = _pre_trace_quant_model(model, args)
-            graph, _, _, module = _create_jit_graph(model, args)
-            _C._jit_pass_inline(graph)
-            _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
-            _C._jit_pass_erase_number_types(graph)
-            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-    except Exception as e:
-        raise errors.OnnxExporterError(
-            "Failed to discover unconvertible ops because of errors during the JIT graph "
-            "generation process."
-        ) from e
-
-    unsupported_ops = []
-    for node in graph.nodes():
-        domain_op = node.kind()
-        if domain_op.startswith(("onnx::", "prim::")):
-            # We consider onnx and prim ops as supported ops, even though some "prim"
-            # ops are not implemented as symbolic functions, because they may be
-            # eliminated in the conversion passes. Users may still see errors caused
-            # by prim ops even though they don't show up in the list.
-            continue
-        if not registration.registry.is_registered_op(
-            domain_op.rstrip("_"), opset_version
-        ):
-            # We consider all registered ops supported, even though some of them are
-            # only partially supported, because there is not yet a good way to check
-            # if an op is fully supported.
-            # TODO(justinchuby): Create a way to check if an op is fully supported.
-            unsupported_ops.append(domain_op)
-    return graph, unsupported_ops
-
-
-def _setup_trace_module_map(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    export_modules_as_functions: bool | Collection[type[torch.nn.Module]],
-) -> set[str]:
-    def __register_attribute_hook():
-        attr_name = "_onnx_attrs"
-
-        def _track_module_attributes_forward_pre_hook(module, input):
-            setattr(module, attr_name, _get_module_attributes(module))
-
-        def _track_module_attributes_forward_hook(module, input, output):
-            tracing_state = _C._get_tracing_state()
-            if not tracing_state:
-                return
-
-            graph = tracing_state.graph()
-            onnx_attrs = {}
-            if hasattr(module, attr_name):
-                onnx_attrs = getattr(module, attr_name)
-                delattr(module, attr_name)
-
-            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
-
-        for m in model.modules():
-            m.register_forward_hook(_track_module_attributes_forward_hook)
-            m.register_forward_pre_hook(_track_module_attributes_forward_pre_hook)
-
-    def _unqualified_variable_name(qualified_name: str) -> str:
-        """
-        Parse qualified variable name and return the unqualified version.
-
-        Pure numeric atoms are considered inadequate, so this function will look past them,
-        and start from the first non-numeric atom.
-
-        Example:
-            >>> _unqualified_variable_name("__main__.Foo.bar")
-            'bar'
-            >>> _unqualified_variable_name("__main__.Foo.bar.0")
-            'bar.0'
-        """
-        name_atoms = qualified_name.split(".")
-        for i, atom in reversed(list(enumerate(name_atoms))):
-            if not atom.isnumeric():
-                return ".".join(name_atoms[i:])
-        return qualified_name
-
-    trace_module_map = {
-        _m: torch._C._jit_onnx_create_full_scope_name(
-            torch.typename(type(_m)), _unqualified_variable_name(_n)
-        )
-        for _n, _m in model.named_modules()
-    }
-    torch.jit._trace._trace_module_map = trace_module_map
-    if isinstance(export_modules_as_functions, bool) and export_modules_as_functions:
-        module_typenames = {torch.typename(type(module)) for module in trace_module_map}
-    elif isinstance(export_modules_as_functions, set) and export_modules_as_functions:
-
-        def _find_typename(v):
-            if isinstance(v, type):
-                return torch.typename(v)
-            else:
-                raise RuntimeError(
-                    "Only type of the `nn.Module` should be "
-                    "passed in the set for argument `export_modules_as_functions`. "
-                    f"Got `{type(v).__name__}`."
-                )
-
-        module_typenames = {_find_typename(v) for v in export_modules_as_functions}
-    else:
-        module_typenames = set()
-
-    if module_typenames:
-        __register_attribute_hook()
-
-    return module_typenames
-
-
-def _reset_trace_module_map():
-    torch.jit._trace._trace_module_map = None
-    _C._jit_pass_onnx_clear_scope_records()
-
-
-def _get_module_attributes(module):
-    annotations = typing.get_type_hints(type(module))
-    base_m_annotations = typing.get_type_hints(torch.nn.Module)
-    [annotations.pop(k, None) for k in base_m_annotations]
-    # Check whether module attributes can be accessed. Some classes
-    # define attributes but don't provide access to them in their
-    # constructor.
-    #
-    # For example, torch.nn.Embedding has the `freeze` variable and its
-    # type specified in the class but the attribute is not created in the
-    # constructor. In other words, there is no `self.freeze = <True | False>`
-    # in the constructor.
-    #
-    # Reference: https://github.com/pytorch/pytorch/blob/92de1d322223fb5584e384971b32c46b93bc2f4b/torch/nn/modules/sparse.py#L120
-    attrs = {}
-    for k in annotations:
-        try:
-            attrs[k] = getattr(module, k)
-        except AttributeError:
-            _C._jit_onnx_log(f"Skipping module attribute '{k}'")
-            continue
-    return attrs
-
-
-def _export(
-    model,
-    args,
-    f,
-    export_params=True,
-    verbose=False,
-    training=_C_onnx.TrainingMode.EVAL,
-    input_names=None,
-    output_names=None,
-    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
-    export_type=None,
-    opset_version=None,
-    do_constant_folding=True,
-    dynamic_axes=None,
-    keep_initializers_as_inputs=None,
-    fixed_batch_size=False,
-    custom_opsets=None,
-    add_node_names=True,
-    onnx_shape_inference=True,
-    export_modules_as_functions: Any = False,
-    autograd_inlining=True,
-):
-    assert GLOBALS.in_onnx_export is False
-
-    if isinstance(model, torch.nn.DataParallel):
-        raise ValueError(
-            "torch.nn.DataParallel is not supported by ONNX "
-            "exporter, please use 'attribute' module to "
-            "unwrap model from torch.nn.DataParallel. Try "
-            "torch.onnx.export(model.module, ...)"
-        )
-
-    GLOBALS.onnx_shape_inference = onnx_shape_inference
-
-    if opset_version is None:
-        opset_version = _constants.ONNX_DEFAULT_OPSET
-
-    if opset_version > _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET:
-        warnings.warn(
-            f"Exporting to ONNX opset version {opset_version} is not supported. "
-            f"by 'torch.onnx.export()'. "
-            f"The highest opset version supported is {_constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET}. "
-            f"To use a newer opset version, consider 'torch.onnx.export(..., dynamo=True)'. ",
-            category=errors.OnnxExporterWarning,
-        )
-
-    if export_modules_as_functions and opset_version < 15:
-        raise ValueError(
-            "`export_modules_as_functions` is not supported for `opset_version` < 15."
-            "This is because `opset_version` < 15 implies IR version < 8, which means "
-            "no local function support. "
-        )
-    if not operator_export_type:
-        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
-
-    # By default, training=TrainingMode.EVAL,
-    # which is good because running a model in training mode could result in
-    # internal buffers getting updated, dropout getting applied, etc.
-    # If you really know what you're doing, you can turn
-    # training=TrainingMode.TRAINING or training=TrainingMode.PRESERVE,
-    # (to preserve whatever the original training mode was.)
-    GLOBALS.export_onnx_opset_version = opset_version
-    GLOBALS.operator_export_type = operator_export_type
-
-    try:
-        GLOBALS.in_onnx_export = True
-        _autograd_inlining_previous = GLOBALS.autograd_inlining
-        GLOBALS.autograd_inlining = autograd_inlining
-
-        module_typenames_to_export_as_functions: set[str] = set()
-        if isinstance(model, (torch.nn.Module, torch.jit.ScriptModule)):
-            module_typenames_to_export_as_functions = _setup_trace_module_map(
-                model, export_modules_as_functions
-            )
-
-        with exporter_context(model, training, verbose):
-            val_keep_init_as_ip = _decide_keep_init_as_input(
-                keep_initializers_as_inputs,
-                operator_export_type,
-                opset_version,
-            )
-            val_add_node_names = _decide_add_node_names(
-                add_node_names, operator_export_type
-            )
-            val_do_constant_folding = _decide_constant_folding(
-                do_constant_folding, operator_export_type, training
-            )
-            # Normally f can be a file-like object, but for large models, the external data format requires a
-            # valid `model_file_location`. Code in export.cpp will enforce this.
-            if isinstance(f, str):
-                model_file_location = f
-            else:
-                model_file_location = ""
-            args = _decide_input_format(model, args)
-            if dynamic_axes is None:
-                dynamic_axes = {}
-            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-
-            graph, params_dict, torch_out = _model_to_graph(
-                model,
-                args,
-                verbose,
-                input_names,
-                output_names,
-                operator_export_type,
-                val_do_constant_folding,
-                fixed_batch_size=fixed_batch_size,
-                training=training,
-                dynamic_axes=dynamic_axes,
-            )
-
-            if custom_opsets is None:
-                custom_opsets = {}
-
-            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-            node_attr_to_name = {}  # type: ignore[var-annotated]
-            if module_typenames_to_export_as_functions:
-                # NOTE: cannot call DCE after this pass. DCE will remove function definition nodes.
-                node_attr_to_name = _C._jit_pass_onnx_function_extraction(
-                    graph,
-                    module_typenames_to_export_as_functions,
-                    list(params_dict.keys()),
-                )
-
-            if keep_initializers_as_inputs is not True:
-                params_dict = _C._jit_pass_onnx_deduplicate_initializers(  # type: ignore[assignment]
-                    graph,
-                    params_dict,  # type: ignore[arg-type]
-                    getattr(model, "training", False),  # type: ignore[arg-type]
-                )
-            _C._jit_pass_onnx_assign_scoped_names_for_node_and_value(graph)
-            defer_weight_export = False
-            if export_params:
-                (
-                    proto,
-                    export_map,
-                    _val_use_external_data_format,
-                    _node_names,
-                ) = graph._export_onnx(  # type: ignore[attr-defined]
-                    params_dict,
-                    opset_version,
-                    dynamic_axes,
-                    defer_weight_export,
-                    operator_export_type,
-                    not verbose,
-                    val_keep_init_as_ip,
-                    custom_opsets,
-                    val_add_node_names,
-                    model_file_location,
-                    node_attr_to_name,
-                )
-            else:
-                (
-                    proto,
-                    export_map,
-                    _,
-                    _,
-                ) = graph._export_onnx(  # type: ignore[attr-defined]
-                    {},
-                    opset_version,
-                    dynamic_axes,
-                    defer_weight_export,
-                    operator_export_type,
-                    not verbose,
-                    val_keep_init_as_ip,
-                    custom_opsets,
-                    val_add_node_names,
-                    model_file_location,
-                    node_attr_to_name,
-                )
-            # insert function_proto into model_proto.
-            proto = onnx_proto_utils._add_onnxscript_fn(
-                proto,
-                custom_opsets,
-            )
-            if verbose:
-                _C._jit_onnx_log("Exported graph: ", graph)
-            onnx_proto_utils._export_file(proto, f, export_map)
-    finally:
-        assert GLOBALS.in_onnx_export
-        GLOBALS.in_onnx_export = False
-        GLOBALS.autograd_inlining = _autograd_inlining_previous
-        _reset_trace_module_map()
-
-    return torch_out
-
-
-def _apply_friendly_debug_names(graph, params):
-    for n in graph.nodes():
-        for v in n.inputs():
-            old_name = v.debugName()
-            if old_name != str(v.unique()):
-                continue
-            new_name = f"{n.kind()}_{v.unique()}"
-            v.setDebugName(new_name)
-            if old_name in params:
-                params[new_name] = params.pop(old_name)
-
-
-def _set_input_and_output_names(graph, input_names, output_names):
-    def set_names(node_list, name_list, descriptor):
-        if name_list is None:
-            return
-        if len(name_list) > len(node_list):
-            raise RuntimeError(
-                f"number of {descriptor} names provided ({len(name_list)}) "
-                f"exceeded number of {descriptor}s ({len(node_list)})"
-            )
-
-        # Mark if the output node DebugName is set before.
-        output_node_set = set()
-        for i, (name, node) in enumerate(zip(name_list, node_list)):
-            # Duplicated output node, insert onnx::Identity to avoid setting the same DebugName after setDebugName().
-            if descriptor == "output":
-                if node in output_node_set:
-                    identity_node = graph.create("onnx::Identity")
-                    identity_node.insertAfter(node.node())
-                    identity_node.addInput(node)
-                    identity_node.output().setType(node.type())
-                    graph.return_node().replaceInput(i, identity_node.output())
-                    node = identity_node.output()
-                output_node_set.add(node)
-
-            if node.debugName() != name:
-                node.setDebugName(name)
-
-    set_names(list(graph.inputs()), input_names, "input")
-    set_names(list(graph.outputs()), output_names, "output")
-
-
-def _run_symbolic_method(g, op_name, symbolic_fn, args):
-    r"""
-    This trampoline function gets invoked for every symbolic method
-    call from C++.
-    """
-    try:
-        graph_context = jit_utils.GraphContext(
-            graph=g,
-            block=g.block(),
-            opset=GLOBALS.export_onnx_opset_version,
-            original_node=None,  # type: ignore[arg-type]
-            params_dict=_params_dict,
-            env={},
-            values_in_env=set(),
-            new_nodes=[],
-        )
-        return symbolic_fn(graph_context, *args)
-    except TypeError as e:
-        # Handle the specific case where we didn't successfully dispatch
-        # to symbolic_fn.  Otherwise, the backtrace will have the clues
-        # you need.
-        e.args = (f"{e.args[0]} (occurred when translating {op_name})",)
-        raise
-
-
-def _add_block(node: _C.Node) -> _C.Block:
-    return node.addBlock()
-
-
-def _add_input_to_block(block: _C.Block):
-    return block.addInputToBlock()  # type: ignore[attr-defined]
-
-
-def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
-    return block.registerOutput(value)
-
-
-def _should_aten_fallback(
-    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
-):
-    # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
-    #   an aten::ATen operator is created regardless of symbolics existence
-
-    is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
-    is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
-    is_aten_fallback_export = (
-        operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
-    )
-
-    if not name.startswith("aten::"):
-        return False
-
-    if is_onnx_aten_export or (is_aten_fallback_export and not is_exportable_aten_op):
-        return True
-
-    return False
-
-
-def _get_aten_op_overload_name(n: _C.Node) -> str:
-    # Returns `overload_name` attribute to ATen ops on non-Caffe2 builds
-    schema = n.schema()
-    if not schema.startswith("aten::"):
-        return ""
-    return _C.parse_schema(schema).overload_name
-
-
-def _run_symbolic_function(
-    graph: _C.Graph,
-    block: _C.Block,
-    node: _C.Node,
-    inputs: Any,
-    env: dict[_C.Value, _C.Value],
-    values_in_env: set[_C.Value],
-    new_nodes: list[_C.Node],
-    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
-) -> _C.Value | Sequence[_C.Value | None] | None:
-    """Runs a symbolic function.
-
-    The function is used in C++ to export the node to ONNX.
-
-    Returns:
-        A single or a tuple of Values.
-        None when the node gets cloned as is into the new graph.
-    """
-
-    opset_version = GLOBALS.export_onnx_opset_version
-
-    # See Note [Export inplace]
-    node_kind = node.kind()
-    if node_kind.endswith("_"):
-        # Treat relu_ -> relu; add_ -> add etc.
-        ns_op_name = node_kind[:-1]
-    else:
-        ns_op_name = node_kind
-
-    namespace, op_name = jit_utils.parse_node_kind(ns_op_name)
-
-    graph_context = jit_utils.GraphContext(
-        graph=graph,
-        block=block,
-        opset=opset_version,
-        original_node=node,
-        params_dict=_params_dict,
-        env=env,
-        values_in_env=values_in_env,
-        new_nodes=new_nodes,
-    )
-
-    # Direct ATen export requested
-    if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
-        attrs = {
-            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
-            for k in node.attributeNames()
-        }
-        outputs = node.outputsSize()
-        attrs["outputs"] = outputs
-        return graph_context.aten_op(
-            op_name,
-            *inputs,
-            overload_name=_get_aten_op_overload_name(node),
-            **attrs,
-        )
-
-    try:
-        domain = namespace
-        symbolic_function_name = f"{domain}::{op_name}"
-
-        symbolic_function_group = registration.registry.get_function_group(
-            symbolic_function_name
-        )
-        if symbolic_function_group is not None:
-            symbolic_fn = symbolic_function_group.get(opset_version)
-            if symbolic_fn is not None:
-                # TODO Wrap almost identical attrs assignment or comment the difference.
-                attrs = {
-                    k: symbolic_helper._node_get(node, k) for k in node.attributeNames()
-                }
-                return symbolic_fn(graph_context, *inputs, **attrs)
-
-        attrs = {
-            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
-            for k in node.attributeNames()
-        }
-        if namespace == "onnx":
-            # Clone node to trigger ONNX shape inference
-            return graph_context.op(
-                op_name, *inputs, **attrs, outputs=node.outputsSize()
-            )  # type: ignore[attr-defined]
-
-        raise errors.UnsupportedOperatorError(
-            symbolic_function_name,
-            opset_version,
-            symbolic_function_group.get_min_supported()
-            if symbolic_function_group
-            else None,
-        )
-
-    except RuntimeError:
-        if operator_export_type == _C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
-            return None
-        elif operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
-            # Emit ATen op for non-Caffe2 builds when `operator_export_type==ONNX_ATEN_FALLBACK`
-            attrs = {
-                k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
-                for k in node.attributeNames()
-            }
-            return graph_context.aten_op(
-                op_name,
-                *inputs,
-                overload_name=_get_aten_op_overload_name(node),
-                **attrs,
-            )
-        raise
-    except TypeError as e:
-        # Handle the specific case where we didn't successfully dispatch.
-        # Otherwise, the backtrace will have the clues you need.
-        e.args = (f"{e.args[0]} \n(Occurred when translating {op_name}).",)
-        raise
-
-
-def _verify_custom_op_name(symbolic_name: str):
-    if not re.match(r"^[a-zA-Z0-9-_]+::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name):
-        raise errors.OnnxExporterError(
-            f"Failed to register operator {symbolic_name}. "
-            "The symbolic name must match the format domain::name, "
-            "and should start with a letter and contain only "
-            "alphanumerical characters"
-        )
-
-    ns, _ = jit_utils.parse_node_kind(symbolic_name)
-    if ns == "onnx":
-        raise ValueError(
-            f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."
-        )
-
-
-def register_custom_op_symbolic(
-    symbolic_name: str,
-    symbolic_fn: Callable,
-    opset_version: int,
-):
-    """Registers a symbolic function for a custom operator.
-
-    When the user registers symbolic for custom/contrib ops,
-    it is highly recommended to add shape inference for that operator via setType API,
-    otherwise the exported graph may have incorrect shape inference in some extreme cases.
-    An example of setType is `test_aten_embedding_2` in `test_operators.py`.
-
-    See "Custom Operators" in the module documentation for an example usage.
-
-    Args:
-        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
-            format.
-        symbolic_fn (Callable): A function that takes in the ONNX graph and
-            the input arguments to the current operator, and returns new
-            operator nodes to add to the graph.
-        opset_version (int): The ONNX opset version in which to register.
-    """
-    if symbolic_name.startswith("::"):
-        symbolic_name = f"aten{symbolic_name}"
-
-    _verify_custom_op_name(symbolic_name)
-
-    registration.custom_onnx_symbolic(symbolic_name, opset_version)(symbolic_fn)
-
-
-def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
-    """Unregisters ``symbolic_name``.
-
-    See "Custom Operators" in the module documentation for an example usage.
-
-    Args:
-        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
-            format.
-        opset_version (int): The ONNX opset version in which to unregister.
-    """
-    if symbolic_name.startswith("::"):
-        symbolic_name = f"aten{symbolic_name}"
-
-    _verify_custom_op_name(symbolic_name)
-
-    registration.registry.unregister(symbolic_name, opset_version)
-
-
-def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
-    """Ensures dynamic axes argument is follows the expected format."""
-    if len(dynamic_axes) == 0:
-        return
-
-    if hasattr(model, "graph"):
-        # Extracting set of valid input/output names that shall be used for dynamic_axes
-        if (input_names is None) or len(input_names) == 0:
-            input_names = [x.debugName() for x in model.graph.inputs()]
-        if (output_names is None) or len(output_names) == 0:
-            output_names = [y.debugName() for y in model.graph.outputs()]
-
-    valid_names = set((input_names or []) + (output_names or []))
-
-    # If dynamic axes are provided as a list rather than dictionary, they should
-    # first get converted to a dictionary in expected format. If desired axes names
-    # are not provided for dynamic axes, automatic names shall be generated for
-    # provided dynamic axes of specified input/output
-    for key, value in dynamic_axes.items():
-        if key not in valid_names:
-            warnings.warn(
-                f"Provided key {key} for dynamic axes is not a valid input/output name"
-            )
-        if isinstance(value, list):
-            warnings.warn(
-                "No names were found for specified dynamic axes of provided input."
-                f"Automatically generated names will be applied to each dynamic axes of input {key}"
-            )
-
-            value_dict = {}
-            for i, x in enumerate(value):
-                if not isinstance(x, int):
-                    raise ValueError(
-                        "The type of axis index is expected to be an integer"
-                    )
-                if x in value_dict:
-                    warnings.warn(
-                        f"Duplicate dynamic axis index {x} was provided for input {key}."
-                    )
-                else:
-                    value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1)
-            dynamic_axes[key] = value_dict
 
+__all__: list[str] = []
 
-def model_signature(model: torch.nn.Module | Callable) -> inspect.Signature:
-    return inspect.signature(
-        model.forward if isinstance(model, torch.nn.Module) else model
-    )
+from torch.onnx._internal.torchscript_exporter.utils import *  # noqa: F401,F403
diff --git a/torch/onnx/verification.py b/torch/onnx/verification.py
index bc98fedae086..70d901acb47a 100644
--- a/torch/onnx/verification.py
+++ b/torch/onnx/verification.py
@@ -1,1872 +1,12 @@
-# mypy: allow-untyped-defs
-"""The ONNX verification module provides a set of tools to verify the correctness of ONNX models."""
+"""A set of tools to verify the correctness of ONNX models."""
 
-from __future__ import annotations
+__all__ = ["VerificationInfo", "verify_onnx_program"]
 
-
-__all__ = [
-    "OnnxBackend",
-    "VerificationOptions",
-    "verify",
-    "check_export_model_diff",
-    "VerificationInfo",
-    "verify_onnx_program",
-    "GraphInfo",
-    "GraphInfoPrettyPrinter",
-    "OnnxTestCaseRepro",
-    "find_mismatch",
-    "verify_aten_graph",
-]
-
-import contextlib
-import copy
-import dataclasses
-import datetime
-import difflib
-import enum
-import functools
-import io
-import itertools
-import os
-import tempfile
-import typing_extensions
-import warnings
-from collections.abc import Collection, Mapping, Sequence
-from typing import Any, Callable, Union
-
-import numpy as np
-import numpy.typing as npt
-
-import torch
-import torch._C._onnx as _C_onnx
-from torch import _C
-from torch.onnx import _constants, _experimental, utils
-from torch.onnx._globals import GLOBALS
-from torch.onnx._internal import onnx_proto_utils
 from torch.onnx._internal.exporter._verification import (
     VerificationInfo,
     verify_onnx_program,
 )
-from torch.types import Number
-
 
-# TODO: Update deprecation messages to recommend the new classes
 
 VerificationInfo.__module__ = "torch.onnx.verification"
 verify_onnx_program.__module__ = "torch.onnx.verification"
-
-# Everything below are deprecated ##############################################
-
-_ORT_PROVIDERS = ("CPUExecutionProvider",)
-
-_NumericType = Union[Number, torch.Tensor, np.ndarray]
-_ModelType = Union[torch.nn.Module, torch.jit.ScriptModule]
-_InputArgsType = Union[torch.Tensor, tuple[Any, ...]]
-_InputKwargsType = Mapping[str, Any]
-_OutputsType = Union[Sequence[_NumericType], Sequence]
-
-
-class OnnxBackend(enum.Enum):
-    """Enum class for ONNX backend used for export verification.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-    """
-
-    REFERENCE = "ONNXReferenceEvaluator"
-    ONNX_RUNTIME_CPU = "CPUExecutionProvider"
-    ONNX_RUNTIME_CUDA = "CUDAExecutionProvider"
-
-
-@dataclasses.dataclass
-class VerificationOptions:
-    """Options for ONNX export verification.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-
-    Attributes:
-        flatten: If True, unpack nested list/tuple/dict inputs into a flattened list of
-            Tensors for ONNX. Set this to False if nested structures are to be preserved
-            for ONNX, which is usually the case with exporting ScriptModules. Default True.
-        ignore_none: Whether to ignore None type in torch output, which is usually the
-            case with tracing. Set this to False, if torch output should keep None type,
-            which is usually the case with exporting ScriptModules. Default to True.
-        check_shape: Whether to check the shapes between PyTorch and ONNX Runtime outputs
-            are exactly the same. Set this to False to allow output shape broadcasting.
-            Default to True.
-        check_dtype: Whether to check the dtypes between PyTorch and ONNX Runtime outputs
-            are consistent. Default to True.
-        backend: ONNX backend for verification. Default to OnnxBackend.ONNX_RUNTIME_CPU.
-        rtol: relative tolerance in comparison between ONNX and PyTorch outputs.
-        atol: absolute tolerance in comparison between ONNX and PyTorch outputs.
-        remained_onnx_input_idx: If provided, only the specified inputs will be passed
-            to the ONNX model. Supply a list when there are unused inputs in the model.
-            Since unused inputs will be removed in the exported ONNX model, supplying
-            all inputs will cause an error on unexpected inputs. This parameter tells
-            the verifier which inputs to pass into the ONNX model.
-        acceptable_error_percentage: acceptable percentage of element mismatches in comparison.
-            It should be a float of value between 0.0 and 1.0.
-    """
-
-    flatten: bool = True
-    ignore_none: bool = True
-    check_shape: bool = True
-    check_dtype: bool = True
-    backend: OnnxBackend = OnnxBackend.ONNX_RUNTIME_CPU
-    rtol: float = 1e-3
-    atol: float = 1e-7
-    remained_onnx_input_idx: Sequence[int] | None = None
-    acceptable_error_percentage: float | None = None
-
-
-def _flatten_tuples(elem):
-    flattened = []
-    for t in elem:
-        if isinstance(t, tuple):
-            flattened.extend(_flatten_tuples(t))
-        else:
-            flattened.append(t)
-    return flattened
-
-
-# TODO(justinchuby): Add type checking by narrowing down the return type when input is None
-def _to_numpy(elem) -> list | npt.NDArray:
-    if isinstance(elem, torch.Tensor):
-        if elem.requires_grad:
-            return elem.detach().cpu().numpy()
-        else:
-            return elem.cpu().numpy()
-    elif isinstance(elem, (list, tuple)):
-        return [_to_numpy(inp) for inp in elem]
-    elif isinstance(elem, (bool, int, float)):
-        return np.array(elem)
-    elif isinstance(elem, dict):
-        flattened = []
-        for k in elem:
-            flattened.extend([_to_numpy(k), _to_numpy(elem[k])])
-        return flattened
-    return elem
-
-
-def _inline_flatten_list(inputs, res_list) -> list:
-    for i in inputs:
-        res_list.append(i) if not isinstance(
-            i, (list, tuple)
-        ) else _inline_flatten_list(i, res_list)
-    return res_list
-
-
-def _unpack_to_numpy(values, cast_onnx_accepted=True) -> list:
-    value_unpacked = []
-    for value in values:
-        value_unpacked.extend(
-            utils.unpack_quantized_tensor(value, cast_onnx_accepted=cast_onnx_accepted)
-        )
-    return [_to_numpy(v) for v in value_unpacked]
-
-
-def _run_onnx(onnx_session, inputs) -> _OutputsType:
-    kw_inputs = {}
-    if inputs and isinstance(inputs[-1], dict):
-        kw_inputs = inputs[-1]
-        inputs = inputs[:-1]
-    inputs = _unpack_to_numpy(_flatten_tuples(inputs))
-    ort_inputs = {}
-    for input_name, input in kw_inputs.items():
-        ort_inputs[input_name] = _to_numpy(input)
-    inputs = _to_numpy(inputs)
-    if hasattr(onnx_session, "get_inputs"):
-        # onnxruntime.InferenceSession
-        input_names = [i.name for i in onnx_session.get_inputs()]
-    elif hasattr(onnx_session, "input_names"):
-        # onnx.reference.ReferenceEvaluator
-        input_names = onnx_session.input_names
-    else:
-        raise ValueError(f"Unknown ONNX backend type: {type(onnx_session)}.")
-
-    for i, input in enumerate(inputs):
-        if i == len(input_names) or input_names[i] in ort_inputs:
-            raise ValueError(
-                f"got too many positional inputs. inputs: {inputs}. kw_inputs: {kw_inputs}. "
-                f"input names: {input_names}."
-            )
-        ort_inputs[input_names[i]] = input
-    onnx_outs = onnx_session.run(None, ort_inputs)
-    return onnx_outs
-
-
-def _ort_session(
-    model: str | io.BytesIO, ort_providers: Sequence[str] = _ORT_PROVIDERS
-):
-    try:
-        import onnxruntime  # type: ignore[import]
-    except ImportError as e:
-        raise ImportError("onnxruntime is required for export verification.") from e
-
-    if ort_providers is None:
-        ort_providers = _ORT_PROVIDERS
-
-    session_options = onnxruntime.SessionOptions()
-    # suppress ort warnings.
-    # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
-    session_options.log_severity_level = 3
-    ort_session = onnxruntime.InferenceSession(
-        model if isinstance(model, str) else model.getvalue(),
-        session_options,
-        providers=ort_providers,
-    )
-    return ort_session
-
-
-def _onnx_reference_evaluator_session(model: str | io.BytesIO):
-    try:
-        import onnx
-        from onnx import reference as onnx_reference  # type: ignore[attr-defined]
-    except ImportError as exc:
-        raise ImportError("onnx >= 1.13 is required for reference evaluator.") from exc
-
-    proto = (
-        onnx.load(model)  # type: ignore[attr-defined]
-        if isinstance(model, str)
-        else onnx.load_model_from_string(model.getvalue())  # type: ignore[attr-defined]
-    )
-    onnx_session = onnx_reference.ReferenceEvaluator(proto)
-    return onnx_session
-
-
-def _onnx_backend_session(model: str | io.BytesIO, backend: OnnxBackend):
-    if backend == OnnxBackend.REFERENCE:
-        onnx_session = _onnx_reference_evaluator_session(model)
-    elif backend in {OnnxBackend.ONNX_RUNTIME_CPU, OnnxBackend.ONNX_RUNTIME_CUDA}:
-        onnx_session = _ort_session(model, (backend.value,))
-    else:
-        raise ValueError(f"Unsupported backend: {backend}")
-    return onnx_session
-
-
-def _compare_onnx_pytorch_outputs_in_np(
-    onnx_outs: _OutputsType,
-    pt_outs: _OutputsType,
-    options: VerificationOptions,
-):
-    assert len(onnx_outs) == len(pt_outs), (
-        f"Number of outputs differ ONNX runtime: ({len(onnx_outs)}) PyTorch: ({len(pt_outs)})"
-    )
-    acceptable_error_percentage = options.acceptable_error_percentage
-    if acceptable_error_percentage and (
-        acceptable_error_percentage > 1.0 or acceptable_error_percentage < 0.0
-    ):
-        raise ValueError(
-            "If set, acceptable_error_percentage should be between 0.0 and 1.0"
-        )
-
-    for ort_out, pt_out in zip(onnx_outs, pt_outs):
-        try:
-            # TODO: Remove `check_shape` option once every shape inconsistent issue is addressed.
-            if not options.check_shape:
-                # Allow different but broadcastable output shapes.
-                ort_out, pt_out = np.broadcast_arrays(ort_out, pt_out)
-            torch.testing.assert_close(
-                ort_out,
-                pt_out,
-                rtol=options.rtol,
-                atol=options.atol,
-                check_dtype=options.check_dtype,
-                equal_nan=True,
-            )
-        except AssertionError as e:
-            if acceptable_error_percentage:
-                error_percentage = 1 - np.sum(
-                    np.isclose(ort_out, pt_out, rtol=options.rtol, atol=options.atol)
-                ) / np.prod(ort_out.shape)
-                if error_percentage <= acceptable_error_percentage:
-                    warnings.warn(
-                        f"Suppressed AssertionError:\n{e}.\n"
-                        f"Error percentage {error_percentage} "
-                        f"within acceptable range {acceptable_error_percentage}."
-                    )
-                    continue
-            if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
-                warnings.warn("ONNX output is quantized")
-            if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
-                warnings.warn("PyTorch output is quantized")
-            raise
-
-
-def _compare_onnx_pytorch_outputs(
-    onnx_outs: _OutputsType,
-    pt_outs: Any,
-    options: VerificationOptions,
-):
-    """
-    Compare ONNX and PyTorch outputs.
-
-    Args:
-        onnx_outs: outputs from ONNX backend.
-        pt_outs: outputs from PyTorch.
-        options: options for verification.
-
-    Raises:
-        AssertionError: if outputs from ONNX model and PyTorch model are not
-            equal up to specified precision.
-        ValueError: if arguments provided are invalid.
-    """
-    if options.ignore_none:
-        # torch.jit._flatten filters None type
-        pt_outs, _ = torch.jit._flatten(pt_outs)
-    else:
-        pt_outs = _inline_flatten_list([pt_outs], [])
-    pt_outs_np = _unpack_to_numpy(pt_outs, cast_onnx_accepted=False)
-    onnx_outs = _inline_flatten_list(onnx_outs, [])
-    _compare_onnx_pytorch_outputs_in_np(onnx_outs, pt_outs_np, options)
-
-
-def _prepare_input_for_pytorch(args, kwargs):
-    """Prepare input for PyTorch model execution.
-
-    Any future changes/formatting to the input before dispatching to the PyTorch
-    model should be made in this function.
-
-    Args:
-        args: positional arguments for PyTorch model forward method.
-        kwargs: keyword arguments for PyTorch model forward method.
-
-    Returns:
-        args: positional arguments for PyTorch model forward method.
-        kwargs: keyword arguments for PyTorch model forward method.
-    """
-    if isinstance(args, (torch.Tensor, dict)):
-        args = (args,)
-    # In-place operators will update input tensor data as well.
-    # Thus inputs are replicated before every forward call.
-    args = copy.deepcopy(args)
-    if kwargs:
-        kwargs = copy.deepcopy(kwargs)
-    else:
-        kwargs = {}
-    return args, kwargs
-
-
-def _prepare_input_for_export(args, kwargs):
-    """Prepare input for ONNX model export.
-
-    Any future changes/formatting to the input before dispatching to the
-    :func:`torch.onnx.export` api should be made in this function.
-
-    Args:
-        args: positional arguments for PyTorch model forward method.
-        kwargs: keyword arguments for PyTorch model forward method.
-
-    Returns:
-        onnx_inputs: positional arguments for ONNX model export, as `args` in
-            :func:`torch.onnx.export`.
-    """
-    args, kwargs = _prepare_input_for_pytorch(args, kwargs)
-    if not kwargs and len(args) > 0 and isinstance(args[-1], dict):
-        onnx_inputs = args + ({},)
-    elif kwargs:
-        onnx_inputs = args + (kwargs,)
-    else:
-        onnx_inputs = args
-    return onnx_inputs
-
-
-def _prepare_input_for_onnx(
-    args, kwargs, remained_onnx_input_idx: Sequence[int] | None, flatten: bool
-):
-    """Prepare input for ONNX model execution in ONNX backend.
-
-    Any future changes/formatting to the input before dispatching to the ONNX backend
-    run should be made in this function.
-
-    Args:
-        args: positional arguments for PyTorch model forward method.
-        kwargs: keyword arguments for PyTorch model forward method.
-        remained_onnx_input_idx: indices of inputs to be used for ONNX model execution.
-        flatten: whether to flatten the input before dispatching to the ONNX model execution.
-
-    Returns:
-        onnx_inputs: positional arguments for ONNX model execution in ONNX backend.
-    """
-    onnx_inputs = _prepare_input_for_export(args, kwargs)
-    if flatten:
-        onnx_inputs, _ = torch.jit._flatten(onnx_inputs)
-    elif onnx_inputs and onnx_inputs[-1] == {}:
-        # Handle empty kwargs (normally removed by flatten).
-        onnx_inputs = onnx_inputs[:-1]
-    if remained_onnx_input_idx is not None:
-        return [onnx_inputs[i] for i in remained_onnx_input_idx]
-    else:
-        return onnx_inputs
-
-
-def _try_clone_model(model):
-    """Used for preserving original model in case forward mutates model states."""
-    try:
-        return copy.deepcopy(model)
-    except Exception:
-        warnings.warn(
-            "Failed to clone model. Model state might be mutated during verification."
-        )
-        return model
-
-
-def _compare_onnx_pytorch_model(
-    pt_model: _ModelType,
-    onnx_model_f: str | io.BytesIO,
-    input_args: _InputArgsType,
-    input_kwargs: _InputKwargsType | None,
-    additional_test_inputs: Sequence[_InputArgsType] | None,
-    options: VerificationOptions,
-):
-    """Compare outputs from ONNX model runs with outputs from PyTorch model runs.
-
-    Args:
-        pt_model: PyTorch model.
-        onnx_model_f: ONNX model file path or file-like object.
-        input_args: positional arguments for PyTorch model forward method.
-        input_kwargs: keyword arguments for PyTorch model forward method.
-        additional_test_inputs: additional positional arguments for PyTorch model
-            forward method.
-        options: options for verification.
-
-    Raises:
-        AssertionError: if outputs from ONNX model and PyTorch model are not
-            equal up to specified precision.
-    """
-    onnx_session = _onnx_backend_session(onnx_model_f, options.backend)
-
-    def compare_onnx_pytorch_model_with_input(input_args, input_kwargs):
-        pt_args, pt_kwargs = _prepare_input_for_pytorch(input_args, input_kwargs)
-        # TODO: remove this and treat mutating model separately. See #77679
-        pt_model_copy = _try_clone_model(pt_model)
-        pt_outs = pt_model_copy(*pt_args, **pt_kwargs)
-
-        onnx_inputs = _prepare_input_for_onnx(
-            input_args, input_kwargs, options.remained_onnx_input_idx, options.flatten
-        )
-
-        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
-
-        _compare_onnx_pytorch_outputs(
-            onnx_outs=onnx_outs,
-            pt_outs=pt_outs,
-            options=options,
-        )
-
-    compare_onnx_pytorch_model_with_input(input_args, input_kwargs)
-
-    if additional_test_inputs:
-        for test_input_args in additional_test_inputs:
-            compare_onnx_pytorch_model_with_input(test_input_args, {})
-
-
-class _GraphDiff:
-    """A class to represent the difference between two graphs."""
-
-    def __init__(self, graph_a: _C.Graph, graph_b: _C.Graph):
-        """Construct a _GraphDiff object.
-
-        Args:
-            graph_a (_C.Graph): First graph to compare.
-            graph_b (_C.Graph): Second graph to compare.
-        """
-        self.graph_a = graph_a
-        self.graph_b = graph_b
-
-    def __str__(self):
-        """See function :func:`diff_report`."""
-        return self.diff_report()
-
-    def _indent(self, lines: str) -> str:
-        return "\n".join(["\t" + line for line in lines.splitlines()])
-
-    def diff_report(self) -> str:
-        """Return a string representation of the graph difference.
-
-        The report shows the first pair of nodes that diverges. It also shows the source
-        location of the pair of nodes.
-
-        Returns:
-            graph_diff_report (str): A string representation of the graph difference.
-        """
-        graph_a = self.graph_a
-        graph_b = self.graph_b
-
-        graph_a_str = str(graph_a)
-        graph_b_str = str(graph_b)
-
-        if graph_a_str == graph_b_str:
-            return ""
-
-        graph_diff = difflib.ndiff(
-            graph_a_str.splitlines(True), graph_b_str.splitlines(True)
-        )
-        graph_diff_report = ["Graph diff:", self._indent("".join(graph_diff))]
-
-        for node_a, node_b in itertools.zip_longest(graph_a.nodes(), graph_b.nodes()):
-            if str(node_a) != str(node_b):
-                graph_diff_report.append("First diverging operator:")
-                node_diff = difflib.ndiff(
-                    str(node_a).splitlines(True), str(node_b).splitlines(True)
-                )
-                source_printout = ["node diff:", self._indent("".join(node_diff))]
-
-                stack_a = node_a.sourceRange() if node_a else None
-                if stack_a:
-                    source_printout.extend(
-                        ["Former source location:", self._indent(str(stack_a))]
-                    )
-                stack_b = node_b.sourceRange() if node_b else None
-                if stack_b:
-                    source_printout.extend(
-                        ["Latter source location:", self._indent(str(stack_b))]
-                    )
-
-                graph_diff_report.extend(source_printout)
-
-                break
-
-        return "\n".join(graph_diff_report)
-
-
-def _check_graph_diff(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    test_input_groups: Sequence[tuple[tuple[Any, ...], Mapping[str, Any]]],
-    export_options: _experimental.ExportOptions,
-    model_to_graph_func: Callable[
-        [
-            torch.nn.Module,
-            tuple[Any, ...],
-            Mapping[str, Any],
-            _experimental.ExportOptions,
-        ],
-        _C.Graph,
-    ],
-) -> str:
-    """Check if graph produced by `model_to_graph_func` is the same across `test_input_groups`.
-
-    Args:
-        model: See :func:`check_export_model_diff`.
-        test_input_groups: See :func:`check_export_model_diff`.
-        export_options: See :func:`check_export_model_diff`.
-        model_to_graph_func: A function to convert a PyTorch model to a JIT IR graph.
-
-    Returns:
-        graph_diff_report (str): A string representation of the graph difference.
-    """
-    if len(test_input_groups) < 2:
-        raise ValueError("Need at least two groups of test inputs to compare.")
-
-    ref_jit_graph = None
-    for args, kwargs in test_input_groups:
-        jit_graph = model_to_graph_func(model, args, kwargs, export_options)
-        if ref_jit_graph is None:
-            ref_jit_graph = jit_graph
-            continue
-
-        graph_diff_report = _GraphDiff(ref_jit_graph, jit_graph).diff_report()
-        if graph_diff_report:
-            return graph_diff_report
-    return ""
-
-
-def _traced_graph_from_model(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    args: tuple[Any, ...],
-    kwargs: Mapping[str, Any],
-    export_options: _experimental.ExportOptions,
-) -> _C.Graph:
-    """As part of the ONNX export steps, create a traced JIT graph from a PyTorch model.
-
-    Args:
-        model: See :func:`check_export_model_diff`.
-        args: See :func:`check_export_model_diff`.
-        kwargs: See :func:`check_export_model_diff`.
-        export_options: See :func:`check_export_model_diff`.
-
-    Returns:
-        jit_graph (_C.Graph): A traced JIT graph.
-    """
-    training = export_options.training
-    verbose = export_options.verbose
-
-    with utils.exporter_context(model, training, verbose):
-        export_inputs = _prepare_input_for_export(args, kwargs)
-        model = utils._pre_trace_quant_model(model, export_inputs)
-        jit_graph, _, _, _ = utils._create_jit_graph(model, export_inputs)
-        return jit_graph
-
-
-def _onnx_graph_from_model(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    args: tuple[Any, ...],
-    kwargs: Mapping[str, Any],
-    export_options: _experimental.ExportOptions,
-) -> _C.Graph:
-    """As part of the ONNX export steps, export an ONNX JIT graph from a PyTorch model.
-
-    Args:
-        model: See :func:`check_export_model_diff`.
-        args: See :func:`check_export_model_diff`.
-        kwargs: See :func:`check_export_model_diff`.
-        export_options: See :func:`check_export_model_diff`.
-
-    Returns:
-        onnx_graph (_C.Graph): An ONNX JIT graph.
-    """
-    # TODO: refactor utils.py to remove duplicated code of context setup. See #78834
-    opset_version = export_options.opset_version
-    operator_export_type = export_options.operator_export_type
-    export_modules_as_functions = export_options.export_modules_as_functions
-    training = export_options.training
-    verbose = export_options.verbose
-    dynamic_axes = export_options.dynamic_axes
-    input_names = export_options.input_names
-    output_names = export_options.output_names
-
-    if opset_version is None:
-        opset_version = _constants.ONNX_DEFAULT_OPSET
-
-    utils._setup_trace_module_map(model, export_modules_as_functions)
-
-    if not operator_export_type:
-        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
-
-    GLOBALS.export_onnx_opset_version = opset_version
-    GLOBALS.operator_export_type = operator_export_type
-
-    with utils.exporter_context(model, training, verbose):
-        do_constant_folding = utils._decide_constant_folding(
-            export_options.do_constant_folding, operator_export_type, training
-        )
-
-        if dynamic_axes is None:
-            dynamic_axes = {}
-        utils._validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-
-        export_inputs = _prepare_input_for_export(args, kwargs)
-        export_inputs = utils._decide_input_format(model, export_inputs)
-        onnx_graph, _, _ = utils._model_to_graph(
-            model,
-            export_inputs,
-            verbose,
-            input_names,
-            output_names,
-            operator_export_type,
-            do_constant_folding,
-            training=training,
-            dynamic_axes=dynamic_axes,
-        )
-
-        return onnx_graph
-
-
-def _onnx_graph_from_aten_graph(
-    graph: torch.Graph,
-    export_options: _experimental.ExportOptions,
-    params_dict: dict[str, Any] | None = None,
-) -> tuple[torch.Graph, dict[str, Any]]:
-    if params_dict is None:
-        params_dict = {}
-    operator_export_type = export_options.operator_export_type
-    dynamic_axes = export_options.dynamic_axes or {}
-    input_names = export_options.input_names
-    training = export_options.training
-    do_constant_folding = export_options.do_constant_folding
-    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
-
-    GLOBALS.export_onnx_opset_version = opset_version
-    GLOBALS.operator_export_type = operator_export_type
-
-    do_constant_folding = utils._decide_constant_folding(
-        do_constant_folding, operator_export_type, training
-    )
-
-    # TODO: Below is doing aten graph to onnx. It should be abstracted as a
-    # function in torch/onnx/utils.py.
-    graph = graph.copy()
-    graph = utils._optimize_graph(
-        graph,
-        operator_export_type,
-        params_dict=params_dict,
-        dynamic_axes=dynamic_axes,
-        input_names=input_names,
-    )
-
-    if training is None or training == _C_onnx.TrainingMode.EVAL:
-        params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
-
-    if (
-        do_constant_folding
-        and opset_version >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
-    ):
-        params_dict = _C._jit_pass_onnx_constant_fold(graph, params_dict, opset_version)
-        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-
-    if GLOBALS.onnx_shape_inference:
-        _C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, opset_version)
-
-    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
-
-    # For ONNX opset < 9, constants only have three data types: float16, float, double.
-    # In this pass transform constants of other data types to float/double + cast operator.
-    if opset_version < 9:
-        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
-
-    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
-    _C._jit_decay_packed_param_input_types(graph)
-
-    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
-
-    if export_options.verbose:
-        print("ONNX graph: ", graph)
-
-    return graph, params_dict
-
-
-def _onnx_proto_from_onnx_graph(
-    onnx_graph: torch.Graph,
-    export_options: _experimental.ExportOptions,
-    params_dict: dict[str, Any],
-) -> tuple[bytes, Mapping[str, bytes]]:
-    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
-    dynamic_axes = export_options.dynamic_axes or {}
-    operator_export_type = export_options.operator_export_type
-    val_keep_init_as_ip = utils._decide_keep_init_as_input(
-        export_options.keep_initializers_as_inputs,
-        operator_export_type,
-        opset_version,
-    )
-    val_add_node_names = utils._decide_add_node_names(True, operator_export_type)
-    custom_opsets = export_options.custom_opsets or {}
-
-    proto, export_map, _, _ = onnx_graph._export_onnx(  # type: ignore[attr-defined]
-        params_dict,
-        opset_version,
-        dynamic_axes,
-        False,
-        operator_export_type,
-        not export_options.verbose,
-        val_keep_init_as_ip,
-        custom_opsets,
-        val_add_node_names,
-        "",
-        {},
-    )
-
-    return proto, export_map
-
-
-def check_export_model_diff(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    test_input_groups: Sequence[tuple[tuple[Any, ...], Mapping[str, Any]]],
-    export_options: _experimental.ExportOptions | None = None,
-) -> str:
-    """Verify exported model discrepancy between different groups of inputs.
-
-    A graph is exported for each group of inputs. The exported graphs are then compared
-    to each other, and discrepancies of first pair of nodes are reported. This function
-    first checks the jit graph. If no discrepancies were found, it then checks the onnx
-    graph.
-
-    Unless otherwise specified, the jit/ONNX graph is expected to be the same, regardless
-    of the inputs used for exporting. A discrepancy implies the graph exported is
-    not accurate when run on other groups of inputs, which will typically results in
-    runtime errors or mismatching output.
-
-    Args:
-        model (torch.nn.Module or torch.jit.ScriptModule): The model to be exported.
-        test_input_groups (Sequence[Tuple[Tuple[Any, ...], Mapping[str, Any]]]): A sequence
-            of input groups to be used to export the model. Each input group is a pair of
-            (args, kwargs).
-        export_options (_experimental.ExportOptions, optional): An _experimental.ExportOptions
-            object that controls the export behavior.
-
-    Returns:
-        str: A string containing the diff of the exported models.
-    """
-    export_options = (
-        _experimental.ExportOptions() if export_options is None else export_options
-    )
-
-    jit_diff_report = _check_graph_diff(
-        model, test_input_groups, export_options, _traced_graph_from_model
-    )
-    if jit_diff_report:
-        return jit_diff_report
-
-    return _check_graph_diff(
-        model, test_input_groups, export_options, _onnx_graph_from_model
-    )
-
-
-@typing_extensions.deprecated(
-    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
-    "and use ONNXProgram to test the ONNX model",
-    category=None,
-)
-def verify(
-    model: _ModelType,
-    input_args: _InputArgsType,
-    input_kwargs: _InputKwargsType | None = None,
-    do_constant_folding: bool = True,
-    dynamic_axes: Mapping[str, Mapping[int, str] | Mapping[str, Sequence[int]]]
-    | None = None,
-    input_names: Sequence[str] | None = None,
-    output_names: Sequence[str] | None = None,
-    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
-    opset_version: int | None = None,
-    keep_initializers_as_inputs: bool = True,
-    verbose: bool = False,
-    fixed_batch_size: bool = False,
-    use_external_data: bool = False,
-    additional_test_inputs: Sequence[_InputArgsType] | None = None,
-    options: VerificationOptions | None = None,
-):
-    """Verify model export to ONNX against original PyTorch model.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-
-    Args:
-        model: See :func:`torch.onnx.export`.
-        input_args: See :func:`torch.onnx.export`.
-        input_kwargs: See :func:`torch.onnx.export`.
-        do_constant_folding: See :func:`torch.onnx.export`.
-        dynamic_axes: See :func:`torch.onnx.export`.
-        input_names: See :func:`torch.onnx.export`.
-        output_names: See :func:`torch.onnx.export`.
-        training: See :func:`torch.onnx.export`.
-        opset_version: See :func:`torch.onnx.export`.
-        keep_initializers_as_inputs: See :func:`torch.onnx.export`.
-        verbose: See :func:`torch.onnx.export`.
-        fixed_batch_size: Legacy argument, used only by rnn test cases.
-        use_external_data: Explicitly specify whether to export the model with external data.
-        additional_test_inputs: List of tuples. Each tuple is a group of
-            input arguments to test. Currently only ``*args`` are supported.
-        options: A VerificationOptions object that controls the verification behavior.
-
-    Raises:
-        AssertionError: if outputs from ONNX model and PyTorch model are not
-            equal up to specified precision.
-        ValueError: if arguments provided are invalid.
-    """
-    if options is None:
-        options = VerificationOptions()
-
-    if training == torch.onnx.TrainingMode.TRAINING:
-        model.train()
-    elif training == torch.onnx.TrainingMode.EVAL:
-        model.eval()
-    with torch.no_grad(), contextlib.ExitStack() as stack:
-        model_f: str | io.BytesIO = io.BytesIO()
-        if use_external_data:
-            tmpdir_path = stack.enter_context(tempfile.TemporaryDirectory())
-            model_f = os.path.join(tmpdir_path, "model.onnx")
-
-        inputs_for_export = _prepare_input_for_export(input_args, input_kwargs)
-
-        # TODO(#77679): remove this and treat mutating model separately.
-        model_copy = _try_clone_model(model)
-        utils._export(
-            model,
-            inputs_for_export,
-            model_f,
-            opset_version=opset_version,
-            do_constant_folding=do_constant_folding,
-            keep_initializers_as_inputs=keep_initializers_as_inputs,
-            dynamic_axes=dynamic_axes,
-            input_names=input_names,
-            output_names=output_names,
-            fixed_batch_size=fixed_batch_size,
-            training=training,
-            verbose=verbose,
-        )
-
-        _compare_onnx_pytorch_model(
-            pt_model=model_copy,
-            onnx_model_f=model_f,
-            input_args=input_args,
-            input_kwargs=input_kwargs,
-            additional_test_inputs=additional_test_inputs,
-            options=options,
-        )
-
-
-@typing_extensions.deprecated(
-    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
-    "and use ONNXProgram to test the ONNX model"
-)
-def verify_aten_graph(
-    graph: torch.Graph,
-    input_args: tuple[Any, ...],
-    export_options: _experimental.ExportOptions,
-    params_dict: dict[str, Any] | None = None,
-    verification_options: VerificationOptions | None = None,
-) -> tuple[AssertionError | None, torch.Graph, _OutputsType, _OutputsType]:
-    """Verify aten graph export to ONNX against original PyTorch model.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-    """
-    if verification_options is None:
-        verification_options = VerificationOptions()
-    if params_dict is None:
-        params_dict = {}
-
-    original_jit_graph = graph
-    graph = graph.copy()
-
-    # Execute aten graph and get reference torch jit outputs.
-    graph_inputs = list(graph.inputs())
-    jit_inputs = tuple([arg for arg in input_args if arg is not None])
-    weights = [params_dict[v.debugName()] for v in graph_inputs[len(jit_inputs) :]]
-    assert all(w is not None for w in weights)
-    # TODO: Only copy the argument if mutation is detected in Graph.
-    jit_inputs = copy.deepcopy(jit_inputs)
-    jit_input_and_parameters = jit_inputs + tuple(weights)
-    jit_outs = torch._C._jit_interpret_graph(graph, jit_input_and_parameters)  # type: ignore[attr-defined]
-    if not isinstance(jit_outs, (list, tuple)):
-        jit_outs = [jit_outs]
-
-    # Convert aten graph to onnx graph.
-    graph, onnx_params_dict = _onnx_graph_from_aten_graph(
-        graph, export_options, params_dict
-    )
-
-    proto, export_map = _onnx_proto_from_onnx_graph(
-        graph, export_options, onnx_params_dict
-    )
-    model_f: str | io.BytesIO = io.BytesIO()
-    onnx_proto_utils._export_file(proto, model_f, export_map)
-
-    # NOTE: Verification is unstable. Try catch to emit information for debugging.
-    try:
-        # NOTE: Input might be dce'ed, so we need to remove those from the input args.
-        new_input_names = {v.debugName() for v in graph.inputs()}
-        new_input_args = []
-        for v, arg in zip(original_jit_graph.inputs(), input_args):
-            if v.debugName() in new_input_names:
-                new_input_args.append(arg)
-        input_args = tuple(new_input_args)
-
-        onnx_inputs = _prepare_input_for_onnx(
-            input_args,
-            {},
-            verification_options.remained_onnx_input_idx,
-            verification_options.flatten,
-        )
-
-        onnx_session = _onnx_backend_session(model_f, verification_options.backend)
-        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
-        del onnx_session  # To free device memory
-
-        try:
-            _compare_onnx_pytorch_outputs(
-                onnx_outs=onnx_outs,
-                pt_outs=jit_outs,
-                options=verification_options,
-            )
-        except AssertionError as e:
-            return e, graph, jit_outs, onnx_outs
-
-        return None, graph, jit_outs, onnx_outs
-
-    except Exception as e:
-        print("Unexpected error during verification.")
-        print("jit graph: ", original_jit_graph)
-        print("onnx graph: ", graph)
-        raise e
-
-
-class GraphInfoPrettyPrinter:
-    graph_info: GraphInfo | None
-    upper_printer: GraphInfoPrettyPrinter | None
-    lower_printer: GraphInfoPrettyPrinter | None
-
-    graph_str_lambdas: Mapping[int, str]
-    connector_str_lambdas: Mapping[int, str]
-    children_str_lambdas: Mapping[int, str]
-
-    def __init__(self, graph_info: GraphInfo | None):
-        self.graph_info = graph_info
-        if (
-            graph_info is not None
-            and graph_info.upper_graph_info is not None
-            and graph_info.lower_graph_info is not None
-        ):
-            self.upper_printer = GraphInfoPrettyPrinter(graph_info.upper_graph_info)
-            self.lower_printer = GraphInfoPrettyPrinter(graph_info.lower_graph_info)
-        else:
-            self.upper_printer = None
-            self.lower_printer = None
-
-    def _total_rows(self) -> int:
-        if self.graph_info is None:
-            return 1
-        if self.upper_printer and self.lower_printer:
-            return (
-                self.upper_printer._total_rows() + self.lower_printer._total_rows() + 1
-            )
-        return 2  # Two lines: node count + id.
-
-    def _node_count_segment_str(self) -> str:
-        if self.graph_info is None:
-            return "..."
-        node_count = self.graph_info.essential_node_count()
-        has_mismatch = self.graph_info.has_mismatch()
-        error_node_kind = (
-            f"({self.graph_info.essential_node_kinds().pop()})"
-            if node_count == 1 and has_mismatch
-            else ""
-        )
-
-        return f"{node_count} {'X' if has_mismatch else chr(0x2713)} {error_node_kind}"
-
-    def _graph_id_segment_str(self) -> str:
-        if self.graph_info is None:
-            return ""
-        return f"id: {self.graph_info.id}"
-
-    def _max_segment_columns(self) -> int:
-        return max(
-            map(len, (self._node_count_segment_str(), self._graph_id_segment_str()))
-        )
-
-    def _graph_segment_str_at_line(self, line: int) -> str:
-        """Get the string representation of the graph segment at the given line."""
-        if line == 0:
-            result_str = self._node_count_segment_str()
-            result_str += " " * (self._max_segment_columns() - len(result_str))
-            return result_str
-        if line == 1:
-            result_str = self._graph_id_segment_str()
-            result_str += " " * (self._max_segment_columns() - len(result_str))
-            return result_str
-        if 0 <= line < self._total_rows():
-            return " " * self._max_segment_columns()
-        return ""
-
-    def _connector_segment_str_at_line(self, line: int) -> str:
-        """Get the connector segment string at the given line."""
-        if self.upper_printer is None and self.lower_printer is None:
-            return ""
-        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
-        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
-        if line == 0:
-            return "  __"
-        elif line < upper_total_rows + 1:
-            return " |  "
-        elif line == upper_total_rows + 1:
-            return " |__"
-        elif line < upper_total_rows + lower_total_rows + 1:
-            return "    "
-        return ""
-
-    def _children_str_at_line(self, line: int) -> str:
-        """Get the string representation of the children at the given line.
-
-        Recursively calls `_str_at_line` on children nodes.
-        """
-        if self.upper_printer is None and self.lower_printer is None:
-            return ""
-        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
-        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
-        if 0 <= line < upper_total_rows:
-            return (
-                self.upper_printer._str_at_line(line) if self.upper_printer else "..."
-            )
-        elif upper_total_rows < line < upper_total_rows + lower_total_rows + 1:
-            return (
-                self.lower_printer._str_at_line(line - upper_total_rows - 1)
-                if self.lower_printer
-                else "..."
-            )
-        return ""
-
-    def _str_at_line(self, line: int) -> str:
-        """Get the string representation of the graph at the given line."""
-        return (
-            self._graph_segment_str_at_line(line)
-            + self._connector_segment_str_at_line(line)
-            + self._children_str_at_line(line)
-        )
-
-    def pretty_print(self):
-        if self.graph_info is None:
-            print(None)
-            return
-        # Print tree.
-        print(" Tree: ".center(80, "="))
-        total_rows = self._total_rows()
-        for line in range(total_rows):
-            print(self._str_at_line(line).rstrip())
-        if self.graph_info.has_mismatch():
-            # Summarize leaf subgraphs with mismatch.
-            print(" Mismatch leaf subgraphs: ".center(80, "="))
-            print(
-                [
-                    graph_info.id
-                    for graph_info in self.graph_info.all_mismatch_leaf_graph_info()
-                ]
-            )
-            # Summarize node kinds with mismatch.
-            mismatch_node_kinds: dict[str, int] = {}
-            for graph_info in self.graph_info.all_mismatch_leaf_graph_info():
-                node_kinds = graph_info.essential_node_kinds()
-                if len(node_kinds) == 1:
-                    node_kind = node_kinds.pop()
-                    mismatch_node_kinds[node_kind] = (
-                        mismatch_node_kinds.get(node_kind, 0) + 1
-                    )
-            print(" Mismatch node kinds: ".center(80, "="))
-            print(mismatch_node_kinds)
-        else:
-            print(" No mismatch found. ".center(80, "="))
-
-
-class OnnxTestCaseRepro:
-    def __init__(self, repro_dir):
-        self.repro_dir = repro_dir
-        self.proto, self.inputs, self.outputs = onnx_proto_utils.load_test_case(
-            repro_dir
-        )
-
-    @classmethod
-    def create_test_case_repro(
-        cls, proto: bytes, inputs, outputs, dir: str, name: str | None = None
-    ):
-        """Create a repro under "{dir}/test_{name}" for an ONNX test case.
-
-        The test case contains the model and the inputs/outputs data. The directory
-        structure is as follows:
-
-        dir
-        \u251c\u2500\u2500 test_<name>
-        \u2502   \u251c\u2500\u2500 model.onnx
-        \u2502   \u2514\u2500\u2500 test_data_set_0
-        \u2502       \u251c\u2500\u2500 input_0.pb
-        \u2502       \u251c\u2500\u2500 input_1.pb
-        \u2502       \u251c\u2500\u2500 output_0.pb
-        \u2502       \u2514\u2500\u2500 output_1.pb
-
-        Args:
-            proto: ONNX model proto.
-            inputs: Inputs to the model.
-            outputs: Outputs of the model.
-            dir: Directory to save the repro.
-            name: Name of the test case. If not specified, a name based on current time
-                will be generated.
-        Returns:
-            Path to the repro.
-        """
-        if name is None:
-            name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
-        return onnx_proto_utils.export_as_test_case(
-            proto,
-            _to_numpy(inputs),
-            _to_numpy(outputs),
-            name,
-            dir,
-        )
-
-    def validate(self, options: VerificationOptions):
-        """Run the ONNX test case with options.backend, and compare with the expected outputs.
-
-        Args:
-            options: Options for validation.
-
-        Raise:
-            AssertionError: if outputs from options.backend and expected outputs are not
-                equal up to specified precision.
-        """
-        onnx_session = _onnx_backend_session(io.BytesIO(self.proto), options.backend)
-        run_outputs = onnx_session.run(None, self.inputs)
-        if hasattr(onnx_session, "get_outputs"):
-            output_names = [o.name for o in onnx_session.get_outputs()]
-        elif hasattr(onnx_session, "output_names"):
-            output_names = onnx_session.output_names
-        else:
-            raise ValueError(f"Unknown onnx session type: {type(onnx_session)}")
-        expected_outs = [self.outputs[name] for name in output_names]
-        _compare_onnx_pytorch_outputs_in_np(run_outputs, expected_outs, options)
-
-
-@typing_extensions.deprecated(
-    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
-    "and use ONNXProgram to test the ONNX model"
-)
-@dataclasses.dataclass
-class GraphInfo:
-    """GraphInfo contains validation information of a TorchScript graph and its converted ONNX graph.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-    """
-
-    graph: torch.Graph
-    input_args: tuple[Any, ...]
-    params_dict: dict[str, Any]
-    export_options: _experimental.ExportOptions = dataclasses.field(
-        default_factory=_experimental.ExportOptions
-    )
-    mismatch_error: AssertionError | None = dataclasses.field(default=None, init=False)
-    pt_outs: Sequence[_NumericType] | None = dataclasses.field(default=None, init=False)
-    upper_graph_info: GraphInfo | None = dataclasses.field(default=None, init=False)
-    lower_graph_info: GraphInfo | None = dataclasses.field(default=None, init=False)
-    id: str = dataclasses.field(default="")
-    _onnx_graph: torch.Graph | None = dataclasses.field(init=False, default=None)
-
-    _EXCLUDED_NODE_KINDS: frozenset[str] = frozenset(
-        {"prim::Constant", "prim::ListConstruct", "aten::ScalarImplicit"}
-    )
-
-    def clear(self):
-        """Clear states and results of previous verification."""
-        self.mismatch_error = None
-        self.pt_outs = None
-        self._onnx_graph = None
-        self.upper_graph_info = None
-        self.lower_graph_info = None
-
-    def pretty_print_tree(self):
-        """Pretty print `GraphInfo` tree.
-
-        Each node represents a subgraph, showing the number of nodes in the subgraph and
-        a check mark if the subgraph has output mismatch between torch and ONNX.
-
-        The id of the subgraph is shown under the node. The `GraphInfo` object for any
-        subgraph can be retrieved by calling `graph_info.find_partition(id)`.
-
-        Example::
-
-            ==================================== Tree: =====================================
-            5 X   __2 X    __1 \u2713
-            id:  |  id: 0 |  id: 00
-                 |        |
-                 |        |__1 X (aten::relu)
-                 |           id: 01
-                 |
-                 |__3 X    __1 \u2713
-                    id: 1 |  id: 10
-                          |
-                          |__2 X     __1 X (aten::relu)
-                             id: 11 |  id: 110
-                                    |
-                                    |__1 \u2713
-                                       id: 111
-            =========================== Mismatch leaf subgraphs: ===========================
-            ['01', '110']
-            ============================= Mismatch node kinds: =============================
-            {'aten::relu': 2}
-
-        """
-        GraphInfoPrettyPrinter(self).pretty_print()
-
-    def pretty_print_mismatch(self, graph: bool = False):
-        """Pretty print details of the mismatch between torch and ONNX.
-
-        Args:
-            graph: If True, print the ATen JIT graph and ONNX graph.
-        """
-        print(f" Mismatch info for graph partition {self.id}: ".center(80, "="))
-        if graph:
-            print(" ATen JIT graph ".center(80, "="))
-            # TODO: A more compact graph printer.
-            #   * Drop stride, grad, device information.
-            #   * Show source location on a separate line.
-            print(self.graph)
-            if self._onnx_graph is not None:
-                print(" ONNX graph ".center(80, "="))
-                print(self._onnx_graph)
-        if self.has_mismatch():
-            print(" Mismatch error ".center(80, "="))
-            print(self.mismatch_error)
-        else:
-            print(" No mismatch ".center(80, "="))
-
-    def has_mismatch(self) -> bool:
-        """Return True if the subgraph has output mismatch between torch and ONNX."""
-        return self.mismatch_error is not None
-
-    def essential_node_count(self) -> int:
-        """Return the number of nodes in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
-        return sum(
-            1 for n in self.graph.nodes() if n.kind() not in self._EXCLUDED_NODE_KINDS
-        )
-
-    def essential_node_kinds(self) -> set[str]:
-        """Return the set of node kinds in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
-        return {
-            n.kind()
-            for n in self.graph.nodes()
-            if n.kind() not in self._EXCLUDED_NODE_KINDS
-        }
-
-    def all_mismatch_leaf_graph_info(self) -> list[GraphInfo]:
-        """Return a list of all leaf `GraphInfo` objects that have mismatch."""
-        if not self.has_mismatch():
-            return []
-
-        no_mismatch_children = (
-            self.upper_graph_info is None or not self.upper_graph_info.has_mismatch()
-        ) and (
-            self.lower_graph_info is None or not self.lower_graph_info.has_mismatch()
-        )
-
-        if no_mismatch_children:
-            return [self]
-
-        results = []
-        if self.upper_graph_info is not None:
-            results += self.upper_graph_info.all_mismatch_leaf_graph_info()
-        if self.lower_graph_info is not None:
-            results += self.lower_graph_info.all_mismatch_leaf_graph_info()
-
-        return results
-
-    def find_partition(self, id: str) -> GraphInfo | None:
-        """Find the `GraphInfo` object with the given id."""
-        if id == self.id:
-            return self
-        current_length = len(self.id)
-        if len(id) > current_length:
-            if id[current_length] == "0" and self.upper_graph_info is not None:
-                return self.upper_graph_info.find_partition(id)
-            elif id[current_length] == "1" and self.lower_graph_info is not None:
-                return self.lower_graph_info.find_partition(id)
-        return None
-
-    def export_repro(
-        self, repro_dir: str | None = None, name: str | None = None
-    ) -> str:
-        """Export the subgraph to ONNX along with the input/output data for repro.
-
-        The repro directory will contain the following files::
-
-            dir
-            \u251c\u2500\u2500 test_<name>
-            \u2502   \u251c\u2500\u2500 model.onnx
-            \u2502   \u2514\u2500\u2500 test_data_set_0
-            \u2502       \u251c\u2500\u2500 input_0.pb
-            \u2502       \u251c\u2500\u2500 input_1.pb
-            \u2502       \u251c\u2500\u2500 output_0.pb
-            \u2502       \u2514\u2500\u2500 output_1.pb
-
-        Args:
-            repro_dir: The directory to export the repro files to. Defaults to current
-                working directory if None.
-            name: An optional name for the test case folder: "test_{name}".
-
-        Returns:
-            The path to the exported repro directory.
-        """
-
-        if repro_dir is None:
-            repro_dir = os.getcwd()
-        repro_dir = os.path.join(repro_dir, "onnx_debug")
-
-        onnx_graph, onnx_params_dict = _onnx_graph_from_aten_graph(
-            self.graph, self.export_options, self.params_dict
-        )
-
-        proto, _ = _onnx_proto_from_onnx_graph(
-            onnx_graph, self.export_options, onnx_params_dict
-        )
-        return OnnxTestCaseRepro.create_test_case_repro(
-            proto, self.input_args, self.pt_outs, repro_dir, name
-        )
-
-    def _graph_partition_pivot(self) -> int:
-        """Find the pivot index to partition the graph.
-
-        The pivot is the node that splits the graph into two parts. Each part should
-        have the similar amount of nodes, excluding non essential ops, defined in
-        `_EXCLUDED_NODE_KINDS`, such as `prim::Constant`.
-        If the graph has an odd number of nodes, the upper part will have one more node.
-        If the graph does not have any node that can be partitioned, return -1.
-
-        Returns:
-            The index of the pivot node.
-        """
-        included_node_indices = [
-            i
-            for i, n in enumerate(self.graph.nodes())
-            if n.kind() not in self._EXCLUDED_NODE_KINDS
-        ]
-        half_idx = len(included_node_indices) // 2 - 1
-        if half_idx >= 0 and len(included_node_indices) > half_idx:
-            return included_node_indices[half_idx] + 1
-        return -1
-
-    def _partition_upper_graph(self) -> torch.Graph:
-        pivot = self._graph_partition_pivot()
-        if pivot == -1:
-            return torch.Graph()
-        graph = self.graph.copy()  # Copy to not mutate parent graph.
-        original_outputs = list(graph.outputs())
-
-        def _process_bridge_value_for_upper(
-            new_outputs: list[torch.Value], bridge_value: torch.Value
-        ) -> torch.Value:
-            # Add bridge values as upper graph outputs.
-            new_outputs.append(bridge_value)
-            return bridge_value
-
-        new_outputs: list[torch.Value] = []
-        process_bridge_value_for_upper = functools.partial(
-            _process_bridge_value_for_upper, new_outputs
-        )
-        _, dropped_nodes, complete_upper_nodes_set, _ = self._partition_nodes(
-            graph, pivot, process_bridge_value_for_upper
-        )
-
-        for _ in enumerate(original_outputs):
-            graph.eraseOutput(0)
-        for output in new_outputs:
-            graph.registerOutput(output)
-
-        for node in reversed(dropped_nodes):
-            node.destroy()
-
-        for i, input in reversed(list(enumerate(list(graph.inputs())))):
-            if (
-                not _has_uses_by_nodes(input, complete_upper_nodes_set)
-                and input not in new_outputs
-            ):
-                try:
-                    graph.eraseInput(i)
-                except RuntimeError as e:
-                    print(input, graph)
-                    raise e
-
-        return graph
-
-    def _partition_lower_graph(self) -> torch.Graph:
-        pivot = self._graph_partition_pivot()
-        if pivot == -1:
-            return torch.Graph()
-        graph = self.graph.copy()  # Copy to not mutate parent graph.
-        original_outputs = list(graph.outputs())
-        original_inputs = list(graph.inputs())
-
-        def _process_bridge_value_for_lower(
-            graph: torch.Graph, bridge_value: torch.Value
-        ) -> torch.Value:
-            # Add bridge values as lower graph inputs.
-            new_input = graph.addInput()
-            bridge_value.replaceAllUsesWith(new_input)
-            new_input.copyMetadata(bridge_value)
-            return new_input
-
-        process_bridge_value_for_lower = functools.partial(
-            _process_bridge_value_for_lower, graph
-        )
-
-        upper_nodes, lower_nodes, _, complete_lower_nodes_set = self._partition_nodes(
-            graph, pivot, process_bridge_value_for_lower
-        )
-
-        new_outputs = [
-            output for output in original_outputs if _produced_by(output, lower_nodes)
-        ]
-        for _ in enumerate(original_outputs):
-            graph.eraseOutput(0)
-        for output in new_outputs:
-            graph.registerOutput(output)
-
-        for input in original_inputs:
-            if _has_uses_by_nodes(input, complete_lower_nodes_set):
-                new_input = graph.addInput()
-                input.replaceAllUsesWith(new_input)
-                new_input.copyMetadata(input)
-
-        for node in reversed(upper_nodes):
-            if node not in complete_lower_nodes_set:
-                try:
-                    node.destroy()
-                except RuntimeError as e:
-                    print(node, graph)
-                    raise e
-
-        for _ in original_inputs:
-            graph.eraseInput(0)
-
-        return graph
-
-    def _partition_node(
-        self,
-        node: torch.Node,
-        complete_upper_nodes_set: set[torch.Node],
-        complete_lower_nodes_set: set[torch.Node],
-        original_graph_outputs: set[torch.Value],
-        covered_bridge_values: set[torch.Value],
-        process_bridge_value: Callable[[torch.Value], torch.Value],
-    ):
-        if node in complete_lower_nodes_set:
-            return
-
-        if (
-            _node_has_uses_by(node, complete_lower_nodes_set)
-            and node.kind() in self._EXCLUDED_NODE_KINDS
-        ):
-            complete_lower_nodes_set.update(_all_nodes([node]))
-            for input in node.inputs():
-                if input in covered_bridge_values:
-                    continue
-                self._partition_node(
-                    input.node(),
-                    complete_upper_nodes_set,
-                    complete_lower_nodes_set,
-                    original_graph_outputs,
-                    covered_bridge_values,
-                    process_bridge_value,
-                )
-        else:
-            for output in node.outputs():
-                if output in covered_bridge_values:
-                    continue
-                if (
-                    _has_uses_by_nodes(output, complete_lower_nodes_set)
-                    or output in original_graph_outputs
-                ):
-                    covered_bridge_values.add(process_bridge_value(output))
-
-    def _partition_nodes(
-        self,
-        graph: torch.Graph,
-        pivot: int,
-        process_bridge_value: Callable[[torch.Value], torch.Value],
-    ) -> tuple[list[torch.Node], list[torch.Node], set[torch.Node], set[torch.Node]]:
-        nodes = list(graph.nodes())
-        upper_nodes = nodes[:pivot]
-        lower_nodes = nodes[pivot:]
-        # `upper_nodes` and `complete_upper_nodes_set` differs in that the latter
-        # recursively contains nodes in subblock of `upper_nodes`.
-        # The same applies for `lower_nodes` and `complete_lower_nodes_set`.
-        # With addition that `complete_lower_nodes_set` will include nodes that
-        # are determined to be copied from `upper_nodes` to `lower_nodes`.
-        complete_upper_nodes_set = _all_nodes(upper_nodes)
-        complete_lower_nodes_set = _all_nodes(lower_nodes)
-        original_graph_outputs = set(graph.outputs())
-        # Bridge values are values produced from upper graph, and consumed
-        # by lower graph. These values need to be become upper graph outputs
-        # and lower graph inputs, to bridge the interaction.
-        # Start with all graph inputs marked as covered. If any graph input is
-        # needed by lower graph, just keep it in lower graph inputs later.
-        covered_bridge_values = set(graph.inputs())
-        for node in upper_nodes:
-            self._partition_node(
-                node,
-                complete_upper_nodes_set,
-                complete_lower_nodes_set,
-                original_graph_outputs,
-                covered_bridge_values,
-                process_bridge_value,
-            )
-        return (
-            upper_nodes,
-            lower_nodes,
-            complete_upper_nodes_set,
-            complete_lower_nodes_set,
-        )
-
-    def _bridge_kwargs(self):
-        pt_outs = self.pt_outs
-        graph_outputs = list(self.graph.outputs())
-        assert pt_outs is not None
-        assert len(graph_outputs) == len(pt_outs), (
-            f"{len(graph_outputs)} vs {len(pt_outs)}\nGraph: {self.graph}"
-        )
-        return {v.debugName(): o for v, o in zip(graph_outputs, pt_outs)}
-
-    def _args_and_params_for_partition_graph(
-        self,
-        graph: torch.Graph,
-        bridge_kwargs: Mapping[str, _NumericType | Sequence[_NumericType]],
-        full_kwargs: Mapping[str, torch.Tensor],
-        full_params: Mapping[str, torch.Tensor],
-    ):
-        input_names = [input.debugName() for input in graph.inputs()]
-        args = tuple(bridge_kwargs[k] for k in input_names if k in bridge_kwargs)
-        args += tuple(full_kwargs[k] for k in input_names if k in full_kwargs)
-        params = {k: full_params[k] for k in input_names if k in full_params}
-        assert len(args) + len(params) == len(input_names), (
-            f"{len(args)} + {len(params)} vs {len(input_names)}: {input_names}"
-        )
-        return args, params
-
-    def verify_export(
-        self, options: VerificationOptions
-    ) -> tuple[AssertionError | None, torch.Graph, _OutputsType, _OutputsType]:
-        """
-        Verify the export from TorchScript IR graph to ONNX.
-
-        Export the TorchScript IR graph to ONNX, with the inputs, parameters and export
-        options recorded in this object. Then verify the exported ONNX graph against
-        the original TorchScript IR graph under the provided verification options.
-
-        Args:
-            options: The verification options.
-
-        Returns:
-            error: The AssertionError raised during the verification. Returns None if no
-            error is raised.
-            onnx_graph: The exported ONNX graph in TorchScript IR format.
-            onnx_outs: The outputs from running exported ONNX model under the onnx
-            backend in `options`.
-            pt_outs: The outputs from running the TorchScript IR graph.
-        """
-        return verify_aten_graph(
-            self.graph,
-            input_args=self.input_args,
-            params_dict=self.params_dict,
-            export_options=self.export_options,
-            verification_options=options,
-        )
-
-    def find_mismatch(
-        self,
-        options: VerificationOptions | None = None,
-    ):
-        """
-        Find all mismatches between the TorchScript IR graph and the exported onnx model.
-
-        Binary searches the model graph to find the minimal subgraph that exhibits the
-        mismatch. A `GraphInfo` object is created for each subgraph, recording the test
-        inputs and export options, as well as the validation results.
-
-        Args:
-            options: The verification options.
-        """
-        self.clear()
-
-        if options is None:
-            options = VerificationOptions()
-
-        if self.export_options.verbose:
-            print(self.graph)
-
-        if len(list(self.graph.outputs())) == 0:
-            return
-
-        assert len(self.input_args) + len(self.params_dict) == len(
-            list(self.graph.inputs())
-        ), (
-            f"Number of graph inputs({len(list(self.graph.inputs()))}) does not match "
-            f"the provided tensor arguments({len(self.input_args)} + {len(self.params_dict)})."
-        )
-
-        self.mismatch_error, self._onnx_graph, self.pt_outs, _ = self.verify_export(
-            options
-        )
-
-        if self.mismatch_error is None:
-            # No mismatch found in graph.
-            return
-
-        if self.essential_node_count() <= 1:
-            # Reached leaf node, no more partitioning.
-            return
-
-        full_kwargs = {
-            k.debugName(): v for k, v in zip(self.graph.inputs(), self.input_args)
-        }
-        full_params = self.params_dict
-
-        upper_graph = self._partition_upper_graph()
-        upper_args, upper_params = self._args_and_params_for_partition_graph(
-            upper_graph, {}, full_kwargs, full_params
-        )
-        self.upper_graph_info = GraphInfo(
-            upper_graph,
-            upper_args,
-            upper_params,
-            self.export_options,
-            id=self.id + "0",
-        )
-
-        self.upper_graph_info.find_mismatch(options)
-
-        bridge_kwargs = self.upper_graph_info._bridge_kwargs()
-        lower_graph = self._partition_lower_graph()
-        lower_args, lower_params = self._args_and_params_for_partition_graph(
-            lower_graph, bridge_kwargs, full_kwargs, full_params
-        )
-        self.lower_graph_info = GraphInfo(
-            lower_graph,
-            lower_args,
-            lower_params,
-            self.export_options,
-            id=self.id + "1",
-        )
-
-        self.lower_graph_info.find_mismatch(options)
-
-
-def _all_nodes(nodes: Collection[torch.Node]) -> set[torch.Node]:
-    all_nodes = set(nodes)
-    for n in nodes:
-        for b in n.blocks():
-            all_nodes.update(_all_nodes(list(b.nodes())))
-    return all_nodes
-
-
-def _has_uses_by_nodes(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
-    return any(use.user in nodes for use in value.uses())
-
-
-def _node_has_uses_by(node: torch.Node, nodes: Collection[torch.Node]) -> bool:
-    for output in node.outputs():
-        if _has_uses_by_nodes(output, nodes):
-            return True
-    return False
-
-
-def _produced_by(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
-    return value.node() in nodes
-
-
-@typing_extensions.deprecated(
-    "torch.onnx.verification.* is deprecated. Consider using torch.onnx.export(..., dynamo=True) "
-    "and use ONNXProgram to test the ONNX model"
-)
-def find_mismatch(
-    model: torch.nn.Module | torch.jit.ScriptModule,
-    input_args: tuple[Any, ...],
-    do_constant_folding: bool = True,
-    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
-    opset_version: int | None = None,
-    keep_initializers_as_inputs: bool = True,
-    verbose: bool = False,
-    options: VerificationOptions | None = None,
-) -> GraphInfo:
-    r"""Find all mismatches between the original model and the exported model.
-
-    .. deprecated:: 2.7
-        Consider using ``torch.onnx.export(..., dynamo=True)`` and use the returned
-        ``ONNXProgram`` to test the ONNX model.
-
-    Experimental. The API is subject to change.
-
-    This tool helps debug the mismatch between the original PyTorch model and exported
-    ONNX model. It binary searches the model graph to find the minimal subgraph that
-    exhibits the mismatch.
-
-    Args:
-        model: The model to be exported.
-        input_args: The input arguments to the model.
-        do_constant_folding: Same as `do_constant_folding` in :func:`torch.onnx.export`.
-        training: Same as `training` in :func:`torch.onnx.export`.
-        opset_version: Same as `opset_version` in :func:`torch.onnx.export`.
-        keep_initializers_as_inputs: Same as `keep_initializers_as_inputs` in :func:`torch.onnx.export`.
-        verbose: Same as `verbose` in :func:`torch.onnx.export`.
-        options: The options for the mismatch verification.
-
-    Returns:
-        A GraphInfo object that contains the mismatch information.
-
-    Example::
-
-        >>> import torch
-        >>> import torch.onnx.verification
-        >>> torch.manual_seed(0)
-        >>> opset_version = 15
-        >>> # Define a custom symbolic function for aten::relu.
-        >>> # The custom symbolic function is incorrect, which will result in mismatches.
-        >>> def incorrect_relu_symbolic_function(g, self):
-        ...     return self
-        >>> torch.onnx.register_custom_op_symbolic(
-        ...     "aten::relu",
-        ...     incorrect_relu_symbolic_function,
-        ...     opset_version=opset_version,
-        ... )
-        >>> class Model(torch.nn.Module):
-        ...     def __init__(self) -> None:
-        ...         super().__init__()
-        ...         self.layers = torch.nn.Sequential(
-        ...             torch.nn.Linear(3, 4),
-        ...             torch.nn.ReLU(),
-        ...             torch.nn.Linear(4, 5),
-        ...             torch.nn.ReLU(),
-        ...             torch.nn.Linear(5, 6),
-        ...         )
-        ...     def forward(self, x):
-        ...         return self.layers(x)
-        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
-        >>> graph_info = torch.onnx.verification.find_mismatch(
-        ...     Model(),
-        ...     (torch.randn(2, 3),),
-        ...     opset_version=opset_version,
-        ... )
-        ===================== Mismatch info for graph partition : ======================
-        ================================ Mismatch error ================================
-        Tensor-likes are not close!
-        Mismatched elements: 12 / 12 (100.0%)
-        Greatest absolute difference: 0.2328854203224182 at index (1, 2) (up to 1e-07 allowed)
-        Greatest relative difference: 0.699536174352349 at index (1, 3) (up to 0.001 allowed)
-        ==================================== Tree: =====================================
-        5 X   __2 X    __1 \u2713
-        id:  |  id: 0 |  id: 00
-             |        |
-             |        |__1 X (aten::relu)
-             |           id: 01
-             |
-             |__3 X    __1 \u2713
-                id: 1 |  id: 10
-                      |
-                      |__2 X     __1 X (aten::relu)
-                         id: 11 |  id: 110
-                                |
-                                |__1 \u2713
-                                   id: 111
-        =========================== Mismatch leaf subgraphs: ===========================
-        ['01', '110']
-        ============================= Mismatch node kinds: =============================
-        {'aten::relu': 2}
-
-    """
-    if options is None:
-        options = VerificationOptions()
-    if opset_version is None:
-        opset_version = _constants.ONNX_DEFAULT_OPSET
-    """From aten graph, do binary search on graph partition to find operator export discrepancy."""
-    # TODO: Copied from utils.py `export` until `_optimize_graph`.
-    if training == torch.onnx.TrainingMode.TRAINING:
-        model.train()
-    elif training == torch.onnx.TrainingMode.EVAL:
-        model.eval()
-    with torch.no_grad():
-        inputs_for_export = _prepare_input_for_export(input_args, {})
-        args = utils._decide_input_format(model, inputs_for_export)
-
-        model = utils._pre_trace_quant_model(model, args)
-        graph, params, _torch_out, _module = utils._create_jit_graph(model, args)
-        params_dict = utils._get_named_param_dict(graph, params)
-
-        utils._apply_friendly_debug_names(graph, params_dict)
-
-        graph_info = GraphInfo(
-            graph,
-            input_args,
-            params_dict,
-            _experimental.ExportOptions(
-                do_constant_folding=do_constant_folding,
-                training=training,
-                opset_version=opset_version,
-                keep_initializers_as_inputs=keep_initializers_as_inputs,
-                verbose=verbose,
-            ),
-        )
-        graph_info.find_mismatch(options)
-        graph_info.pretty_print_mismatch()
-        graph_info.pretty_print_tree()
-
-        return graph_info
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 58ad582bebb9..8703719dabc7 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -200,13 +200,16 @@ def step(self, epoch: Optional[int] = None) -> None:
                 )
 
         self._step_count += 1
+        if epoch is not None:
+            warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
+        self._update_lr(epoch)
 
+    def _update_lr(self, epoch: Optional[int] = None):
         with _enable_get_lr_call(self):
             if epoch is None:
                 self.last_epoch += 1
                 values = self.get_lr()
             else:
-                warnings.warn(EPOCH_DEPRECATION_WARNING, UserWarning)
                 self.last_epoch = epoch
                 if hasattr(self, "_get_closed_form_lr"):
                     values = cast(list[float], self._get_closed_form_lr())
@@ -913,7 +916,7 @@ def step(self) -> None:  # type: ignore[override]
         idx = bisect_right(self._milestones, self.last_epoch)
         scheduler = self._schedulers[idx]
         if idx > 0 and self._milestones[idx - 1] == self.last_epoch:
-            scheduler.step(0)
+            scheduler._update_lr(0)
         else:
             scheduler.step()
 
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 28a41b7c714e..2ef6c48f4efa 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -28,9 +28,10 @@
 Args: TypeAlias = tuple[Any, ...]
 Kwargs: TypeAlias = dict[str, Any]
 StateDict: TypeAlias = dict[str, Any]
-DeviceDict = dict[Optional[torch.device], torch.Tensor]
-DeviceDtypeDict = dict[Optional[tuple[torch.device, torch.dtype]], torch.Tensor]
-
+DeviceDict: TypeAlias = dict[Optional[torch.device], torch.Tensor]
+DeviceDtypeDict: TypeAlias = dict[
+    Optional[tuple[torch.device, torch.dtype]], torch.Tensor
+]
 
 GlobalOptimizerPreHook: TypeAlias = Callable[
     ["Optimizer", Args, Kwargs], Optional[tuple[Args, Kwargs]]
diff --git a/torch/overrides.py b/torch/overrides.py
index bff2c875cfdf..c8fd7c6a2289 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -362,6 +362,7 @@ def get_ignored_functions() -> set[Callable]:
         Tensor._view_func,
         Tensor._view_func_unsafe,
         Tensor._rev_view_func_unsafe,
+        Tensor._make_dtensor,
         Tensor._make_wrapper_subclass,
         Tensor._python_dispatch.__get__,
         Tensor._has_symbolic_sizes_strides.__get__,
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index d88d6c5cad72..573541799bbe 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -527,7 +527,6 @@ def tensorboard_trace_handler(
     ``worker_name`` should be unique for each worker in distributed scenario,
     it will be set to '[hostname]_[pid]' by default.
     """
-    import os
     import socket
     import time
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index 317543962820..be284429114f 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -24,6 +24,7 @@
     TEST_CUDNN = LazyVal(lambda: TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.tensor(1., device=CUDA_DEVICE)))
 
 TEST_CUDNN_VERSION = LazyVal(lambda: torch.backends.cudnn.version() if TEST_CUDNN else 0)
+ROCM_VERSION = LazyVal(lambda : tuple(int(v) for v in torch.version.hip.split('.')[:2]) if torch.version.hip else (0, 0))
 
 SM53OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (5, 3))
 SM60OrLater = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() >= (6, 0))
@@ -40,6 +41,7 @@
 IS_JETSON = LazyVal(lambda: torch.cuda.is_available() and (torch.cuda.get_device_capability() in [(7, 2), (8, 7)] or IS_THOR))
 IS_SM89 = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() == (8, 9))
 IS_SM90 = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0))
+IS_SM100 = LazyVal(lambda: torch.cuda.is_available() and torch.cuda.get_device_capability() == (10, 0))
 
 def evaluate_gfx_arch_within(arch_list):
     if not torch.cuda.is_available():
@@ -94,7 +96,6 @@ def evaluate_platform_supports_cudnn_attention():
 def evaluate_platform_supports_fp8():
     if torch.cuda.is_available():
         if torch.version.hip:
-            ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
             archs = ['gfx94']
             if ROCM_VERSION >= (6, 3):
                 archs.extend(['gfx120'])
@@ -123,16 +124,23 @@ def evaluate_platform_supports_fp8_grouped_gemm():
 def evaluate_platform_supports_mx_gemm():
     if torch.cuda.is_available():
         if torch.version.hip:
-            ROCM_VERSION = tuple(int(v) for v in torch.version.hip.split('.')[:2])
             if ROCM_VERSION >= (7, 0):
                 return 'gfx950' in torch.cuda.get_device_properties(0).gcnArchName
         else:
             return SM100OrLater
     return False
 
+def evaluate_platform_supports_mxfp8_grouped_gemm():
+    if torch.cuda.is_available() and not torch.version.hip:
+        built_with_fbgemm_genai = "USE_FBGEMM_GENAI" in torch.__config__.show()
+        return built_with_fbgemm_genai and IS_SM100
+    return False
+
 PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mx_gemm())
 PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
 PLATFORM_SUPPORTS_FP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_fp8_grouped_gemm())
+PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: TEST_CUDA and SM100OrLater)
+PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM: bool = LazyVal(lambda: evaluate_platform_supports_mxfp8_grouped_gemm())
 
 if TEST_NUMBA:
     try:
@@ -238,7 +246,7 @@ def tf32_enabled():
 # if device is specified, it will check if device is cuda
 # if dtype is specified, it will check if dtype is float32 or complex64
 # tf32 and fp32 are different only when all the three checks pass
-def tf32_on_and_off(tf32_precision=1e-5):
+def tf32_on_and_off(tf32_precision=1e-5, only_if=True):
     def with_tf32_disabled(self, function_call):
         with tf32_off():
             function_call()
@@ -254,7 +262,7 @@ def wrapper(f):
         @functools.wraps(f)
         def wrapped(*args, **kwargs):
             kwargs.update(zip(arg_names, args))
-            cond = torch.cuda.is_tf32_supported()
+            cond = torch.cuda.is_tf32_supported() and only_if
             if 'device' in kwargs:
                 cond = cond and (torch.device(kwargs['device']).type == 'cuda')
             if 'dtype' in kwargs:
@@ -268,7 +276,6 @@ def wrapped(*args, **kwargs):
         return wrapped
     return wrapper
 
-
 # This is a wrapper that wraps a test to run it with TF32 turned off.
 # This wrapper is designed to be used when a test uses matmul or convolutions
 # but the purpose of that test is not testing matmul or convolutions.
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 528497ba5457..8971eca1bb24 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -1277,26 +1277,39 @@ def __init__(self, dep, reason):
 
 
 def _has_sufficient_memory(device, size):
-    if torch.device(device).type == "cuda":
-        if not torch.cuda.is_available():
+    device_ = torch.device(device)
+    device_type = device_.type
+    if device_type in ["cuda", "xpu"]:
+        acc = torch.accelerator.current_accelerator()
+        # Case 1: no accelerator found
+        if not acc:
             return False
+        # Case 2: accelerator found but not matching device type
+        if acc.type != device_type:
+            return True
+        # Case 3: accelerator found and matching device type but not available
+        if not torch.accelerator.is_available():
+            return False
+        # Case 4: accelerator found and matching device type and available
         gc.collect()
-        torch.cuda.empty_cache()
-        # torch.cuda.mem_get_info, aka cudaMemGetInfo, returns a tuple of (free memory, total memory) of a GPU
-        if device == "cuda":
-            device = "cuda:0"
-        return (
-            torch.cuda.memory.mem_get_info(device)[0]
-            * torch.cuda.memory.get_per_process_memory_fraction(device)
-        ) >= size
-
-    if device == "xla":
-        raise unittest.SkipTest("TODO: Memory availability checks for XLA?")
+        torch.accelerator.empty_cache()
+
+        if device_.index is None:
+            device_ = torch.device(device_type, 0)
 
-    if device == "xpu":
-        raise unittest.SkipTest("TODO: Memory availability checks for Intel GPU?")
+        if device_type == "cuda":
+            return (
+                torch.cuda.memory.mem_get_info(device_)[0]
+                * torch.cuda.memory.get_per_process_memory_fraction(device_)
+            ) >= size
+
+        if device_type == "xpu":
+            return torch.xpu.memory.mem_get_info(device_)[0] >= size
+
+    if device_type == "xla":
+        raise unittest.SkipTest("TODO: Memory availability checks for XLA?")
 
-    if device != "cpu":
+    if device_type != "cpu":
         raise unittest.SkipTest("Unknown device type")
 
     # CPU
@@ -1342,7 +1355,6 @@ def dep_fn(self, *args, **kwargs):
             # an additional array of the same size as the input.
             if inductor and torch._inductor.config.cpp_wrapper and _device != "cpu":
                 size_bytes *= 2
-
             if not _has_sufficient_memory(_device, size_bytes):
                 raise unittest.SkipTest(f"Insufficient {_device} memory")
 
@@ -1577,6 +1589,12 @@ def __init__(self, *args):
         super().__init__(*args, device_type="cuda")
 
 
+# Overrides specified dtypes on Intel GPU.
+class dtypesIfXPU(dtypes):
+    def __init__(self, *args):
+        super().__init__(*args, device_type="xpu")
+
+
 class dtypesIfMPS(dtypes):
     def __init__(self, *args):
         super().__init__(*args, device_type="mps")
@@ -1960,14 +1978,18 @@ def get_all_device_types() -> list[str]:
     and torch.cpu._is_avx2_supported()
     and os.getenv("ATEN_CPU_CAPABILITY") != "default"
 )
+IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED = (
+    torch.xpu.is_available() and torch.utils._triton.has_triton()
+)
 flex_attention_supported_platform = unittest.skipUnless(
-    IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
+    IS_FLEX_ATTENTION_XPU_PLATFORM_SUPPORTED
+    or IS_FLEX_ATTENTION_CPU_PLATFORM_SUPPORTED
     or (
         torch.cuda.is_available()
         and torch.utils._triton.has_triton()
         and torch.cuda.get_device_capability() >= (8, 0)
     ),
-    "Requires CUDA and Triton, or CPU with avx2 and later",
+    "Requires CUDA and Triton, Intel GPU and triton, or CPU with avx2 and later",
 )
 if torch.version.hip and "gfx94" in torch.cuda.get_device_properties(0).gcnArchName:
     e4m3_type = torch.float8_e4m3fnuz
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 29584208b9f7..4c2c3e023031 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -12362,6 +12362,10 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                DecorateInfo(
                    toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
                    'TestCommon', 'test_variant_consistency_eager', device_type='cuda'),
+               # Higher differences starting with Zen3 or Alder Lake
+               DecorateInfo(
+                   toleranceOverride({torch.complex64: tol(atol=4e-05, rtol=4e-06)}),
+                   'TestDecomp', 'test_quick', device_type='cpu'),
                DecorateInfo(
                    toleranceOverride({torch.complex64: tol(atol=1e-05, rtol=1.2e-03)}),
                    'TestMathBits', 'test_conj_view', device_type='cuda'),
@@ -21078,6 +21082,7 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
             # NOTE: Only run on MPS
             DecorateInfo(unittest.skip('Skipped!'), device_type='cpu'),
             DecorateInfo(unittest.skip('Skipped!'), device_type='cuda'),
+            DecorateInfo(unittest.skip('Skipped!'), device_type='xpu'),
             DecorateInfo(unittest.skip('Skipped!'), device_type='meta'),
         ),),
     OpInfo(
diff --git a/torch/testing/_internal/common_mps.py b/torch/testing/_internal/common_mps.py
index 371c2745ade5..ea07fd3c0514 100644
--- a/torch/testing/_internal/common_mps.py
+++ b/torch/testing/_internal/common_mps.py
@@ -12,8 +12,9 @@
 
     def mps_ops_modifier(
         ops: Sequence[OpInfo],
-        device_type: Optional[str] = None,
+        device_type: str = "mps",
         xfail_exclusion: Optional[list[str]] = None,
+        sparse: bool = False,
     ) -> Sequence[OpInfo]:
         if xfail_exclusion is None:
             xfail_exclusion = []
@@ -37,6 +38,7 @@ def mps_ops_modifier(
             "as_strided_copy",
             "as_strided_scatter",
             "asin",
+            "asinh",
             "acos",
             "atan",
             "broadcast_tensors",
@@ -294,7 +296,7 @@ def mps_ops_modifier(
         }
 
         # Those ops are not expected to work
-        UNIMPLEMENTED_XFAILLIST = {
+        UNIMPLEMENTED_XFAILLIST: dict[str, Optional[list]] = {
             # Failures due to lack of op implementation on MPS backend
             "logspace": None,
             "logspacetensor_overload": None,
@@ -311,13 +313,11 @@ def mps_ops_modifier(
             "nn.functional.grid_sample": None,  # Unsupported Border padding mode
             "hash_tensor": None,
             "heaviside": None,
-            "igamma": None,
-            "igammac": None,
             "index_reduceprod": None,
             "index_reducemean": None,
             "index_reduceamax": None,
             "index_reduceamin": None,
-            "kthvalue": None,
+            # "kthvalue": None,
             "lcm": None,
             "linalg.cond": None,
             "linalg.eigh": None,
@@ -340,7 +340,6 @@ def mps_ops_modifier(
             "masked.median": None,
             "matrix_exp": None,
             "mode": None,
-            "native_dropout_backward": None,
             "normnuc": None,
             "nn.functional.fractional_max_pool2d": None,
             "nn.functional.fractional_max_pool3d": None,
@@ -439,9 +438,13 @@ def mps_ops_modifier(
                 torch.uint8,
                 torch.int8,
             ],
-            # round not working properly for float16 and bfloat16
-            "round": [torch.float16, torch.bfloat16],
-            "rounddecimals_0": [torch.bfloat16],
+        }
+        UNIMPLEMENTED_XFAILLIST_SPARSE: dict[str, Optional[list]] = {
+            "logspace": None,
+            "logspacetensor_overload": None,
+            "linalg.eig": None,
+            "linalg.eigvals": None,
+            "put": None,
         }
 
         if MACOS_VERSION < 15.0:
@@ -451,8 +454,10 @@ def mps_ops_modifier(
                     "nanquantile": None,
                 }
             )
+        if sparse:
+            UNIMPLEMENTED_XFAILLIST.update(UNIMPLEMENTED_XFAILLIST_SPARSE)
 
-        UNDEFINED_XFAILLIST = {
+        UNDEFINED_XFAILLIST: dict[str, Optional[list]] = {
             # Top 60 operators
             # topk fails with duplicate indices
             "topk": [
@@ -529,7 +534,7 @@ def mps_ops_modifier(
             ],
         }
 
-        ON_MPS_XFAILLIST = {
+        ON_MPS_XFAILLIST: dict[str, Optional[list]] = {
             # Failures due to lack of implementation of downstream functions on MPS backend
             # TODO: remove these once downstream function 'aten::_linalg_svd.U' have been implemented
             "linalg.matrix_rank": None,
@@ -602,6 +607,30 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
 
         for op in ops:
             key = op.name + op.variant_test_name
+            addDecorator(
+                op,
+                DecorateInfo(
+                    unittest.expectedFailure,
+                    dtypes=[
+                        torch.double,
+                        torch.cdouble,
+                    ],
+                ),
+            )
+            if sparse:
+                # Skipped due to test_sparse_zero_dims test in test_sparse.py which allocates empty tensor
+                # which leads to unexpected success with it
+                addDecorator(
+                    op,
+                    DecorateInfo(
+                        unittest.skip(
+                            "Skipped due to MPS not supporting complex128 tensors"
+                        ),
+                        dtypes=[
+                            torch.complex128,
+                        ],
+                    ),
+                )
             if key in EMPTY_OPS_SKIPLIST:
                 addDecorator(
                     op,
@@ -664,6 +693,8 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "masked.scatter": [torch.float16, torch.float32],
             "grid_sampler_3d": None,
             "index_fill": [torch.float16, torch.float32],  # missing `aten::_unique`.
+            "igamma": None,  # currently not supported for any device
+            "igammac": None,  # currently not supported for any device
             "linalg.solve": [torch.float16, torch.float32],  # missing `aten::lu_solve`.
             "linalg.solve_ex": [
                 torch.float16,
@@ -725,8 +756,6 @@ def mps_ops_grad_modifier(ops: Sequence[OpInfo]) -> Sequence[OpInfo]:
             "signal.windows.kaiser": [torch.float32],
             "signal.windows.nuttall": [torch.float32],
             "eye": [torch.float16, torch.float32],
-            # round not working properly for float16
-            "round": [torch.float16],
             # topk fails with duplicate indices
             "topk": [torch.float16],
         }
@@ -810,3 +839,12 @@ def addDecorator(op: OpInfo, d: DecorateInfo) -> None:
                 addDecorator(op, DecorateInfo(unittest.expectedFailure))
 
         return ops
+else:
+
+    def mps_ops_modifier(
+        ops: Sequence[OpInfo],
+        device_type: str = "mps",
+        xfail_exclusion: Optional[list[str]] = None,
+        sparse: bool = False,
+    ) -> Sequence[OpInfo]:
+        return ops
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index f8671379950e..600848b80a7e 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -3394,7 +3394,7 @@ def get_default_quantizer(is_qat, is_dynamic, inputs):
 
     maybe_no_grad = contextlib.nullcontext() if is_qat else torch.no_grad()
     with maybe_no_grad:
-        export_model = export_for_training(mod, inputs, strict=True).module()
+        export_model = export_for_training(mod, inputs, strict=True).module(check_guards=False)
         quantizer = (
             quantizer
             if quantizer
diff --git a/torch/testing/_internal/common_quantized.py b/torch/testing/_internal/common_quantized.py
index 9dc177a7899b..0dc9d4cb3db7 100644
--- a/torch/testing/_internal/common_quantized.py
+++ b/torch/testing/_internal/common_quantized.py
@@ -479,3 +479,110 @@ def to_blocked(input_matrix) -> torch.Tensor:
     rearranged = blocks.reshape(-1, 4, 32, 4).transpose(1, 2).reshape(-1, 32, 16)
 
     return rearranged.flatten()
+
+# This function is extracted from https://github.com/pytorch/ao/blob/v0.12.0/torchao/prototype/mx_formats/mx_tensor.py#L142
+def to_mxfp8(
+    data_hp: torch.Tensor,
+    block_size: int = 32,
+):
+    assert data_hp.dtype in (
+        torch.bfloat16,
+        torch.float,
+    ), f"{data_hp.dtype} is not supported yet"
+    assert (
+        data_hp.shape[-1] % block_size == 0
+    ), f"the last dimension of shape {data_hp.shape} must be divisible by block_size {block_size}"
+    assert data_hp.is_contiguous(), "unsupported"
+
+    orig_shape = data_hp.shape
+    data_hp = data_hp.reshape(
+        *orig_shape[:-1], orig_shape[-1] // block_size, block_size
+    )
+
+    max_abs = torch.amax(torch.abs(data_hp), -1).unsqueeze(-1)
+
+    data_hp = data_hp.to(torch.float32)
+    max_abs = max_abs.to(torch.float32)
+
+    F8E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max  # 448.0
+    max_pos = F8E4M3_MAX
+
+    # RCEIL
+    def _to_mx_rceil(
+        data_hp: torch.Tensor,
+        max_abs: torch.Tensor,
+        max_pos: float,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        E8M0_EXPONENT_BIAS = 127
+        descale = max_abs / max_pos
+        exponent = torch.where(
+            torch.isnan(descale),
+            0xFF,  # Handle biased exponent for nan
+            # NOTE: descale < (torch.finfo(torch.float32).smallest_normal / 2) is handled through clamping
+            (
+                torch.clamp(
+                    torch.ceil(torch.log2(descale)),
+                    min=-E8M0_EXPONENT_BIAS,
+                    max=E8M0_EXPONENT_BIAS,
+                )
+                + E8M0_EXPONENT_BIAS
+            ).to(torch.uint8),
+        )
+
+        descale_fp = torch.where(
+            exponent == 0,
+            1.0,
+            torch.exp2(E8M0_EXPONENT_BIAS - exponent.to(torch.float32)),
+        )
+
+        # scale and saturated cast the data elements to max of target dtype
+        data_lp = torch.clamp(data_hp * descale_fp, min=-1 * max_pos, max=max_pos)
+        return exponent, data_lp
+
+    scale_e8m0_biased, data_lp = _to_mx_rceil(data_hp, max_abs, max_pos)
+
+    # cast to target dtype
+    data_lp = data_lp.to(torch.float8_e4m3fn)
+    # need to reshape at the end to help inductor fuse things
+    data_lp = data_lp.reshape(orig_shape)
+
+    scale_e8m0_biased = scale_e8m0_biased.view(torch.float8_e8m0fnu)
+    scale_e8m0_biased = scale_e8m0_biased.squeeze(-1)
+    return scale_e8m0_biased, data_lp
+
+# Source: https://github.com/pytorch/ao/blob/568c1932a16ae9f30d48da214a88dc0013e98ed8/torchao/prototype/moe_training/utils.py#L310
+def generate_jagged_offs(E, M, multiple_of=16, dtype=torch.int32, device="cuda"):
+    """
+    Utility function for tests and benchmarks.
+
+    Generates a tensor of length E, containing random values divisible by `multiple_of`,
+    from 0 to M, in sorted order, and where the final value in the tensor is always M.
+    Args:
+        E (int): The length of the tensor.
+        M (int): The maximum value in the tensor.
+    Returns:
+        torch.Tensor: A tensor of length E with the specified properties.
+    """
+    import random
+
+    # Ensure M is divisible by 16
+    if M % multiple_of != 0:
+        raise ValueError(f"M must be divisible by {multiple_of}")
+
+    # Generate a list of possible values
+    possible_values = list(range(multiple_of, M + 1, multiple_of))
+
+    # If E is larger than the number of possible values, raise an error
+    if E > len(possible_values):
+        raise ValueError("E cannot be larger than the number of possible values")
+
+    # Randomly select E - 1 values from the possible values (excluding M)
+    selected_values = torch.tensor(random.sample(possible_values[:-1], E - 1))
+
+    # Append M to the selected values
+    selected_values = torch.cat((selected_values, torch.tensor([M])))
+
+    # Sort the selected values
+    selected_values, _ = torch.sort(selected_values)
+
+    return selected_values.to(dtype).to(device)
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index e498eab9cfb4..e25e08fbf509 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -394,11 +394,14 @@ def init_pg(self, eager_init, backend: Optional[str] = None) -> None:
         device_id = None
         if "nccl" in backend or "xccl" in backend:
             # set device for nccl pg for collectives
+            # TODO: if users want to enable testing across hosts, we may need
+            # to change this part.
             torch.accelerator.set_device_index(self.rank)
             # we only need to set device_id for nccl backend with eager init
             device_id = (
                 torch.device(f"{self.device_type}:{self.rank}") if eager_init else None
             )
+
         # For nccl backend, bind the device to the process if device_id is not None
         # so the nccl communicator is immediately formed and we can use `ncclCommSplit`
         # for form subgroup to avoid unnecesssary overhead.
@@ -420,7 +423,18 @@ def destroy_pg(self, device_id: Optional[int] = None) -> None:
             device_id = (
                 torch.cuda.current_device() if self.device_type == "cuda" else self.rank
             )
-        dist.barrier(device_ids=[device_id])
+
+        if self.device_type == "cpu" and torch._C._get_accelerator().type != "cpu":
+            # NOTE: when `device_id` is not None, barrier() will choose the accelerator
+            # of the most pripority, which means if the test specifies to use CPU for
+            # testing while CUDA is available on the host, the barrier() will use CUDA.
+            # To avoid this and better respect `self.device_type`, we add this branch to
+            # enforce barrier() to use CPU when `self.device_type` is CPU and other
+            # accelerator is also available.
+            dist.barrier()
+        else:
+            dist.barrier(device_ids=[device_id])
+
         dist.destroy_process_group()
 
     def setUp(self) -> None:
diff --git a/torch/testing/_internal/hop_db.py b/torch/testing/_internal/hop_db.py
index 50462859019c..2a0883408892 100644
--- a/torch/testing/_internal/hop_db.py
+++ b/torch/testing/_internal/hop_db.py
@@ -202,6 +202,15 @@ def body_fn(iter_t, x):
 
     return torch._higher_order_ops.while_loop(cond_fn, body_fn, (iter_t, x))
 
+def simple_while_loop_stack_output(iter_t, x):
+    def cond_fn(iter_t, x):
+        return iter_t > 0
+
+    def body_fn(iter_t, x):
+        return iter_t - 1, x.cos()
+
+    return torch._higher_order_ops.while_loop_stack_output(cond_fn, body_fn, (iter_t, x), tuple())
+
 
 def sample_inputs_scan(opinfo, device, dtype, requires_grad, **kwargs):
     make_arg = functools.partial(
@@ -374,6 +383,19 @@ def fn(x):
         check_inplace_batched_forward_grad=False,
         supports_autograd=False,
     ),
+    OpInfo(
+        name="while_loop_stack_output",
+        variant_test_name="simple",
+        op=simple_while_loop_stack_output,
+        sample_inputs_func=sample_inputs_while_loop,
+        dtypes=all_types_and(torch.bool, torch.half),
+        supports_out=False,
+        check_batched_grad=False,
+        check_batched_gradgrad=False,
+        check_batched_forward_grad=False,
+        check_inplace_batched_forward_grad=False,
+        supports_autograd=False,
+    ),
     OpInfo(
         name="auto_functionalize",
         variant_test_name="simple",
diff --git a/torch/testing/_internal/inductor_utils.py b/torch/testing/_internal/inductor_utils.py
index 9befe1146e56..661181243250 100644
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@@ -9,12 +9,11 @@
 import os
 from subprocess import CalledProcessError
 import sys
-from typing import Any, Optional
 import torch._inductor.async_compile  # noqa: F401 required to warm up AsyncCompile pools
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch._dynamo.device_interface import get_interface_for_device
 from torch._inductor.graph import GraphLowering
 from torch._inductor.compile_fx import shape_env_from_inputs
+from torch._inductor.utils import OrderedSet
 from torch._inductor.codecache import CppCodeCache
 from torch._inductor.custom_graph_pass import CustomGraphModulePass
 from torch._inductor.codegen.common import (
@@ -37,6 +36,8 @@
 from torch.testing._internal.common_utils import (
     LazyVal,
     IS_FBCODE,
+)
+from torch.testing._internal.common_utils import (
     TestCase,
     IS_CI,
     IS_WINDOWS,
@@ -306,6 +307,24 @@ def _quantize_rowwise(x: Tensor, float8_dtype: torch.dtype):
     inverse_scale = scale.reciprocal()
     return x_fp8, inverse_scale
 
+class MockGraphHandler(GraphLowering):
+    """Minimal mock graph handler for testing virtualized context."""
+
+    def __init__(self, name_to_buffer=None):
+        import torch._inductor.sizevars
+
+        self.sizevars = torch._inductor.sizevars.SizeVarAllocator()
+        self.name_to_buffer = name_to_buffer or {}
+        self.graph_inputs = {}
+        self.mutated_buffers = OrderedSet()
+        self.removed_buffers = OrderedSet()
+        self.constants = {}
+        self.scheduler = None
+
+    def get_dtype(self, buffer_name: str) -> torch.dtype:  # noqa: ARG002
+        """Return default dtype for any buffer (for testing)."""
+        return torch.float32
+
 @contextlib.contextmanager
 def patch_inductor_backend(
     device: str,
@@ -323,6 +342,7 @@ def patch_inductor_backend(
     original_scheduling = get_scheduling_for_device(device)
     original_python_wrapper = get_wrapper_codegen_for_device(device, False)
     original_cpp_wrapper = get_wrapper_codegen_for_device(device, True)
+    original_fx_wrapper = get_wrapper_codegen_for_device(device, fx_wrapper=True)
     original_custom_pass = get_custom_backend_pass_for_device(device)
     original_custom_backend_config = get_custom_backend_config_for_device(device)
 
@@ -333,6 +353,7 @@ def patch_inductor_backend(
             original_scheduling,
             python_wrapper_codegen if python_wrapper_codegen is not None else original_python_wrapper,
             original_cpp_wrapper,
+            original_fx_wrapper,
             custom_pass if custom_pass is not None else original_custom_pass,
             custom_backend_config if custom_backend_config is not None else original_custom_backend_config
         )
@@ -344,61 +365,7 @@ def patch_inductor_backend(
             original_scheduling,
             original_python_wrapper,
             original_cpp_wrapper,
+            original_fx_wrapper,
             original_custom_pass,
             original_custom_backend_config
         )
-
-def backend_for_device(device: str) -> Optional[str]:
-    """ Get the Inductor codegen backend used for the device ``device``. """
-    if dev_int := get_interface_for_device(device):
-        return dev_int.inductor_backend()
-    return None
-
-def try_patch_inductor_backend_config(device: str, key: str,
-                                      value: Any) -> contextlib.ContextDecorator:
-    """
-    Try to patch the backend-specific Inductor options, for the codegen backend
-    corresponding to the given ``device``. If that config can't be found to
-    patch, skip the test.
-
-    Will patch the member of the global ``config.$BACKEND``, if it exists. If
-    the given device also specifies a custom config module, will also try to
-    patch its ``$BACKEND`` member if it exists.
-
-    """
-    device_backend = backend_for_device(device)
-
-    if device_backend is None:
-        return unittest.skip(
-            f"Can't patch Inductor config {key} for device {device}")
-
-    config_modules = [torch._inductor.config]
-    if custom_config_module := get_custom_backend_config_for_device(device):
-        config_modules.append(custom_config_module)
-
-    contexts: list[contextlib.ContextDecorator] = []
-
-    for mod in config_modules:
-        if (
-                hasattr(mod, f"{device_backend}")
-                and hasattr(mod, f"{device_backend}.{key}")
-        ):
-            contexts.append(mod.patch(f"{device_backend}.{key}", value))
-
-    if len(contexts) == 0:
-        return unittest.skip(
-            f"Can't patch Inductor config {key} for device {device}")
-
-    class ContextStack(contextlib.ContextDecorator):
-        def __init__(self, contexts: list[contextlib.ContextDecorator]) -> None:
-            self.contexts: list[contextlib.ContextDecorator] = contexts
-
-        def __enter__(self) -> None:
-            for cd in self.contexts:
-                cd.__enter__()
-
-        def __exit__(self, exc_type, exc_val, exc_tb):  # type: ignore[no-untyped-def]
-            for cd in self.contexts:
-                cd.__exit__(exc_type, exc_val, exc_tb)
-
-    return ContextStack(contexts)
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
index 40687995470b..4edaf86dd1d7 100644
--- a/torch/testing/_internal/triton_utils.py
+++ b/torch/testing/_internal/triton_utils.py
@@ -2,19 +2,83 @@
 
 import unittest
 
-from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON, HAS_GPU
+from torch.testing._internal.inductor_utils import (
+    HAS_CUDA_AND_TRITON,
+    HAS_GPU,
+    HAS_XPU_AND_TRITON,
+)
 from torch.utils._triton import has_triton
 
 
 requires_cuda_and_triton = unittest.skipUnless(
     HAS_CUDA_AND_TRITON, "requires cuda and triton"
 )
+requires_gpu_and_triton = unittest.skipUnless(
+    HAS_XPU_AND_TRITON or HAS_CUDA_AND_TRITON, "requires gpu and triton"
+)
 requires_gpu = unittest.skipUnless(HAS_GPU, "requires gpu")
 
 if has_triton():
     import triton
     from triton import language as tl
 
+    import torch
+
+    def _get_strange_configs() -> list[triton.Config]:
+        if torch.version.hip:
+            configs = [
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 16,
+                        "BLOCK_SIZE_N": 16,
+                        "BLOCK_SIZE_K": 16,
+                        "GROUP_SIZE_M": 4,
+                        "matrix_instr_nonkdim": 16,
+                        "waves_per_eu": 3,
+                        "kpack": 2,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 128,
+                        "BLOCK_SIZE_N": 64,
+                        "BLOCK_SIZE_K": 16,
+                        "GROUP_SIZE_M": 4,
+                        "matrix_instr_nonkdim": 16,
+                        "waves_per_eu": 3,
+                        "kpack": 2,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+            ]
+        else:
+            configs = [
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 16,
+                        "BLOCK_SIZE_N": 16,
+                        "BLOCK_SIZE_K": 16,
+                        "GROUP_SIZE_M": 4,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_SIZE_M": 128,
+                        "BLOCK_SIZE_N": 64,
+                        "BLOCK_SIZE_K": 32,
+                        "GROUP_SIZE_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                ),
+            ]
+        return configs
+
     # Define here so that multiple tests can take advantage of it
     @triton.jit
     def add_kernel(
diff --git a/torch/utils/_python_dispatch.py b/torch/utils/_python_dispatch.py
index a0f1c0a222bf..5441468eb3b5 100644
--- a/torch/utils/_python_dispatch.py
+++ b/torch/utils/_python_dispatch.py
@@ -1,10 +1,11 @@
 # mypy: allow-untyped-defs
 import contextlib
+import functools
 import warnings
 from collections import deque
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Optional, overload, Protocol, Union
+from typing import Optional, overload, Protocol, Union
 from typing_extensions import TypeIs
 
 import torch
@@ -27,6 +28,8 @@
 
 _is_in_torch_dispatch_mode = False
 _is_in_non_infra_torch_dispatch_mode = False
+# If inside any mode that has ignore_compile_internals() = False
+_is_in_any_mode_without_ignore_compile_internals = False
 
 
 def is_in_torch_dispatch_mode(include_infra_modes=True) -> bool:
@@ -37,6 +40,10 @@ def is_in_torch_dispatch_mode(include_infra_modes=True) -> bool:
     )
 
 
+def is_in_any_mode_without_ignore_compile_internals() -> bool:
+    return _is_in_any_mode_without_ignore_compile_internals
+
+
 class TorchDispatchMode:
     """
     A ``TorchDispatchMode`` allows you to override the meaning of all
@@ -81,6 +88,9 @@ def __init__(self, _dispatch_key=None):
 
         self.old_dispatch_mode_flags: deque[bool] = deque()
         self.old_non_infra_dispatch_mode_flags: deque[bool] = deque()
+        self.old_without_ignore_compile_internals_dispatch_mode_flags: deque[bool] = (
+            deque()
+        )
 
     def _lazy_init_old_dispatch_mode_flags(self):
         if not hasattr(self, "old_dispatch_mode_flags"):
@@ -89,12 +99,21 @@ def _lazy_init_old_dispatch_mode_flags(self):
         if not hasattr(self, "old_non_infra_dispatch_mode_flags"):
             self.old_non_infra_dispatch_mode_flags: deque[bool] = deque()  # type: ignore[no-redef]
 
+        if not hasattr(
+            self, "old_without_ignore_compile_internals_dispatch_mode_flags"
+        ):
+            self.old_without_ignore_compile_internals_dispatch_mode_flags: deque[  # type: ignore[no-redef]
+                bool
+            ] = deque()
+
     def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         raise NotImplementedError
 
     def __enter__(self):
         global _is_in_torch_dispatch_mode
         global _is_in_non_infra_torch_dispatch_mode
+        global _is_in_any_mode_without_ignore_compile_internals
+
         # Previously, there wasn't any state in this class' constructor
         # super calls were added to existing modes, but for any new modes
         # this will replicate the previous behavior of not strictly needing
@@ -108,6 +127,13 @@ def __enter__(self):
         _is_in_non_infra_torch_dispatch_mode = (
             _is_in_non_infra_torch_dispatch_mode or not self.is_infra_mode()
         )
+        self.old_without_ignore_compile_internals_dispatch_mode_flags.append(
+            _is_in_any_mode_without_ignore_compile_internals
+        )
+        _is_in_any_mode_without_ignore_compile_internals = (
+            _is_in_any_mode_without_ignore_compile_internals
+            or not self.ignore_compile_internals()
+        )
         _push_mode(self)
         return self
 
@@ -123,6 +149,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         _is_in_non_infra_torch_dispatch_mode = (
             self.old_non_infra_dispatch_mode_flags.pop()
         )
+        global _is_in_any_mode_without_ignore_compile_internals
+        _is_in_any_mode_without_ignore_compile_internals = (
+            self.old_without_ignore_compile_internals_dispatch_mode_flags.pop()
+        )
         _pop_mode(mb_dk_or_mode_key)
 
     @classmethod
@@ -137,6 +167,38 @@ def push(cls, *args, **kwargs):
     def is_infra_mode(cls):
         return False
 
+    @classmethod
+    def ignore_compile_internals(cls):
+        """Ignore operators that are compiled via torch.compile.
+
+        If ``True``, then this TorchDispatchMode ignores operators that
+        are optimized by :func:`torch.compile`. Mechanically, this involves
+        turning off the TorchDispatchMode throughout the whole compilation process,
+        and turning it back on for the runtime of the compiled artifact(s).
+        For example,
+
+        @torch.compile
+        def f(x):
+            return x.sin().cos()
+
+        with LoggingMode():
+            f(x)
+
+        The above example will not log anything if
+        ``LoggingMode.ignore_compile_internals()`` is True.
+        torch.compile will fuse sin() and cos() into a single operation
+        and this TorchDispatchMode will not be passed sin and cos.
+
+        If ``False`` (default), :func:`torch.compile` will respect
+        the eager semantics of passing this TorchDispatchMode all
+        operators that would have run during eager execution.
+        The way this will usually happen is that :func:`torch.compile`
+        will just fallback to eager-mode PyTorch.
+        """
+        if cls.is_infra_mode():
+            return True
+        return False
+
 
 def _get_current_dispatch_mode():
     stack_len = _len_torch_dispatch_stack()
@@ -393,7 +455,7 @@ def is_traceable_wrapper_subclass(t: object) -> TypeIs[TensorWithFlatten]:
                 that require the stride info to be constructed. In most cases, this arg can be
                 safely ignored.
     """
-    is_subclass = isinstance(t, torch.Tensor) and type(t) != torch.Tensor
+    is_subclass = isinstance(t, torch.Tensor) and type(t) is not torch.Tensor
     return (
         is_subclass
         and hasattr(t, "__tensor_flatten__")
@@ -405,7 +467,7 @@ def is_traceable_wrapper_subclass_type(t: type) -> TypeIs[type[TensorWithFlatten
     """Same as above, but takes a type argument instead of an instance."""
     return (
         issubclass(t, torch.Tensor)
-        and t != torch.Tensor
+        and t is not torch.Tensor
         and hasattr(t, "__tensor_flatten__")
         and hasattr(t, "__tensor_unflatten__")
     )
@@ -474,7 +536,16 @@ def alias_non_inplace_storage(arg, ret):
         # in theory if a subclass that needs this API wants to sometimes return
         # plain tensors, we could remove the assert and just not perform the aliasing,
         # but it seems safer to learn more about this case first.
-        if is_traceable_wrapper_subclass(arg) or is_traceable_wrapper_subclass(ret):
+        #
+        # Performance note: This is all just to assert that the argument and result
+        # types match, checking that is cheaper than is_traceable_wrapper_subclass_type,
+        # and multiple returns are relatively unlikely, so just check up front!
+        arg_type = type(arg)
+        ret_type = type(ret)
+        if arg_type is not ret_type and (
+            is_traceable_wrapper_subclass_type(arg_type)
+            or is_traceable_wrapper_subclass_type(ret_type)
+        ):
             ret_list = ret if isinstance(ret, list) else [ret]
             for r in ret_list:
                 assert type(arg) == type(
@@ -499,13 +570,10 @@ def alias_non_inplace_storage(arg, ret):
             assert isinstance(ret, torch.Tensor), f"type: {type(ret)}"
             torch._functionalize_unsafe_set(ret, arg)
 
-    num_args = len(func._schema.arguments)
-    num_returns = len(func._schema.returns)
-    for arg_idx in range(num_args):
-        for return_idx in range(num_returns):
-            schema_arg = schema_info.args[arg_idx]
+    for arg_idx, schema_arg in enumerate(schema_info.args):
+        for return_idx, schema_out in enumerate(schema_info.outs):
             is_read_only_alias_match = (
-                schema_arg.alias_set & schema_info.outs[return_idx].alias_set
+                schema_arg.alias_set & schema_out.alias_set
             ) and not schema_arg.is_write
             if is_read_only_alias_match:
                 alias_non_inplace_storage(args[arg_idx], outs[return_idx])
@@ -526,16 +594,19 @@ class SchemaInfo:
     args: list[AliasInfo]
     outs: list[AliasInfo]
 
-
-# Can't import torch._ops.OpOverload due to circular reference
-parsed_schema_map: dict[Any, SchemaInfo] = {}
+    # NOTE[SchemaInfo int_tags]: This has nothing to do with aliasing, but we take
+    # advantage of our existing caching of data for each OpOverload to paper over an
+    # efficiency problem with pybind11::enum_ (which currently is used to implement
+    # torch.Tag): a scan over a list of pybind enums using `in` is inefficient because
+    # each element must be converted to int with the __int__ method, which incurs a lot
+    # of overhead. Converting to int once and caching removes this per-op overhead.
+    int_tags: list[int]
 
 
 # Given an OpOverload, returns schema information on it.
 # This is cached for efficiency, since it can involve running torchgen
+@functools.cache
 def get_alias_info(func) -> SchemaInfo:
-    if func in parsed_schema_map:
-        return parsed_schema_map[func]
     # For ATen ops: use torchgen (since torchscript parser doesn't handle alias annotations
     # properly for some ops that output tensorlists)
     if func.namespace == "aten":
@@ -597,11 +668,16 @@ def get_alias_info(func) -> SchemaInfo:
             )
             for a in func._schema.returns
         ]
-    schema_info = SchemaInfo(args=arg_schemas, outs=out_schemas)
-    parsed_schema_map[func] = schema_info
+    schema_info = SchemaInfo(
+        args=arg_schemas, outs=out_schemas, int_tags=[int(x) for x in func.tags]
+    )
     return schema_info
 
 
+# See NOTE[SchemaInfo int_tags] above.
+_TORCH_TAG_INPLACE_VIEW_INT = int(torch.Tag.inplace_view)  # type: ignore[call-overload]
+
+
 def return_and_correct_aliasing(func, args, kwargs, out):
     """
     This function should be used by wrapper tensor ``__torch_dispatch__`` subclasses
@@ -623,14 +699,14 @@ def return_and_correct_aliasing(func, args, kwargs, out):
     schema_info = get_alias_info(func)
 
     def get_write_alias(x):
-        if len(x.alias_set) == 0:
+        alias_set = x.alias_set
+        if not alias_set or not x.is_write:
             return None
-        alias_set = list(x.alias_set)
         # torchscript allows for complicated alias sets, but our dispatcher ops only really involve simple aliasing
         assert len(alias_set) == 1
-        if x.is_write:
-            return alias_set[0]
-        return None
+        # timeit says next(iter(alias_set)) is faster than list(alias_set)[0] even for
+        # set of size 1 on Python 3.13.
+        return next(iter(alias_set))
 
     def get_arg_from_alias(output_alias, schema_info, args, kwargs):
         new_args, new_kwargs = torch.fx.operator_schemas.normalize_function(  # type: ignore[misc]
@@ -656,7 +732,8 @@ def get_arg_from_alias(output_alias, schema_info, args, kwargs):
 
     # For inplace_view ops in particular, we'll try hard to make sure that the wrapper subclass's
     # metadata is set correctly.
-    if torch.Tag.inplace_view in func.tags:
+    # See NOTE[SchemaInfo int_tags] above.
+    if _TORCH_TAG_INPLACE_VIEW_INT in schema_info.int_tags:
         # no_dispatch() to make sure that we secretly change the metadata on the wrapper,
         # but don't end up dispatching the op anywhere else.
         mutated_args = [
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index 2b6c159f5c3a..836cffc2144b 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -3,7 +3,8 @@
 import math
 import operator
 import sys
-from typing import Callable, Optional, SupportsFloat, TYPE_CHECKING, TypeVar, Union
+from collections.abc import Callable
+from typing import Optional, SupportsFloat, TYPE_CHECKING, TypeVar, Union
 from typing_extensions import TypeVarTuple, Unpack
 
 import sympy
@@ -1192,7 +1193,8 @@ def eval(cls, *args):
             # When all strides are integral, we can sort, and the size for the
             # largest stride doesn't matter and can be arbitrarily symbolic
             s_sizes, s_strides = zip(
-                *sorted(zip(sizes, strides), key=operator.itemgetter(1))
+                *sorted(zip(sizes, strides, strict=False), key=operator.itemgetter(1)),
+                strict=False,
             )
             # Put something arbitrary in the max size spot, it'll be ignored
             if all(isinstance(a, sympy.Integer) for a in s_sizes[:-1]):
@@ -1411,7 +1413,10 @@ def eval(cls, a, b):
                 return sympy.Integer(getattr(operator, real_op_name)(int(a), int(b)))
             return None
 
-    BitwiseFn.__name__ = "BitwiseFn_" + name
+    nm = "BitwiseFn_" + name
+    BitwiseFn.__name__ = nm
+    BitwiseFn.__qualname__ = nm
+
     return BitwiseFn
 
 
diff --git a/torch/utils/_triton.py b/torch/utils/_triton.py
index af1e5e0e6f42..7d545e822164 100644
--- a/torch/utils/_triton.py
+++ b/torch/utils/_triton.py
@@ -13,6 +13,17 @@ def has_triton_package() -> bool:
         return False
 
 
+@functools.cache
+def get_triton_version(fallback: tuple[int, int] = (0, 0)) -> tuple[int, int]:
+    try:
+        import triton  # noqa: F401
+
+        major, minor = tuple(int(v) for v in triton.__version__.split(".")[:2])
+        return (major, minor)
+    except ImportError:
+        return fallback
+
+
 @functools.cache
 def _device_supports_tma() -> bool:
     import torch
@@ -71,7 +82,7 @@ def has_triton_tma_device() -> bool:
             torch.cuda.is_available()
             and torch.cuda.get_device_capability() >= (9, 0)
             and not torch.version.hip
-        ):
+        ) or torch.xpu.is_available():
             # old API
             try:
                 from triton.language.extra.cuda import (  # noqa: F401
@@ -103,7 +114,7 @@ def has_triton_stable_tma_api() -> bool:
             torch.cuda.is_available()
             and torch.cuda.get_device_capability() >= (9, 0)
             and not torch.version.hip
-        ):
+        ) or torch.xpu.is_available():
             try:
                 from triton.language import make_tensor_descriptor  # noqa: F401
 
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index f23ae6aafff7..902d2fe6ce0f 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -22,9 +22,8 @@
 import torch._appdirs
 from .file_baton import FileBaton
 from ._cpp_extension_versioner import ExtensionVersioner
-from .hipify import hipify_python
-from .hipify.hipify_python import GeneratedFileCleaner
 from typing import Optional, Union
+from typing_extensions import deprecated
 from torch.torch_version import TorchVersion, Version
 
 from setuptools.command.build_ext import build_ext
@@ -1368,6 +1367,7 @@ def CUDAExtension(name, sources, *args, **kwargs):
     include_dirs = kwargs.get('include_dirs', [])
 
     if IS_HIP_EXTENSION:
+        from .hipify import hipify_python
         build_dir = os.getcwd()
         hipify_result = hipify_python.hipify(
             project_directory=build_dir,
@@ -1705,6 +1705,9 @@ def load(name,
         is_standalone,
         keep_intermediates=keep_intermediates)
 
+@deprecated("PyBind11 ABI handling is internal to PyBind11; this will be removed after PyTorch 2.9.0")
+def _get_pybind11_abi_build_flags() -> list[str]:
+    return []
 
 def check_compiler_is_gcc(compiler):
     if not IS_LINUX:
@@ -2105,6 +2108,8 @@ def _jit_compile(name,
     if baton.try_acquire():
         try:
             if version != old_version:
+                from .hipify import hipify_python
+                from .hipify.hipify_python import GeneratedFileCleaner
                 with GeneratedFileCleaner(keep_intermediates=keep_intermediates) as clean_ctx:
                     if IS_HIP_EXTENSION and (with_cuda or with_cudnn):
                         hipify_result = hipify_python.hipify(
@@ -2413,10 +2418,6 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
 
     # If not given or set as native, determine what's best for the GPU / CUDA version that can be found
     if not _arch_list or _arch_list == "native":
-        if not _arch_list:
-            logger.warning(
-                "TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. \n"
-                "If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.")
         arch_list = []
         # the assumption is that the extension should run on any of the currently visible cards,
         # which could be of different types - therefore all archs for visible cards should be included
@@ -2435,6 +2436,15 @@ def _get_cuda_arch_flags(cflags: Optional[list[str]] = None) -> list[str]:
                 arch_list.append(arch)
         arch_list = sorted(arch_list)
         arch_list[-1] += '+PTX'
+
+        if not _arch_list:
+            # Only log on rank 0 in distributed settings to avoid spam
+            if not torch.distributed.is_available() or not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
+                arch_list_str = ';'.join(arch_list)
+                logger.debug(
+                    "TORCH_CUDA_ARCH_LIST is not set, using TORCH_CUDA_ARCH_LIST='%s' "
+                    "for visible GPU architectures. Set os.environ['TORCH_CUDA_ARCH_LIST'] to override.",
+                    arch_list_str)
     else:
         # Deal with lists that are ' ' separated (only deal with ';' after)
         _arch_list = _arch_list.replace(' ', ';')
diff --git a/torch/utils/flop_counter.py b/torch/utils/flop_counter.py
index 348e40eb6254..b8d4e878b7f0 100644
--- a/torch/utils/flop_counter.py
+++ b/torch/utils/flop_counter.py
@@ -834,7 +834,8 @@ def __torch_dispatch__(self, func, types, args=(), kwargs=None):
         kwargs = kwargs if kwargs else {}
 
         # Skip ops from non-standard dispatch_sizes_strides_policy such as NJT
-        if func in {torch.ops.aten.is_contiguous.default,
+        if func in {torch.ops.aten.sym_is_contiguous.default,
+                    torch.ops.aten.is_contiguous.default,
                     torch.ops.aten.is_contiguous.memory_format,
                     torch.ops.aten.is_strides_like_format.default,
                     torch.ops.aten.is_non_overlapping_and_dense.default,
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 88d3026de9a1..12291db1704c 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -4326,12 +4326,15 @@
         ("cudaStreamGetCaptureInfo_v2", ("hipStreamGetCaptureInfo_v2", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureStatus", ("hipStreamCaptureStatus", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureStatusActive", ("hipStreamCaptureStatusActive", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamCaptureStatusNone", ("hipStreamCaptureStatusNone", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureMode", ("hipStreamCaptureMode", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureModeGlobal", ("hipStreamCaptureModeGlobal", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureModeRelaxed", ("hipStreamCaptureModeRelaxed", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamCaptureModeThreadLocal", ("hipStreamCaptureModeThreadLocal", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamBeginCapture", ("hipStreamBeginCapture", CONV_TYPE, API_RUNTIME)),
         ("cudaStreamEndCapture", ("hipStreamEndCapture", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamSetCaptureDependencies", ("hipStreamSetCaptureDependencies", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamUpdateCaptureDependencies", ("hipStreamUpdateCaptureDependencies", CONV_STREAM, API_RUNTIME)),
         ("cudaGraphInstantiate", ("hipGraphInstantiate", CONV_TYPE, API_RUNTIME)),
         ("cudaGraphInstantiateWithFlags", ("hipGraphInstantiateWithFlags", CONV_TYPE, API_RUNTIME)),
         (
diff --git a/torch/xpu/__init__.py b/torch/xpu/__init__.py
index 9a4ade5e71ea..79aae38a3168 100644
--- a/torch/xpu/__init__.py
+++ b/torch/xpu/__init__.py
@@ -236,15 +236,13 @@ def get_device_capability(device: Optional[_device_t] = None) -> dict[str, Any]:
         Dict[str, Any]: the xpu capability dictionary of the device
     """
     props = get_device_properties(device)
-    # pybind service attributes are no longer needed and their presence breaks
-    # the further logic related to the serialization of the created dictionary.
-    # In particular it filters out `<bound method PyCapsule._pybind11_conduit_v1_ of _XpuDeviceProperties..>`
-    # to fix Triton tests.
-    # This field appears after updating pybind to 2.13.6.
+    # Only keep attributes that are safe for dictionary serialization.
+    serializable_types = (int, float, bool, str, type(None), list, tuple, dict)
     return {
-        prop: getattr(props, prop)
-        for prop in dir(props)
-        if not prop.startswith(("__", "_pybind11_"))
+        key: value
+        for key in dir(props)
+        if not key.startswith("__")
+        and isinstance((value := getattr(props, key)), serializable_types)
     }
 
 
diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py
index b1e4618ef0d1..611400d271d9 100644
--- a/torchgen/aoti/fallback_ops.py
+++ b/torchgen/aoti/fallback_ops.py
@@ -187,4 +187,5 @@
     "aten.narrow.default": {},
     "aten.amax.default": {},
     "aten.new_empty.default": {},
+    "aten.new_zeros.default": {},
 }
diff --git a/torchgen/api/functionalization.py b/torchgen/api/functionalization.py
index 93667e39b17f..f4b46b5f1476 100644
--- a/torchgen/api/functionalization.py
+++ b/torchgen/api/functionalization.py
@@ -23,20 +23,13 @@
 
 
 # This file describes the translation of JIT schema to API's used
-# when creating view lambdas that are used by the functionalization pass.
-# There are two types of lambdas: forward lambdas and reverse lambdas.
-# These API's mostly follow the dispatcher API, with a few quirks:
-# - The lambda capture has to convert reference types to value types
-# - While the forward lambda just directly calls into the at::_ops API
-#   (following the dispatcher convention), the logic here for the reverse lambda
+# when creating `ViewMeta` specializations that are used by the functionalization pass.
+# These API's mostly follow the dispatcher API, with one difference:
+# - While the forward function just directly calls into the at::_ops API
+#   (following the dispatcher convention), the logic here for the reverse function
 #   is responsible for generating both the call-site, and the declarations
 #   (which are implemented manually in the at::functionalization::impl namespace).
 
-# The lambdas generated for each view op in the functionalization pass are of the form
-# [capture_arguments](outer_arguments) -> returns_type {
-#     return name(inner_arguments);
-# }
-
 # Define some specific lambda input arguments.
 base_binding = Binding(
     name="base",
@@ -46,6 +39,18 @@
     ),
     default=None,
 )
+
+has_symbolic_inputs_binding = Binding(
+    name="has_symbolic_inputs",
+    nctype=NamedCType(name="has_symbolic_inputs", type=BaseCType(boolT)),
+    argument=Argument(
+        name="has_symbolic_inputs",
+        type=BaseType(BaseTy.bool),
+        default=None,
+        annotation=None,
+    ),
+    default=None,
+)
 mutated_view_binding = Binding(
     name="mutated_view",
     nctype=NamedCType(name="mutated_view", type=ConstRefCType(BaseCType(tensorT))),
@@ -54,11 +59,11 @@
     ),
     default=None,
 )
-mutated_view_idx_binding = Binding(
-    name="mutated_view_idx",
-    nctype=NamedCType(name="mutated_view_idx", type=BaseCType(longT)),
+out_index_binding = Binding(
+    name="out_index",
+    nctype=NamedCType(name="out_index", type=BaseCType(longT)),
     argument=Argument(
-        name="base", type=BaseType(BaseTy.Tensor), default=None, annotation=None
+        name="out_index", type=BaseType(BaseTy.int), default=None, annotation=None
     ),
     default=None,
 )
@@ -86,8 +91,13 @@
 )
 
 
-# The lambda capture itself doesn't have a name.
-# The name returned here corresponds to the name of the inner function called by the lambda.
+# Name of the `ViewMeta` specialization class created.
+def classname(func: FunctionSchema, with_namespace: bool = False) -> str:
+    namespace = "at::functionalization::" if with_namespace else ""
+    return f"{namespace}{func.name.unambiguous_name()}_ViewMeta"
+
+
+# Name of the operation called inside the `forward`/`reverse` implementations.
 def name(
     g: NativeFunctionsViewGroup,
     *,
@@ -124,24 +134,6 @@ def reverse_name(f: NativeFunction, include_namespace: bool) -> str:
         return f"{api_name}_inverse"
 
 
-def capture_arguments(func: FunctionSchema, *, is_reverse: bool) -> list[Binding]:
-    # capture arguments include all arguments except `self`.
-    # Importantly, they don't include any C++ reference types (or else we'll get a dangling reference in the capture),
-    # So any reference types (IntArrayRef) need to be converted to value types (vector<int64_t>)
-    args = func.arguments.flat_all
-    assert args[0].type == BaseType(BaseTy.Tensor)
-    non_self_args = args[1:]
-    non_self_value_bindings = [
-        dispatcher.argument(a, remove_non_owning_ref_types=True) for a in non_self_args
-    ]
-
-    all_bindings = [
-        inverse_return_mode_binding if is_reverse else reapply_views_binding
-    ]
-    all_bindings.extend(non_self_value_bindings)
-    return all_bindings
-
-
 def returns_type(func: FunctionSchema) -> CType:
     # Assertion: all view ops return tensor-like outputs
     assert len(func.returns) >= 1
@@ -152,24 +144,49 @@ def returns_type(func: FunctionSchema) -> CType:
     return BaseCType(tensorT)
 
 
-def outer_arguments(*, is_reverse: bool) -> list[Binding]:
-    if is_reverse:
-        return [base_binding, mutated_view_binding, mutated_view_idx_binding]
-    else:
-        return [base_binding, mutated_view_idx_binding]
+# Checks whether `func` might return more than one value.
+def is_multi_output(func: FunctionSchema) -> bool:
+    return len(func.returns) > 1 or (
+        len(func.returns) == 1 and func.returns[0].type.is_list_like() is not None
+    )
+
 
+# `ViewMeta` specialization constructor parameters.
+def base_ctor_arguments(func: FunctionSchema) -> list[Binding]:
+    # All specializations are parematerized by `has_symbolic_inputs` flag.
+    arguments = [has_symbolic_inputs_binding]
 
-def inner_call_index(func: FunctionSchema) -> Binding | None:
-    # For view ops that return multiple tensors (like `split`), we generate a separate lambda for each output.
-    # When we replay a view op that returns multiple tensors, we need to index into the output appropriately
-    if len(func.returns) > 1 or (
-        len(func.returns) == 1 and func.returns[0].type.is_list_like()
-    ):
-        return mutated_view_idx_binding
-    return None
+    # If `func` might return more than 1 value, we also parameterize this specialization
+    # with the output index.
+    if is_multi_output(func):
+        arguments.append(out_index_binding)
+
+    return arguments
+
+
+# `ViewMeta` specialized class' constructor arguments.
+#
+# Values needed specifically by this specialization, that the base class does not need.
+# Same as the class' attributes, but non-owning.
+def extra_ctor_arguments(func: FunctionSchema) -> list[Binding]:
+    return attributes(func, owning=False)
+
+
+# `ViewMeta` specialized class' non-static member data.
+#
+# Essential data for calling the instance's `forward` and `reverse functions. You can
+# think of them as values that should be captured from the functionalization kernel.
+def attributes(func: FunctionSchema, owning: bool = True) -> list[Binding]:
+    args = func.arguments.flat_all
+    assert args[0].type == BaseType(BaseTy.Tensor)
+    return [
+        reapply_views_binding,
+        inverse_return_mode_binding,
+        *[dispatcher.argument(a, remove_non_owning_ref_types=owning) for a in args[1:]],
+    ]
 
 
-def inner_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]:
+def op_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]:
     args = func.arguments.flat_all
     assert args[0].type == BaseType(BaseTy.Tensor)
     non_self_args = args[1:]
@@ -183,13 +200,12 @@ def inner_arguments(func: FunctionSchema, is_reverse: bool) -> list[Binding]:
         # the reverse lambda does the same, but with an additional "mutated_view" arg
         # additionally, we have a calling convention: for view ops that return multiple tensor outputs
         # their corresponding view_inverse function takes in an additional index argument.
-        index_binding = inner_call_index(func)
-        if index_binding is not None:
+        if is_multi_output(func):
             return [
                 base_binding,
                 mutated_view_binding,
                 inverse_return_mode_binding,
-                index_binding,
+                out_index_binding,
             ] + non_self_bindings
         else:
             return [
diff --git a/torchgen/api/types/signatures.py b/torchgen/api/types/signatures.py
index b3856e65e700..d4a47536dd1f 100644
--- a/torchgen/api/types/signatures.py
+++ b/torchgen/api/types/signatures.py
@@ -300,83 +300,11 @@ def decl(self) -> str:
         return_type = functionalization.returns_type(self.g.view.func)
         decls = [
             a.decl()
-            for a in functionalization.inner_arguments(
-                self.g.view.func, is_reverse=True
-            )
+            for a in functionalization.op_arguments(self.g.view.func, is_reverse=True)
         ]
         return f"static {return_type.cpp_type()} {self.name()}({', '.join(decls)});"
 
 
-@dataclass(frozen=True)
-class FunctionalizationLambda:
-    g: NativeFunctionsViewGroup
-
-    # are we generating the forward lambda or the reverse lambda?
-    is_reverse: bool
-
-    def captures(self) -> list[Expr]:
-        # The lambda lives inside of a kernel following the dispatcher API, so its outer context is the dispatcher arguments
-        # We also need to read the "reapply views" TLS at the time that the functionalization kernel was executed,
-        # and plumb it into the lambda.
-        outer_ctx = dispatcher.arguments(self.g.view.func) + [
-            functionalization.reapply_views_binding,
-            functionalization.inverse_return_mode_binding,
-        ]
-        capture_bindings = functionalization.capture_arguments(
-            self.g.view.func, is_reverse=self.is_reverse
-        )
-        # allow_expensive_conversions is set because we want to convert
-        # some reference types (IntArrayRef) to value types (vector<int64_t>).
-        capture_exprs = translate.translate(
-            outer_ctx, capture_bindings, method=False, allow_expensive_conversions=True
-        )
-        return capture_exprs
-
-    def decl(self) -> str:
-        return_type = functionalization.returns_type(self.g.view.func)
-        capture_str = ", ".join(
-            f"{val.type.name} = {val.expr}" for val in self.captures()
-        )
-        decls = [
-            a.decl()
-            for a in functionalization.outer_arguments(is_reverse=self.is_reverse)
-        ]
-        return f"[{capture_str}]({', '.join(decls)}) -> {return_type.cpp_type()}"
-
-    def inner_call(self, *, reapply_views: bool | None = None) -> str:
-        inner_call_name = functionalization.name(
-            self.g,
-            is_reverse=self.is_reverse,
-            include_namespace=True,
-            reapply_views=reapply_views,
-        )
-
-        arg_ctx = functionalization.outer_arguments(is_reverse=self.is_reverse)
-        capture_ctx = functionalization.capture_arguments(
-            self.g.view.func, is_reverse=self.is_reverse
-        )
-        full_ctx = arg_ctx + capture_ctx
-
-        assert self.g.view_copy is not None
-        call_bindings = functionalization.inner_arguments(
-            self.g.view_copy.func, is_reverse=self.is_reverse
-        )
-        maybe_index = functionalization.inner_call_index(self.g.view_copy.func)
-        call_exprs = [
-            e.expr for e in translate.translate(full_ctx, call_bindings, method=False)
-        ]
-        if not self.is_reverse and maybe_index is not None:
-            return f"{inner_call_name}({', '.join(call_exprs)})[{maybe_index.name}];"
-        else:
-            return f"{inner_call_name}({', '.join(call_exprs)});"
-
-    @staticmethod
-    def from_func(
-        g: NativeFunctionsViewGroup, *, is_reverse: bool
-    ) -> FunctionalizationLambda:
-        return FunctionalizationLambda(g, is_reverse)
-
-
 @dataclass(frozen=True)
 class StructuredImplSignature:
     g: NativeFunctionsGroup
diff --git a/torchgen/api/types/types.py b/torchgen/api/types/types.py
index 8e068291738c..41c05653fffd 100644
--- a/torchgen/api/types/types.py
+++ b/torchgen/api/types/types.py
@@ -79,6 +79,7 @@
 typeAndSizeT = BaseCppType("torch::autograd::generated", "TypeAndSize")
 tensorGeometryT = BaseCppType("at", "TensorGeometry")
 SymIntT = BaseCppType("c10", "SymInt")
+SymBoolT = BaseCppType("c10", "SymBool")
 symIntArrayRefT = BaseCppType("c10", "SymIntArrayRef")
 
 # Types representing template parameters.  Technically, we probably shouldn't
@@ -125,6 +126,7 @@
     BaseTy.Storage: storageT,
     BaseTy.Stream: streamT,
     BaseTy.SymInt: SymIntT,
+    BaseTy.SymBool: SymBoolT,
 }
 
 # CTypes encode C++ type structure as needed for translation.
diff --git a/torchgen/gen.py b/torchgen/gen.py
index b8290d6b8684..7bbdd4a7a741 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -43,6 +43,8 @@
     gen_functionalization_definition,
     gen_functionalization_registration,
     gen_functionalization_view_inverse_declaration,
+    gen_functionalization_view_meta_classes_decl,
+    gen_functionalization_view_meta_classes_impl,
     GenCompositeViewCopyKernel,
 )
 from torchgen.gen_vmap_plumbing import gen_all_vmap_plumbing
@@ -2493,48 +2495,48 @@ def key_func(
         },
     )
 
-    def functionalization_env_callable(
+    def gen_op_headers(
         g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
-    ) -> dict[str, list[str]]:
-        def gen_op_headers(
-            g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
-        ) -> list[str]:
-            if isinstance(g, NativeFunctionsViewGroup):
-                # view ops always get a functionalization kernel
-                headers = [
-                    f"#include <ATen/ops/{g.view.root_name}_native.h>",
-                    f"#include <ATen/ops/{g.view.root_name}_ops.h>",
+    ) -> list[str]:
+        if isinstance(g, NativeFunctionsViewGroup):
+            # view ops always get a functionalization kernel
+            headers = [
+                f"#include <ATen/ops/{g.view.root_name}_native.h>",
+                f"#include <ATen/ops/{g.view.root_name}_ops.h>",
+            ]
+            if g.view_copy is not None:
+                headers += [
+                    f"#include <ATen/ops/{g.view_copy.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.view_copy.root_name}_ops.h>",
                 ]
-                if g.view_copy is not None:
-                    headers += [
-                        f"#include <ATen/ops/{g.view_copy.root_name}_native.h>",
-                        f"#include <ATen/ops/{g.view_copy.root_name}_ops.h>",
-                    ]
-                return headers
-            elif isinstance(g, NativeFunctionsGroup):
-                headers = [
-                    f"#include <ATen/ops/{g.functional.root_name}_native.h>",
-                    f"#include <ATen/ops/{g.functional.root_name}_ops.h>",
-                    f"#include <ATen/ops/{g.out.root_name}_native.h>",
-                    f"#include <ATen/ops/{g.out.root_name}_ops.h>",
+            return headers
+        elif isinstance(g, NativeFunctionsGroup):
+            headers = [
+                f"#include <ATen/ops/{g.functional.root_name}_native.h>",
+                f"#include <ATen/ops/{g.functional.root_name}_ops.h>",
+                f"#include <ATen/ops/{g.out.root_name}_native.h>",
+                f"#include <ATen/ops/{g.out.root_name}_ops.h>",
+            ]
+            if g.inplace is not None:
+                headers += [
+                    f"#include <ATen/ops/{g.inplace.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.inplace.root_name}_ops.h>",
                 ]
-                if g.inplace is not None:
-                    headers += [
-                        f"#include <ATen/ops/{g.inplace.root_name}_native.h>",
-                        f"#include <ATen/ops/{g.inplace.root_name}_ops.h>",
-                    ]
-                if g.mutable is not None:
-                    headers += [
-                        f"#include <ATen/ops/{g.mutable.root_name}_native.h>",
-                        f"#include <ATen/ops/{g.mutable.root_name}_ops.h>",
-                    ]
-                return headers
-            else:
-                return [
-                    f"#include <ATen/ops/{g.root_name}_native.h>",
-                    f"#include <ATen/ops/{g.root_name}_ops.h>",
+            if g.mutable is not None:
+                headers += [
+                    f"#include <ATen/ops/{g.mutable.root_name}_native.h>",
+                    f"#include <ATen/ops/{g.mutable.root_name}_ops.h>",
                 ]
+            return headers
+        else:
+            return [
+                f"#include <ATen/ops/{g.root_name}_native.h>",
+                f"#include <ATen/ops/{g.root_name}_ops.h>",
+            ]
 
+    def functionalization_env_callable(
+        g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
+    ) -> dict[str, list[str]]:
         return {
             "ops_headers": gen_op_headers(g),
             "func_definitions": gen_functionalization_definition(
@@ -2600,6 +2602,31 @@ def gen_op_headers(
         },
     )
 
+    cpu_fm.write(
+        "ViewMetaClasses.h",
+        lambda: {
+            "view_meta_declarations": list(
+                concatMap(
+                    lambda g: gen_functionalization_view_meta_classes_decl(selector, g),
+                    view_groups,
+                )
+            )
+        },
+    )
+
+    cpu_fm.write(
+        "ViewMetaClasses.cpp",
+        lambda: {
+            "view_meta_implementations": list(
+                concatMap(
+                    lambda g: gen_functionalization_view_meta_classes_impl(selector, g),
+                    view_groups,
+                )
+            ),
+            "op_headers": list(concatMap(gen_op_headers, view_groups)),
+        },
+    )
+
     # Note [view_copy NativeFunctions]
     # Every view operator in native_functions.yaml that is not CompositeImplicitAutograd
     # needs to have a corresponding non-aliasing {view}_copy variant.
diff --git a/torchgen/gen_functionalization_type.py b/torchgen/gen_functionalization_type.py
index 42407974087a..f47985837eac 100644
--- a/torchgen/gen_functionalization_type.py
+++ b/torchgen/gen_functionalization_type.py
@@ -1,16 +1,15 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Callable, TYPE_CHECKING
+from typing import Callable, Optional, TYPE_CHECKING
 
-from torchgen.api import cpp, dispatcher
+from torchgen.api import cpp, dispatcher, functionalization
 from torchgen.api.translate import translate
 from torchgen.api.types import (
     BaseCType,
     Binding,
     CType,
     DispatcherSignature,
-    FunctionalizationLambda,
     iTensorListRefT,
     NativeSignature,
     OptionalCType,
@@ -48,7 +47,7 @@
     MUTABLE_OPS_THAT_CANNOT_GET_AN_OUT_VARIANT,
     OUT_OPS_THAT_DONT_GET_GROUPED_PROPERLY,
 )
-from torchgen.utils import dataclass_repr
+from torchgen.utils import concatMap, dataclass_repr, FileManager
 
 
 if TYPE_CHECKING:
@@ -365,6 +364,8 @@ def emit_view_functionalization_body(
     with native_function_manager(f):
         call_sig = DispatcherSignature.from_schema(g.view_copy.func)
 
+        spec = ViewMetaSpecialization(g, f=f)
+
         # the "view_copy" op name that the functionalization kernels need to call
         api_name = g.view_copy.func.name.unambiguous_name()
         # Sometimes the functionalization pass needs to no-op (e.g. if it was passed non-functional tensors)
@@ -385,9 +386,6 @@ def emit_view_functionalization_body(
             for e in translate(unwrapped_args_ctx, call_sig.arguments(), method=False)
         ]
 
-        forward_lambda = FunctionalizationLambda.from_func(g, is_reverse=False)
-        reverse_lambda = FunctionalizationLambda.from_func(g, is_reverse=True)
-
         # The meta API call should use the same arguments, but convert all tensors to meta tensors first.
         meta_conversion_str, meta_call_ctx = convert_to_meta_tensors(dispatcher_sig)
         meta_call_args = [
@@ -415,19 +413,7 @@ def emit_view_functionalization_body(
             : at::functionalization::InverseReturnMode::NeverView
       );
       {symbolic_inputs_check}
-      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-        {forward_lambda.decl()} {{
-          if (reapply_views) {{
-            return {forward_lambda.inner_call(reapply_views=True)}
-          }} else {{
-            return {forward_lambda.inner_call(reapply_views=False)}
-          }}
-        }},
-        {reverse_lambda.decl()} {{
-          return {reverse_lambda.inner_call()}
-        }},
-        /*has_symbolic_inputs=*/{symbolic_inputs_varname}
-      );
+      auto view_meta = {spec.new()};
       auto compute_reference_meta =
         {view_tensor_name}.key_set().has_backend(c10::BackendComponent::XLABit) ||
         {view_tensor_name}.key_set().has_backend(c10::BackendComponent::LazyBit);
@@ -455,7 +441,6 @@ def emit_view_functionalization_body(
 """
 
         else:
-            is_multi_output_view = isinstance(f.func.returns[0].type, ListType)
             return f"""
     {dispatcher_sig.defn(name=wrapper_name(f.func), is_redispatching_fn=True)} {{
       {unwrap_tensor_args_str}
@@ -489,21 +474,7 @@ def emit_view_functionalization_body(
         }}
       }}
       {symbolic_inputs_check}
-      at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-        {forward_lambda.decl()} {{
-          if (reapply_views) {{
-            return {forward_lambda.inner_call(reapply_views=True)}
-          }} else {{
-            return {forward_lambda.inner_call(reapply_views=False)}
-          }}
-        }},
-        {reverse_lambda.decl()} {{
-          return {reverse_lambda.inner_call()}
-        }},
-        /*has_symbolic_inputs=*/{symbolic_inputs_varname},
-        /*is_multi_output=*/{str(is_multi_output_view).lower()},
-        /*is_as_strided=*/{str(str(f.func.name) == "as_strided").lower()}
-      );
+      auto view_meta = {spec.new()};
       auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, {view_tensor_name}, view_meta);
       // See  Note [Propagating strides in the functionalization pass]
       if (compute_reference_meta && !disable_meta_reference()) {{
@@ -771,6 +742,301 @@ def emit_decl_helper(g: NativeFunctionsViewGroup) -> str | None:
     return emit_decl_helper(g)
 
 
+# Helper class for generating `ViewMeta` specializations.
+@dataclass
+class ViewMetaSpecialization:
+    g: NativeFunctionsViewGroup
+    f: NativeFunction
+
+    @property
+    def is_multi_output(self) -> bool:
+        return functionalization.is_multi_output(self.f.func)
+
+    @property
+    def is_as_strided(self) -> bool:
+        return str(self.f.func.name) == "as_strided"
+
+    @property
+    def out_index(self) -> str:
+        if self.is_multi_output:
+            return functionalization.out_index_binding.name
+        return "0"
+
+    @property
+    def classname(self) -> str:
+        return functionalization.classname(self.f.func)
+
+    def decl(self) -> list[str]:
+        base_ctor_arguments = functionalization.base_ctor_arguments(self.f.func)
+        extra_ctor_arguments = functionalization.extra_ctor_arguments(self.f.func)
+        attributes = functionalization.attributes(self.f.func)
+
+        # List of types for declaring the `SerializableTuple` type.
+        serializable_tuple_args = ",\n".join(
+            f"      {binding.type} /* {binding.name} */"
+            for binding in (base_ctor_arguments + attributes)
+        )
+
+        # Arguments used for forwarding the tuple elements to the constructor.
+        destructure_tuple_args = ", ".join(
+            f"std::get<{i}>(tpl)"
+            for i in range(len(base_ctor_arguments) + len(extra_ctor_arguments))
+        )
+
+        # List of constructor parameters
+        ctor_parameters = ", ".join(
+            binding.decl() for binding in (base_ctor_arguments + extra_ctor_arguments)
+        )
+
+        # Call the base class `ViewMeta` constructor.
+        #
+        # Both of `is_multi_output` and `is_as_strided` are known values, given the
+        # operation schema.
+        is_multi_output_str = str(self.is_multi_output).lower()
+        is_as_strided_str = str(self.is_as_strided).lower()
+
+        base_ctor_bindings = ", ".join(
+            [
+                # `has_symbolic_inputs` is always taken as parameter.
+                functionalization.has_symbolic_inputs_binding.name,
+                f"/*is_multi_output=*/{is_multi_output_str}",
+                f"/*is_as_strided=*/{is_as_strided_str}",
+                # `out_index` is know if the operation returns only one value. Otherwise,
+                # we also take it as parameter.
+                f"/*out_index=*/{self.out_index}",
+            ]
+        )
+
+        # Assignments of `extra_ctor_arguments` to their corresponding fields.
+        # These are extra fields to-be-declared in this specialization.
+        #
+        # We need to set `allow_expensive_conversions`, since we are storing owned versions
+        # of the non-owning arguments.
+        ctor_assignments = ",\n".join(
+            f"        {e.type.name}({e.expr})"
+            for e in translate(
+                extra_ctor_arguments,
+                attributes,
+                method=False,
+                allow_expensive_conversions=True,
+            )
+        )
+
+        # List of arguments for constructing the `SerializableTuple` from an instance.
+        tuple_arguments = ", ".join(
+            binding.name for binding in (base_ctor_arguments + attributes)
+        )
+
+        # List of field declarations.
+        attr_declarations = "\n".join(f"  {binding.decl()};" for binding in attributes)
+
+        # Override `to_out_index` if this operation returns more than 1 value.
+        to_out_index_decl = ""
+        if self.is_multi_output:
+            to_out_index_decl = (
+                "  std::shared_ptr<ViewMeta> to_out_index(int64_t out_idx) override;"
+            )
+
+        return [
+            f"""
+struct TORCH_API {self.classname} : public ViewMeta {{
+  FUNCTIONALIZATION_VIEWMETA_NAME({self.classname})
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(\n{serializable_tuple_args});
+
+  {self.classname}(const SerializableTuple& tpl)
+      : {self.classname}({destructure_tuple_args}) {{}}
+
+  {self.classname}({ctor_parameters})
+      : at::functionalization::ViewMeta({base_ctor_bindings}),
+{ctor_assignments} {{}}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+{to_out_index_decl}
+
+  SerializableTuple to_serializable_tuple() {{
+    return std::make_tuple({tuple_arguments});
+  }}
+
+{attr_declarations}
+}};
+"""
+        ]
+
+    # Generate a call to the actual operation.
+    def opcall(self, is_reverse: bool, reapply_views: bool) -> str:
+        opname = functionalization.name(
+            self.g,
+            is_reverse=is_reverse,
+            include_namespace=True,
+            reapply_views=reapply_views,
+        )
+
+        # Expected arguments for the operation.
+        assert self.g.view_copy is not None
+        op_arguments = functionalization.op_arguments(self.g.view_copy.func, is_reverse)
+
+        # The context is composed by the constructor arguments (which are also
+        # the field variables stored in the instance), and the `base` tensor.
+        context = [functionalization.base_binding]
+        context += functionalization.base_ctor_arguments(self.f.func)
+        context += functionalization.attributes(self.f.func)
+
+        # If we are generating the call for the reverse function, we also have
+        # access to `mutated_view` argument.
+        if is_reverse:
+            context.append(functionalization.mutated_view_binding)
+
+        arguments = ", ".join(
+            [e.expr for e in translate(context, op_arguments, method=False)]
+        )
+
+        # Index the result if this operation returns multiple values.
+        maybe_index = ""
+        if not is_reverse and self.is_multi_output:
+            maybe_index = f"[{self.out_index}]"
+
+        return f"{opname}({arguments}){maybe_index}"
+
+    def impl(self) -> list[str]:
+        functions = [
+            f"""
+at::Tensor {self.classname}::forward(const at::Tensor& base) {{
+  if (reapply_views) {{
+    return {self.opcall(is_reverse=False, reapply_views=True)};
+  }} else {{
+    return {self.opcall(is_reverse=False, reapply_views=False)};
+  }}
+}}""",
+            f"""
+at::Tensor {self.classname}::reverse(const at::Tensor& base, const Tensor& mutated_view) {{
+  return {self.opcall(is_reverse=True, reapply_views=True)};
+}}""",
+        ]
+
+        # If this operation returns multiple values, also generate a `to_out_index`
+        # implementation.
+        if self.is_multi_output:
+            functions.append(f"""
+std::shared_ptr<at::functionalization::ViewMeta> {self.classname}::to_out_index(int64_t out_index) {{
+  return {self.new("out_index")};
+}}
+""")
+
+        return functions
+
+    # Create the Python binding for this specialized class.
+    def binding(self) -> list[str]:
+        name = functionalization.classname(self.f.func, with_namespace=True)
+        return [f"  create_binding_with_pickle<{name}>(functionalization);"]
+
+    # Generate an instantiation of this specialized class.
+    def new(self, out_index: str = "0") -> str:
+        name = functionalization.classname(self.f.func, with_namespace=True)
+        ctor_arguments = functionalization.base_ctor_arguments(
+            self.f.func
+        ) + functionalization.extra_ctor_arguments(self.f.func)
+        # Replace the `out_index` parameter with the given `out_index`.
+        arguments = ", ".join(
+            binding.name if binding.name != "out_index" else out_index
+            for binding in ctor_arguments
+        )
+        return f"std::make_shared<{name}>({arguments})"
+
+    # Run the function `run` for both: `view` and `view_inplace` functions.
+    @staticmethod
+    def map(
+        g: NativeFunctionsViewGroup, run: Callable[[ViewMetaSpecialization], list[str]]
+    ) -> list[str]:
+        def maybe_run(f: Optional[NativeFunction]) -> list[str]:
+            if f is None:
+                return []
+            with native_function_manager(f):
+                return run(ViewMetaSpecialization(g, f))
+
+        return list(concatMap(maybe_run, (g.view, g.view_inplace)))
+
+
+def gen_functionalization_view_meta_classes_base(
+    selector: SelectiveBuilder,
+    g: NativeFunctionsViewGroup,
+    run: Callable[[ViewMetaSpecialization], list[str]],
+) -> list[str]:
+    if not selector.include_all_operators:
+        return []
+
+    if g.composite:
+        return []
+
+    return ViewMetaSpecialization.map(g, run)
+
+
+def gen_functionalization_view_meta_classes_decl(
+    selector: SelectiveBuilder, g: NativeFunctionsViewGroup
+) -> list[str]:
+    return gen_functionalization_view_meta_classes_base(
+        selector, g, ViewMetaSpecialization.decl
+    )
+
+
+def gen_functionalization_view_meta_classes_impl(
+    selector: SelectiveBuilder, g: NativeFunctionsViewGroup
+) -> list[str]:
+    return gen_functionalization_view_meta_classes_base(
+        selector, g, ViewMetaSpecialization.impl
+    )
+
+
+def gen_functionalization_view_meta_classes_binding(
+    selector: SelectiveBuilder, g: NativeFunctionsViewGroup
+) -> list[str]:
+    return gen_functionalization_view_meta_classes_base(
+        selector, g, ViewMetaSpecialization.binding
+    )
+
+
+# Generates the Python bindings for the `ViewMeta` specialized classes.
+def gen_functionalization_view_meta_classes(
+    native_functions_path: str,
+    tags_path: str,
+    selector: SelectiveBuilder,
+    install_dir: str,
+    template_dir: str,
+) -> None:
+    from torchgen.gen import get_grouped_by_view_native_functions, parse_native_yaml
+
+    # Parse the native_functions.yaml.
+    # Then, group them into `NativeFunctionsViewGroup`.
+    #
+    # This is the same steps we do in gen.py (ATen codegen).
+    native_functions = parse_native_yaml(
+        native_functions_path, tags_path
+    ).native_functions
+    native_functions_with_view_groups = get_grouped_by_view_native_functions(
+        native_functions
+    )
+    view_groups = [
+        g
+        for g in native_functions_with_view_groups
+        if isinstance(g, NativeFunctionsViewGroup)
+    ]
+
+    fm = FileManager(install_dir=install_dir, template_dir=template_dir, dry_run=False)
+    fm.write(
+        "ViewMetaClassesPythonBinding.cpp",
+        lambda: {
+            "view_meta_bindings": list(
+                concatMap(
+                    lambda g: gen_functionalization_view_meta_classes_binding(
+                        selector, g
+                    ),
+                    view_groups,
+                )
+            ),
+        },
+    )
+
+
 def gen_functionalization_registration(
     selector: SelectiveBuilder,
     g: NativeFunction | NativeFunctionsGroup | NativeFunctionsViewGroup,
diff --git a/version.txt b/version.txt
index 03e905f0db5f..c8e38b614057 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-2.9.0a0
+2.9.0