diff --git a/.automation_scripts/parse_xml_results.py b/.automation_scripts/parse_xml_results.py new file mode 100644 index 000000000000..7db2e1ce9233 --- /dev/null +++ b/.automation_scripts/parse_xml_results.py @@ -0,0 +1,178 @@ +""" The Python PyTorch testing script. +## +# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +""" + +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Any, Dict, Tuple + +# Backends list +BACKENDS_LIST = [ + "dist-gloo", + "dist-nccl" +] + +TARGET_WORKFLOW = "--rerun-disabled-tests" + +def get_job_id(report: Path) -> int: + # [Job id in artifacts] + # Retrieve the job id from the report path. In our GHA workflows, we append + # the job id to the end of the report name, so `report` looks like: + # unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml + # and we want to get `5596745227` out of it. + try: + return int(report.parts[0].rpartition("_")[2]) + except ValueError: + return -1 + +def is_rerun_disabled_tests(root: ET.ElementTree) -> bool: + """ + Check if the test report is coming from rerun_disabled_tests workflow + """ + skipped = root.find(".//*skipped") + # Need to check against None here, if not skipped doesn't work as expected + if skipped is None: + return False + + message = skipped.attrib.get("message", "") + return TARGET_WORKFLOW in message or "num_red" in message + +def parse_xml_report( + tag: str, + report: Path, + workflow_id: int, + workflow_run_attempt: int, + work_flow_name: str +) -> Dict[Tuple[str], Dict[str, Any]]: + """Convert a test report xml file into a JSON-serializable list of test cases.""" + print(f"Parsing {tag}s for test report: {report}") + + job_id = get_job_id(report) + print(f"Found job id: {job_id}") + + test_cases: Dict[Tuple[str], Dict[str, Any]] = {} + + root = ET.parse(report) + # TODO: unlike unittest, pytest-flakefinder used by rerun disabled tests for test_ops + # includes skipped messages multiple times (50 times by default). This slows down + # this script too much (O(n)) because it tries to gather all the stats. This should + # be fixed later in the way we use pytest-flakefinder. A zipped test report from rerun + # disabled test is only few MB, but will balloon up to a much bigger XML file after + # extracting from a dozen to few hundred MB + if is_rerun_disabled_tests(root): + return test_cases + + for test_case in root.iter(tag): + case = process_xml_element(test_case) + if tag == 'testcase': + case["workflow_id"] = workflow_id + case["workflow_run_attempt"] = workflow_run_attempt + case["job_id"] = job_id + case["work_flow_name"] = work_flow_name + + # [invoking file] + # The name of the file that the test is located in is not necessarily + # the same as the name of the file that invoked the test. + # For example, `test_jit.py` calls into multiple other test files (e.g. + # jit/test_dce.py). For sharding/test selection purposes, we want to + # record the file that invoked the test. + # + # To do this, we leverage an implementation detail of how we write out + # tests (https://bit.ly/3ajEV1M), which is that reports are created + # under a folder with the same name as the invoking file. + case_name = report.parent.name + for ind in range(len(BACKENDS_LIST)): + if BACKENDS_LIST[ind] in report.parts: + case_name = case_name + "_" + BACKENDS_LIST[ind] + break + case["invoking_file"] = case_name + test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["work_flow_name"] ) ] = case + elif tag == 'testsuite': + case["work_flow_name"] = work_flow_name + case["invoking_xml"] = report.name + case["running_time_xml"] = case["time"] + case_name = report.parent.name + for ind in range(len(BACKENDS_LIST)): + if BACKENDS_LIST[ind] in report.parts: + case_name = case_name + "_" + BACKENDS_LIST[ind] + break + case["invoking_file"] = case_name + + test_cases[ ( case["invoking_file"], case["invoking_xml"], case["work_flow_name"] ) ] = case + + return test_cases + +def process_xml_element(element: ET.Element) -> Dict[str, Any]: + """Convert a test suite element into a JSON-serializable dict.""" + ret: Dict[str, Any] = {} + + # Convert attributes directly into dict elements. + # e.g. + # + # becomes: + # {"name": "test_foo", "classname": "test_bar"} + ret.update(element.attrib) + + # The XML format encodes all values as strings. Convert to ints/floats if + # possible to make aggregation possible in Rockset. + for k, v in ret.items(): + try: + ret[k] = int(v) + except ValueError: + pass + try: + ret[k] = float(v) + except ValueError: + pass + + # Convert inner and outer text into special dict elements. + # e.g. + # my_inner_text my_tail + # becomes: + # {"text": "my_inner_text", "tail": " my_tail"} + if element.text and element.text.strip(): + ret["text"] = element.text + if element.tail and element.tail.strip(): + ret["tail"] = element.tail + + # Convert child elements recursively, placing them at a key: + # e.g. + # + # hello + # world + # another + # + # becomes + # { + # "foo": [{"text": "hello"}, {"text": "world"}], + # "bar": {"text": "another"} + # } + for child in element: + if child.tag not in ret: + ret[child.tag] = process_xml_element(child) + else: + # If there are multiple tags with the same name, they should be + # coalesced into a list. + if not isinstance(ret[child.tag], list): + ret[child.tag] = [ret[child.tag]] + ret[child.tag].append(process_xml_element(child)) + return ret \ No newline at end of file diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py new file mode 100644 index 000000000000..514afd19624c --- /dev/null +++ b/.automation_scripts/run_pytorch_unit_tests.py @@ -0,0 +1,518 @@ +#!/usr/bin/env python3 + +""" The Python PyTorch testing script. +## +# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +""" + +import argparse +import os +import shutil +import subprocess +from subprocess import STDOUT, CalledProcessError + +from collections import namedtuple +from datetime import datetime +from pathlib import Path +from parse_xml_results import ( + parse_xml_report +) +from pprint import pprint +from typing import Any, Dict, List + +# unit test status list +UT_STATUS_LIST = [ + "PASSED", + "MISSED", + "SKIPPED", + "FAILED", + "XFAILED", + "ERROR" +] + +DEFAULT_CORE_TESTS = [ + "test_nn", + "test_torch", + "test_cuda", + "test_ops", + "test_unary_ufuncs", + "test_autograd", + "inductor/test_torchinductor" +] + +DISTRIBUTED_CORE_TESTS = [ + "distributed/test_c10d_common", + "distributed/test_c10d_nccl", + "distributed/test_distributed_spawn" +] + +CONSOLIDATED_LOG_FILE_NAME="pytorch_unit_tests.log" + +def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, workflow_name, path="."): + test_cases = {} + items_list = os.listdir(path) + for dir in items_list: + new_dir = path + '/' + dir + '/' + if os.path.isdir(new_dir): + for xml_report in Path(new_dir).glob("**/*.xml"): + test_cases.update( + parse_xml_report( + tag, + xml_report, + workflow_run_id, + workflow_run_attempt, + workflow_name + ) + ) + return test_cases + +def get_test_status(test_case): + # In order of priority: S=skipped, F=failure, E=error, P=pass + if "skipped" in test_case and test_case["skipped"]: + type_message = test_case["skipped"] + if type_message.__contains__('type') and type_message['type'] == "pytest.xfail": + return "XFAILED" + else: + return "SKIPPED" + elif "failure" in test_case and test_case["failure"]: + return "FAILED" + elif "error" in test_case and test_case["error"]: + return "ERROR" + else: + return "PASSED" + +def get_test_message(test_case, status=None): + if status == "SKIPPED": + return test_case["skipped"] if "skipped" in test_case else "" + elif status == "FAILED": + return test_case["failure"] if "failure" in test_case else "" + elif status == "ERROR": + return test_case["error"] if "error" in test_case else "" + else: + if "skipped" in test_case: + return test_case["skipped"] + elif "failure" in test_case: + return test_case["failure"] + elif "error" in test_case: + return test_case["error"] + else: + return "" + +def get_test_file_running_time(test_suite): + if test_suite.__contains__('time'): + return test_suite["time"] + return 0 + +def get_test_running_time(test_case): + if test_case.__contains__('time'): + return test_case["time"] + return "" + +def summarize_xml_files(path, workflow_name): + # statistics + TOTAL_TEST_NUM = 0 + TOTAL_PASSED_NUM = 0 + TOTAL_SKIPPED_NUM = 0 + TOTAL_XFAIL_NUM = 0 + TOTAL_FAILED_NUM = 0 + TOTAL_ERROR_NUM = 0 + TOTAL_EXECUTION_TIME = 0 + + #parse the xml files + test_cases = parse_xml_reports_as_dict(-1, -1, 'testcase', workflow_name, path) + test_suites = parse_xml_reports_as_dict(-1, -1, 'testsuite', workflow_name, path) + test_file_and_status = namedtuple("test_file_and_status", ["file_name", "status"]) + # results dict + res = {} + res_item_list = [ "PASSED", "SKIPPED", "XFAILED", "FAILED", "ERROR" ] + test_file_items = set() + for (k,v) in list(test_suites.items()): + file_name = k[0] + if not file_name in test_file_items: + test_file_items.add(file_name) + # initialization + for item in res_item_list: + temp_item = test_file_and_status(file_name, item) + res[temp_item] = {} + temp_item_statistics = test_file_and_status(file_name, "STATISTICS") + res[temp_item_statistics] = {'TOTAL': 0, 'PASSED': 0, 'SKIPPED': 0, 'XFAILED': 0, 'FAILED': 0, 'ERROR': 0, 'EXECUTION_TIME': 0} + test_running_time = get_test_file_running_time(v) + res[temp_item_statistics]["EXECUTION_TIME"] += test_running_time + TOTAL_EXECUTION_TIME += test_running_time + else: + test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS") + test_running_time = get_test_file_running_time(v) + res[test_tuple_key_statistics]["EXECUTION_TIME"] += test_running_time + TOTAL_EXECUTION_TIME += test_running_time + + for (k,v) in list(test_cases.items()): + file_name = k[0] + class_name = k[1] + test_name = k[2] + combined_name = file_name + "::" + class_name + "::" + test_name + test_status = get_test_status(v) + test_running_time = get_test_running_time(v) + test_message = get_test_message(v, test_status) + test_info_value = "" + test_tuple_key_status = test_file_and_status(file_name, test_status) + test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS") + TOTAL_TEST_NUM += 1 + res[test_tuple_key_statistics]["TOTAL"] += 1 + if test_status == "PASSED": + test_info_value = str(test_running_time) + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["PASSED"] += 1 + TOTAL_PASSED_NUM += 1 + elif test_status == "SKIPPED": + test_info_value = str(test_running_time) + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["SKIPPED"] += 1 + TOTAL_SKIPPED_NUM += 1 + elif test_status == "XFAILED": + test_info_value = str(test_running_time) + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["XFAILED"] += 1 + TOTAL_XFAIL_NUM += 1 + elif test_status == "FAILED": + test_info_value = test_message + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["FAILED"] += 1 + TOTAL_FAILED_NUM += 1 + elif test_status == "ERROR": + test_info_value = test_message + res[test_tuple_key_status][combined_name] = test_info_value + res[test_tuple_key_statistics]["ERROR"] += 1 + TOTAL_ERROR_NUM += 1 + + # generate statistics_dict + statistics_dict = {} + statistics_dict["TOTAL"] = TOTAL_TEST_NUM + statistics_dict["PASSED"] = TOTAL_PASSED_NUM + statistics_dict["SKIPPED"] = TOTAL_SKIPPED_NUM + statistics_dict["XFAILED"] = TOTAL_XFAIL_NUM + statistics_dict["FAILED"] = TOTAL_FAILED_NUM + statistics_dict["ERROR"] = TOTAL_ERROR_NUM + statistics_dict["EXECUTION_TIME"] = TOTAL_EXECUTION_TIME + aggregate_item = workflow_name + "_aggregate" + total_item = test_file_and_status(aggregate_item, "STATISTICS") + res[total_item] = statistics_dict + + return res + +def run_command_and_capture_output(cmd): + try: + print(f"Running command '{cmd}'") + with open(CONSOLIDATED_LOG_FILE_PATH, "a+") as output_file: + print(f"========================================", file=output_file, flush=True) + print(f"[RUN_PYTORCH_UNIT_TESTS] Running command '{cmd}'", file=output_file, flush=True) # send to consolidated file as well + print(f"========================================", file=output_file, flush=True) + p = subprocess.run(cmd, shell=True, stdout=output_file, stderr=STDOUT, text=True) + except CalledProcessError as e: + print(f"ERROR: Cmd {cmd} failed with return code: {e.returncode}!") + +def run_entire_tests(workflow_name, test_shell_path, overall_logs_path_current_run, test_reports_src): + if os.path.exists(test_reports_src): + shutil.rmtree(test_reports_src) + + os.mkdir(test_reports_src) + copied_logs_path = "" + if workflow_name == "default": + os.environ['TEST_CONFIG'] = 'default' + copied_logs_path = overall_logs_path_current_run + "default_xml_results_entire_tests/" + elif workflow_name == "distributed": + os.environ['TEST_CONFIG'] = 'distributed' + copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_entire_tests/" + elif workflow_name == "inductor": + os.environ['TEST_CONFIG'] = 'inductor' + copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_entire_tests/" + # use test.sh for tests execution + run_command_and_capture_output(test_shell_path) + copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path) + entire_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name) + return entire_results_dict + +def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src): + if os.path.exists(test_reports_src): + shutil.rmtree(test_reports_src) + + os.mkdir(test_reports_src) + copied_logs_path = "" + if workflow_name == "default": + os.environ['TEST_CONFIG'] = 'default' + os.environ['HIP_VISIBLE_DEVICES'] = '0' + copied_logs_path = overall_logs_path_current_run + "default_xml_results_priority_tests/" + # use run_test.py for tests execution + default_priority_test_suites = " ".join(DEFAULT_CORE_TESTS) + command = "python3 " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose" + run_command_and_capture_output(command) + del os.environ['HIP_VISIBLE_DEVICES'] + elif workflow_name == "distributed": + os.environ['TEST_CONFIG'] = 'distributed' + os.environ['HIP_VISIBLE_DEVICES'] = '0,1' + copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_priority_tests/" + # use run_test.py for tests execution + distributed_priority_test_suites = " ".join(DISTRIBUTED_CORE_TESTS) + command = "python3 " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose" + run_command_and_capture_output(command) + del os.environ['HIP_VISIBLE_DEVICES'] + copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path) + priority_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name) + + return priority_results_dict + +def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src, selected_list): + if os.path.exists(test_reports_src): + shutil.rmtree(test_reports_src) + + os.mkdir(test_reports_src) + copied_logs_path = "" + if workflow_name == "default": + os.environ['TEST_CONFIG'] = 'default' + os.environ['HIP_VISIBLE_DEVICES'] = '0' + copied_logs_path = overall_logs_path_current_run + "default_xml_results_selected_tests/" + # use run_test.py for tests execution + default_selected_test_suites = " ".join(selected_list) + command = "python3 " + test_run_test_path + " --include " + default_selected_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose" + run_command_and_capture_output(command) + del os.environ['HIP_VISIBLE_DEVICES'] + elif workflow_name == "distributed": + os.environ['TEST_CONFIG'] = 'distributed' + os.environ['HIP_VISIBLE_DEVICES'] = '0,1' + copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_selected_tests/" + # use run_test.py for tests execution + distributed_selected_test_suites = " ".join(selected_list) + command = "python3 " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose" + run_command_and_capture_output(command) + del os.environ['HIP_VISIBLE_DEVICES'] + elif workflow_name == "inductor": + os.environ['TEST_CONFIG'] = 'inductor' + copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_selected_tests/" + inductor_selected_test_suites = "" + non_inductor_selected_test_suites = "" + for item in selected_list: + if "inductor/" in item: + inductor_selected_test_suites += item + inductor_selected_test_suites += " " + else: + non_inductor_selected_test_suites += item + non_inductor_selected_test_suites += " " + if inductor_selected_test_suites != "": + inductor_selected_test_suites = inductor_selected_test_suites[:-1] + command = "python3 " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose" + run_command_and_capture_output(command) + if non_inductor_selected_test_suites != "": + non_inductor_selected_test_suites = non_inductor_selected_test_suites[:-1] + command = "python3 " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose" + run_command_and_capture_output(command) + copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path) + selected_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name) + + return selected_results_dict + +def run_test_and_summarize_results( + pytorch_root_dir: str, + priority_tests: bool, + test_config: List[str], + default_list: List[str], + distributed_list: List[str], + inductor_list: List[str], + skip_rerun: bool) -> Dict[str, Any]: + + # copy current environment variables + _environ = dict(os.environ) + + # modify path + test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh" + test_run_test_path = pytorch_root_dir + "/test/run_test.py" + repo_test_log_folder_path = pytorch_root_dir + "/.automation_logs/" + test_reports_src = pytorch_root_dir + "/test/test-reports/" + run_test_python_file = pytorch_root_dir + "/test/run_test.py" + + # change directory to pytorch root + os.chdir(pytorch_root_dir) + + # all test results dict + res_all_tests_dict = {} + + # patterns + search_text = "--reruns=2" + replace_text = "--reruns=0" + + # create logs folder + if not os.path.exists(repo_test_log_folder_path): + os.mkdir(repo_test_log_folder_path) + + # Set common environment variables for all scenarios + os.environ['CI'] = '1' + os.environ['PYTORCH_TEST_WITH_ROCM'] = '1' + os.environ['HSA_FORCE_FINE_GRAIN_PCIE'] = '1' + os.environ['PYTORCH_TESTING_DEVICE_ONLY_FOR'] = 'cuda' + os.environ['CONTINUE_THROUGH_ERROR'] = 'True' + if skip_rerun: + # modify run_test.py in-place + with open(run_test_python_file, 'r') as file: + data = file.read() + data = data.replace(search_text, replace_text) + with open(run_test_python_file, 'w') as file: + file.write(data) + + # Time stamp + current_datetime = datetime.now().strftime("%Y%m%d_%H-%M-%S") + print("Current date & time : ", current_datetime) + # performed as Job ID + str_current_datetime = str(current_datetime) + overall_logs_path_current_run = repo_test_log_folder_path + str_current_datetime + "/" + os.mkdir(overall_logs_path_current_run) + + global CONSOLIDATED_LOG_FILE_PATH + CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME + + # Check multi gpu availability if distributed tests are enabled + if ("distributed" in test_config) or len(distributed_list) != 0: + check_num_gpus_for_distributed() + + # Install test requirements + command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt" + run_command_and_capture_output(command) + + # Run entire tests for each workflow + if not priority_tests and not default_list and not distributed_list and not inductor_list: + # run entire tests for default, distributed and inductor workflows → use test.sh + if not test_config: + check_num_gpus_for_distributed() + # default test process + res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["default"] = res_default_all + # distributed test process + res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["distributed"] = res_distributed_all + # inductor test process + res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["inductor"] = res_inductor_all + else: + workflow_list = [] + for item in test_config: + workflow_list.append(item) + if "default" in workflow_list: + res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["default"] = res_default_all + if "distributed" in workflow_list: + res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["distributed"] = res_distributed_all + if "inductor" in workflow_list: + res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["inductor"] = res_inductor_all + # Run priority test for each workflow + elif priority_tests and not default_list and not distributed_list and not inductor_list: + if not test_config: + check_num_gpus_for_distributed() + # default test process + res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["default"] = res_default_priority + # distributed test process + res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["distributed"] = res_distributed_priority + # will not run inductor priority tests + print("Inductor priority tests cannot run since no core tests defined with inductor workflow.") + else: + workflow_list = [] + for item in test_config: + workflow_list.append(item) + if "default" in workflow_list: + res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["default"] = res_default_priority + if "distributed" in workflow_list: + res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src) + res_all_tests_dict["distributed"] = res_distributed_priority + if "inductor" in workflow_list: + print("Inductor priority tests cannot run since no core tests defined with inductor workflow.") + # Run specified tests for each workflow + elif (default_list or distributed_list or inductor_list) and not test_config and not priority_tests: + if default_list: + default_workflow_list = [] + for item in default_list: + default_workflow_list.append(item) + res_default_selected = run_selected_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src, default_workflow_list) + res_all_tests_dict["default"] = res_default_selected + if distributed_list: + distributed_workflow_list = [] + for item in distributed_list: + distributed_workflow_list.append(item) + res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list) + res_all_tests_dict["distributed"] = res_distributed_selected + if inductor_list: + inductor_workflow_list = [] + for item in inductor_list: + inductor_workflow_list.append(item) + res_inductor_selected = run_selected_tests("inductor", test_run_test_path, overall_logs_path_current_run, test_reports_src, inductor_workflow_list) + res_all_tests_dict["inductor"] = res_inductor_selected + else: + raise Exception("Invalid test configurations!") + + # restore environment variables + os.environ.clear() + os.environ.update(_environ) + + # restore files + if skip_rerun: + # modify run_test.py in-place + with open(run_test_python_file, 'r') as file: + data = file.read() + data = data.replace(replace_text, search_text) + with open(run_test_python_file, 'w') as file: + file.write(data) + + return res_all_tests_dict + +def parse_args(): + parser = argparse.ArgumentParser(description='Run PyTorch unit tests and generate xml results summary', formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument('--test_config', nargs='+', default=[], type=str, help="space-separated list of test workflows to be executed eg. 'default distributed'") + parser.add_argument('--priority_tests', action='store_true', help="run priority tests only") + parser.add_argument('--default_list', nargs='+', default=[], help="space-separated list of 'default' config test suites/files to be executed eg. 'test_weak test_dlpack'") + parser.add_argument('--distributed_list', nargs='+', default=[], help="space-separated list of 'distributed' config test suites/files to be executed eg. 'distributed/test_c10d_common distributed/test_c10d_nccl'") + parser.add_argument('--inductor_list', nargs='+', default=[], help="space-separated list of 'inductor' config test suites/files to be executed eg. 'inductor/test_torchinductor test_ops'") + parser.add_argument('--pytorch_root', default='.', type=str, help="PyTorch root directory") + parser.add_argument('--skip_rerun', action='store_true', help="skip rerun process") + parser.add_argument('--example_output', type=str, help="{'workflow_name': {\n" + " test_file_and_status(file_name='workflow_aggregate', status='STATISTICS'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='ERROR'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='FAILED'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='PASSED'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='SKIPPED'): {}, \n" + " test_file_and_status(file_name='test_file_name_1', status='STATISTICS'): {} \n" + "}}\n") + parser.add_argument('--example_usages', type=str, help="RUN ALL TESTS: python3 run_pytorch_unit_tests.py \n" + "RUN PRIORITY TESTS: python3 run_pytorch_unit_tests.py --test_config distributed --priority_test \n" + "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor") + return parser.parse_args() + +def check_num_gpus_for_distributed(): + p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True) + num_gpus_visible = int(p.stdout) + assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests" + +def main(): + args = parse_args() + all_tests_results = run_test_and_summarize_results(args.pytorch_root, args.priority_tests, args.test_config, args.default_list, args.distributed_list, args.inductor_list, args.skip_rerun) + pprint(dict(all_tests_results)) + +if __name__ == "__main__": + main() diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh index 424ddd0013cd..41cabc3bf511 100644 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ b/.ci/aarch64_linux/aarch64_ci_build.sh @@ -3,8 +3,20 @@ set -eux -o pipefail GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} -if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then +# Set CUDA architecture lists to match x86 build_cuda.sh +if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then + export TORCH_CUDA_ARCH_LIST="8.0;9.0" +elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0" +elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then + export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX" +fi + +# Compress the fatbin with -compress-mode=size for CUDA 13 +if [[ "$DESIRED_CUDA" == *"13"* ]]; then + export TORCH_NVCC_FLAGS="-compress-mode=size" + # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801 + export BUILD_BUNDLE_PTXAS=1 fi SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" @@ -18,7 +30,7 @@ cd / # on the mounted pytorch repo git config --global --add safe.directory /pytorch pip install -r /pytorch/requirements.txt -pip install auditwheel==6.2.0 +pip install auditwheel==6.2.0 wheel if [ "$DESIRED_CUDA" = "cpu" ]; then echo "BASE_CUDA_VERSION is not set. Building cpu wheel." #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files @@ -26,6 +38,16 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then else echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" export USE_SYSTEM_NCCL=1 + + # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic) + if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then + echo "Bundling CUDA libraries with wheel for aarch64." + else + echo "Using nvidia libs from pypi for aarch64." + echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS" + export USE_NVIDIA_PYPI_LIBS=1 + fi + #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda fi diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py index a2b5f6912c9a..1b6429fa8c06 100755 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py @@ -69,62 +69,186 @@ def replace_tag(filename) -> None: f.writelines(lines) +def patch_library_rpath( + folder: str, + lib_name: str, + use_nvidia_pypi_libs: bool = False, + desired_cuda: str = "", +) -> None: + """Apply patchelf to set RPATH for a library in torch/lib""" + lib_path = f"{folder}/tmp/torch/lib/{lib_name}" + + if use_nvidia_pypi_libs: + # For PyPI NVIDIA libraries, construct CUDA RPATH + cuda_rpaths = [ + "$ORIGIN/../../nvidia/cudnn/lib", + "$ORIGIN/../../nvidia/nvshmem/lib", + "$ORIGIN/../../nvidia/nccl/lib", + "$ORIGIN/../../nvidia/cusparselt/lib", + ] + + if "130" in desired_cuda: + cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib") + else: + cuda_rpaths.extend( + [ + "$ORIGIN/../../nvidia/cublas/lib", + "$ORIGIN/../../nvidia/cuda_cupti/lib", + "$ORIGIN/../../nvidia/cuda_nvrtc/lib", + "$ORIGIN/../../nvidia/cuda_runtime/lib", + "$ORIGIN/../../nvidia/cufft/lib", + "$ORIGIN/../../nvidia/curand/lib", + "$ORIGIN/../../nvidia/cusolver/lib", + "$ORIGIN/../../nvidia/cusparse/lib", + "$ORIGIN/../../nvidia/nvtx/lib", + "$ORIGIN/../../nvidia/cufile/lib", + ] + ) + + # Add $ORIGIN for local torch libs + rpath = ":".join(cuda_rpaths) + ":$ORIGIN" + else: + # For bundled libraries, just use $ORIGIN + rpath = "$ORIGIN" + + if os.path.exists(lib_path): + os.system( + f"cd {folder}/tmp/torch/lib/; " + f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}" + ) + + +def copy_and_patch_library( + src_path: str, + folder: str, + use_nvidia_pypi_libs: bool = False, + desired_cuda: str = "", +) -> None: + """Copy a library to torch/lib and patch its RPATH""" + if os.path.exists(src_path): + lib_name = os.path.basename(src_path) + shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}") + patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda) + + def package_cuda_wheel(wheel_path, desired_cuda) -> None: """ Package the cuda wheel libraries """ folder = os.path.dirname(wheel_path) - wheelname = os.path.basename(wheel_path) os.mkdir(f"{folder}/tmp") os.system(f"unzip {wheel_path} -d {folder}/tmp") - libs_to_copy = [ - "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12", - "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so", - "/usr/local/cuda/lib64/libcudnn.so.9", - "/usr/local/cuda/lib64/libcublas.so.12", - "/usr/local/cuda/lib64/libcublasLt.so.12", - "/usr/local/cuda/lib64/libcudart.so.12", - "/usr/local/cuda/lib64/libcufft.so.11", - "/usr/local/cuda/lib64/libcusparse.so.12", - "/usr/local/cuda/lib64/libcusparseLt.so.0", - "/usr/local/cuda/lib64/libcusolver.so.11", - "/usr/local/cuda/lib64/libcurand.so.10", - "/usr/local/cuda/lib64/libnccl.so.2", - "/usr/local/cuda/lib64/libnvJitLink.so.12", - "/usr/local/cuda/lib64/libnvrtc.so.12", - "/usr/local/cuda/lib64/libnvshmem_host.so.3", - "/usr/local/cuda/lib64/libcudnn_adv.so.9", - "/usr/local/cuda/lib64/libcudnn_cnn.so.9", - "/usr/local/cuda/lib64/libcudnn_graph.so.9", - "/usr/local/cuda/lib64/libcudnn_ops.so.9", - "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9", - "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9", - "/usr/local/cuda/lib64/libcudnn_heuristic.so.9", - "/lib64/libgomp.so.1", - "/usr/lib64/libgfortran.so.5", - "/acl/build/libarm_compute.so", - "/acl/build/libarm_compute_graph.so", - "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0", - "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0", - "/usr/local/lib/libnvpl_lapack_core.so.0", - "/usr/local/lib/libnvpl_blas_core.so.0", - ] + # Delete original wheel since it will be repackaged + os.system(f"rm {wheel_path}") + + # Check if we should use PyPI NVIDIA libraries or bundle system libraries + use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1" + + if use_nvidia_pypi_libs: + print("Using nvidia libs from pypi - skipping CUDA library bundling") + # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages + # We only need to bundle non-NVIDIA libraries + minimal_libs_to_copy = [ + "/lib64/libgomp.so.1", + "/usr/lib64/libgfortran.so.5", + "/acl/build/libarm_compute.so", + "/acl/build/libarm_compute_graph.so", + "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_lapack_core.so.0", + "/usr/local/lib/libnvpl_blas_core.so.0", + ] - if "129" in desired_cuda: - libs_to_copy += [ - "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9", + # Copy minimal libraries to unzipped_folder/torch/lib + for lib_path in minimal_libs_to_copy: + copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda) + + # Patch torch libraries used for searching libraries + torch_libs_to_patch = [ + "libtorch.so", + "libtorch_cpu.so", + "libtorch_cuda.so", + "libtorch_cuda_linalg.so", + "libtorch_global_deps.so", + "libtorch_python.so", + "libtorch_nvshmem.so", + "libc10.so", + "libc10_cuda.so", + "libcaffe2_nvrtc.so", + "libshm.so", + ] + for lib_name in torch_libs_to_patch: + patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda) + else: + print("Bundling CUDA libraries with wheel") + # Original logic for bundling system CUDA libraries + # Common libraries for all CUDA versions + common_libs = [ + # Non-NVIDIA system libraries + "/lib64/libgomp.so.1", + "/usr/lib64/libgfortran.so.5", + "/acl/build/libarm_compute.so", + "/acl/build/libarm_compute_graph.so", + # Common CUDA libraries (same for all versions) + "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0", + "/usr/local/lib/libnvpl_lapack_core.so.0", + "/usr/local/lib/libnvpl_blas_core.so.0", + "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so", + "/usr/local/cuda/lib64/libcudnn.so.9", + "/usr/local/cuda/lib64/libcusparseLt.so.0", + "/usr/local/cuda/lib64/libcurand.so.10", + "/usr/local/cuda/lib64/libnccl.so.2", + "/usr/local/cuda/lib64/libnvshmem_host.so.3", + "/usr/local/cuda/lib64/libcudnn_adv.so.9", + "/usr/local/cuda/lib64/libcudnn_cnn.so.9", + "/usr/local/cuda/lib64/libcudnn_graph.so.9", + "/usr/local/cuda/lib64/libcudnn_ops.so.9", + "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9", + "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9", + "/usr/local/cuda/lib64/libcudnn_heuristic.so.9", "/usr/local/cuda/lib64/libcufile.so.0", "/usr/local/cuda/lib64/libcufile_rdma.so.1", + "/usr/local/cuda/lib64/libcusparse.so.12", ] - # Copy libraries to unzipped_folder/a/lib - for lib_path in libs_to_copy: - lib_name = os.path.basename(lib_path) - shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}") - os.system( - f"cd {folder}/tmp/torch/lib/; " - f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}" - ) + # CUDA version-specific libraries + if "13" in desired_cuda: + minor_version = desired_cuda[-1] + version_specific_libs = [ + "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13", + "/usr/local/cuda/lib64/libcublas.so.13", + "/usr/local/cuda/lib64/libcublasLt.so.13", + "/usr/local/cuda/lib64/libcudart.so.13", + "/usr/local/cuda/lib64/libcufft.so.12", + "/usr/local/cuda/lib64/libcusolver.so.12", + "/usr/local/cuda/lib64/libnvJitLink.so.13", + "/usr/local/cuda/lib64/libnvrtc.so.13", + f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}", + ] + elif "12" in desired_cuda: + # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9") + minor_version = desired_cuda[-1] + version_specific_libs = [ + "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12", + "/usr/local/cuda/lib64/libcublas.so.12", + "/usr/local/cuda/lib64/libcublasLt.so.12", + "/usr/local/cuda/lib64/libcudart.so.12", + "/usr/local/cuda/lib64/libcufft.so.11", + "/usr/local/cuda/lib64/libcusolver.so.11", + "/usr/local/cuda/lib64/libnvJitLink.so.12", + "/usr/local/cuda/lib64/libnvrtc.so.12", + f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}", + ] + else: + raise ValueError(f"Unsupported CUDA version: {desired_cuda}.") + + # Combine all libraries + libs_to_copy = common_libs + version_specific_libs + + # Copy libraries to unzipped_folder/torch/lib + for lib_path in libs_to_copy: + copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda) # Make sure the wheel is tagged with manylinux_2_28 for f in os.scandir(f"{folder}/tmp/"): @@ -132,14 +256,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None: replace_tag(f"{f.path}/WHEEL") break - os.mkdir(f"{folder}/cuda_wheel") - os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *") - shutil.move( - f"{folder}/cuda_wheel/{wheelname}", - f"{folder}/{wheelname}", - copy_function=shutil.copy2, - ) - os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/") + os.system(f"wheel pack {folder}/tmp/ -d {folder}") + os.system(f"rm -rf {folder}/tmp/") def complete_wheel(folder: str) -> str: @@ -162,14 +280,7 @@ def complete_wheel(folder: str) -> str: f"/{folder}/dist/{repaired_wheel_name}", ) else: - repaired_wheel_name = wheel_name.replace( - "linux_aarch64", "manylinux_2_28_aarch64" - ) - print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}") - os.rename( - f"/{folder}/dist/{wheel_name}", - f"/{folder}/dist/{repaired_wheel_name}", - ) + repaired_wheel_name = list_dir(f"/{folder}/dist")[0] print(f"Copying {repaired_wheel_name} to artifacts") shutil.copy2( @@ -211,6 +322,16 @@ def parse_arguments(): if enable_cuda: build_vars += "MAX_JOBS=5 " + # Handle PyPI NVIDIA libraries vs bundled libraries + use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1" + if use_nvidia_pypi_libs: + print("Configuring build for PyPI NVIDIA libraries") + # Configure for dynamic linking (matching x86 logic) + build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 " + else: + print("Configuring build for bundled NVIDIA libraries") + # Keep existing static linking approach - already configured above + override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") desired_cuda = os.getenv("DESIRED_CUDA") if override_package_version is not None: diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index f22aa919e434..8672fae2bbdd 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -81,8 +81,8 @@ elif [[ "$image" == *riscv* ]]; then DOCKERFILE="ubuntu-cross-riscv/Dockerfile" fi -_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb -_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b +_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152 +_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96 if [[ "$image" == *rocm* ]]; then _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6 _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d @@ -114,31 +114,19 @@ case "$tag" in UCC_COMMIT=${_UCC_COMMIT} TRITON=yes ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks) - CUDA_VERSION=12.8.1 + pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11) + CUDA_VERSION=13.0.0 ANACONDA_PYTHON_VERSION=3.10 - GCC_VERSION=9 - VISION=yes - KATEX=yes - UCX_COMMIT=${_UCX_COMMIT} - UCC_COMMIT=${_UCC_COMMIT} - TRITON=yes - INDUCTOR_BENCHMARKS=yes - ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks) - CUDA_VERSION=12.8.1 - ANACONDA_PYTHON_VERSION=3.12 - GCC_VERSION=9 + GCC_VERSION=11 VISION=yes KATEX=yes UCX_COMMIT=${_UCX_COMMIT} UCC_COMMIT=${_UCC_COMMIT} TRITON=yes - INDUCTOR_BENCHMARKS=yes ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks) + pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks) CUDA_VERSION=12.8.1 - ANACONDA_PYTHON_VERSION=3.13 + ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=9 VISION=yes KATEX=yes @@ -173,8 +161,8 @@ case "$tag" in VISION=yes ONNX=yes ;; - pytorch-linux-jammy-py3.9-clang12) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-py3.10-clang12) + ANACONDA_PYTHON_VERSION=3.10 CLANG_VERSION=12 VISION=yes TRITON=yes @@ -209,24 +197,24 @@ case "$tag" in UCC_COMMIT=${_UCC_COMMIT} PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950" ;; - pytorch-linux-jammy-xpu-2025.0-py3) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-xpu-n-1-py3) + ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes - XPU_VERSION=2025.0 + XPU_VERSION=2025.1 NINJA_VERSION=1.9.0 TRITON=yes ;; - pytorch-linux-jammy-xpu-2025.1-py3) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-xpu-n-py3) + ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes - XPU_VERSION=2025.1 + XPU_VERSION=2025.2 NINJA_VERSION=1.9.0 TRITON=yes ;; - pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-py3-gcc11-inductor-benchmarks) + ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes KATEX=yes @@ -234,8 +222,8 @@ case "$tag" in DOCS=yes INDUCTOR_BENCHMARKS=yes ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12) + ANACONDA_PYTHON_VERSION=3.10 CUDA_VERSION=12.8.1 CLANG_VERSION=12 VISION=yes @@ -246,8 +234,8 @@ case "$tag" in CLANG_VERSION=18 VISION=yes ;; - pytorch-linux-jammy-py3.9-gcc11) - ANACONDA_PYTHON_VERSION=3.9 + pytorch-linux-jammy-py3.10-gcc11) + ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes KATEX=yes @@ -274,13 +262,10 @@ case "$tag" in TRITON_CPU=yes ;; pytorch-linux-jammy-linter) - # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627. - # We will need to update mypy version eventually, but that's for another day. The task - # would be to upgrade mypy to 1.0.0 with Python 3.11 - PYTHON_VERSION=3.9 + PYTHON_VERSION=3.10 ;; - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter) - PYTHON_VERSION=3.9 + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter) + PYTHON_VERSION=3.10 CUDA_VERSION=12.8.1 ;; pytorch-linux-jammy-aarch64-py3.10-gcc11) diff --git a/.ci/docker/ci_commit_pins/torchbench.txt b/.ci/docker/ci_commit_pins/torchbench.txt index efbc3ceeb2af..c9be7b440bae 100644 --- a/.ci/docker/ci_commit_pins/torchbench.txt +++ b/.ci/docker/ci_commit_pins/torchbench.txt @@ -1 +1 @@ -e03a63be43e33596f7f0a43b0f530353785e4a59 +74a23feff57432129df84d8099e622773cf77925 diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt index 3be14be85ad6..b03606f6defc 100644 --- a/.ci/docker/ci_commit_pins/triton-xpu.txt +++ b/.ci/docker/ci_commit_pins/triton-xpu.txt @@ -1 +1 @@ -a6572fb0be5b9b0a19b0641a0ce05810fa04e44c +1b0418a9a454b2b93ab8d71f40e59d2297157fae diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt index 60c896b80c8f..99ec5b4aa341 100644 --- a/.ci/docker/ci_commit_pins/triton.txt +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -1 +1 @@ -f7888497a1eb9e98d4c07537f0d0bcfe180d1363 +d08c31a24d622b4bf767a6645135b7b3d0d886f4 diff --git a/.ci/docker/common/install_cpython.sh b/.ci/docker/common/install_cpython.sh index c160e5704ba3..692edd0b898f 100755 --- a/.ci/docker/common/install_cpython.sh +++ b/.ci/docker/common/install_cpython.sh @@ -83,9 +83,9 @@ function build_cpython { py_suffix=${py_ver::-1} py_folder=$py_suffix fi - # Only b3 is available now + # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4 if [ "$py_suffix" == "3.14.0" ]; then - py_suffix="3.14.0b3" + py_suffix="3.14.0rc2" fi wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz do_cpython_build $py_ver Python-$py_suffix diff --git a/.ci/docker/common/install_cuda.sh b/.ci/docker/common/install_cuda.sh index 00c3cfd06b41..c6808ea4a7a2 100644 --- a/.ci/docker/common/install_cuda.sh +++ b/.ci/docker/common/install_cuda.sh @@ -147,7 +147,7 @@ function install_128 { } function install_130 { - CUDNN_VERSION=9.12.0.46 + CUDNN_VERSION=9.13.0.50 echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" # install CUDA 13.0 in the same container install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh index f48140952c3a..8e714bcb6cd3 100755 --- a/.ci/docker/common/install_triton.sh +++ b/.ci/docker/common/install_triton.sh @@ -21,7 +21,7 @@ elif [ -n "${TRITON_CPU}" ]; then TRITON_REPO="https://github.com/triton-lang/triton-cpu" TRITON_TEXT_FILE="triton-cpu" else - TRITON_REPO="https://github.com/triton-lang/triton" + TRITON_REPO="https://github.com/ROCm/triton" TRITON_TEXT_FILE="triton" fi diff --git a/.ci/docker/common/install_ucc.sh b/.ci/docker/common/install_ucc.sh index b7f884ea9648..04f15a52e88e 100755 --- a/.ci/docker/common/install_ucc.sh +++ b/.ci/docker/common/install_ucc.sh @@ -44,8 +44,12 @@ function install_ucc() { ./autogen.sh - # We only run distributed tests on Tesla M60 and A10G - NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86" + if [[ -n "$CUDA_VERSION" && $CUDA_VERSION == 13* ]]; then + NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86" + else + # We only run distributed tests on Tesla M60 and A10G + NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86" + fi if [[ -n "$ROCM_VERSION" ]]; then if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh index 7f21d2e42c72..0b150872f93c 100644 --- a/.ci/docker/common/install_xpu.sh +++ b/.ci/docker/common/install_xpu.sh @@ -65,10 +65,14 @@ function install_ubuntu() { function install_rhel() { . /etc/os-release - - if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then - echo "RHEL version ${VERSION_ID} not supported" - exit + if [[ "${ID}" == "rhel" ]]; then + if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then + echo "RHEL version ${VERSION_ID} not supported" + exit + fi + elif [[ "${ID}" == "almalinux" ]]; then + # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64 + VERSION_ID="8.8" fi dnf install -y 'dnf-command(config-manager)' @@ -146,11 +150,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then XPU_DRIVER_VERSION="/lts/2350" fi -# Default use Intel® oneAPI Deep Learning Essentials 2025.0 -if [[ "$XPU_VERSION" == "2025.1" ]]; then - XPU_PACKAGES="intel-deep-learning-essentials-2025.1" +# Default use Intel® oneAPI Deep Learning Essentials 2025.1 +if [[ "$XPU_VERSION" == "2025.2" ]]; then + XPU_PACKAGES="intel-deep-learning-essentials-2025.2" else - XPU_PACKAGES="intel-deep-learning-essentials-2025.0" + XPU_PACKAGES="intel-deep-learning-essentials-2025.1" fi # The installation depends on the base OS diff --git a/.ci/docker/common/patch_libstdc.sh b/.ci/docker/common/patch_libstdc.sh new file mode 100755 index 000000000000..7e3a00d0dad8 --- /dev/null +++ b/.ci/docker/common/patch_libstdc.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -xe +# Script used in Linux x86 and aarch64 CD pipeline + +# Workaround for exposing statically linked libstdc++ CXX11 ABI symbols. +# see: https://github.com/pytorch/pytorch/issues/133437 +LIBNONSHARED=$(gcc -print-file-name=libstdc++_nonshared.a) +nm -g $LIBNONSHARED | grep " T " | grep recursive_directory_iterator | cut -c 20- > weaken-symbols.txt +objcopy --weaken-symbols weaken-symbols.txt $LIBNONSHARED $LIBNONSHARED diff --git a/.ci/docker/libtorch/Dockerfile b/.ci/docker/libtorch/Dockerfile index d2788b2713f7..d19431ad8b54 100644 --- a/.ci/docker/libtorch/Dockerfile +++ b/.ci/docker/libtorch/Dockerfile @@ -74,6 +74,14 @@ RUN bash ./install_cuda.sh 13.0 RUN bash ./install_magma.sh 13.0 RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda +# Install libibverbs for libtorch and copy to CUDA directory +RUN apt-get update -y && \ + apt-get install -y libibverbs-dev librdmacm-dev && \ + cp /usr/lib/x86_64-linux-gnu/libmlx5.so* /usr/local/cuda/lib64/ && \ + cp /usr/lib/x86_64-linux-gnu/librdmacm.so* /usr/local/cuda/lib64/ && \ + cp /usr/lib/x86_64-linux-gnu/libibverbs.so* /usr/local/cuda/lib64/ && \ + cp /usr/lib/x86_64-linux-gnu/libnl* /usr/local/cuda/lib64/ + FROM cpu as rocm ARG ROCM_VERSION ARG PYTORCH_ROCM_ARCH diff --git a/.ci/docker/manywheel/Dockerfile_2_28 b/.ci/docker/manywheel/Dockerfile_2_28 index b150423e9954..4803cb778c90 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28 +++ b/.ci/docker/manywheel/Dockerfile_2_28 @@ -130,7 +130,8 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \ /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \ done; - +ADD ./common/patch_libstdc.sh patch_libstdc.sh +RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh # cmake-3.18.4 from pip; force in case cmake3 already exists RUN yum install -y python3-pip && \ @@ -175,6 +176,6 @@ ENV XPU_DRIVER_TYPE ROLLING RUN python3 -m pip install --upgrade pip && \ python3 -mpip install cmake==3.28.4 ADD ./common/install_xpu.sh install_xpu.sh -ENV XPU_VERSION 2025.1 +ENV XPU_VERSION 2025.2 RUN bash ./install_xpu.sh && rm install_xpu.sh RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd diff --git a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 index da7ab4d3fd15..6cfab77941fc 100644 --- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64 @@ -71,3 +71,5 @@ RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/ ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH +ADD ./common/patch_libstdc.sh patch_libstdc.sh +RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh diff --git a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 index 369706055737..4d2596fea821 100644 --- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64 +++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64 @@ -95,3 +95,5 @@ COPY --from=nvpl /opt/nvpl/lib/ /usr/local/lib/ COPY --from=nvpl /opt/nvpl/include/ /usr/local/include/ RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda ENV PATH=/usr/local/cuda/bin:$PATH +ADD ./common/patch_libstdc.sh patch_libstdc.sh +RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index c9d2fddb1324..248ee8409036 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -93,8 +93,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x" #Pinned versions: #test that import: -mypy==1.16.0 +mypy==1.16.0 ; platform_system != "Windows" # Pin MyPy version because new errors are likely to appear with each release +# Skip on Windows as lots of type annotations are POSIX specific #Description: linter #Pinned versions: 1.16.0 #test that import: test_typing.py, test_type_hints.py @@ -112,9 +113,8 @@ ninja==1.11.1.3 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py numba==0.49.0 ; python_version < "3.9" and platform_machine != "s390x" -numba==0.55.2 ; python_version == "3.9" and platform_machine != "s390x" -numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x" -numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" +numba==0.60.0 ; python_version == "3.9" and platform_machine != "s390x" +numba==0.61.2 ; python_version > "3.9" and platform_machine != "s390x" #Description: Just-In-Time Compiler for Numerical Functions #Pinned versions: 0.54.1, 0.49.0, <=0.49.1 #test that import: test_numba_integration.py @@ -133,12 +133,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x" #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py, #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py, #test_binary_ufuncs.py -numpy==1.22.4; python_version == "3.9" or python_version == "3.10" -numpy==1.26.2; python_version == "3.11" or python_version == "3.12" -numpy==2.1.2; python_version >= "3.13" +numpy==2.0.2 ; python_version == "3.9" +numpy==2.1.2 ; python_version > "3.9" -pandas==2.0.3; python_version < "3.13" -pandas==2.2.3; python_version >= "3.13" +pandas==2.2.3 #onnxruntime #Description: scoring engine for Open Neural Network Exchange (ONNX) models @@ -168,10 +166,11 @@ pillow==11.0.0 #Pinned versions: 10.3.0 #test that import: -protobuf==5.29.4 -#Description: Google's data interchange format -#Pinned versions: 5.29.4 -#test that import: test_tensorboard.py, test/onnx/* +protobuf==3.20.2 ; python_version <= "3.12" +protobuf==4.25.1 ; python_version == "3.13" +#Description: Google’s data interchange format +#Pinned versions: 3.20.1 +#test that import: test_tensorboard.py psutil #Description: information on running processes and system utilization @@ -249,8 +248,8 @@ scikit-image==0.22.0 ; python_version >= "3.10" #Pinned versions: 0.20.3 #test that import: -scipy==1.10.1 ; python_version <= "3.11" -scipy==1.14.1 ; python_version >= "3.12" +scipy==1.13.1 ; python_version == "3.9" +scipy==1.14.1 ; python_version > "3.9" # Pin SciPy because of failing distribution tests (see #60347) #Description: scientific python #Pinned versions: 1.10.1 @@ -309,8 +308,7 @@ z3-solver==4.15.1.0 ; platform_machine != "s390x" #Pinned versions: #test that import: -tensorboard==2.13.0 ; python_version < "3.13" -tensorboard==2.18.0 ; python_version >= "3.13" +tensorboard==2.18.0 #Description: Also included in .ci/docker/requirements-docs.txt #Pinned versions: #test that import: test_tensorboard @@ -322,7 +320,8 @@ pywavelets==1.7.0 ; python_version >= "3.12" #Pinned versions: 1.4.1 #test that import: -lxml==5.3.0 +lxml==5.3.0 ; python_version <= "3.12" +lxml==6.0.0 ; python_version == "3.13" #Description: This is a requirement of unittest-xml-reporting # Python-3.9 binaries @@ -334,8 +333,9 @@ sympy==1.13.3 #Pinned versions: #test that import: -onnx==1.18.0 -#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal +onnx==1.16.1 ; python_version <= "3.12" +onnx==1.18.0 ; python_version == "3.13" +#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal #Pinned versions: #test that import: @@ -379,7 +379,7 @@ dataclasses_json==0.6.7 cmake==4.0.0 #Description: required for building -tlparse==0.3.30 +tlparse==0.4.0 #Description: required for log parsing cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x" diff --git a/.ci/docker/requirements-docs.txt b/.ci/docker/requirements-docs.txt index 3de4d8e0e44e..c5ad8e969fb9 100644 --- a/.ci/docker/requirements-docs.txt +++ b/.ci/docker/requirements-docs.txt @@ -1,7 +1,7 @@ sphinx==5.3.0 #Description: This is used to generate PyTorch docs #Pinned versions: 5.3.0 --e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2 +-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # but it doesn't seem to work and hangs around idly. The initial thought that it is probably diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt index 18091983f59d..1545d966571d 100644 --- a/.ci/docker/triton_version.txt +++ b/.ci/docker/triton_version.txt @@ -1 +1 @@ -3.4.0 +3.5.0 diff --git a/.ci/docker/triton_xpu_version.txt b/.ci/docker/triton_xpu_version.txt index 18091983f59d..1545d966571d 100644 --- a/.ci/docker/triton_xpu_version.txt +++ b/.ci/docker/triton_xpu_version.txt @@ -1 +1 @@ -3.4.0 +3.5.0 diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 57f997f30089..1edc8c60c2f0 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -66,6 +66,7 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/" # (optional) Install UCC ARG UCX_COMMIT ARG UCC_COMMIT +ARG CUDA_VERSION ENV UCX_COMMIT $UCX_COMMIT ENV UCC_COMMIT $UCC_COMMIT ENV UCX_HOME /usr diff --git a/.ci/libtorch/build.sh b/.ci/libtorch/build.sh index 54ddd905aad0..c2d67f8b1bb2 100644 --- a/.ci/libtorch/build.sh +++ b/.ci/libtorch/build.sh @@ -7,4 +7,4 @@ set -ex SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh +USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh diff --git a/.ci/lumen_cli/cli/lib/common/gh_summary.py b/.ci/lumen_cli/cli/lib/common/gh_summary.py new file mode 100644 index 000000000000..72bfaa76e706 --- /dev/null +++ b/.ci/lumen_cli/cli/lib/common/gh_summary.py @@ -0,0 +1,143 @@ +from __future__ import annotations + +import logging +import os +import textwrap +from pathlib import Path +from typing import TYPE_CHECKING + +from cli.lib.common.utils import get_wheels +from jinja2 import Template + + +if TYPE_CHECKING: + from collections.abc import Iterable, Mapping + + +logger = logging.getLogger(__name__) + +_TPL_CONTENT = Template( + textwrap.dedent("""\ + ## {{ title }} + + ```{{ lang }} + {{ content }} + ``` +""") +) + +_TPL_LIST_ITEMS = Template( + textwrap.dedent("""\ + ## {{ title }} + {% for it in items %} + - {{ it.pkg }}: {{ it.relpath }} + {% else %} + _(no item found)_ + {% endfor %} + """) +) + +_TPL_TABLE = Template( + textwrap.dedent("""\ + {%- if rows %} + | {{ cols | join(' | ') }} | + |{%- for _ in cols %} --- |{%- endfor %} + {%- for r in rows %} + | {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %} + {%- endfor %} + {%- else %} + _(no data)_ + {%- endif %} +""") +) + + +def gh_summary_path() -> Path | None: + """Return the Path to the GitHub step summary file, or None if not set.""" + p = os.environ.get("GITHUB_STEP_SUMMARY") + return Path(p) if p else None + + +def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool: + """ + Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set. + append_content: default true, if True, append to the end of the file, else overwrite the whole file + + Returns: + True if written successfully (in GitHub Actions environment), + False if skipped (e.g., running locally where the variable is not set). + """ + sp = gh_summary_path() + if not sp: + logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.") + return False + + md_clean = textwrap.dedent(md).strip() + "\n" + + mode = "a" if append_content else "w" + with sp.open(mode, encoding="utf-8") as f: + f.write(md_clean) + return True + + +def md_heading(text: str, level: int = 2) -> str: + """Generate a Markdown heading string with the given level (1-6).""" + return f"{'#' * max(1, min(level, 6))} {text}\n" + + +def md_details(summary: str, content: str) -> str: + """Generate a collapsible
block with a summary and inner content.""" + return f"
\n{summary}\n\n{content}\n\n
\n" + + +def summarize_content_from_file( + output_dir: Path, + freeze_file: str, + title: str = "Content from file", + code_lang: str = "", # e.g. "text" or "ini" +) -> bool: + f = Path(output_dir) / freeze_file + if not f.exists(): + return False + content = f.read_text(encoding="utf-8").strip() + md = render_content(content, title=title, lang=code_lang) + return write_gh_step_summary(md) + + +def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3): + items = get_wheels(path, max_depth=max_depth) + if not items: + return False + md = render_list(items, title=title) + return write_gh_step_summary(md) + + +def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str: + """ + Render a list of dicts as a Markdown table using Jinja template. + """ + rows = list(rows) + cols = list({k for r in rows for k in r.keys()}) + md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n" + return md + + +def render_list( + items: Iterable[str], + *, + title: str = "List", +) -> str: + tpl = _TPL_LIST_ITEMS + md = tpl.render(title=title, items=items) + return md + + +def render_content( + content: str, + *, + title: str = "Content", + lang: str = "text", +) -> str: + tpl = _TPL_CONTENT + md = tpl.render(title=title, content=content, lang=lang) + return md diff --git a/.ci/lumen_cli/cli/lib/common/git_helper.py b/.ci/lumen_cli/cli/lib/common/git_helper.py index 7fa070a3cb65..9833caca956c 100644 --- a/.ci/lumen_cli/cli/lib/common/git_helper.py +++ b/.ci/lumen_cli/cli/lib/common/git_helper.py @@ -45,7 +45,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules # Checkout pinned commit commit = get_post_build_pinned_commit(target) - logger.info("Checking out pinned commit %s", commit) + logger.info("Checking out pinned %s commit %s", target, commit) r.git.checkout(commit) # Update submodules if requested @@ -55,7 +55,7 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules sm.update(init=True, recursive=True, progress=PrintProgress()) logger.info("Successfully cloned %s", target) - return r + return r, commit except GitCommandError as e: logger.error("Git operation failed: %s", e) diff --git a/.ci/lumen_cli/cli/lib/common/pip_helper.py b/.ci/lumen_cli/cli/lib/common/pip_helper.py index 1eed8406c9f7..a53747e24d25 100644 --- a/.ci/lumen_cli/cli/lib/common/pip_helper.py +++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py @@ -4,7 +4,7 @@ import shutil import sys from collections.abc import Iterable -from importlib.metadata import PackageNotFoundError, version +from importlib.metadata import PackageNotFoundError, version # noqa: UP035 from typing import Optional, Union from cli.lib.common.utils import run_command diff --git a/.ci/lumen_cli/cli/lib/common/utils.py b/.ci/lumen_cli/cli/lib/common/utils.py index 05790bd66acf..b03309810d98 100644 --- a/.ci/lumen_cli/cli/lib/common/utils.py +++ b/.ci/lumen_cli/cli/lib/common/utils.py @@ -8,6 +8,7 @@ import subprocess import sys from contextlib import contextmanager +from pathlib import Path from typing import Optional @@ -115,3 +116,24 @@ def working_directory(path: str): yield finally: os.chdir(prev_cwd) + + +def get_wheels( + output_dir: Path, + max_depth: Optional[int] = None, +) -> list[str]: + """Return a list of wheels found in the given output directory.""" + root = Path(output_dir) + if not root.exists(): + return [] + items = [] + for dirpath, _, filenames in os.walk(root): + depth = Path(dirpath).relative_to(root).parts + if max_depth is not None and len(depth) > max_depth: + continue + for fname in sorted(filenames): + if fname.endswith(".whl"): + pkg = fname.split("-")[0] + relpath = str((Path(dirpath) / fname).relative_to(root)) + items.append({"pkg": pkg, "relpath": relpath}) + return items diff --git a/.ci/lumen_cli/cli/lib/core/vllm/lib.py b/.ci/lumen_cli/cli/lib/core/vllm/lib.py index 7f3a930b2cc6..0e2132839adb 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py +++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py @@ -1,13 +1,27 @@ import logging +import os +import textwrap from typing import Any +from cli.lib.common.gh_summary import write_gh_step_summary from cli.lib.common.git_helper import clone_external_repo from cli.lib.common.pip_helper import pip_install_packages from cli.lib.common.utils import run_command, temp_environ, working_directory +from jinja2 import Template logger = logging.getLogger(__name__) +_TPL_VLLM_INFO = Template( + textwrap.dedent("""\ + ## Vllm against Pytorch CI Test Summary + **Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }}) + {%- if torch_sha %} + **Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }}) + {%- endif %} +""") +) + def sample_vllm_test_library(): """ @@ -27,7 +41,6 @@ def sample_vllm_test_library(): "pytest -v -s basic_correctness/test_cumem.py", "pytest -v -s basic_correctness/test_basic_correctness.py", "pytest -v -s basic_correctness/test_cpu_offload.py", - "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py", ], }, "vllm_basic_models_test": { @@ -54,16 +67,12 @@ def sample_vllm_test_library(): "-v", "-s", "entrypoints/llm", - "--ignore=entrypoints/llm/test_lazy_outlines.py", "--ignore=entrypoints/llm/test_generate.py", - "--ignore=entrypoints/llm/test_generate_multiple_loras.py", "--ignore=entrypoints/llm/test_collective_rpc.py", ] ), - "pytest -v -s entrypoints/llm/test_lazy_outlines.py", - "pytest -v -s entrypoints/llm/test_generate.py ", - "pytest -v -s entrypoints/llm/test_generate_multiple_loras.py", - "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode", + "pytest -v -s entrypoints/llm/test_generate.py", + "pytest -v -s entrypoints/offline_mode", ], }, "vllm_regression_test": { @@ -83,14 +92,24 @@ def sample_vllm_test_library(): "num_gpus": 4, "steps": [ "pytest -v -s -x lora/test_chatglm3_tp.py", - "echo $VLLM_WORKER_MULTIPROC_METHOD", "pytest -v -s -x lora/test_llama_tp.py", - "pytest -v -s -x lora/test_multi_loras_with_tp.py", + "pytest -v -s -x lora/test_llm_with_multi_loras.py", ], }, - "vllm_lora_280_failure_test": { - "title": "LoRA 280 failure test", - "id": "vllm_lora_280_failure_test", + "vllm_distributed_test_28_failure_test": { + "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure", + "id": "vllm_distributed_test_28_failure_test", + "env_vars": { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + }, + "num_gpus": 4, + "steps": [ + "pytest -v -s distributed/test_sequence_parallel.py", + ], + }, + "vllm_lora_28_failure_test": { + "title": "LoRA pytorch 2.8 failure test", + "id": "vllm_lora_28_failure_test", "steps": ["pytest -v lora/test_quant_model.py"], }, "vllm_multi_model_processor_test": { @@ -101,6 +120,15 @@ def sample_vllm_test_library(): "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py", ], }, + "vllm_multi_model_test_28_failure_test": { + "title": "Multi-Model Test (Failed 2.8 release)", + "id": "vllm_multi_model_test_28_failure_test", + "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"], + "steps": [ + "pytest -v -s models/multimodal/generation/test_voxtral.py", + "pytest -v -s models/multimodal/pooling", + ], + }, "vllm_pytorch_compilation_unit_tests": { "title": "PyTorch Compilation Unit Tests", "id": "vllm_pytorch_compilation_unit_tests", @@ -115,6 +143,28 @@ def sample_vllm_test_library(): "pytest -v -s compile/test_decorator.py", ], }, + "vllm_languagde_model_test_extended_generation_28_failure_test": { + "title": "Language Models Test (Extended Generation) 2.8 release failure", + "id": "vllm_languagde_model_test_extended_generation_28_failure_test", + "package_install": [ + "--no-build-isolation", + "git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8", + ], + "steps": [ + "pytest -v -s models/language/generation/test_mistral.py", + ], + }, + "vllm_distributed_test_2_gpu_28_failure_test": { + "title": "Distributed Tests (2 GPUs) pytorch 2.8 release failure", + "id": "vllm_distributed_test_2_gpu_28_failure_test", + "env_vars": { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + }, + "num_gpus": 4, + "steps": [ + "pytest -v -s distributed/test_sequence_parallel.py", + ], + }, # TODO(elainewy):need to add g6 with 4 gpus to run this test "vllm_lora_test": { "title": "LoRA Test %N", @@ -214,12 +264,13 @@ def run_test_plan( def clone_vllm(dst: str = "vllm"): - clone_external_repo( + _, commit = clone_external_repo( target="vllm", repo="https://github.com/vllm-project/vllm.git", dst=dst, update_submodules=True, ) + return commit def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str: @@ -230,3 +281,12 @@ def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> for k in sorted(mapping, key=len, reverse=True): step = step.replace(k, mapping[k]) return step + + +def summarize_build_info(vllm_commit: str) -> bool: + torch_sha = os.getenv("GITHUB_SHA") + md = ( + _TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip() + + "\n" + ) + return write_gh_step_summary(md) diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py index d067a14f7590..8db48065cb05 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py +++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py @@ -13,6 +13,11 @@ env_str_field, with_params_help, ) +from cli.lib.common.gh_summary import ( + gh_summary_path, + summarize_content_from_file, + summarize_wheels, +) from cli.lib.common.path_helper import ( copy, ensure_dir_exists, @@ -21,7 +26,7 @@ is_path_exist, ) from cli.lib.common.utils import run_command -from cli.lib.core.vllm.lib import clone_vllm +from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info logger = logging.getLogger(__name__) @@ -153,18 +158,43 @@ def run(self): """ inputs = VllmBuildParameters() logger.info("Running vllm build with inputs: %s", inputs) - clone_vllm() + vllm_commit = clone_vllm() self.cp_dockerfile_if_exist(inputs) - # cp torch wheels from root direct to vllm workspace if exist self.cp_torch_whls_if_exist(inputs) - ensure_dir_exists(inputs.output_dir) + # make sure the output dir to store the build artifacts exist + ensure_dir_exists(Path(inputs.output_dir)) cmd = self._generate_docker_build_cmd(inputs) logger.info("Running docker build: \n %s", cmd) - run_command(cmd, cwd="vllm", env=os.environ.copy()) + + try: + run_command(cmd, cwd="vllm", env=os.environ.copy()) + finally: + self.genearte_vllm_build_summary(vllm_commit, inputs) + + def genearte_vllm_build_summary( + self, vllm_commit: str, inputs: VllmBuildParameters + ): + if not gh_summary_path(): + return logger.info("Skipping, not detect GH Summary env var....") + logger.info("Generate GH Summary ...") + # summarize vllm build info + summarize_build_info(vllm_commit) + + # summarize vllm build artifacts + vllm_artifact_dir = inputs.output_dir / "wheels" + summarize_content_from_file( + vllm_artifact_dir, + "build_summary.txt", + title="Vllm build env pip package summary", + ) + summarize_wheels( + inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts" + ) + summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts") def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str: if not inputs.use_torch_whl: diff --git a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py index 2be8e246486e..76401e33f29f 100644 --- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py +++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py @@ -104,20 +104,26 @@ def run(self): main function to run vllm test """ self.prepare() - with working_directory(self.work_directory): - if self.test_type == TestInpuType.TEST_PLAN: - if self.num_shards > 1: - run_test_plan( - self.test_plan, - "vllm", - sample_vllm_test_library(), - self.shard_id, - self.num_shards, - ) + try: + with working_directory(self.work_directory): + if self.test_type == TestInpuType.TEST_PLAN: + if self.num_shards > 1: + run_test_plan( + self.test_plan, + "vllm", + sample_vllm_test_library(), + self.shard_id, + self.num_shards, + ) + else: + run_test_plan( + self.test_plan, "vllm", sample_vllm_test_library() + ) else: - run_test_plan(self.test_plan, "vllm", sample_vllm_test_library()) - else: - raise ValueError(f"Unknown test type {self.test_type}") + raise ValueError(f"Unknown test type {self.test_type}") + finally: + # double check the torches are not overridden by other packages + check_versions() def _install_wheels(self, params: VllmTestParameters): logger.info("Running vllm test with inputs: %s", params) @@ -220,6 +226,8 @@ def preprocess_test_in( target_path = Path(target_file) lines = target_path.read_text().splitlines() + pkgs_to_add = [] + # Remove lines starting with the package names (==, @, >=) — case-insensitive pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE) kept_lines = [line for line in lines if not pattern.match(line)] @@ -236,7 +244,11 @@ def preprocess_test_in( ] # Write back: header_lines + blank + kept_lines - out = "\n".join(header_lines + [""] + kept_lines) + "\n" + out_lines = header_lines + [""] + kept_lines + if pkgs_to_add: + out_lines += [""] + pkgs_to_add + + out = "\n".join(out_lines) + "\n" target_path.write_text(out) logger.info("[INFO] Updated %s", target_file) diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh index 3fbd25be1da3..6ed38f8b25c6 100644 --- a/.ci/manywheel/build_cuda.sh +++ b/.ci/manywheel/build_cuda.sh @@ -124,6 +124,7 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then fi if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then echo "Bundling with cudnn and cublas." + DEPS_LIST+=( "/usr/local/cuda/lib64/libcudnn_adv.so.9" "/usr/local/cuda/lib64/libcudnn_cnn.so.9" @@ -133,16 +134,11 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9" "/usr/local/cuda/lib64/libcudnn_heuristic.so.9" "/usr/local/cuda/lib64/libcudnn.so.9" - "/usr/local/cuda/lib64/libcublas.so.12" - "/usr/local/cuda/lib64/libcublasLt.so.12" "/usr/local/cuda/lib64/libcusparseLt.so.0" - "/usr/local/cuda/lib64/libcudart.so.12" - "/usr/local/cuda/lib64/libnvrtc.so.12" "/usr/local/cuda/lib64/libnvrtc-builtins.so" "/usr/local/cuda/lib64/libcufile.so.0" "/usr/local/cuda/lib64/libcufile_rdma.so.1" "/usr/local/cuda/lib64/libnvshmem_host.so.3" - "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12" "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so" ) DEPS_SONAME+=( @@ -154,22 +150,56 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then "libcudnn_engines_precompiled.so.9" "libcudnn_heuristic.so.9" "libcudnn.so.9" - "libcublas.so.12" - "libcublasLt.so.12" "libcusparseLt.so.0" - "libcudart.so.12" - "libnvrtc.so.12" "libnvrtc-builtins.so" "libnvshmem_host.so.3" "libcufile.so.0" "libcufile_rdma.so.1" - "libcupti.so.12" "libnvperf_host.so" ) # Add libnvToolsExt only if CUDA version is not 12.9 - if [[ $CUDA_VERSION != 12.9* ]]; then - DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1") - DEPS_SONAME+=("libnvToolsExt.so.1") + if [[ $CUDA_VERSION == 13* ]]; then + DEPS_LIST+=( + "/usr/local/cuda/lib64/libcublas.so.13" + "/usr/local/cuda/lib64/libcublasLt.so.13" + "/usr/local/cuda/lib64/libcudart.so.13" + "/usr/local/cuda/lib64/libnvrtc.so.13" + "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13" + "/usr/local/cuda/lib64/libibverbs.so.1" + "/usr/local/cuda/lib64/librdmacm.so.1" + "/usr/local/cuda/lib64/libmlx5.so.1" + "/usr/local/cuda/lib64/libnl-3.so.200" + "/usr/local/cuda/lib64/libnl-route-3.so.200") + DEPS_SONAME+=( + "libcublas.so.13" + "libcublasLt.so.13" + "libcudart.so.13" + "libnvrtc.so.13" + "libcupti.so.13" + "libibverbs.so.1" + "librdmacm.so.1" + "libmlx5.so.1" + "libnl-3.so.200" + "libnl-route-3.so.200") + export USE_CUPTI_SO=1 + export ATEN_STATIC_CUDA=0 + export USE_CUDA_STATIC_LINK=0 + export USE_CUFILE=0 + else + DEPS_LIST+=( + "/usr/local/cuda/lib64/libnvToolsExt.so.1" + "/usr/local/cuda/lib64/libcublas.so.12" + "/usr/local/cuda/lib64/libcublasLt.so.12" + "/usr/local/cuda/lib64/libcudart.so.12" + "/usr/local/cuda/lib64/libnvrtc.so.12" + "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12") + DEPS_SONAME+=( + "libnvToolsExt.so.1" + "libcublas.so.12" + "libcublasLt.so.12" + "libcudart.so.12" + "libnvrtc.so.12" + "libcupti.so.12") fi else echo "Using nvidia libs from pypi." diff --git a/.ci/pytorch/check_binary.sh b/.ci/pytorch/check_binary.sh index 0f632f8006c0..cca289ac146b 100755 --- a/.ci/pytorch/check_binary.sh +++ b/.ci/pytorch/check_binary.sh @@ -67,7 +67,7 @@ fi # wheels with cxx11-abi echo "Checking that the gcc ABI is what we expect" -if [[ "$(uname)" != 'Darwin' ]]; then +if [[ "$(uname)" != 'Darwin' && "$(uname -m)" != "s390x" ]]; then # We also check that there are cxx11 symbols in libtorch # echo "Checking that symbols in libtorch.so have the right gcc abi" diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index 6d79a4517edf..bf03e132d30b 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -284,7 +284,7 @@ function install_torchrec_and_fbgemm() { function clone_pytorch_xla() { if [[ ! -d ./xla ]]; then - git clone --recursive --quiet https://github.com/pytorch/xla.git + git clone --recursive -b r2.9 https://github.com/pytorch/xla.git pushd xla # pin the xla hash so that we don't get broken by changes to xla git checkout "$(cat ../.github/ci_commit_pins/xla.txt)" diff --git a/.ci/pytorch/cpp_doc_push_script.sh b/.ci/pytorch/cpp_doc_push_script.sh index 6e417bf8bbe9..f085fa78bebe 100755 --- a/.ci/pytorch/cpp_doc_push_script.sh +++ b/.ci/pytorch/cpp_doc_push_script.sh @@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \ # Build the docs pushd docs/cpp -time make VERBOSE=1 html -j +time make VERBOSE=1 html popd popd diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh index 295a82f057dc..a859901191e0 100755 --- a/.ci/pytorch/macos-test.sh +++ b/.ci/pytorch/macos-test.sh @@ -195,7 +195,7 @@ torchbench_setup_macos() { git checkout "$(cat ../.github/ci_commit_pins/vision.txt)" git submodule update --init --recursive python setup.py clean - python setup.py develop + python -m pip install -e . -v --no-build-isolation popd pushd torchaudio @@ -204,7 +204,7 @@ torchbench_setup_macos() { git submodule update --init --recursive python setup.py clean #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp - USE_OPENMP=0 python setup.py develop + USE_OPENMP=0 python -m pip install -e . -v --no-build-isolation popd checkout_install_torchbench @@ -302,6 +302,47 @@ test_torchbench_smoketest() { fi done + echo "Pytorch benchmark on mps device completed" +} + +test_aoti_torchbench_smoketest() { + print_cmake_info + + echo "Launching AOTInductor torchbench setup" + pip_benchmark_deps + # shellcheck disable=SC2119,SC2120 + torchbench_setup_macos + + TEST_REPORTS_DIR=$(pwd)/test/test-reports + mkdir -p "$TEST_REPORTS_DIR" + + local device=mps + local dtypes=(undefined float16 bfloat16 notset) + local dtype=${dtypes[$1]} + local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16) + + echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}" + local dtype_arg="--${dtype}" + if [ "$dtype" == notset ]; then + dtype_arg="--float32" + fi + touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" + for model in "${models[@]}"; do + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \ + --performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \ + --accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true + done + + echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}" + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \ + --performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true + PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \ + --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \ + --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true echo "Pytorch benchmark on mps device completed" } @@ -350,6 +391,8 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then test_timm_perf elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then test_torchbench_smoketest "${SHARD_NUMBER}" +elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then + test_aoti_torchbench_smoketest "${SHARD_NUMBER}" elif [[ $TEST_CONFIG == *"mps"* ]]; then test_python_mps elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then diff --git a/.ci/pytorch/numba-cuda-13.patch b/.ci/pytorch/numba-cuda-13.patch new file mode 100644 index 000000000000..f96ff287ed39 --- /dev/null +++ b/.ci/pytorch/numba-cuda-13.patch @@ -0,0 +1,25 @@ +From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001 +From: Michael Wang <13521008+isVoid@users.noreply.github.com> +Date: Tue, 1 Apr 2025 17:28:05 -0700 +Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage + (#185) + +Co-authored-by: isVoid +--- + numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py +index 1641bf77..233e9ed7 100644 +--- a/numba_cuda/numba/cuda/cudadrv/driver.py ++++ b/numba_cuda/numba/cuda/cudadrv/driver.py +@@ -365,6 +365,9 @@ def _find_api(self, fname): + else: + variants = ('_v2', '') + ++ if fname in ("cuCtxGetDevice", "cuCtxSynchronize"): ++ return getattr(self.lib, fname) ++ + for variant in variants: + try: + return getattr(self.lib, f'{fname}{variant}') diff --git a/.ci/pytorch/smoke_test/check_binary_symbols.py b/.ci/pytorch/smoke_test/check_binary_symbols.py index 3e88ffe4ffd7..b0c607659c72 100755 --- a/.ci/pytorch/smoke_test/check_binary_symbols.py +++ b/.ci/pytorch/smoke_test/check_binary_symbols.py @@ -32,6 +32,9 @@ "torch::", ) +# Patterns for detecting statically linked libstdc++ symbols +STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")] + def _apply_libtorch_symbols(symbols): return [ @@ -53,12 +56,17 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]: return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]] -def grep_symbols(lib: str, patterns: list[Any]) -> list[str]: +def grep_symbols( + lib: str, patterns: list[Any], symbol_type: str | None = None +) -> list[str]: def _grep_symbols( symbols: list[tuple[str, str, str]], patterns: list[Any] ) -> list[str]: rc = [] for _s_addr, _s_type, s_name in symbols: + # Filter by symbol type if specified + if symbol_type and _s_type != symbol_type: + continue for pattern in patterns: if pattern.match(s_name): rc.append(s_name) @@ -80,6 +88,18 @@ def _get_symbols_chunk(i): return functools.reduce(list.__add__, (x.result() for x in tasks), []) +def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None: + cxx11_statically_linked_symbols = grep_symbols( + lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T" + ) + num_statically_linked_symbols = len(cxx11_statically_linked_symbols) + print(f"num_statically_linked_symbols (T): {num_statically_linked_symbols}") + if num_statically_linked_symbols > 0: + raise RuntimeError( + f"Found statically linked libstdc++ symbols (recursive_directory_iterator): {cxx11_statically_linked_symbols[:100]}" + ) + + def check_lib_symbols_for_abi_correctness(lib: str) -> None: print(f"lib: {lib}") cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS) @@ -107,6 +127,7 @@ def main() -> None: libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so") check_lib_symbols_for_abi_correctness(libtorch_cpu_path) + check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path) if __name__ == "__main__": diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index a0c3760b5eaa..e8c5b3fc56af 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -32,6 +32,16 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v git config --global --add safe.directory /var/lib/jenkins/workspace fi + +# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878 +NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true) +if [ -n "$NUMBA_CUDA_DIR" ]; then + NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch" + pushd "$NUMBA_CUDA_DIR" + patch -p4 <"$NUMBA_PATCH" + popd +fi + echo "Environment variables:" env @@ -496,6 +506,14 @@ test_inductor_cpp_wrapper_shard() { -k 'take' \ --shard "$1" "$NUM_TEST_SHARDS" \ --verbose + + if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then + python test/run_test.py \ + --include inductor/test_mkldnn_pattern_matcher \ + -k 'xpu' \ + --shard "$1" "$NUM_TEST_SHARDS" \ + --verbose + fi } # "Global" flags for inductor benchmarking controlled by TEST_CONFIG @@ -1606,6 +1624,25 @@ test_operator_benchmark() { --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv" } +test_operator_microbenchmark() { + TEST_REPORTS_DIR=$(pwd)/test/test-reports + mkdir -p "$TEST_REPORTS_DIR" + TEST_DIR=$(pwd) + + cd benchmarks/operator_benchmark/pt_extension + python -m pip install . + + cd "${TEST_DIR}"/benchmarks/operator_benchmark + + for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do + $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \ + --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \ + --benchmark-name "PyTorch operator microbenchmark" --use-compile + $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \ + --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \ + --benchmark-name "PyTorch operator microbenchmark" + done +} if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") @@ -1660,6 +1697,8 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then test_operator_benchmark cpu ${TEST_MODE} fi +elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then + test_operator_microbenchmark elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then test_inductor_distributed elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then @@ -1713,11 +1752,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then elif [[ "${TEST_CONFIG}" == *inductor* ]]; then install_torchvision test_inductor_shard "${SHARD_NUMBER}" - if [[ "${SHARD_NUMBER}" == 1 ]]; then - if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then - test_inductor_distributed - fi - fi elif [[ "${TEST_CONFIG}" == *einops* ]]; then test_einops elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat index 19d715b9d0b6..67d156922192 100644 --- a/.ci/pytorch/win-test-helpers/build_pytorch.bat +++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat @@ -137,7 +137,7 @@ sccache --show-stats python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])" ( if "%BUILD_ENVIRONMENT%"=="" ( - echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash. + echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash. ) else ( copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%" diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat index 01e08c8bb4e5..abd2c8722b11 100644 --- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat +++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat @@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" ( ) else ( set CONDA_PARENT_DIR=C:\Jenkins ) - +set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3 :: Be conservative here when rolling out the new AMI with conda. This will try :: to install conda as before if it couldn't find the conda installation. This :: can be removed eventually after we gain enough confidence in the AMI -if not exist %CONDA_PARENT_DIR%\Miniconda3 ( +if not exist %CONDA_ROOT_DIR% ( set INSTALL_FRESH_CONDA=1 ) @@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" ( if errorlevel 1 exit /b if not errorlevel 0 exit /b - %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3 + %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR% if errorlevel 1 exit /b if not errorlevel 0 exit /b ) :: Activate conda so that we can use its commands, i.e. conda, python, pip -call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3 +call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR% +:: Activate conda so that we can use its commands, i.e. conda, python, pip +call conda activate py_tmp + +call pip install -r .ci/docker/requirements-ci.txt diff --git a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat index 4a464d6b5786..3173582b06f4 100644 --- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -14,7 +14,7 @@ if not errorlevel 0 exit /b :: build\torch. Rather than changing all these references, making a copy of torch folder :: from conda to the current workspace is easier. The workspace will be cleaned up after :: the job anyway -xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ +xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\ pushd . if "%VC_VERSION%" == "" ( diff --git a/.ci/pytorch/win-test.sh b/.ci/pytorch/win-test.sh index be7f3e4bb35c..c96d5c331c9f 100755 --- a/.ci/pytorch/win-test.sh +++ b/.ci/pytorch/win-test.sh @@ -38,13 +38,20 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then fi # TODO: Move both of them to Windows AMI -python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 +python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1 + +# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments +# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node' +# scipy from 1.6.3 to 1.10 +# expecttest from 0.1.3 to 0.3.0 +# xdoctest from 1.0.2 to 1.3.0 +python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42" # Install Z3 optional dependency for Windows builds. python -m pip install z3-solver==4.15.1.0 # Install tlparse for test\dynamo\test_structured_trace.py UTs. -python -m pip install tlparse==0.3.30 +python -m pip install tlparse==0.4.0 # Install parameterized python -m pip install parameterized==0.8.1 @@ -52,9 +59,6 @@ python -m pip install parameterized==0.8.1 # Install pulp for testing ilps under torch\distributed\_tools python -m pip install pulp==2.9.0 -# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308 -python -m pip install expecttest==0.3.0 - run_tests() { # Run nvidia-smi if available for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do diff --git a/.ci/pytorch/windows/cuda128.bat b/.ci/pytorch/windows/cuda128.bat index bbdfb4bd1bb7..bbd349e2efb4 100644 --- a/.ci/pytorch/windows/cuda128.bat +++ b/.ci/pytorch/windows/cuda128.bat @@ -37,10 +37,10 @@ IF "%CUDA_PATH_V128%"=="" ( ) IF "%BUILD_VISION%" == "" ( - set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0 + set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0 set TORCH_NVCC_FLAGS=-Xfatbin -compress-all ) ELSE ( - set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120 + set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120 ) set "CUDA_PATH=%CUDA_PATH_V128%" diff --git a/.ci/pytorch/windows/internal/copy.bat b/.ci/pytorch/windows/internal/copy.bat index 40f2bd7acdbb..e0281c0d78a4 100644 --- a/.ci/pytorch/windows/internal/copy.bat +++ b/.ci/pytorch/windows/internal/copy.bat @@ -1,12 +1,20 @@ -copy "%CUDA_PATH%\bin\cusparse*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\cublas*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\cudart*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\curand*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\cufft*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\cusolver*64_*.dll*" pytorch\torch\lib + +if %CUDA_VERSION% geq 130 ( + set "dll_path=bin\x64" +) else ( + set "dll_path=bin" +) + +copy "%CUDA_PATH%\%dll_path%\cusparse*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\cublas*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\cudart*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\curand*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\cufft*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\cusolver*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\nvrtc*64_*.dll*" pytorch\torch\lib +copy "%CUDA_PATH%\%dll_path%\nvJitLink_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib -copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib copy "%CUDA_PATH%\extras\CUPTI\lib64\nvperf_host*.dll*" pytorch\torch\lib @@ -20,8 +28,3 @@ copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib if exist "C:\Windows\System32\zlibwapi.dll" ( copy "C:\Windows\System32\zlibwapi.dll" pytorch\torch\lib ) - -::copy nvJitLink dll is requires for cuda 12+ -if exist "%CUDA_PATH%\bin\nvJitLink_*.dll*" ( - copy "%CUDA_PATH%\bin\nvJitLink_*.dll*" pytorch\torch\lib -) diff --git a/.ci/pytorch/windows/internal/driver_update.bat b/.ci/pytorch/windows/internal/driver_update.bat index 5ed3a236c09a..2c173aed818b 100644 --- a/.ci/pytorch/windows/internal/driver_update.bat +++ b/.ci/pytorch/windows/internal/driver_update.bat @@ -1,9 +1,9 @@ -set WIN_DRIVER_VN=528.89 -set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore -curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe +set WIN_DRIVER_VN=580.88 +set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore +curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe if errorlevel 1 exit /b 1 -start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot +start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot if errorlevel 1 exit /b 1 -del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL +del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL diff --git a/.ci/pytorch/windows/internal/xpu_install.bat b/.ci/pytorch/windows/internal/xpu_install.bat index 2296adf4dfe6..f143571a5692 100644 --- a/.ci/pytorch/windows/internal/xpu_install.bat +++ b/.ci/pytorch/windows/internal/xpu_install.bat @@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build" :xpu_bundle_install_start set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI -set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe +set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product -set XPU_BUNDLE_VERSION=2025.0.1+20 +set XPU_BUNDLE_VERSION=2025.1.3+5 set XPU_BUNDLE_INSTALLED=0 set XPU_BUNDLE_UNINSTALL=0 set XPU_EXTRA_URL=NULL @@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226 set XPU_EXTRA_INSTALLED=0 set XPU_EXTRA_UNINSTALL=0 -if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] ( - set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe - set XPU_BUNDLE_VERSION=2025.1.3+5 +if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] ( + set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe + set XPU_BUNDLE_VERSION=2025.2.1+20 ) :: Check if XPU bundle is target version or already installed @@ -90,14 +90,3 @@ if errorlevel 1 exit /b 1 del xpu_extra.exe :xpu_install_end - -if not "%XPU_ENABLE_KINETO%"=="1" goto install_end -:: Install Level Zero SDK -set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip -curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip" -echo "Installing level zero SDK..." -7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero" -set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%" -del "%SRC_DIR%\temp_build\level_zero_sdk.zip" - -:install_end diff --git a/.ci/wheel/build_wheel.sh b/.ci/wheel/build_wheel.sh index b9b6448ae208..e63a68e4f193 100755 --- a/.ci/wheel/build_wheel.sh +++ b/.ci/wheel/build_wheel.sh @@ -124,19 +124,15 @@ popd export TH_BINARY_BUILD=1 export INSTALL_TEST=0 # dont install test binaries into site-packages -export MACOSX_DEPLOYMENT_TARGET=10.15 +export MACOSX_DEPLOYMENT_TARGET=11.0 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} -SETUPTOOLS_PINNED_VERSION="==70.1.0" -PYYAML_PINNED_VERSION="==5.3" EXTRA_CONDA_INSTALL_FLAGS="" CONDA_ENV_CREATE_FLAGS="" RENAME_WHEEL=true case $desired_python in 3.14t) echo "Using 3.14 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=6.0.1" NUMPY_PINNED_VERSION="==2.1.0" CONDA_ENV_CREATE_FLAGS="python-freethreading" EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" @@ -145,8 +141,6 @@ case $desired_python in ;; 3.14) echo "Using 3.14t deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=6.0.1" NUMPY_PINNED_VERSION="==2.1.0" EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge" desired_python="3.14.0rc1" @@ -154,8 +148,6 @@ case $desired_python in ;; 3.13t) echo "Using 3.13 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=6.0.1" NUMPY_PINNED_VERSION="==2.1.0" CONDA_ENV_CREATE_FLAGS="python-freethreading" EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge" @@ -164,37 +156,23 @@ case $desired_python in ;; 3.13) echo "Using 3.13 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=6.0.1" NUMPY_PINNED_VERSION="==2.1.0" ;; 3.12) echo "Using 3.12 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=6.0.1" NUMPY_PINNED_VERSION="==2.0.2" ;; 3.11) echo "Using 3.11 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=5.3" NUMPY_PINNED_VERSION="==2.0.2" ;; 3.10) echo "Using 3.10 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=5.3" - NUMPY_PINNED_VERSION="==2.0.2" - ;; - 3.9) - echo "Using 3.9 deps" - SETUPTOOLS_PINNED_VERSION=">=70.1.0" - PYYAML_PINNED_VERSION=">=5.3" NUMPY_PINNED_VERSION="==2.0.2" ;; *) - echo "Using default deps" - NUMPY_PINNED_VERSION="==1.11.3" + echo "Unsupported version $desired_python" + exit 1 ;; esac @@ -204,8 +182,6 @@ conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_p source activate "$tmp_env_name" PINNED_PACKAGES=( - "setuptools${SETUPTOOLS_PINNED_VERSION}" - "pyyaml${PYYAML_PINNED_VERSION}" "numpy${NUMPY_PINNED_VERSION}" ) retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt" @@ -223,7 +199,7 @@ export BUILD_TEST=OFF pushd "$pytorch_rootdir" echo "Calling setup.py bdist_wheel at $(date)" -python setup.py bdist_wheel -d "$whl_tmp_dir" +python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version} echo "Finished setup.py bdist_wheel at $(date)" diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 87fea14b8d28..aa82d36aa7ce 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -5,7 +5,9 @@ export TZ=UTC tagged_version() { GIT_DIR="${workdir}/pytorch/.git" GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*" - if [[ ! -d "${GIT_DIR}" ]]; then + if [[ -n "${CIRCLE_TAG:-}" ]]; then + echo "${CIRCLE_TAG}" + elif [[ ! -d "${GIT_DIR}" ]]; then echo "Abort, abort! Git dir ${GIT_DIR} does not exists!" kill $$ elif ${GIT_DESCRIBE} --exact >/dev/null; then @@ -69,16 +71,11 @@ fi export PYTORCH_BUILD_NUMBER=1 +# This part is done in the builder scripts so commenting the duplicate code +: <<'BLOCK_COMMENT' # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) - -# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT -TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'" - -# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries. -if [[ "$DESIRED_CUDA" == "cu129" ]]; then - TRITON_CONSTRAINT="platform_system == 'Linux'" -fi +TRITON_CONSTRAINT="platform_system == 'Linux'" if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}" @@ -117,6 +114,7 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}" fi fi +BLOCK_COMMENT USE_GLOO_WITH_OPENSSL="ON" if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh index 27cd36f94928..18dcde50e2b6 100644 --- a/.circleci/scripts/binary_windows_build.sh +++ b/.circleci/scripts/binary_windows_build.sh @@ -15,8 +15,7 @@ fi if [[ "$DESIRED_CUDA" == 'xpu' ]]; then export VC_YEAR=2022 export USE_SCCACHE=0 - export XPU_VERSION=2025.1 - export XPU_ENABLE_KINETO=1 + export XPU_VERSION=2025.2 fi echo "Free space on filesystem before build:" diff --git a/.circleci/scripts/binary_windows_test.sh b/.circleci/scripts/binary_windows_test.sh index 79f714265f2c..9326d9037e8b 100644 --- a/.circleci/scripts/binary_windows_test.sh +++ b/.circleci/scripts/binary_windows_test.sh @@ -8,7 +8,7 @@ export VC_YEAR=2022 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then export VC_YEAR=2022 - export XPU_VERSION=2025.1 + export XPU_VERSION=2025.2 fi pushd "$PYTORCH_ROOT/.ci/pytorch/" diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 85c7999c1857..798dee312306 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -12,7 +12,9 @@ self-hosted-runner: - linux.9xlarge.ephemeral - am2.linux.9xlarge.ephemeral - linux.12xlarge + - linux.12xlarge.memory - linux.24xlarge + - linux.24xlarge.memory - linux.24xlarge.ephemeral - linux.24xlarge.amd - linux.arm64.2xlarge diff --git a/.github/actions/build-external-packages/action.yml b/.github/actions/build-external-packages/action.yml index dc8b8b889536..c0c727d93ac6 100644 --- a/.github/actions/build-external-packages/action.yml +++ b/.github/actions/build-external-packages/action.yml @@ -4,6 +4,11 @@ name: Build External packages description: build external packages for PyTorch inputs: + cuda-version: + description: CUDA version to use + type: string + required: true + default: '12.8.1' cuda-arch-list: description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0") type: string @@ -44,10 +49,12 @@ runs: env: SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 SCCACHE_REGION: us-east-1 + CUDA_VERSION: ${{ inputs.cuda-version }} TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }} BASE_IMAGE: ${{ inputs.docker-image }} BUILD_TARGETS: ${{ inputs.build-targets }} - PARENT_OUTPUT_DIR: ${{ inputs.output-dir}} + PARENT_OUTPUT_DIR: ${{ inputs.output-dir }} + TORCH_WHEELS_PATH: ${{ inputs.torch-wheel-dir }} shell: bash run: | set -euo pipefail @@ -68,7 +75,6 @@ runs: export OUTPUT_DIR echo "Building external package: $target in directory $OUTPUT_DIR" python3 -m cli.run build external "$target" - done END_TIME=$(date +%s) diff --git a/.github/actions/checkout-pytorch/action.yml b/.github/actions/checkout-pytorch/action.yml index 055404c69474..15f193ef3a5d 100644 --- a/.github/actions/checkout-pytorch/action.yml +++ b/.github/actions/checkout-pytorch/action.yml @@ -57,6 +57,21 @@ runs: submodules: ${{ inputs.submodules }} show-progress: false + - name: Clean submodules post checkout + id: clean-submodules + if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} + shell: bash + env: + NO_SUDO: ${{ inputs.no-sudo }} + run: | + cd "${GITHUB_WORKSPACE}" + # Clean stale submodule dirs + if [ -z "${NO_SUDO}" ]; then + sudo git submodule foreach --recursive git clean -ffdx + else + git submodule foreach --recursive git clean -ffdx + fi + - name: Clean workspace (try again) if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }} diff --git a/.github/actions/setup-win/action.yml b/.github/actions/setup-win/action.yml index 93c957896b5e..2ea330f93b49 100644 --- a/.github/actions/setup-win/action.yml +++ b/.github/actions/setup-win/action.yml @@ -6,6 +6,12 @@ inputs: cuda-version: description: which cuda version to install, 'cpu' for none required: true + python-version: + required: false + type: string + default: "3.10" + description: | + The python version to be used. Will be 3.10 by default runs: using: composite @@ -38,18 +44,24 @@ runs: CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat" { + echo "CONDA=${CONDA}"; echo "CONDA_RUN=${CONDA} run --no-capture-output"; echo "CONDA_BUILD=${CONDA} run conda-build"; echo "CONDA_INSTALL=${CONDA} install"; } >> "${GITHUB_ENV}" - name: Setup Python3 + env: + PYTHON_VERSION: ${{ inputs.python-version }} shell: bash run: | set +e set -x - PYTHON3=$(${CONDA_RUN} which python3) + # Create new py_tmp env with python-version + ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv + + PYTHON3=$(${CONDA_RUN} -n py_tmp which python3) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then @@ -62,7 +74,7 @@ runs: # installation, which is Python 3 based. Its Python is default to Python 3. Further, there # is also the Miniconda installation that is Python 2 based, and both can be installed if # needed. In both cases, Python binary is just called python - PYTHON=$(${CONDA_RUN} which python) + PYTHON=$(${CONDA_RUN} -n py_tmp which python) EXIT_CODE=$? if [[ "${EXIT_CODE}" == "0" ]]; then diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index 0b9c14848239..b0255e764c59 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1 @@ -10a5002c6195bd95e34df8fe28ff8a2d55a2a922 +27fc2493d383354a008106f22f3be232badee9a1 diff --git a/.github/ci_commit_pins/vllm.txt b/.github/ci_commit_pins/vllm.txt index 80c5a90c7be9..bbc484d273a1 100644 --- a/.github/ci_commit_pins/vllm.txt +++ b/.github/ci_commit_pins/vllm.txt @@ -1 +1 @@ -add1adfec742dfb13e614dab3372b5aafd1ff046 +78a47f87ce259a48f0391fa9ae15add05ea7432b diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 53cf6c8c9915..ee530f8c8b21 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -a1c6ee92c85e8b0955c20892ed68f032a6015c09 +r2.9 diff --git a/.github/ci_configs/vllm/Dockerfile.tmp_vllm b/.github/ci_configs/vllm/Dockerfile.tmp_vllm index 330a78424fee..2cee6ed2df19 100644 --- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm +++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm @@ -12,54 +12,46 @@ ARG BUILD_BASE_IMAGE=torch-nightly-base # by default, it uses devel-ubuntu22.04 official image. ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 +# The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile +ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py" -#################### TORCH NIGHTLY BASE IMAGE #################### + +#################### TORCH NIGHTLY BASE IMAGE #################### # A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci -From nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base -ARG CUDA_VERSION=12.8.1 -ARG PYTHON_VERSION=3.12 -ARG TARGETPLATFORM -ENV DEBIAN_FRONTEND=noninteractive - -RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ - echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment - -# Install Python and other dependencies if it does not existed -RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \ - echo "Installing Python ${PYTHON_VERSION}..." && \ - echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \ - echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \ - apt-get update -y && \ - apt-get install -y ccache software-properties-common git curl sudo && \ - for i in 1 2 3; do \ - add-apt-repository -y ppa:deadsnakes/ppa && break || \ - { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ - done && \ - apt-get update -y && \ - apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \ - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \ - update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \ - ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \ - else \ - echo "Python ${PYTHON_VERSION} already present, skipping setup."; \ - fi \ - && python3 --version && python3 -m pip --version +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base + +ARG CUDA_VERSION +ARG PYTHON_VERSION +ARG GET_PIP_URL + +# Install Python and other dependencies +RUN apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl wget sudo vim \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # as it was causing spam when compiling the CUTLASS kernels # Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519) RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \ - if [ "$current_gcc_version" -lt 10 ]; then \ - echo "GCC version is $current_gcc_version, installing gcc-10..."; \ - apt-get update && \ - apt-get install -y gcc-10 g++-10 && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \ - else \ - echo "GCC version is $current_gcc_version, no need to install gcc-10."; \ - fi && \ - gcc --version && g++ --version + if command -v apt-get >/dev/null; then \ + if [ "$current_gcc_version" -lt 10 ]; then \ + echo "GCC version is $current_gcc_version, installing gcc-10..."; \ + apt-get update \ + && apt-get install -y gcc-10 g++-10 \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \ + && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \ + else \ + echo "GCC version is $current_gcc_version, no need to install gcc-10."; \ + fi \ + fi \ + && gcc --version && g++ --version # install uv for faster pip installs RUN --mount=type=cache,target=/root/.cache/uv \ @@ -79,6 +71,21 @@ ENV UV_LINK_MODE=copy FROM ${BUILD_BASE_IMAGE} AS base USER root +ARG CUDA_VERSION +ARG PYTHON_VERSION + +# TODO (huydhn): Only work with PyTorch manylinux builder +ENV PATH="/opt/python/cp312-cp312/bin:${PATH}" + +# Install some system dependencies and double check python version +RUN if command -v apt-get >/dev/null; then \ + apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl wget sudo vim; \ + else \ + dnf install -y git curl wget sudo vim; \ + fi \ + && python3 --version && python3 -m pip --version + # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully # this won't be needed for future versions of this docker image @@ -118,17 +125,15 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \ if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \ echo "[INFO] Installing torch wheels to build vllm"; \ torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \ - vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \ - audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \ - uv pip install --system "${torch_whl}[opt-einsum]"; \ - uv pip install --system "${vision_whl}"; \ - uv pip install --system "${audio_whl}"; \ + vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \ + audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \ + uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \ elif [ -n "$PINNED_TORCH_VERSION" ]; then \ echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \ - uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \ + uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ else \ echo "[INFO] Installing torch nightly with latest one to build vllm"; \ - uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \ + uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ fi # Install numba 0.61.2 for cuda environment @@ -137,12 +142,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Install common dependencies from vllm common.txt RUN --mount=type=cache,target=/root/.cache/uv \ -uv pip install --system -r requirements/common.txt - + uv pip install --system -r requirements/common.txt # Must put before installing xformers, so it can install the correct version of xfomrers. -ARG exformer_cuda_arch_list='7.5;8.0+PTX;9.0a' -ENV TORCH_CUDA_ARCH_LIST=${exformer_cuda_arch_list} +ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a' +ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list} ARG max_jobs=16 ENV MAX_JOBS=${max_jobs} @@ -153,8 +157,8 @@ RUN pip freeze | grep -E 'ninja' # Build xformers with cuda and torch nightly/wheel # following official xformers guidance: https://github.com/facebookresearch/xformers#build -# sha for https://github.com/facebookresearch/xformers/tree/v0.0.31 -ARG XFORMERS_COMMIT=eb0946a363464da96ea40afd1a7f72a907c25497 +# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2 +ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468 ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ @@ -176,6 +180,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage. # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt + RUN cat torch_build_versions.txt RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio' @@ -187,11 +192,6 @@ RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio' FROM base AS build ARG TARGETPLATFORM -ENV UV_HTTP_TIMEOUT=500 -ENV UV_INDEX_STRATEGY="unsafe-best-match" -# Use copy mode to avoid hardlink failures with Docker cache mounts -ENV UV_LINK_MODE=copy - COPY . . RUN python3 use_existing_torch.py @@ -250,9 +250,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \ fi -RUN echo "[DEBUG] Listing current directory:" && \ +RUN echo "[INFO] Listing current directory:" && \ ls -al && \ - echo "[DEBUG] Showing torch_build_versions.txt content:" && \ + echo "[INFO] Showing torch_build_versions.txt content:" && \ cat torch_build_versions.txt #################### WHEEL BUILD IMAGE #################### @@ -262,42 +262,40 @@ RUN echo "[DEBUG] Listing current directory:" && \ # Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer FROM ${FINAL_BASE_IMAGE} AS vllm-base USER root + +ARG CUDA_VERSION +ARG PYTHON_VERSION +ARG GET_PIP_URL + +# TODO (huydhn): Only work with PyTorch manylinux builder +ENV PATH="/opt/python/cp312-cp312/bin:${PATH}" + # prepare for environment starts WORKDIR /workspace -RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ - echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment - -# Install Python and other dependencies if it does not existed -RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \ - echo "Installing Python ${PYTHON_VERSION}..." && \ - echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \ - echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \ - apt-get update -y && \ - apt-get install -y ccache software-properties-common git curl sudo && \ - for i in 1 2 3; do \ - add-apt-repository -y ppa:deadsnakes/ppa && break || \ - { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ - done && \ - apt-get update -y && \ - apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \ - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \ - update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \ - ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \ - else \ - echo "Python ${PYTHON_VERSION} already present, skipping setup."; \ - fi \ - && python3 --version && python3 -m pip --version - +# Install Python and other dependencies +RUN if command -v apt-get >/dev/null; then \ + apt-get update -y \ + && apt-get install -y ccache software-properties-common git curl wget sudo vim \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \ + else \ + dnf install -y git curl wget sudo vim; \ + fi \ + && python3 --version && python3 -m pip --version # Get the torch versions, and whls used in previous stagtes for consistency COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt COPY --from=base /workspace/xformers-dist /wheels/xformers COPY --from=build /workspace/vllm-dist /wheels/vllm -RUN echo "[DEBUG] Listing current directory before torch install step:" && \ +RUN echo "[INFO] Listing current directory before torch install step:" && \ ls -al && \ - echo "[DEBUG] Showing torch_build_versions.txt content:" && \ + echo "[INFO] Showing torch_build_versions.txt content:" && \ cat torch_build_versions.txt # Workaround for https://github.com/openai/triton/issues/2507 and @@ -306,7 +304,6 @@ RUN echo "[DEBUG] Listing current directory before torch install step:" && \ # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ - # Install uv for faster pip installs if not existed RUN --mount=type=cache,target=/root/.cache/uv \ if ! python3 -m uv --version > /dev/null 2>&1; then \ @@ -326,15 +323,13 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \ --mount=type=cache,target=/root/.cache/uv \ if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \ torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \ - vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \ - audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \ + vision_whl=$(find /dist -name 'torchvision*.whl' | head -n1 | xargs); \ + audio_whl=$(find /dist -name 'torchaudio*.whl' | head -n1 | xargs); \ echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \ - uv pip install --system "${torch_whl}[opt-einsum]"; \ - uv pip install --system "${vision_whl}"; \ - uv pip install --system "${audio_whl}"; \ + uv pip install --system "${torch_whl}[opt-einsum]" "${vision_whl}" "${audio_whl}" /dist/*.whl; \ else \ echo "[INFO] Installing torch versions from torch_build_versions.txt"; \ - uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128; \ + uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ fi # Install the vllm wheel from previous stage @@ -345,9 +340,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system /wheels/xformers/*.whl --verbose - # Build flashinfer from source. -ARG torch_cuda_arch_list='8.0;8.9;9.0a' +ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0' # install package for build flashinfer # see issue: https://github.com/flashinfer-ai/flashinfer/issues/738 @@ -358,7 +352,7 @@ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} # Build flashinfer for torch nightly from source around 10 mins ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt -ARG FLASHINFER_GIT_REF="v0.2.9rc2" +ARG FLASHINFER_GIT_REF="v0.2.14.post1" RUN --mount=type=cache,target=/root/.cache/uv \ git clone --depth 1 --recursive --shallow-submodules \ --branch ${FLASHINFER_GIT_REF} \ @@ -376,6 +370,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Logging to confirm the torch versions RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer' +RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt ################### VLLM INSTALLED IMAGE #################### @@ -414,11 +409,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system -r requirements/nightly_torch_test.txt -# Workaround for #17068 -# pinned commit for v2.2.4 -RUN --mount=type=cache,target=/root/.cache/uv \ - uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@95d8aba8a8c75aedcaa6143713b11e745e7cd0d9#egg=mamba-ssm" - # Logging to confirm the torch versions RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer' @@ -433,4 +423,5 @@ FROM scratch as export-wheels # Just copy the wheels we prepared in previous stages COPY --from=base /workspace/xformers-dist /wheels/xformers COPY --from=build /workspace/vllm-dist /wheels/vllm +COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt index 224835188d87..3a27cac46f71 100644 --- a/.github/requirements/pip-requirements-macOS.txt +++ b/.github/requirements/pip-requirements-macOS.txt @@ -28,7 +28,7 @@ pyyaml==6.0.2 scipy==1.12.0 setuptools==72.1.0 sympy==1.13.3 -tlparse==0.3.30 +tlparse==0.4.0 tensorboard==2.13.0 typing-extensions==4.12.2 unittest-xml-reporting<=3.2.0,>=2.0.0 diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py index beec9f96aba2..f2851e331725 100644 --- a/.github/scripts/build_triton_wheel.py +++ b/.github/scripts/build_triton_wheel.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import os +import re import shutil import sys from pathlib import Path @@ -50,6 +51,30 @@ def patch_init_py( with open(path, "w") as f: f.write(orig) +def get_rocm_version() -> str: + rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm" + rocm_version = "0.0.0" + rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h" + if not os.path.isfile(rocm_version_h): + rocm_version_h = f"{rocm_path}/include/rocm_version.h" + # The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install. + if os.path.isfile(rocm_version_h): + RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)") + RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)") + RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)") + major, minor, patch = 0, 0, 0 + for line in open(rocm_version_h): + match = RE_MAJOR.search(line) + if match: + major = int(match.group(1)) + match = RE_MINOR.search(line) + if match: + minor = int(match.group(1)) + match = RE_PATCH.search(line) + if match: + patch = int(match.group(1)) + rocm_version = str(major)+"."+str(minor)+"."+str(patch) + return rocm_version def build_triton( *, @@ -64,14 +89,24 @@ def build_triton( if "MAX_JOBS" not in env: max_jobs = os.cpu_count() or 1 env["MAX_JOBS"] = str(max_jobs) - + if not release: + # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8 + # while release build should only include the version, i.e. 2.1.0 + rocm_version = get_rocm_version() + version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}" + version += version_suffix with TemporaryDirectory() as tmpdir: triton_basedir = Path(tmpdir) / "triton" triton_pythondir = triton_basedir / "python" triton_repo = "https://github.com/openai/triton" if device == "rocm": - triton_pkg_name = "pytorch-triton-rocm" + triton_repo = "https://github.com/ROCm/triton" + rocm_version = get_rocm_version() # e.g., "7.0.1" + if tuple(map(int, rocm_version.split("."))) > (7, 0, 0): + triton_pkg_name = "triton" + else: + triton_pkg_name = "pytorch-triton-rocm" elif device == "xpu": triton_pkg_name = "pytorch-triton-xpu" triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton" @@ -84,10 +119,12 @@ def build_triton( ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir ) else: + check_call(["git", "fetch", "origin", commit_hash], cwd=triton_basedir) check_call(["git", "checkout", commit_hash], cwd=triton_basedir) # change built wheel name and version env["TRITON_WHEEL_NAME"] = triton_pkg_name + env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix if with_clang_ldd: env["TRITON_BUILD_WITH_CLANG_LLD"] = "1" diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py index 9ba210a5ed2b..dd16dbc18db2 100755 --- a/.github/scripts/filter_test_configs.py +++ b/.github/scripts/filter_test_configs.py @@ -41,9 +41,9 @@ def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool: } # The link to the published list of disabled jobs -DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json" +DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=hjktHz2WOejHpxKpkqpDknTt5rMTM9KK" # and unstable jobs -UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json" +UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=wrjdvvQTJxgvMO.rGw5MEuMsj6XbjuV7" # Some constants used to handle disabled and unstable jobs JOB_NAME_SEP = "/" diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index a576706ace22..4dc97ee6a284 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -16,18 +16,16 @@ # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this -CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"] +CUDA_ARCHES = ["12.6", "12.8", "13.0"] CUDA_STABLE = "12.8" CUDA_ARCHES_FULL_VERSION = { "12.6": "12.6.3", "12.8": "12.8.1", - "12.9": "12.9.1", "13.0": "13.0.0", } CUDA_ARCHES_CUDNN_VERSION = { "12.6": "9", "12.8": "9", - "12.9": "9", "13.0": "9", } @@ -40,99 +38,82 @@ CPU_S390X_ARCH = ["cpu-s390x"] -CUDA_AARCH64_ARCHES = ["12.9-aarch64"] +CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"] PYTORCH_EXTRA_INSTALL_REQUIREMENTS = { "12.6": ( - "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'" + "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | " + "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | " + "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " + "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | " + "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | " + "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | " + "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | " + "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " + "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " + "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | " + "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | " + "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | " + "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'" ), "12.8": ( - "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'" - ), - "12.9": ( - "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'" + "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | " + "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | " + "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | " + "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | " + "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | " + "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | " + "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | " + "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | " + "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | " + "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | " + "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | " + "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | " + "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | " + "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | " + "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'" ), "13.0": ( - "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'" + "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | " + "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | " + "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | " + "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | " + "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | " + "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | " + "nvidia-curand==10.4.0.35; platform_system == 'Linux' | " + "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | " + "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | " + "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | " + "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | " + "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | " + "nvidia-nvtx==13.0.39; platform_system == 'Linux' | " + "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | " + "nvidia-cufile==1.15.0.42; platform_system == 'Linux'" ), "xpu": ( - "intel-cmplr-lib-rt==2025.1.1 | " - "intel-cmplr-lib-ur==2025.1.1 | " - "intel-cmplr-lic-rt==2025.1.1 | " - "intel-sycl-rt==2025.1.1 | " - "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "onemkl-sycl-blas==2025.1.0 | " - "onemkl-sycl-dft==2025.1.0 | " - "onemkl-sycl-lapack==2025.1.0 | " - "onemkl-sycl-rng==2025.1.0 | " - "onemkl-sycl-sparse==2025.1.0 | " - "dpcpp-cpp-rt==2025.1.1 | " - "intel-opencl-rt==2025.1.1 | " - "mkl==2025.1.0 | " - "intel-openmp==2025.1.1 | " - "tbb==2022.1.0 | " - "tcmlib==1.3.0 | " - "umf==0.10.0 | " - "intel-pti==0.12.3" + "intel-cmplr-lib-rt==2025.2.1 | " + "intel-cmplr-lib-ur==2025.2.1 | " + "intel-cmplr-lic-rt==2025.2.1 | " + "intel-sycl-rt==2025.2.1 | " + "oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "onemkl-sycl-blas==2025.2.0 | " + "onemkl-sycl-dft==2025.2.0 | " + "onemkl-sycl-lapack==2025.2.0 | " + "onemkl-sycl-rng==2025.2.0 | " + "onemkl-sycl-sparse==2025.2.0 | " + "dpcpp-cpp-rt==2025.2.1 | " + "intel-opencl-rt==2025.2.1 | " + "mkl==2025.2.0 | " + "intel-openmp==2025.2.1 | " + "tbb==2022.2.0 | " + "tcmlib==1.4.0 | " + "umf==0.11.0 | " + "intel-pti==0.13.1" ), } @@ -240,12 +221,8 @@ def generate_libtorch_matrix( if os == "linux": arches += CUDA_ARCHES arches += ROCM_ARCHES - if "13.0" in arches: - arches.remove("13.0") elif os == "windows": arches += CUDA_ARCHES - if "13.0" in arches: - arches.remove("13.0") if libtorch_variants is None: libtorch_variants = [ "shared-with-deps", @@ -310,8 +287,6 @@ def generate_wheels_matrix( arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES elif os == "windows": arches += CUDA_ARCHES + XPU_ARCHES - if "13.0" in arches: - arches.remove("13.0") elif os == "linux-aarch64": # Separate new if as the CPU type is different and # uses different build/test scripts @@ -334,19 +309,20 @@ def generate_wheels_matrix( else arch_version ) - # TODO: Enable python 3.13t on cpu-s390x - if gpu_arch_type == "cpu-s390x" and python_version == "3.13t": - continue # TODO: Enable python 3.14 for rest - if os not in ["linux", "linux-aarch64", "macos-arm64", "windows"] and ( - python_version == "3.14" or python_version == "3.14t" - ): + if os not in [ + "linux", + "linux-aarch64", + "linux-s390x", + "macos-arm64", + "windows", + ] and (python_version == "3.14" or python_version == "3.14t"): continue # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install if ( - arch_version in ["13.0", "12.9", "12.8", "12.6"] + arch_version in ["13.0", "12.8", "12.6"] and os == "linux" or arch_version in CUDA_AARCH64_ARCHES ): @@ -410,6 +386,5 @@ def generate_wheels_matrix( validate_nccl_dep_consistency("13.0") -validate_nccl_dep_consistency("12.9") validate_nccl_dep_consistency("12.8") validate_nccl_dep_consistency("12.6") diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 67906d4ad88d..0396c405ad0a 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -135,7 +135,7 @@ class OperatingSystem: build_configs=generate_binary_build_matrix.generate_wheels_matrix( OperatingSystem.LINUX, arches=["6.4"], - python_versions=["3.9"], + python_versions=["3.10"], ), ciflow_config=CIFlowConfig( labels={ diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py index 58f3ca50baa1..ac3a1cc12921 100755 --- a/.github/scripts/test_trymerge.py +++ b/.github/scripts/test_trymerge.py @@ -27,6 +27,7 @@ get_drci_classifications, gh_get_team_members, GitHubPR, + iter_issue_timeline_until_comment, JobCheckState, main as trymerge_main, MandatoryChecksMissingError, @@ -34,6 +35,8 @@ RE_GHSTACK_DESC, read_merge_rules, remove_job_name_suffix, + sha_from_committed_event, + sha_from_force_push_after, validate_revert, ) @@ -124,7 +127,7 @@ def __init__(self) -> None: self.force = force self.pr_num = 76123 self.dry_run = True - self.comment_id = 0 + self.comment_id = 12345 # Set to non-zero value self.reason = "this is for testing" self.ignore_current = False self.check_mergeability = False @@ -152,9 +155,9 @@ def mock_revert( def mock_merge( pr: GitHubPR, repo: GitRepo, + comment_id: int, dry_run: bool = False, skip_mandatory_checks: bool = False, - comment_id: Optional[int] = None, timeout_minutes: int = 400, stale_pr_days: int = 3, ignore_current: bool = False, @@ -470,9 +473,9 @@ def test_main_force( mock_merge.assert_called_once_with( mock.ANY, mock.ANY, + comment_id=mock.ANY, dry_run=mock.ANY, skip_mandatory_checks=True, - comment_id=mock.ANY, ignore_current=False, ) @@ -485,9 +488,9 @@ def test_main_merge(self, mock_merge: Any, *args: Any) -> None: mock_merge.assert_called_once_with( mock.ANY, mock.ANY, + comment_id=mock.ANY, dry_run=mock.ANY, skip_mandatory_checks=False, - comment_id=mock.ANY, ignore_current=False, ) @@ -1138,5 +1141,176 @@ def test__revlist_to_prs_two_prs( ) +@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql) +@mock.patch("trymerge.gh_fetch_merge_base", return_value="") +@mock.patch( + "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications +) +class TestTimelineFunctions(TestCase): + """Tests for the new timeline-related functions""" + + def test_sha_from_committed_event(self, *args: Any) -> None: + """Test extracting SHA from committed event""" + # Based on actual GitHub API format - committed events have "sha" at top level + event = { + "event": "committed", + "sha": "fb21ce932ded6670c918804a0d9151b773770a7c", + } + self.assertEqual( + sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c" + ) + + # Test with missing SHA + event_no_sha = {"event": "committed"} + self.assertIsNone(sha_from_committed_event(event_no_sha)) + + def test_sha_from_force_push_after(self, *args: Any) -> None: + """Test extracting SHA from force push event""" + # NOTE: The current function doesn't handle the actual GitHub API format + # Real force push events have "commit_id" at top level, but this function + # looks for "after", "after_commit", "after_sha", or "head_sha" fields + + # Test with the legacy format the current function handles + event_legacy = { + "event": "head_ref_force_pushed", + "after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"}, + } + self.assertEqual( + sha_from_force_push_after(event_legacy), + "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e", + ) + + # Test with current GitHub API format (should return None with current implementation) + event_real_api = { + "event": "head_ref_force_pushed", + "commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e", + } + self.assertEqual( + sha_from_force_push_after(event_real_api), + "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e", + ) # Current function doesn't handle commit_id + + # Test with missing SHA + event_no_sha = {"event": "head_ref_force_pushed"} + self.assertIsNone(sha_from_force_push_after(event_no_sha)) + + @mock.patch("trymerge.gh_fetch_json_list") + def test_iter_issue_timeline_until_comment( + self, mock_gh_fetch_json_list: Any, *args: Any + ) -> None: + """Test timeline iteration until target comment""" + # Mock timeline data based on actual GitHub API format + timeline_data = [ + {"event": "commented", "id": 100, "body": "first comment"}, + {"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"}, + {"event": "commented", "id": 200, "body": "target comment"}, + {"event": "commented", "id": 300, "body": "after target"}, + ] + mock_gh_fetch_json_list.return_value = timeline_data + + # Test iteration stops at target comment + events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200)) + self.assertEqual(len(events), 3) # Should stop at target comment + self.assertEqual(events[0]["event"], "commented") + self.assertEqual(events[0]["id"], 100) + self.assertEqual(events[1]["event"], "committed") + self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c") + self.assertEqual(events[2]["event"], "commented") + self.assertEqual(events[2]["id"], 200) + + @mock.patch("trymerge.gh_fetch_json_list") + def test_iter_issue_timeline_until_comment_not_found( + self, mock_gh_fetch_json_list: Any, *args: Any + ) -> None: + """Test timeline iteration when target comment is not found""" + # Mock empty timeline + mock_gh_fetch_json_list.return_value = [] + + events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999)) + self.assertEqual(len(events), 0) + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_commit_after_comment( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + """Test get_commit_sha_at_comment returns correct SHA after comment""" + mock_iter_timeline.return_value = [ + {"event": "committed", "sha": "commit1"}, + {"event": "committed", "sha": "commit2"}, + {"event": "commented", "id": 100}, + {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertEqual(sha, "commit2") + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_force_push_before_comment( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.return_value = [ + {"event": "committed", "sha": "commit1"}, + {"event": "committed", "sha": "commit2"}, + {"event": "head_ref_force_pushed", "commit_id": "commit3"}, + {"event": "commented", "id": 100}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertEqual(sha, "commit3") + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.return_value = [ + {"event": "committed", "sha": "commit1"}, + {"event": "committed", "sha": "commit2"}, + {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}}, + {"event": "commented", "id": 100}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertEqual(sha, "commit3") + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_multiple_comments( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.return_value = [ + {"event": "committed", "sha": "commit1"}, + {"event": "commented", "id": 100}, + {"event": "committed", "sha": "commit2"}, + {"event": "commented", "id": 200}, + {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}}, + {"event": "commented", "id": 300}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(200) + self.assertEqual(sha, "commit2") + sha = pr.get_commit_sha_at_comment(300) + self.assertEqual(sha, "commit3") + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_no_events( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.return_value = [ + {"event": "commented", "id": 100}, + {"event": "labeled", "label": {"name": "test"}}, + ] + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertIsNone(sha) + + @mock.patch("trymerge.iter_issue_timeline_until_comment") + def test_get_commit_sha_at_comment_exception( + self, mock_iter_timeline: Any, *args: Any + ) -> None: + mock_iter_timeline.side_effect = Exception("API error") + pr = GitHubPR("pytorch", "pytorch", 77700) + sha = pr.get_commit_sha_at_comment(100) + self.assertIsNone(sha) + + if __name__ == "__main__": main() diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index 695a53305a05..00b66869dcf2 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -450,6 +450,63 @@ def __init__(self, name: str, url: str, run_id: int, status: Optional[str]): IGNORABLE_FAILED_CHECKS_THESHOLD = 10 +def iter_issue_timeline_until_comment( + org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200 +) -> Any: + """ + Yield timeline entries in order until (and including) the entry whose id == target_comment_id + for a 'commented' event. Stops once the target comment is encountered. + """ + page = 1 + + while page <= max_pages: + url = ( + f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline" + ) + params = {"per_page": 100, "page": page} + + batch = gh_fetch_json_list(url, params) + + if not batch: + return + for ev in batch: + # The target is the issue comment row with event == "commented" and id == issue_comment_id + if ev.get("event") == "commented" and ev.get("id") == target_comment_id: + yield ev # nothing in the timeline after this matters, so stop early + return + yield ev + if len(batch) < 100: + return + page += 1 + + # If we got here without finding the comment, then we either hit a bug or some github PR + # has a _really_ long timeline. + # The max # of pages found on any pytorch/pytorch PR at the time of this change was 41 + raise RuntimeError( + f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}." + f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team." + ) + + +def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]: + """Extract SHA from committed event in timeline""" + return ev.get("sha") + + +def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]: + """Extract SHA from force push event in timeline""" + # The current GitHub API format + commit_id = ev.get("commit_id") + if commit_id: + return str(commit_id) + + # Legacy format + after = ev.get("after") or ev.get("after_commit") or {} + if isinstance(after, dict): + return after.get("sha") or after.get("oid") + return ev.get("after_sha") or ev.get("head_sha") + + def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any: rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no) return rc["data"]["repository"]["pullRequest"] @@ -737,16 +794,24 @@ def get_changed_files_count(self) -> int: def last_commit(self) -> Any: return self.info["commits"]["nodes"][-1]["commit"] + def last_commit_sha(self, default: Optional[str] = None) -> str: + # for commits, the oid is the sha + + if default is None: + return str(self.last_commit()["oid"]) + + return str(self.last_commit().get("oid", default)) + def get_merge_base(self) -> str: if self.merge_base: return self.merge_base - last_commit_oid = self.last_commit()["oid"] + last_commit_sha = self.last_commit_sha() # NB: We could use self.base_ref() here for regular PR, however, that doesn't # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base, # so let's just use main instead self.merge_base = gh_fetch_merge_base( - self.org, self.project, last_commit_oid, self.default_branch() + self.org, self.project, last_commit_sha, self.default_branch() ) # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid @@ -835,6 +900,44 @@ def get_approved_by(self) -> list[str]: def get_commit_count(self) -> int: return int(self.info["commits_with_authors"]["totalCount"]) + def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]: + """ + Get the PR head commit SHA that was present when a specific comment was posted. + This ensures we only merge the state of the PR at the time the merge command was issued, + not any subsequent commits that may have been pushed after. + + Returns None if no head-changing events found before the comment or if the comment was not found. + """ + head = None + + try: + for event in iter_issue_timeline_until_comment( + self.org, self.project, self.pr_num, comment_id + ): + etype = event.get("event") + if etype == "committed": + sha = sha_from_committed_event(event) + if sha: + head = sha + print(f"Timeline: Found commit event for SHA {sha}") + elif etype == "head_ref_force_pushed": + sha = sha_from_force_push_after(event) + if sha: + head = sha + print(f"Timeline: Found force push event for SHA {sha}") + elif etype == "commented": + if event.get("id") == comment_id: + print(f"Timeline: Found final comment with sha {sha}") + return head + except Exception as e: + print( + f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}" + ) + return None + + print(f"Did not find comment with id {comment_id} in the PR timeline") + return None + def get_pr_creator_login(self) -> str: return cast(str, self.info["author"]["login"]) @@ -1151,7 +1254,7 @@ def merge_into( *, skip_mandatory_checks: bool = False, dry_run: bool = False, - comment_id: Optional[int] = None, + comment_id: int, ignore_current_checks: Optional[list[str]] = None, ) -> None: # Raises exception if matching rule is not found @@ -1167,7 +1270,7 @@ def merge_into( skip_internal_checks=can_skip_internal_checks(self, comment_id), ignore_current_checks=ignore_current_checks, ) - additional_merged_prs = self.merge_changes( + additional_merged_prs = self.merge_changes_locally( repo, skip_mandatory_checks, comment_id ) @@ -1196,7 +1299,7 @@ def merge_into( broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []), flaky_checks=ignorable_checks.get("FLAKY", []), unstable_checks=ignorable_checks.get("UNSTABLE", []), - last_commit_sha=self.last_commit().get("oid", ""), + last_commit_sha=self.last_commit_sha(default=""), merge_base_sha=self.get_merge_base(), merge_commit_sha=merge_commit_sha, is_failed=False, @@ -1217,7 +1320,7 @@ def merge_into( dry_run=dry_run, ) - def merge_changes( + def merge_changes_locally( self, repo: GitRepo, skip_mandatory_checks: bool = False, @@ -1226,27 +1329,15 @@ def merge_changes( skip_all_rule_checks: bool = False, ) -> list["GitHubPR"]: """ - :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally + :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally """ branch_to_merge_into = self.default_branch() if branch is None else branch if repo.current_branch() != branch_to_merge_into: repo.checkout(branch_to_merge_into) - if not self.is_ghstack_pr(): - msg = self.gen_commit_message() - pr_branch_name = f"__pull-request-{self.pr_num}__init__" - repo.fetch(self.last_commit()["oid"], pr_branch_name) - repo._run_git("merge", "--squash", pr_branch_name) - repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg) - - # Did the PR change since we started the merge? - pulled_sha = repo.show_ref(pr_branch_name) - latest_pr_status = GitHubPR(self.org, self.project, self.pr_num) - if pulled_sha != latest_pr_status.last_commit()["oid"]: - raise RuntimeError( - "PR has been updated since CI checks last passed. Please rerun the merge command." - ) - return [] - else: + + # It's okay to skip the commit SHA check for ghstack PRs since + # authoring requires write access to the repo. + if self.is_ghstack_pr(): return self.merge_ghstack_into( repo, skip_mandatory_checks, @@ -1254,6 +1345,48 @@ def merge_changes( skip_all_rule_checks=skip_all_rule_checks, ) + msg = self.gen_commit_message() + pr_branch_name = f"__pull-request-{self.pr_num}__init__" + + # Determine which commit SHA to merge + commit_to_merge = None + if not comment_id: + raise ValueError("Must provide --comment-id when merging regular PRs") + + # Get the commit SHA that was present when the comment was made + commit_to_merge = self.get_commit_sha_at_comment(comment_id) + if not commit_to_merge: + raise RuntimeError( + f"Could not find commit that was pushed before comment {comment_id}" + ) + + # Validate that this commit is the latest commit on the PR + latest_commit = self.last_commit_sha() + if commit_to_merge != latest_commit: + raise RuntimeError( + f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted " + f"but now the latest commit on the PR is {latest_commit}. " + f"Please re-issue the merge command to merge the latest commit." + ) + + print(f"Merging commit {commit_to_merge} locally") + + repo.fetch(commit_to_merge, pr_branch_name) + repo._run_git("merge", "--squash", pr_branch_name) + repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg) + + # Did the PR change since we started the merge? + pulled_sha = repo.show_ref(pr_branch_name) + latest_pr_status = GitHubPR(self.org, self.project, self.pr_num) + if ( + pulled_sha != latest_pr_status.last_commit_sha() + or pulled_sha != commit_to_merge + ): + raise RuntimeError( + "PR has been updated since CI checks last passed. Please rerun the merge command." + ) + return [] + class MergeRuleFailedError(RuntimeError): def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None: @@ -1458,7 +1591,7 @@ def find_matching_merge_rule( pending_checks = [] failed_checks = [] - hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}" + hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}" if len(failed_checks) > 0: if reject_reason_score < 30000: reject_reason_score = 30000 @@ -2156,14 +2289,14 @@ def categorize_checks( def merge( pr: GitHubPR, repo: GitRepo, + comment_id: int, dry_run: bool = False, skip_mandatory_checks: bool = False, - comment_id: Optional[int] = None, timeout_minutes: int = 400, stale_pr_days: int = 3, ignore_current: bool = False, ) -> None: - initial_commit_sha = pr.last_commit()["oid"] + initial_commit_sha = pr.last_commit_sha() pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}" print(f"Attempting merge of {initial_commit_sha} ({pr_link})") @@ -2234,7 +2367,7 @@ def merge( f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)" ) pr = GitHubPR(pr.org, pr.project, pr.pr_num) - if initial_commit_sha != pr.last_commit()["oid"]: + if initial_commit_sha != pr.last_commit_sha(): raise RuntimeError( "New commits were pushed while merging. Please rerun the merge command." ) @@ -2401,7 +2534,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: if args.check_mergeability: if pr.is_ghstack_pr(): get_ghstack_prs(repo, pr) # raises error if out of sync - pr.merge_changes( + pr.merge_changes_locally( repo, skip_mandatory_checks=True, skip_all_rule_checks=True, @@ -2416,12 +2549,18 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run) return try: + # Ensure comment id is set, else fail + if not args.comment_id: + raise ValueError( + "Comment ID is required for merging PRs, please provide it using --comment-id" + ) + merge( pr, repo, + comment_id=args.comment_id, dry_run=args.dry_run, skip_mandatory_checks=args.force, - comment_id=args.comment_id, ignore_current=args.ignore_current, ) except Exception as e: @@ -2443,7 +2582,7 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: broken_trunk_checks=[], flaky_checks=[], unstable_checks=[], - last_commit_sha=pr.last_commit().get("oid", ""), + last_commit_sha=pr.last_commit_sha(default=""), merge_base_sha=pr.get_merge_base(), is_failed=True, skip_mandatory_checks=args.force, diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 23d4c003efa8..7c93fdf522a4 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -4,7 +4,7 @@ {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%} {%- set timeout_minutes = 240 -%} -{%- set timeout_minutes_windows_binary = 300 -%} +{%- set timeout_minutes_windows_binary = 360 -%} {%- macro concurrency(build_environment) -%} concurrency: @@ -32,7 +32,7 @@ concurrency: {%- macro setup_ec2_windows() -%} !{{ display_ec2_information() }} - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index e0998e46fb5f..bf7db5866e78 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -56,7 +56,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -77,6 +77,9 @@ jobs: runs_on: linux.s390x ALPINE_IMAGE: "docker.io/s390x/alpine" timeout-minutes: 420 + {%- elif config["gpu_arch_type"] == "rocm" %} + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %} runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.24xlarge.ephemeral @@ -135,7 +138,7 @@ jobs: contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9 - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -150,10 +153,10 @@ jobs: with: name: !{{ config["build_name"] }} path: "${{ runner.temp }}/artifacts/" - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: !{{ config["container_image"] }} @@ -161,7 +164,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -182,7 +185,7 @@ jobs: with: name: !{{ config["build_name"] }} path: "${{ runner.temp }}/artifacts/" - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} - name: ROCm set GPU_FLAG run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" @@ -196,7 +199,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: !{{ config["container_image"] }} @@ -204,7 +207,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 02fa68f54172..662060bb1307 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -68,12 +68,7 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} - name: Populate binary env run: | # shellcheck disable=SC1091 diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2 index 1039a6214a7a..5e3798f8e237 100644 --- a/.github/templates/upload.yml.j2 +++ b/.github/templates/upload.yml.j2 @@ -33,7 +33,7 @@ {%- if is_windows %} # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" {%- endif %} {%- else %} diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2 index c3a824ad05a3..c61686f8df27 100644 --- a/.github/templates/windows_binary_build_workflow.yml.j2 +++ b/.github/templates/windows_binary_build_workflow.yml.j2 @@ -64,7 +64,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -135,7 +135,7 @@ jobs: {%- else %} !{{ set_runner_specific_vars() }} !{{ common.setup_ec2_windows() }} - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} {%- endif %} - name: Populate binary env shell: bash @@ -211,7 +211,7 @@ jobs: "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat" {%- else %} !{{ common.setup_ec2_windows() }} - !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }} !{{ set_runner_specific_vars() }} {%- endif %} - uses: !{{ common.download_artifact_action }} diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml index 72241a772be6..d9e5e29576d4 100644 --- a/.github/workflows/_bazel-build-test.yml +++ b/.github/workflows/_bazel-build-test.yml @@ -47,7 +47,7 @@ jobs: reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: fetch-depth: 1 submodules: false @@ -69,25 +69,25 @@ jobs: runs-on: ${{ matrix.runner }} steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 with: github-secret: ${{ secrets.GITHUB_TOKEN }} # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 - name: Setup Linux uses: ./.github/actions/setup-linux - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-image-name: ${{ inputs.docker-image-name }} - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -97,7 +97,7 @@ jobs: run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - uses: pytorch/test-infra/.github/actions/setup-nvidia@main + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9 if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }} - name: Output disk space left @@ -209,5 +209,5 @@ jobs: file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }} - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 if: always() diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml index bfa035bc753b..e81e4b6a8b26 100644 --- a/.github/workflows/_binary-build-linux.yml +++ b/.github/workflows/_binary-build-linux.yml @@ -142,13 +142,13 @@ jobs: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" if: inputs.build_environment != 'linux-s390x-binary-manywheel' - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.github-token }} - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }} @@ -178,7 +178,6 @@ jobs: - name: Checkout PyTorch to pytorch dir uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -213,9 +212,9 @@ jobs: - name: Calculate docker image id: calculate-docker-image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: - # If doing this in main or release branch, use docker.io. Otherwise + # If doing this in release/2.9 or release branch, use docker.io. Otherwise # use ECR docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: ${{ inputs.DOCKER_IMAGE }} @@ -227,7 +226,7 @@ jobs: - name: Pull Docker image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -283,7 +282,7 @@ jobs: - name: Teardown Linux if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 - name: Chown workspace if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/_binary-test-linux.yml b/.github/workflows/_binary-test-linux.yml index 2d9e4d0e27b2..887ab908b2d8 100644 --- a/.github/workflows/_binary-test-linux.yml +++ b/.github/workflows/_binary-test-linux.yml @@ -125,14 +125,14 @@ jobs: - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" if: inputs.build_environment != 'linux-s390x-binary-manywheel' - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.github-token }} # Setup the environment - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }} @@ -155,7 +155,6 @@ jobs: - name: Checkout PyTorch to pytorch dir uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive show-progress: false path: pytorch @@ -186,9 +185,7 @@ jobs: path: "${{ runner.temp }}/artifacts/" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - uses: pytorch/test-infra/.github/actions/setup-nvidia@main - with: - driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }} + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9 if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }} - name: configure aws credentials @@ -203,7 +200,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: ${{ inputs.DOCKER_IMAGE }} @@ -213,7 +210,7 @@ jobs: - name: Pull Docker image if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }} - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -225,7 +222,7 @@ jobs: - name: Teardown Linux if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 - name: Chown workspace if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel' diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml index 636b76d42931..61896f52bbed 100644 --- a/.github/workflows/_binary-upload.yml +++ b/.github/workflows/_binary-upload.yml @@ -81,7 +81,7 @@ jobs: SHA1: ${{ github.event.pull_request.head.sha || github.sha }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: no-sudo: true diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index ff5dbe604bac..5980ad849fa7 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -67,7 +67,7 @@ jobs: # an OOM issue when running the job, so this upgrades the runner from 4xlarge # to the next available tier of 12xlarge. So much memory just to generate cpp # doc - runner: ${{ inputs.runner_prefix }}linux.12xlarge + runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now) # Let's try to figure out how this can be improved timeout-minutes: 360 @@ -84,7 +84,7 @@ jobs: name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }} steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -95,7 +95,7 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 - name: Setup Linux uses: ./.github/actions/setup-linux @@ -110,12 +110,12 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-image-name: ${{ inputs.docker-image }} - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -222,5 +222,5 @@ jobs: s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 if: always() diff --git a/.github/workflows/_link_check.yml b/.github/workflows/_link_check.yml index 014e6106b073..4c46ad28cf6b 100644 --- a/.github/workflows/_link_check.yml +++ b/.github/workflows/_link_check.yml @@ -11,7 +11,7 @@ on: jobs: lint-urls: if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-url-lint') }} - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 with: job-name: lint-urls timeout: 120 @@ -37,7 +37,7 @@ jobs: lint-xrefs: if: ${{ github.event_name != 'pull_request' || !contains(github.event.pull_request.labels.*.name, 'skip-xref-lint') }} - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 with: job-name: lint-xrefs timeout: 60 diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index 6b4bd429e3c9..f909488850d0 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -134,7 +134,7 @@ jobs: test-matrix: ${{ steps.filter.outputs.test-matrix }} steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -147,7 +147,7 @@ jobs: # checkout because when we run this action we don't *have* a local # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: no-sudo: true @@ -183,7 +183,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image-name }} @@ -199,7 +199,7 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 if: inputs.build-environment != 'linux-s390x-binary-manywheel' && steps.use-old-whl.outputs.reuse != 'true' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -457,7 +457,7 @@ jobs: artifact_prefix: usage_log_build_${{ steps.get-job-id.outputs.job-id }} - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel' - name: Cleanup docker diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 66579b573a63..f413f497d79e 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -99,7 +99,7 @@ jobs: contents: read steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 if: ${{ !contains(matrix.runner, 'b200') && inputs.build-environment != 'linux-s390x-binary-manywheel' }} with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -108,7 +108,7 @@ jobs: docker exec -it $(docker container ps --format '{{.ID}}') bash - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: no-sudo: true @@ -139,7 +139,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image-name: ${{ inputs.docker-image }} @@ -155,7 +155,7 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -167,9 +167,9 @@ jobs: - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver - uses: pytorch/test-infra/.github/actions/setup-nvidia@main + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9 with: - driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }} + driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }} if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }} - name: Setup GPU_FLAG for docker run @@ -273,6 +273,8 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} + EXTRA_FLAGS: ${{ matrix.extra_flags || '' }} + OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} @@ -418,7 +420,7 @@ jobs: aws-region: us-east-1 - name: Upload the benchmark results - uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.9 if: inputs.build-environment != 'linux-s390x-binary-manywheel' with: benchmark-results-dir: test/test-reports @@ -476,7 +478,7 @@ jobs: workflow_attempt: ${{github.run_attempt}} - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' # NB: We are currently having an intermittent GPU-related issue on G5 runners with diff --git a/.github/workflows/_mac-build.yml b/.github/workflows/_mac-build.yml index a2a5f8dd9111..9561dcc8b895 100644 --- a/.github/workflows/_mac-build.yml +++ b/.github/workflows/_mac-build.yml @@ -67,11 +67,11 @@ jobs: test-matrix: ${{ steps.filter.outputs.test-matrix }} steps: - name: Clean up disk space before running MacOS workflow - uses: pytorch/test-infra/.github/actions/check-disk-space@main + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9 # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 - name: Set xcode version env: @@ -82,7 +82,7 @@ jobs: fi - name: Setup Python - uses: pytorch/test-infra/.github/actions/setup-python@main + uses: pytorch/test-infra/.github/actions/setup-python@release/2.9 with: python-version: ${{ inputs.python-version }} pip-requirements-file: .github/requirements/pip-requirements-macOS.txt @@ -188,4 +188,4 @@ jobs: - name: Clean up disk space if: always() continue-on-error: true - uses: pytorch/test-infra/.github/actions/check-disk-space@main + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9 diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 086e25b4868e..29ff3a72817f 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -105,11 +105,11 @@ jobs: done - name: Clean up disk space before running MacOS workflow - uses: pytorch/test-infra/.github/actions/check-disk-space@main + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9 # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 - name: Get workflow job id id: get-job-id @@ -119,7 +119,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} - name: Setup Python - uses: pytorch/test-infra/.github/actions/setup-python@main + uses: pytorch/test-infra/.github/actions/setup-python@release/2.9 with: python-version: ${{ inputs.python-version }} pip-requirements-file: .github/requirements/pip-requirements-macOS.txt @@ -257,7 +257,7 @@ jobs: file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} - name: Upload the benchmark results - uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.9 with: benchmark-results-dir: test/test-reports dry-run: false @@ -287,4 +287,4 @@ jobs: - name: Clean up disk space if: always() continue-on-error: true - uses: pytorch/test-infra/.github/actions/check-disk-space@main + uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.9 diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index f73972942b5f..b6cd5d88a094 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -81,7 +81,7 @@ jobs: steps: # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: no-sudo: true @@ -113,12 +113,12 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-image-name: ${{ inputs.docker-image }} - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -330,7 +330,7 @@ jobs: aws-region: us-east-1 - name: Upload the benchmark results - uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main + uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.9 with: benchmark-results-dir: test/test-reports dry-run: false diff --git a/.github/workflows/_runner-determinator.yml b/.github/workflows/_runner-determinator.yml index 0d674f044ec4..dd28024dbd80 100644 --- a/.github/workflows/_runner-determinator.yml +++ b/.github/workflows/_runner-determinator.yml @@ -59,7 +59,7 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} steps: # - name: Checkout PyTorch - # uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + # uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 # with: # fetch-depth: 1 # submodules: true diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml index ebfb4001e437..92543128265d 100644 --- a/.github/workflows/_win-build.yml +++ b/.github/workflows/_win-build.yml @@ -77,6 +77,7 @@ jobs: run: | git config --global core.longpaths true git config --global core.symlinks true + git config --global core.ignorecase false # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock # the directory on Windows and prevent GHA from checking out as reported @@ -84,10 +85,10 @@ jobs: git config --global core.fsmonitor false - name: Clean up leftover processes on non-ephemeral Windows runner - uses: pytorch/test-infra/.github/actions/cleanup-runner@main + uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.9 - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -102,7 +103,7 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: no-sudo: true @@ -150,7 +151,7 @@ jobs: BUILD_WHEEL: 1 MAX_JOBS: 8 CUDA_VERSION: ${{ inputs.cuda-version }} - PYTHON_VERSION: "3.9" + PYTHON_VERSION: "3.10" SCCACHE_BUCKET: "ossci-compiler-cache" SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} SCCACHE_REGION: us-east-1 diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index 0c95503928fb..37e48d99e2be 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -70,6 +70,7 @@ jobs: run: | git config --global core.longpaths true git config --global core.symlinks true + git config --global core.ignorecase false # https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock # the directory on Windows and prevent GHA from checking out as reported @@ -77,10 +78,10 @@ jobs: git config --global core.fsmonitor false - name: Clean up leftover processes on non-ephemeral Windows runner - uses: pytorch/test-infra/.github/actions/cleanup-runner@main + uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.9 - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 with: github-secret: ${{ secrets.GITHUB_TOKEN }} instructions: | @@ -96,7 +97,7 @@ jobs: # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: no-sudo: true @@ -183,7 +184,7 @@ jobs: env: USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: 3.9 + PYTHON_VERSION: "3.10" CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }} diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index 177e6ca4bbe3..6bceb4eef6ba 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -77,7 +77,7 @@ jobs: steps: # [see note: pytorch repo ref] - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 - name: Setup XPU uses: ./.github/actions/setup-xpu @@ -95,7 +95,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-image-name: ${{ inputs.docker-image }} @@ -109,7 +109,7 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} @@ -275,7 +275,7 @@ jobs: - name: Change permissions if: ${{ always() && steps.test.conclusion }} run: | - docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test" + docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1000:1000 test" - name: Print remaining test logs shell: bash diff --git a/.github/workflows/build-almalinux-images.yml b/.github/workflows/build-almalinux-images.yml index 0754b154a358..e0492f736442 100644 --- a/.github/workflows/build-almalinux-images.yml +++ b/.github/workflows/build-almalinux-images.yml @@ -39,7 +39,7 @@ jobs: tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"] steps: - name: Build docker image - uses: pytorch/pytorch/.github/actions/binary-docker-build@main + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.9 with: docker-image-name: almalinux-builder custom-tag-prefix: ${{matrix.tag}} diff --git a/.github/workflows/build-libtorch-images.yml b/.github/workflows/build-libtorch-images.yml index cc2f54fc45f8..edfa0168e19f 100644 --- a/.github/workflows/build-libtorch-images.yml +++ b/.github/workflows/build-libtorch-images.yml @@ -32,7 +32,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -58,7 +58,7 @@ jobs: ] steps: - name: Build docker image - uses: pytorch/pytorch/.github/actions/binary-docker-build@main + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.9 with: docker-image-name: libtorch-cxx11-builder custom-tag-prefix: ${{ matrix.tag }} diff --git a/.github/workflows/build-manywheel-images-s390x.yml b/.github/workflows/build-manywheel-images-s390x.yml index c498e169f1aa..a719bf21a1ca 100644 --- a/.github/workflows/build-manywheel-images-s390x.yml +++ b/.github/workflows/build-manywheel-images-s390x.yml @@ -25,7 +25,7 @@ jobs: runs-on: linux.s390x steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: false no-sudo: true diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index ce42d5644c93..e3549cd6284a 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -32,7 +32,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -47,12 +47,11 @@ jobs: matrix: include: [ { name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" }, - { name: "manylinux2_28-builder", tag: "cuda12.9", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cuda12.8", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cuda12.6", runner: "linux.9xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" }, - { name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" }, + { name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" }, { name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" }, @@ -64,7 +63,7 @@ jobs: name: ${{ matrix.name }}:${{ matrix.tag }} steps: - name: Build docker image - uses: pytorch/pytorch/.github/actions/binary-docker-build@main + uses: pytorch/pytorch/.github/actions/binary-docker-build@release/2.9 with: docker-image-name: ${{ matrix.name }} custom-tag-prefix: ${{ matrix.tag }} diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index 932d9c886302..8f066de47534 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -3,7 +3,7 @@ name: Build Triton wheels on: push: branches: - - main + - release/2.9 tags: # NOTE: Binary build pipelines should only get triggered on release candidate builds # Release candidate tags look like: v1.11.0-rc1 @@ -36,7 +36,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -74,12 +74,12 @@ jobs: PLATFORM: 'manylinux_2_28_x86_64' steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 with: github-secret: ${{ secrets.GITHUB_TOKEN }} - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: false @@ -87,7 +87,7 @@ jobs: uses: ./.github/actions/setup-linux - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ env.DOCKER_IMAGE }} @@ -184,7 +184,7 @@ jobs: path: ${{ runner.temp }}/artifacts/wheelhouse/* - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 if: always() build-wheel-win: @@ -217,7 +217,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-vllm-wheel.yml b/.github/workflows/build-vllm-wheel.yml new file mode 100644 index 000000000000..9efedf64cce7 --- /dev/null +++ b/.github/workflows/build-vllm-wheel.yml @@ -0,0 +1,248 @@ +name: Build vLLM wheels + +on: + push: + branches: + - main + paths: + - .github/workflows/build-vllm-wheel.yml + - .github/ci_commit_pins/vllm.txt + workflow_dispatch: + pull_request: + paths: + - .github/workflows/build-vllm-wheel.yml + - .github/ci_commit_pins/vllm.txt + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + build-wheel: + if: github.repository_owner == 'pytorch' + strategy: + fail-fast: false + matrix: + python-version: [ '3.12' ] + # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554 + device: [ 'cu128', 'cu129' ] + runner: [ 'linux.12xlarge.memory' ] + include: + - device: cu128 + manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8' + - device: cu129 + manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9' + name: "Build ${{ matrix.device }} vLLM wheel" + runs-on: ${{ matrix.runner }} + timeout-minutes: 480 + env: + PY_VERS: ${{ matrix.python-version }} + MANYLINUX_IMAGE: ${{ matrix.manylinux-image }} + PLATFORM: 'manylinux_2_28_x86_64' + BUILD_DEVICE: ${{ matrix.device }} + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 + with: + submodules: false + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Get latest PyTorch nightly + shell: bash + run: | + set -eux + + # Keep PyTorch nightly wheel here so that we can install it later during + # vLLM build process + mkdir -p "${RUNNER_TEMP}/artifacts/" + + container_name=$(docker run \ + --tty \ + --detach \ + -e PLATFORM \ + -v "${GITHUB_WORKSPACE}:/pytorch" \ + -v "${RUNNER_TEMP}/artifacts:/artifacts" \ + -w /artifacts/ \ + "${MANYLINUX_IMAGE}" + ) + + # Determine python executable for given version (copied from build-triton-wheel) + case $PY_VERS in + 3.10) + PYTHON_EXECUTABLE=/opt/python/cp310-cp310/bin/python + ;; + 3.11) + PYTHON_EXECUTABLE=/opt/python/cp311-cp311/bin/python + ;; + 3.12) + PYTHON_EXECUTABLE=/opt/python/cp312-cp312/bin/python + ;; + 3.13) + PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python + ;; + 3.13t) + PYTHON_EXECUTABLE=/opt/python/cp313-cp313t/bin/python + ;; + 3.14) + PYTHON_EXECUTABLE=/opt/python/cp314-cp314/bin/python + ;; + 3.14t) + PYTHON_EXECUTABLE=/opt/python/cp314-cp314t/bin/python + ;; + *) + echo "Unsupported python version ${PY_VERS}" + exit 1 + ;; + esac + + docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \ + --pre torch torchvision torchaudio \ + --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" + + # I wonder if there is a command to both download and install the wheels + # in one go + docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip download \ + --pre torch torchvision torchaudio \ + --index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}" + + # Save this for later + echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV" + echo "container_name=${container_name}" >> "$GITHUB_ENV" + + - name: Build vLLM wheel + uses: ./.github/actions/build-external-packages + with: + build-targets: vllm + docker-image: ${{ env.MANYLINUX_IMAGE }} + cuda-arch-list: '8.0;8.9;9.0;10.0;12.0' + torch-wheel-dir: ${{ runner.temp }}/artifacts + output-dir: ${{ runner.temp }}/artifacts/externals + + - name: Prepare vLLM wheel + shell: bash + run: | + set -eux + + # Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh + docker exec -t "${container_name}" bash -c " + set -eux + + nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4) + + pushd externals/vllm/wheels + for package in xformers flashinfer-python vllm; do + pushd \$package + auditwheel repair --plat \$PLATFORM *.whl \ + --exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv* + repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*) + repair_wheel=\$(basename \${repair_wheel}) + popd + + cp \${package}/wheelhouse/\${repair_wheel} . + version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) + + if [[ \$package == vllm ]]; then + new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly} + else + major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3) + new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly} + fi + + mv -- \$repair_wheel \$new_wheel + rm -rf \$package + done + popd + " + + docker exec -t "${container_name}" chown -R 1000:1000 /artifacts + + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + with: + name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }} + if-no-files-found: error + path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 + if: always() + + # Copied from build-triton-wheel workflow (mostly) + upload-wheel: + name: "Upload ${{ matrix.device }} vLLM wheel" + needs: + - build-wheel + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + device: [ 'cu128', 'cu129' ] + env: + BUILD_DEVICE: ${{ matrix.device }} + permissions: + id-token: write + contents: read + container: + image: continuumio/miniconda3:4.12.0 + environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }} + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Configure AWS credentials(PyTorch account) for main + if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }} + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels + aws-region: us-east-1 + + - name: Configure AWS credentials(PyTorch account) for RC builds + if: ${{ github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }} + uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels + aws-region: us-east-1 + + - name: Download Build Artifacts + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + # Download all available artifacts + path: ${{ runner.temp }}/artifacts-all + + - name: Select Wheel Artifacts + shell: bash + run: | + set -eux + mkdir -p "${RUNNER_TEMP}/artifacts/" + mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/" + + - name: Set DRY_RUN (only for tagged pushes) + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }} + shell: bash + run: | + echo "DRY_RUN=disabled" >> "$GITHUB_ENV" + + - name: Set UPLOAD_CHANNEL (only for tagged pushes) + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }} + shell: bash + run: | + set -ex + + if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then + echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" + fi + + - name: Upload binaries + env: + PACKAGE_TYPE: wheel + UPLOAD_SUBFOLDER: ${{ env.BUILD_DEVICE }} + PKG_DIR: ${{ runner.temp }}/artifacts + shell: bash + run: | + set -ex + bash .circleci/scripts/binary_upload.sh diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml index 44430522b79d..1174a1c502f6 100644 --- a/.github/workflows/check-labels.yml +++ b/.github/workflows/check-labels.yml @@ -38,7 +38,7 @@ jobs: runs-on: linux.24_04.4x steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml index bef3d8797149..da83019a5908 100644 --- a/.github/workflows/close-nonexistent-disable-issues.yml +++ b/.github/workflows/close-nonexistent-disable-issues.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index 57fe7be15d29..03631be3e563 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -19,7 +19,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index b86ee2352bd1..f88244a13ffc 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -33,7 +33,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -50,28 +50,27 @@ jobs: runner: [linux.12xlarge] docker-image-name: [ pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11, + pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11, pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm, pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks, - pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks, - pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks, pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9, pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11, - pytorch-linux-jammy-py3.9-clang12, + pytorch-linux-jammy-py3.10-clang12, pytorch-linux-jammy-py3.13-clang12, pytorch-linux-jammy-rocm-n-py3, pytorch-linux-noble-rocm-n-py3, pytorch-linux-noble-rocm-alpha-py3, pytorch-linux-jammy-rocm-n-py3-benchmarks, - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12, - pytorch-linux-jammy-py3.9-gcc11, - pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12, + pytorch-linux-jammy-py3.10-gcc11, + pytorch-linux-jammy-py3-gcc11-inductor-benchmarks, pytorch-linux-jammy-py3.12-halide, - pytorch-linux-jammy-xpu-2025.0-py3, - pytorch-linux-jammy-xpu-2025.1-py3, + pytorch-linux-jammy-xpu-n-1-py3, + pytorch-linux-jammy-xpu-n-py3, pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-linter, - pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter, + pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter, # Executorch pin needs update # pytorch-linux-jammy-py3-clang12-executorch, pytorch-linux-jammy-py3.12-triton-cpu, @@ -97,21 +96,21 @@ jobs: # [see note: pytorch repo ref] # deep clone (fetch-depth 0) required for git merge-base - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 - name: Setup Linux uses: ./.github/actions/setup-linux - name: Build docker image id: build-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-image-name: ci-image:${{ matrix.docker-image-name }} always-rebuild: true push: true - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.build-docker-image.outputs.docker-image }} @@ -142,5 +141,5 @@ jobs: if: always() - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 if: always() diff --git a/.github/workflows/docker-cache-mi300.yml b/.github/workflows/docker-cache-mi300.yml index 02c1171c567a..bc2ae450f7c2 100644 --- a/.github/workflows/docker-cache-mi300.yml +++ b/.github/workflows/docker-cache-mi300.yml @@ -20,7 +20,7 @@ jobs: runs-on: rocm-docker steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: no-sudo: true @@ -39,13 +39,13 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3 push: false - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index 2560ebf7912a..134e4caf3088 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -37,7 +37,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -52,7 +52,7 @@ jobs: matrix: ${{ steps.generate-matrix.outputs.matrix }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: fetch-depth: 1 submodules: true @@ -82,7 +82,7 @@ jobs: CUDNN_VERSION: ${{ matrix.cudnn_version }} steps: - name: Setup SSH (Click me for login details) - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 with: github-secret: ${{ secrets.GITHUB_TOKEN }} # [see note: pytorch repo ref] @@ -164,12 +164,12 @@ jobs: fi - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 if: always() validate: needs: build - uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@main + uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.9 with: - channel: nightly + channel: test ref: main diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index 59b14b455e9a..7e36c82644dc 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -112,7 +112,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-cuda-aarch64-12_9-build: + manywheel-py3_10-cuda-aarch64-12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -121,39 +121,131 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_10-cuda-aarch64-12_9 + build_name: manywheel-py3_10-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_10-cuda-aarch64-12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_10-cuda-aarch64-12_9-build + needs: manywheel-py3_10-cuda-aarch64-12_6-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cuda-aarch64-12_9 + build_name: manywheel-py3_10-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_10-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_10-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_10-cuda-aarch64-13_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.10" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_10-cuda-aarch64-13_0 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda-aarch64-13_0-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-cuda-aarch64-13_0-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -223,7 +315,99 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-cuda-aarch64-12_9-build: + manywheel-py3_11-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_11-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_11-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.11" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_11-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_11-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -232,39 +416,39 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_11-cuda-aarch64-12_9 + build_name: manywheel-py3_11-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_11-cuda-aarch64-13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_11-cuda-aarch64-12_9-build + needs: manywheel-py3_11-cuda-aarch64-13_0-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cuda-aarch64-12_9 + build_name: manywheel-py3_11-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -334,7 +518,53 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-cuda-aarch64-12_9-build: + manywheel-py3_12-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_12-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_12-cuda-aarch64-12_8-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -343,39 +573,85 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_12-cuda-aarch64-12_9 + build_name: manywheel-py3_12-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_12-cuda-aarch64-12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_12-cuda-aarch64-12_9-build + needs: manywheel-py3_12-cuda-aarch64-12_8-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-cuda-aarch64-12_9 + build_name: manywheel-py3_12-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_12-cuda-aarch64-13_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.12" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_12-cuda-aarch64-13_0 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda-aarch64-13_0-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-cuda-aarch64-13_0-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -445,7 +721,53 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13-cuda-aarch64-12_9-build: + manywheel-py3_13-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13-cuda-aarch64-12_8-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -454,39 +776,85 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_13-cuda-aarch64-12_9 + build_name: manywheel-py3_13-cuda-aarch64-12_8 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_13-cuda-aarch64-12_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13-cuda-aarch64-12_9-build + needs: manywheel-py3_13-cuda-aarch64-12_8-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cuda-aarch64-12_9 + build_name: manywheel-py3_13-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13-cuda-aarch64-13_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13-cuda-aarch64-13_0 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13-cuda-aarch64-13_0-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13-cuda-aarch64-13_0-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13" + build_name: manywheel-py3_13-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -556,7 +924,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13t-cuda-aarch64-12_9-build: + manywheel-py3_13t-cuda-aarch64-12_6-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -565,39 +933,131 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_13t-cuda-aarch64-12_9 + build_name: manywheel-py3_13t-cuda-aarch64-12_6 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_13t-cuda-aarch64-12_6-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_13t-cuda-aarch64-12_9-build + needs: manywheel-py3_13t-cuda-aarch64-12_6-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda-aarch64-12_9 + build_name: manywheel-py3_13t-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13t-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13t-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13t-cuda-aarch64-13_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_13t-cuda-aarch64-13_0 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cuda-aarch64-13_0-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cuda-aarch64-13_0-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -667,7 +1127,99 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14-cuda-aarch64-12_9-build: + manywheel-py3_14-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.14" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.14" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -676,39 +1228,39 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_14-cuda-aarch64-12_9 + build_name: manywheel-py3_14-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_14-cuda-aarch64-13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_14-cuda-aarch64-12_9-build + needs: manywheel-py3_14-cuda-aarch64-13_0-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14" - build_name: manywheel-py3_14-cuda-aarch64-12_9 + build_name: manywheel-py3_14-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -778,7 +1330,99 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14t-cuda-aarch64-12_9-build: + manywheel-py3_14t-cuda-aarch64-12_6-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.14t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14t-cuda-aarch64-12_6 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cuda-aarch64-12_6-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cuda-aarch64-12_6-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu126 + GPU_ARCH_VERSION: "12.6-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.6 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cuda-aarch64-12_6 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14t-cuda-aarch64-12_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.14t" + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + runs_on: linux.arm64.m7g.4xlarge.ephemeral + ALPINE_IMAGE: "arm64v8/alpine" + build_name: manywheel-py3_14t-cuda-aarch64-12_8 + build_environment: linux-aarch64-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' + timeout-minutes: 420 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cuda-aarch64-12_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cuda-aarch64-12_8-build + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu128 + GPU_ARCH_VERSION: "12.8-aarch64" + GPU_ARCH_TYPE: cuda-aarch64 + DOCKER_IMAGE: manylinuxaarch64-builder + DOCKER_IMAGE_TAG_PREFIX: cuda12.8 + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cuda-aarch64-12_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14t-cuda-aarch64-13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -787,39 +1431,39 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.arm64.m7g.4xlarge.ephemeral ALPINE_IMAGE: "arm64v8/alpine" - build_name: manywheel-py3_14t-cuda-aarch64-12_9 + build_name: manywheel-py3_14t-cuda-aarch64-13_0 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' timeout-minutes: 420 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14t-cuda-aarch64-12_9-upload: # Uploading + manywheel-py3_14t-cuda-aarch64-13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: manywheel-py3_14t-cuda-aarch64-12_9-build + needs: manywheel-py3_14t-cuda-aarch64-13_0-build with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9-aarch64" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0-aarch64" GPU_ARCH_TYPE: cuda-aarch64 DOCKER_IMAGE: manylinuxaarch64-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 DESIRED_PYTHON: "3.14t" - build_name: manywheel-py3_14t-cuda-aarch64-12_9 + build_name: manywheel-py3_14t-cuda-aarch64-13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-libtorch-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-nightly.yml index 776e77e80826..bc671ae80ae2 100644 --- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -248,7 +248,7 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - libtorch-cuda12_9-shared-with-deps-release-build: + libtorch-cuda13_0-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -257,22 +257,22 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: libtorch-cuda12_9-shared-with-deps-release + build_name: libtorch-cuda13_0-shared-with-deps-release build_environment: linux-binary-libtorch secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-cuda12_9-shared-with-deps-release-test: # Testing + libtorch-cuda13_0-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - libtorch-cuda12_9-shared-with-deps-release-build + - libtorch-cuda13_0-shared-with-deps-release-build - get-label-type uses: ./.github/workflows/_binary-test-linux.yml with: @@ -280,38 +280,38 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps - build_name: libtorch-cuda12_9-shared-with-deps-release + build_name: libtorch-cuda13_0-shared-with-deps-release build_environment: linux-binary-libtorch runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading + libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: libtorch-cuda12_9-shared-with-deps-release-test + needs: libtorch-cuda13_0-shared-with-deps-release-test with: PYTORCH_ROOT: /pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DOCKER_IMAGE: libtorch-cxx11-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 + DOCKER_IMAGE_TAG_PREFIX: cuda13.0 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps - build_name: libtorch-cuda12_9-shared-with-deps-release + build_name: libtorch-cuda13_0-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -333,6 +333,7 @@ jobs: LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: libtorch-rocm6_3-shared-with-deps-release build_environment: linux-binary-libtorch secrets: @@ -368,7 +369,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -390,7 +390,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: libtorch-cxx11-builder @@ -398,7 +398,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -447,6 +447,7 @@ jobs: LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: libtorch-rocm6_4-shared-with-deps-release build_environment: linux-binary-libtorch secrets: @@ -482,7 +483,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -504,7 +504,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: libtorch-cxx11-builder @@ -512,7 +512,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary diff --git a/.github/workflows/generated-linux-binary-libtorch-release-main.yml b/.github/workflows/generated-linux-binary-libtorch-release-main.yml index c98d71dfefc4..9d55fc6e50ab 100644 --- a/.github/workflows/generated-linux-binary-libtorch-release-main.yml +++ b/.github/workflows/generated-linux-binary-libtorch-release-main.yml @@ -36,7 +36,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml index ec08b2c78eb6..85b91378b253 100644 --- a/.github/workflows/generated-linux-binary-manywheel-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-main.yml @@ -36,7 +36,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -60,7 +60,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-test: # Testing diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 96a4a0fff837..5f9eaab976a6 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -127,7 +127,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_6-test: # Testing @@ -193,7 +193,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_8-test: # Testing @@ -241,72 +241,6 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-cuda12_9-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.10" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_10-cuda12_9 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cuda12_9-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_10-cuda12_9-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cuda12_9 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-cuda12_9-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_10-cuda12_9-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-cuda12_9 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -325,7 +259,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda13_0-test: # Testing @@ -389,6 +323,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_10-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -423,7 +358,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -445,7 +379,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -453,7 +387,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -500,6 +434,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_10-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -534,7 +469,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -556,7 +490,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -564,7 +498,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -612,7 +546,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_10-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-xpu-test: # Testing @@ -638,7 +572,7 @@ jobs: contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9 - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -656,7 +590,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -667,7 +600,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -675,7 +608,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -785,7 +718,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_6-test: # Testing @@ -851,7 +784,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_8-test: # Testing @@ -899,72 +832,6 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-cuda12_9-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.11" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_11-cuda12_9 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cuda12_9-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_11-cuda12_9-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cuda12_9 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-cuda12_9-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_11-cuda12_9-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-cuda12_9 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -983,7 +850,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda13_0-test: # Testing @@ -1047,6 +914,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_11-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -1081,7 +949,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1103,7 +970,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1111,7 +978,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1158,6 +1025,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.11" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_11-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -1192,7 +1060,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1214,7 +1081,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1222,7 +1089,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1270,7 +1137,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_11-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-xpu-test: # Testing @@ -1296,7 +1163,7 @@ jobs: contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9 - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -1314,7 +1181,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1325,7 +1191,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1333,7 +1199,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1443,7 +1309,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_6-test: # Testing @@ -1509,7 +1375,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_8-test: # Testing @@ -1557,72 +1423,6 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-cuda12_9-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.12" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_12-cuda12_9 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-cuda12_9-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_12-cuda12_9-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-cuda12_9 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-cuda12_9-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_12-cuda12_9-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-cuda12_9 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -1641,7 +1441,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda13_0-test: # Testing @@ -1705,6 +1505,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_12-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -1739,7 +1540,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1761,7 +1561,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1769,7 +1569,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1816,6 +1616,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.12" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_12-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -1850,7 +1651,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1872,7 +1672,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1880,7 +1680,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -1928,7 +1728,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_12-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-xpu-test: # Testing @@ -1954,7 +1754,7 @@ jobs: contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9 - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -1972,7 +1772,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1983,7 +1782,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -1991,7 +1790,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2101,7 +1900,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_6-test: # Testing @@ -2167,7 +1966,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda12_8-test: # Testing @@ -2215,72 +2014,6 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13-cuda12_9-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.13" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13-cuda12_9 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cuda12_9-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_13-cuda12_9-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cuda12_9 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13-cuda12_9-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_13-cuda12_9-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.13" - build_name: manywheel-py3_13-cuda12_9 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2299,7 +2032,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-cuda13_0-test: # Testing @@ -2363,6 +2096,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -2397,7 +2131,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2419,7 +2152,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2427,7 +2160,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2474,6 +2207,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -2508,7 +2242,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2530,7 +2263,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2538,7 +2271,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2586,7 +2319,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13-xpu-test: # Testing @@ -2612,7 +2345,7 @@ jobs: contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9 - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -2630,7 +2363,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2641,7 +2373,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -2649,7 +2381,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -2759,7 +2491,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_6-test: # Testing @@ -2825,7 +2557,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda12_8-test: # Testing @@ -2873,72 +2605,6 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13t-cuda12_9-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.13t" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_13t-cuda12_9 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cuda12_9-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_13t-cuda12_9-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda12_9 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_13t-cuda12_9-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_13t-cuda12_9-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.13t" - build_name: manywheel-py3_13t-cuda12_9 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_13t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -2957,7 +2623,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-cuda13_0-test: # Testing @@ -3021,6 +2687,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13t-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -3055,7 +2722,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3077,7 +2743,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3085,7 +2751,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3132,6 +2798,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.13t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_13t-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -3166,7 +2833,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3188,7 +2854,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3196,7 +2862,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3244,7 +2910,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_13t-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_13t-xpu-test: # Testing @@ -3270,7 +2936,7 @@ jobs: contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9 - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -3288,7 +2954,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3299,7 +2964,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3307,7 +2972,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3417,7 +3082,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_6-test: # Testing @@ -3483,7 +3148,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda12_8-test: # Testing @@ -3531,72 +3196,6 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14-cuda12_9-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.14" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_14-cuda12_9 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14-cuda12_9-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_14-cuda12_9-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.14" - build_name: manywheel-py3_14-cuda12_9 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14-cuda12_9-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_14-cuda12_9-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.14" - build_name: manywheel-py3_14-cuda12_9 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -3615,7 +3214,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-cuda13_0-test: # Testing @@ -3679,6 +3278,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -3713,7 +3313,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3735,7 +3334,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3743,7 +3342,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3790,6 +3389,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -3824,7 +3424,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3846,7 +3445,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3854,7 +3453,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -3902,7 +3501,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14-xpu-test: # Testing @@ -3928,7 +3527,7 @@ jobs: contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9 - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -3946,7 +3545,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3957,7 +3555,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -3965,7 +3563,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -4075,7 +3673,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_6 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_6-test: # Testing @@ -4141,7 +3739,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda12_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda12_8-test: # Testing @@ -4189,72 +3787,6 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14t-cuda12_9-build: - if: ${{ github.repository_owner == 'pytorch' }} - uses: ./.github/workflows/_binary-build-linux.yml - needs: get-label-type - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.14t" - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_14t-cuda12_9 - build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64' - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14t-cuda12_9-test: # Testing - if: ${{ github.repository_owner == 'pytorch' }} - needs: - - manywheel-py3_14t-cuda12_9-build - - get-label-type - uses: ./.github/workflows/_binary-test-linux.yml - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.14t" - build_name: manywheel-py3_14t-cuda12_9 - build_environment: linux-binary-manywheel - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_14t-cuda12_9-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - permissions: - id-token: write - contents: read - needs: manywheel-py3_14t-cuda12_9-test - with: - PYTORCH_ROOT: /pytorch - PACKAGE_TYPE: manywheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" - GPU_ARCH_TYPE: cuda - DOCKER_IMAGE: manylinux2_28-builder - DOCKER_IMAGE_TAG_PREFIX: cuda12.9 - DESIRED_PYTHON: "3.14t" - build_name: manywheel-py3_14t-cuda12_9 - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_14t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml @@ -4273,7 +3805,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-cuda13_0 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-cuda13_0-test: # Testing @@ -4337,6 +3869,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.3 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14t-rocm6_3 build_environment: linux-binary-manywheel secrets: @@ -4371,7 +3904,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4393,7 +3925,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -4401,7 +3933,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -4448,6 +3980,7 @@ jobs: DOCKER_IMAGE_TAG_PREFIX: rocm6.4 DESIRED_PYTHON: "3.14t" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + timeout-minutes: 300 build_name: manywheel-py3_14t-rocm6_4 build_environment: linux-binary-manywheel secrets: @@ -4482,7 +4015,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4504,7 +4036,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -4512,7 +4044,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary @@ -4560,7 +4092,7 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build_name: manywheel-py3_14t-xpu build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_14t-xpu-test: # Testing @@ -4586,7 +4118,7 @@ jobs: contents: read steps: - name: Setup XPU - uses: ./.github/actions/setup-xpu + uses: pytorch/pytorch/.github/actions/setup-xpu@release/2.9 - name: configure aws credentials id: aws_creds uses: aws-actions/configure-aws-credentials@v4 @@ -4604,7 +4136,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4615,7 +4146,7 @@ jobs: working-directory: pytorch - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -4623,7 +4154,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary diff --git a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml index 8177bac3fe21..9df4835757c4 100644 --- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml @@ -38,13 +38,13 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - manywheel-py3_9-rocm6_4-build: + manywheel-py3_10-rocm6_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml needs: get-label-type @@ -58,16 +58,17 @@ jobs: GPU_ARCH_TYPE: rocm DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build_name: manywheel-py3_9-rocm6_4 + timeout-minutes: 300 + build_name: manywheel-py3_10-rocm6_4 build_environment: linux-binary-manywheel-rocm secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-rocm6_4-test: # Testing + manywheel-py3_10-rocm6_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - manywheel-py3_9-rocm6_4-build + - manywheel-py3_10-rocm6_4-build - get-label-type runs-on: linux.rocm.gpu.mi250 timeout-minutes: 240 @@ -82,19 +83,18 @@ jobs: SKIP_ALL_TESTS: 1 DOCKER_IMAGE: manylinux2_28-builder DOCKER_IMAGE_TAG_PREFIX: rocm6.4 - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Setup ROCm uses: ./.github/actions/setup-rocm - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: manywheel-py3_9-rocm6_4 + name: manywheel-py3_10-rocm6_4 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -116,7 +116,7 @@ jobs: role-duration-seconds: 18000 - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }} docker-image-name: manylinux2_28-builder @@ -124,7 +124,7 @@ jobs: docker-build-dir: .ci/docker working-directory: pytorch - name: Pull Docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Test Pytorch binary diff --git a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml index b0c3c06b2e61..d7fd44031be2 100644 --- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -302,3 +302,195 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_13t-cpu-s390x-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.13t" + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + timeout-minutes: 420 + build_name: manywheel-py3_13t-cpu-s390x + build_environment: linux-s390x-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cpu-s390x-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_13t-cpu-s390x-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cpu-s390x + build_environment: linux-s390x-binary-manywheel + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_13t-cpu-s390x-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_13t-cpu-s390x-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.13t" + build_name: manywheel-py3_13t-cpu-s390x + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14-cpu-s390x-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14" + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + timeout-minutes: 420 + build_name: manywheel-py3_14-cpu-s390x + build_environment: linux-s390x-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cpu-s390x-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_14-cpu-s390x-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cpu-s390x + build_environment: linux-s390x-binary-manywheel + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14-cpu-s390x-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14-cpu-s390x-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14" + build_name: manywheel-py3_14-cpu-s390x + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_14t-cpu-s390x-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + needs: get-label-type + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14t" + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + timeout-minutes: 420 + build_name: manywheel-py3_14t-cpu-s390x + build_environment: linux-s390x-binary-manywheel + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cpu-s390x-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: + - manywheel-py3_14t-cpu-s390x-build + - get-label-type + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cpu-s390x + build_environment: linux-s390x-binary-manywheel + runs_on: linux.s390x + ALPINE_IMAGE: "docker.io/s390x/alpine" + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_14t-cpu-s390x-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_14t-cpu-s390x-test + with: + PYTORCH_ROOT: /pytorch + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu-s390x + DOCKER_IMAGE: pytorch/manylinuxs390x-builder + DOCKER_IMAGE_TAG_PREFIX: cpu-s390x + DESIRED_PYTHON: "3.14t" + build_name: manywheel-py3_14t-cpu-s390x + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml index ad7a1cf1d71d..5f21fc565901 100644 --- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml @@ -46,7 +46,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -67,15 +67,9 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index bcc7279dd777..b12a5212cd4e 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -63,15 +63,9 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -208,15 +202,9 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -353,15 +341,9 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -498,15 +480,9 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -643,15 +619,9 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -788,15 +758,9 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -933,15 +897,9 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml index 2c86e7e10359..7a8ea9cbfa2c 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -51,7 +51,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -64,7 +64,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Populate binary env shell: cmd @@ -128,7 +128,7 @@ jobs: - libtorch-cpu-shared-with-deps-debug-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -141,7 +141,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Populate binary env shell: cmd @@ -201,7 +201,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cpu-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml index 912a452f0ee8..14081649d370 100644 --- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -51,7 +51,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -64,7 +64,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Populate binary env shell: cmd @@ -128,7 +128,7 @@ jobs: - libtorch-cpu-shared-with-deps-release-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -141,7 +141,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Populate binary env shell: cmd @@ -201,7 +201,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cpu-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml index 1dd70d0d06a9..d0e02dade299 100644 --- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml @@ -41,7 +41,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -51,7 +51,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -124,7 +124,7 @@ jobs: - wheel-py3_11-cpu-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -198,7 +198,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -271,7 +271,7 @@ jobs: - wheel-py3_12-cpu-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -345,7 +345,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -418,7 +418,7 @@ jobs: - wheel-py3_13-cpu-build - get-label-type runs-on: "windows-11-arm64-preview" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml index ac15a9f3e97a..3df2c65440a5 100644 --- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml +++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml @@ -28,7 +28,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -38,7 +38,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -51,7 +51,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -77,7 +77,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -109,7 +109,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -153,7 +152,7 @@ jobs: - libtorch-cpu-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -166,7 +165,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -183,7 +182,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -215,7 +214,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml index 9c3a96d4caee..f4413a86c657 100644 --- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml @@ -35,7 +35,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -45,7 +45,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -58,7 +58,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -84,7 +84,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -116,7 +116,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -160,7 +159,7 @@ jobs: - libtorch-cpu-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -173,7 +172,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -190,7 +189,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -222,7 +221,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -283,7 +281,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cpu-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -292,7 +290,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -306,7 +304,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -332,7 +330,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -364,7 +362,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -408,7 +405,7 @@ jobs: - libtorch-cuda12_6-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -422,7 +419,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -439,7 +436,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -471,7 +468,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -533,7 +529,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cuda12_6-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -542,7 +538,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -556,7 +552,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -582,7 +578,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -614,7 +610,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -658,7 +653,7 @@ jobs: - libtorch-cuda12_8-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -672,7 +667,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -689,7 +684,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -721,7 +716,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -783,30 +777,30 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cuda12_8-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - libtorch-cuda12_9-shared-with-deps-debug-build: + libtorch-cuda13_0-shared-with-deps-debug-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -832,7 +826,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -864,7 +858,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -884,7 +877,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: libtorch-cuda12_9-shared-with-deps-debug + name: libtorch-cuda13_0-shared-with-deps-debug retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -902,27 +895,27 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda12_9-shared-with-deps-debug-test: # Testing + libtorch-cuda13_0-shared-with-deps-debug-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - libtorch-cuda12_9-shared-with-deps-debug-build + - libtorch-cuda13_0-shared-with-deps-debug-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -939,7 +932,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -971,7 +964,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -992,7 +984,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: libtorch-cuda12_9-shared-with-deps-debug + name: libtorch-cuda13_0-shared-with-deps-debug path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1015,26 +1007,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda12_9-shared-with-deps-debug-upload: # Uploading + libtorch-cuda13_0-shared-with-deps-debug-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: libtorch-cuda12_9-shared-with-deps-debug-test + needs: libtorch-cuda13_0-shared-with-deps-debug-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: debug LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" - build_name: libtorch-cuda12_9-shared-with-deps-debug + DESIRED_PYTHON: "3.10" + build_name: libtorch-cuda13_0-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-libtorch-release-main.yml b/.github/workflows/generated-windows-binary-libtorch-release-main.yml index 9a0a3496e37b..ef94d6212af3 100644 --- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml +++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml @@ -28,7 +28,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -38,7 +38,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -51,7 +51,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -77,7 +77,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -109,7 +109,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -153,7 +152,7 @@ jobs: - libtorch-cpu-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -166,7 +165,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -183,7 +182,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -215,7 +214,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml index d212894b7443..8f4ec6e0b205 100644 --- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml @@ -35,7 +35,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -45,7 +45,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -58,7 +58,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -84,7 +84,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -116,7 +116,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -160,7 +159,7 @@ jobs: - libtorch-cpu-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -173,7 +172,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -190,7 +189,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -222,7 +221,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -283,7 +281,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cpu-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -292,7 +290,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -306,7 +304,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -332,7 +330,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -364,7 +362,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -408,7 +405,7 @@ jobs: - libtorch-cuda12_6-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -422,7 +419,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -439,7 +436,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -471,7 +468,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -533,7 +529,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cuda12_6-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} @@ -542,7 +538,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -556,7 +552,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -582,7 +578,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -614,7 +610,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -658,7 +653,7 @@ jobs: - libtorch-cuda12_8-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch @@ -672,7 +667,7 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -689,7 +684,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -721,7 +716,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -783,30 +777,30 @@ jobs: LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" build_name: libtorch-cuda12_8-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - libtorch-cuda12_9-shared-with-deps-release-build: + libtorch-cuda13_0-shared-with-deps-release-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -832,7 +826,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -864,7 +858,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -884,7 +877,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: libtorch-cuda12_9-shared-with-deps-release + name: libtorch-cuda13_0-shared-with-deps-release retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -902,27 +895,27 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda12_9-shared-with-deps-release-test: # Testing + libtorch-cuda13_0-shared-with-deps-release-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - libtorch-cuda12_9-shared-with-deps-release-build + - libtorch-cuda13_0-shared-with-deps-release-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" + DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information shell: bash @@ -939,7 +932,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -971,7 +964,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -992,7 +984,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: libtorch-cuda12_9-shared-with-deps-release + name: libtorch-cuda13_0-shared-with-deps-release path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -1015,26 +1007,26 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading + libtorch-cuda13_0-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: libtorch-cuda12_9-shared-with-deps-release-test + needs: libtorch-cuda13_0-shared-with-deps-release-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda LIBTORCH_CONFIG: release LIBTORCH_VARIANT: shared-with-deps # This is a dummy value for libtorch to work correctly with our batch scripts # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.9" - build_name: libtorch-cuda12_9-shared-with-deps-release + DESIRED_PYTHON: "3.10" + build_name: libtorch-cuda13_0-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml index b476973a1d86..bca8d4843463 100644 --- a/.github/workflows/generated-windows-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml @@ -35,7 +35,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -45,7 +45,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -80,7 +80,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -112,7 +112,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -156,7 +155,7 @@ jobs: - wheel-py3_10-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -182,7 +181,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -214,7 +213,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -280,7 +278,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -316,7 +314,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -348,7 +346,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -392,7 +389,7 @@ jobs: - wheel-py3_10-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -419,7 +416,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -451,7 +448,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -518,7 +514,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -554,7 +550,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -586,7 +582,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -630,7 +625,7 @@ jobs: - wheel-py3_10-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -657,7 +652,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -689,7 +684,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -752,18 +746,18 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cuda12_9-build: + wheel-py3_10-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -792,7 +786,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -824,7 +818,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -844,7 +837,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_10-cuda12_9 + name: wheel-py3_10-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -862,20 +855,20 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_9-test: # Testing + wheel-py3_10-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_10-cuda12_9-build + - wheel-py3_10-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" @@ -895,7 +888,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -927,7 +920,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -948,7 +940,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_10-cuda12_9 + name: wheel-py3_10-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -971,22 +963,22 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_9-upload: # Uploading + wheel-py3_10-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_10-cuda12_9-test + needs: wheel-py3_10-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cuda12_9 + build_name: wheel-py3_10-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -994,7 +986,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1004,7 +996,7 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -1030,7 +1022,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1062,7 +1054,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1106,7 +1097,7 @@ jobs: - wheel-py3_10-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1132,7 +1123,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1164,7 +1155,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1230,7 +1220,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1265,7 +1255,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1297,7 +1287,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1341,7 +1330,7 @@ jobs: - wheel-py3_11-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1367,7 +1356,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1399,7 +1388,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1465,7 +1453,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1501,7 +1489,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1533,7 +1521,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1577,7 +1564,7 @@ jobs: - wheel-py3_11-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1604,7 +1591,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1636,7 +1623,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1703,7 +1689,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1739,7 +1725,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1771,7 +1757,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1815,7 +1800,7 @@ jobs: - wheel-py3_11-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -1842,7 +1827,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -1874,7 +1859,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -1937,18 +1921,18 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cuda12_9-build: + wheel-py3_11-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -1977,7 +1961,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2009,7 +1993,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2029,7 +2012,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_11-cuda12_9 + name: wheel-py3_11-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2047,20 +2030,20 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_9-test: # Testing + wheel-py3_11-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_11-cuda12_9-build + - wheel-py3_11-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -2080,7 +2063,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2112,7 +2095,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2133,7 +2115,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_11-cuda12_9 + name: wheel-py3_11-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -2156,22 +2138,22 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_9-upload: # Uploading + wheel-py3_11-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_11-cuda12_9-test + needs: wheel-py3_11-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cuda12_9 + build_name: wheel-py3_11-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -2179,7 +2161,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2189,7 +2171,7 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -2215,7 +2197,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2247,7 +2229,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2291,7 +2272,7 @@ jobs: - wheel-py3_11-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2317,7 +2298,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2349,7 +2330,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2415,7 +2395,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2450,7 +2430,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2482,7 +2462,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2526,7 +2505,7 @@ jobs: - wheel-py3_12-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2552,7 +2531,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2584,7 +2563,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2650,7 +2628,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2686,7 +2664,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2718,7 +2696,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2762,7 +2739,7 @@ jobs: - wheel-py3_12-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2789,7 +2766,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2821,7 +2798,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -2888,7 +2864,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -2924,7 +2900,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -2956,7 +2932,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3000,7 +2975,7 @@ jobs: - wheel-py3_12-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3027,7 +3002,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3059,7 +3034,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3122,18 +3096,18 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cuda12_9-build: + wheel-py3_12-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3162,7 +3136,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3194,7 +3168,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3214,7 +3187,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_12-cuda12_9 + name: wheel-py3_12-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3232,20 +3205,20 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_9-test: # Testing + wheel-py3_12-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_12-cuda12_9-build + - wheel-py3_12-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3265,7 +3238,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3297,7 +3270,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3318,7 +3290,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_12-cuda12_9 + name: wheel-py3_12-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -3341,22 +3313,22 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_9-upload: # Uploading + wheel-py3_12-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_12-cuda12_9-test + needs: wheel-py3_12-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cuda12_9 + build_name: wheel-py3_12-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -3364,7 +3336,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3374,7 +3346,7 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -3400,7 +3372,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3432,7 +3404,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3476,7 +3447,7 @@ jobs: - wheel-py3_12-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3502,7 +3473,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3534,7 +3505,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3600,7 +3570,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3635,7 +3605,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3667,7 +3637,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3711,7 +3680,7 @@ jobs: - wheel-py3_13-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3737,7 +3706,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3769,7 +3738,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3835,7 +3803,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3871,7 +3839,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -3903,7 +3871,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -3947,7 +3914,7 @@ jobs: - wheel-py3_13-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -3974,7 +3941,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4006,7 +3973,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4073,7 +4039,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4109,7 +4075,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4141,7 +4107,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4185,7 +4150,7 @@ jobs: - wheel-py3_13-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4212,7 +4177,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4244,7 +4209,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4307,18 +4271,18 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13-cuda12_9-build: + wheel-py3_13-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -4347,7 +4311,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4379,7 +4343,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4399,7 +4362,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13-cuda12_9 + name: wheel-py3_13-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -4417,20 +4380,20 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_9-test: # Testing + wheel-py3_13-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13-cuda12_9-build + - wheel-py3_13-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" @@ -4450,7 +4413,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4482,7 +4445,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4503,7 +4465,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13-cuda12_9 + name: wheel-py3_13-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -4526,22 +4488,22 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13-cuda12_9-upload: # Uploading + wheel-py3_13-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13-cuda12_9-test + needs: wheel-py3_13-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13" - build_name: wheel-py3_13-cuda12_9 + build_name: wheel-py3_13-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -4549,7 +4511,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4559,7 +4521,7 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -4585,7 +4547,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4617,7 +4579,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4661,7 +4622,7 @@ jobs: - wheel-py3_13-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4687,7 +4648,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4719,7 +4680,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4785,7 +4745,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4820,7 +4780,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4852,7 +4812,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -4896,7 +4855,7 @@ jobs: - wheel-py3_13t-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -4922,7 +4881,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -4954,7 +4913,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5020,7 +4978,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5056,7 +5014,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5088,7 +5046,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5132,7 +5089,7 @@ jobs: - wheel-py3_13t-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5159,7 +5116,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5191,7 +5148,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5258,7 +5214,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5294,7 +5250,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5326,7 +5282,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5370,7 +5325,7 @@ jobs: - wheel-py3_13t-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5397,7 +5352,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5429,7 +5384,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5492,18 +5446,18 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_13t-cuda12_9-build: + wheel-py3_13t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -5532,7 +5486,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5564,7 +5518,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5584,7 +5537,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_13t-cuda12_9 + name: wheel-py3_13t-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -5602,20 +5555,20 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_9-test: # Testing + wheel-py3_13t-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_13t-cuda12_9-build + - wheel-py3_13t-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" @@ -5635,7 +5588,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5667,7 +5620,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5688,7 +5640,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_13t-cuda12_9 + name: wheel-py3_13t-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -5711,22 +5663,22 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_13t-cuda12_9-upload: # Uploading + wheel-py3_13t-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_13t-cuda12_9-test + needs: wheel-py3_13t-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.13t" - build_name: wheel-py3_13t-cuda12_9 + build_name: wheel-py3_13t-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -5734,7 +5686,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5744,7 +5696,7 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.13t" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -5770,7 +5722,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5802,7 +5754,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5846,7 +5797,7 @@ jobs: - wheel-py3_13t-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -5872,7 +5823,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -5904,7 +5855,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -5970,7 +5920,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6005,7 +5955,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6037,7 +5987,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6081,7 +6030,7 @@ jobs: - wheel-py3_14-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6107,7 +6056,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6139,7 +6088,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6205,7 +6153,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6241,7 +6189,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6273,7 +6221,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6317,7 +6264,7 @@ jobs: - wheel-py3_14-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6344,7 +6291,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6376,7 +6323,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6443,7 +6389,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6479,7 +6425,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6511,7 +6457,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6555,7 +6500,7 @@ jobs: - wheel-py3_14-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6582,7 +6527,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6614,7 +6559,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6677,18 +6621,18 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_14-cuda12_9-build: + wheel-py3_14-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.14" @@ -6717,7 +6661,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6749,7 +6693,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6769,7 +6712,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_14-cuda12_9 + name: wheel-py3_14-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -6787,20 +6730,20 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_14-cuda12_9-test: # Testing + wheel-py3_14-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_14-cuda12_9-build + - wheel-py3_14-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.14" @@ -6820,7 +6763,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6852,7 +6795,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -6873,7 +6815,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_14-cuda12_9 + name: wheel-py3_14-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -6896,22 +6838,22 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_14-cuda12_9-upload: # Uploading + wheel-py3_14-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_14-cuda12_9-test + needs: wheel-py3_14-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.14" - build_name: wheel-py3_14-cuda12_9 + build_name: wheel-py3_14-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -6919,7 +6861,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -6929,7 +6871,7 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.14" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -6955,7 +6897,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -6987,7 +6929,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7031,7 +6972,7 @@ jobs: - wheel-py3_14-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -7057,7 +6998,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7089,7 +7030,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7155,7 +7095,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -7190,7 +7130,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7222,7 +7162,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7266,7 +7205,7 @@ jobs: - wheel-py3_14t-cpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -7292,7 +7231,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7324,7 +7263,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7390,7 +7328,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -7426,7 +7364,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7458,7 +7396,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7502,7 +7439,7 @@ jobs: - wheel-py3_14t-cuda12_6-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -7529,7 +7466,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7561,7 +7498,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7628,7 +7564,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -7664,7 +7600,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7696,7 +7632,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7740,7 +7675,7 @@ jobs: - wheel-py3_14t-cuda12_8-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -7767,7 +7702,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7799,7 +7734,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7862,18 +7796,18 @@ jobs: secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_14t-cuda12_9-build: + wheel-py3_14t-cuda13_0-build: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.14t" @@ -7902,7 +7836,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -7934,7 +7868,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -7954,7 +7887,7 @@ jobs: - uses: actions/upload-artifact@v4.4.0 if: always() with: - name: wheel-py3_14t-cuda12_9 + name: wheel-py3_14t-cuda13_0 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -7972,20 +7905,20 @@ jobs: run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_14t-cuda12_9-test: # Testing + wheel-py3_14t-cuda13_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} needs: - - wheel-py3_14t-cuda12_9-build + - wheel-py3_14t-cuda13_0-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.14t" @@ -8005,7 +7938,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -8037,7 +7970,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -8058,7 +7990,7 @@ jobs: - uses: actions/download-artifact@v4.1.7 name: Download Build Artifacts with: - name: wheel-py3_14t-cuda12_9 + name: wheel-py3_14t-cuda13_0 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Populate binary env shell: bash @@ -8081,22 +8013,22 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_14t-cuda12_9-upload: # Uploading + wheel-py3_14t-cuda13_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} permissions: id-token: write contents: read - needs: wheel-py3_14t-cuda12_9-test + needs: wheel-py3_14t-cuda13_0-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu129 - GPU_ARCH_VERSION: "12.9" + DESIRED_CUDA: cu130 + GPU_ARCH_VERSION: "13.0" GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.14t" - build_name: wheel-py3_14t-cuda12_9 + build_name: wheel-py3_14t-cuda13_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} uses: ./.github/workflows/_binary-upload.yml @@ -8104,7 +8036,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -8114,7 +8046,7 @@ jobs: GPU_ARCH_TYPE: xpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.14t" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.1.1 | intel-cmplr-lib-ur==2025.1.1 | intel-cmplr-lic-rt==2025.1.1 | intel-sycl-rt==2025.1.1 | oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.1.0 | onemkl-sycl-dft==2025.1.0 | onemkl-sycl-lapack==2025.1.0 | onemkl-sycl-rng==2025.1.0 | onemkl-sycl-sparse==2025.1.0 | dpcpp-cpp-rt==2025.1.1 | intel-opencl-rt==2025.1.1 | mkl==2025.1.0 | intel-openmp==2025.1.1 | tbb==2022.1.0 | tcmlib==1.3.0 | umf==0.10.0 | intel-pti==0.12.3 + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: intel-cmplr-lib-rt==2025.2.1 | intel-cmplr-lib-ur==2025.2.1 | intel-cmplr-lic-rt==2025.2.1 | intel-sycl-rt==2025.2.1 | oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | onemkl-sycl-blas==2025.2.0 | onemkl-sycl-dft==2025.2.0 | onemkl-sycl-lapack==2025.2.0 | onemkl-sycl-rng==2025.2.0 | onemkl-sycl-sparse==2025.2.0 | dpcpp-cpp-rt==2025.2.1 | intel-opencl-rt==2025.2.1 | mkl==2025.2.0 | intel-openmp==2025.2.1 | tbb==2022.2.0 | tcmlib==1.4.0 | umf==0.11.0 | intel-pti==0.13.1 steps: # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -8140,7 +8072,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -8172,7 +8104,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false @@ -8216,7 +8147,7 @@ jobs: - wheel-py3_14t-xpu-build - get-label-type runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge" - timeout-minutes: 300 + timeout-minutes: 360 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch PACKAGE_TYPE: wheel @@ -8242,7 +8173,7 @@ jobs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: pytorch/test-infra/.github/actions/setup-ssh@main + uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.9 continue-on-error: true with: github-secret: ${{ secrets.GITHUB_TOKEN }} @@ -8274,7 +8205,6 @@ jobs: - name: Checkout PyTorch uses: actions/checkout@v4 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch show-progress: false diff --git a/.github/workflows/h100-cutlass-backend.yml b/.github/workflows/h100-cutlass-backend.yml index edf4c2e0e807..6eb072399242 100644 --- a/.github/workflows/h100-cutlass-backend.yml +++ b/.github/workflows/h100-cutlass-backend.yml @@ -27,7 +27,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/h100-distributed.yml b/.github/workflows/h100-distributed.yml index a0a7495483d4..8996add88383 100644 --- a/.github/workflows/h100-distributed.yml +++ b/.github/workflows/h100-distributed.yml @@ -24,7 +24,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/h100-symm-mem.yml b/.github/workflows/h100-symm-mem.yml index c75ca569fc7d..fa8a795216f3 100644 --- a/.github/workflows/h100-symm-mem.yml +++ b/.github/workflows/h100-symm-mem.yml @@ -24,7 +24,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/inductor-micro-benchmark-x86.yml b/.github/workflows/inductor-micro-benchmark-x86.yml index 117183428abc..c6cc075e6b27 100644 --- a/.github/workflows/inductor-micro-benchmark-x86.yml +++ b/.github/workflows/inductor-micro-benchmark-x86.yml @@ -18,13 +18,13 @@ permissions: contents: read jobs: - linux-jammy-cpu-py3_9-gcc11-inductor-build: + inductor-build: if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} - name: linux-jammy-cpu-py3.9-gcc11-inductor + name: inductor-build uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-jammy-py3.9-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks # Use metal host for benchmark jobs test-matrix: | { include: [ @@ -32,13 +32,13 @@ jobs: ]} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-micro-benchmark-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-micro-benchmark-test: + name: inductor-micro-benchmark-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: inductor-build with: build-environment: linux-jammy-py3.9-gcc11 - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 secrets: inherit diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml index a0ae234ab566..842094e0eb48 100644 --- a/.github/workflows/inductor-micro-benchmark.yml +++ b/.github/workflows/inductor-micro-benchmark.yml @@ -20,7 +20,7 @@ permissions: jobs: get-default-label-prefix: name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-nightly.yml b/.github/workflows/inductor-nightly.yml index c17a4ed6341a..7502381de93d 100644 --- a/.github/workflows/inductor-nightly.yml +++ b/.github/workflows/inductor-nightly.yml @@ -23,7 +23,7 @@ permissions: jobs: get-default-label-prefix: name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -32,13 +32,13 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build: - name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks + nightly-dynamo-benchmarks-build: + name: nightly-dynamo-benchmarks-build uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + build-environment: linux-jammy-py3.10-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | { include: [ @@ -51,13 +51,13 @@ jobs: build-additional-packages: "vision audio torchao" secrets: inherit - linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-test: - name: linux-jammy-cpu-py3.9-gcc11-nightly-dynamo-benchmarks + nightly-dynamo-benchmarks-test: + name: nightly-dynamo-benchmarks-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build + needs: nightly-dynamo-benchmarks-build with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-nightly-dynamo-benchmarks-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-gcc11-build + docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }} timeout-minutes: 720 secrets: inherit diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml index 628f62424012..35217f72bf1a 100644 --- a/.github/workflows/inductor-perf-compare.yml +++ b/.github/workflows/inductor-perf-compare.yml @@ -18,7 +18,7 @@ jobs: get-default-label-prefix: if: github.repository_owner == 'pytorch' name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/inductor-perf-test-b200.yml b/.github/workflows/inductor-perf-test-b200.yml index 7b59e92386a3..3c648a849f78 100644 --- a/.github/workflows/inductor-perf-test-b200.yml +++ b/.github/workflows/inductor-perf-test-b200.yml @@ -70,7 +70,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-aarch64.yml b/.github/workflows/inductor-perf-test-nightly-aarch64.yml index e16c8be79130..9e3165fe11ea 100644 --- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml +++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml @@ -55,7 +55,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-h100.yml b/.github/workflows/inductor-perf-test-nightly-h100.yml index dfaec8240d6c..7e323fa5a92e 100644 --- a/.github/workflows/inductor-perf-test-nightly-h100.yml +++ b/.github/workflows/inductor-perf-test-nightly-h100.yml @@ -75,7 +75,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -84,9 +84,8 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - # NB: Keep this in sync with trunk.yml build: - name: cuda12.8-py3.10-gcc9-sm90 + name: build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -128,7 +127,7 @@ jobs: secrets: inherit test-periodically: - name: cuda12.8-py3.10-gcc9-sm90 + name: test-periodically uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '15 0,12 * * 1-6' @@ -145,7 +144,7 @@ jobs: secrets: inherit test-weekly: - name: cuda12.8-py3.10-gcc9-sm90 + name: test-weekly uses: ./.github/workflows/_linux-test.yml needs: build if: github.event.schedule == '0 7 * * 0' @@ -162,9 +161,12 @@ jobs: secrets: inherit test: - name: cuda12.8-py3.10-gcc9-sm90 + name: test uses: ./.github/workflows/_linux-test.yml needs: build + # The pull_request trigger is used in PR to bump transformers pin which always + # needs one round of benchmark + if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }} with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90 dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }} diff --git a/.github/workflows/inductor-perf-test-nightly-macos.yml b/.github/workflows/inductor-perf-test-nightly-macos.yml index 0d92455a8f3c..c3b9a4229924 100644 --- a/.github/workflows/inductor-perf-test-nightly-macos.yml +++ b/.github/workflows/inductor-perf-test-nightly-macos.yml @@ -48,6 +48,9 @@ jobs: { config: "perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" }, { config: "perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" }, { config: "perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" }, + { config: "aot_inductor_perf_smoketest", shard: 1, num_shards: 3, runner: "macos-m2-15" }, + { config: "aot_inductor_perf_smoketest", shard: 2, num_shards: 3, runner: "macos-m2-15" }, + { config: "aot_inductor_perf_smoketest", shard: 3, num_shards: 3, runner: "macos-m2-15" }, ]} secrets: inherit diff --git a/.github/workflows/inductor-perf-test-nightly-rocm.yml b/.github/workflows/inductor-perf-test-nightly-rocm.yml index f329fe74e6b6..dddf68091fdb 100644 --- a/.github/workflows/inductor-perf-test-nightly-rocm.yml +++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml @@ -70,7 +70,7 @@ permissions: read-all jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml index 6e19130a1924..8057b1042676 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml @@ -60,7 +60,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -69,14 +69,14 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-zen-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + build-environment: linux-jammy-py3.10-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" }, @@ -95,16 +95,16 @@ jobs: selected-test-configs: ${{ inputs.benchmark_configs }} secrets: inherit - linux-jammy-zen-cpu-py3_9-gcc11-inductor-test-nightly: - name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + inductor-test-nightly: + name: inductor-test-nightly uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build + needs: inductor-build if: github.event.schedule == '0 7 * * *' with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true - docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false @@ -112,17 +112,16 @@ jobs: monitor-data-collect-interval: 4 secrets: inherit - - linux-jammy-zen-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-zen-cpu-py3.9-gcc11-inductor + inductor-test: + name: inductor-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-zen-cpu-py3_9-gcc11-inductor-build + needs: inductor-build if: github.event_name == 'workflow_dispatch' with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }} - docker-image: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-zen-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false diff --git a/.github/workflows/inductor-perf-test-nightly-x86.yml b/.github/workflows/inductor-perf-test-nightly-x86.yml index 62234e5f499a..b68e9ad95ca4 100644 --- a/.github/workflows/inductor-perf-test-nightly-x86.yml +++ b/.github/workflows/inductor-perf-test-nightly-x86.yml @@ -65,7 +65,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -74,14 +74,14 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + build-environment: linux-jammy-py3.10-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "inductor_huggingface_perf_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xl.spr-metal" }, @@ -101,16 +101,16 @@ jobs: build-additional-packages: "vision audio torchao" secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-test-nightly-freezing: + name: inductor-test-nightly-freezing uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: inductor-build if: github.event.schedule == '0 7 * * *' with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false @@ -118,16 +118,16 @@ jobs: monitor-data-collect-interval: 4 secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-test: + name: inductor-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: inductor-build if: github.event_name == 'workflow_dispatch' with: - build-environment: linux-jammy-py3.9-gcc11-build + build-environment: linux-jammy-py3.10-gcc11-build dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }} - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} timeout-minutes: 720 # disable monitor in perf tests disable-monitor: false diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml index 9fd81a5a05c9..7c573d4d2571 100644 --- a/.github/workflows/inductor-perf-test-nightly.yml +++ b/.github/workflows/inductor-perf-test-nightly.yml @@ -70,7 +70,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -79,7 +79,6 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - # NB: Keep this in sync with trunk.yml build: name: cuda12.8-py3.10-gcc9-sm80 uses: ./.github/workflows/_linux-build.yml diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml index 436cf95c156d..b17ebb84d5d3 100644 --- a/.github/workflows/inductor-periodic.yml +++ b/.github/workflows/inductor-periodic.yml @@ -22,7 +22,7 @@ permissions: jobs: get-default-label-prefix: name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -31,8 +31,8 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build: - name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks + periodic-dynamo-benchmarks-build: + name: periodic-dynamo-benchmarks-build uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: @@ -57,23 +57,33 @@ jobs: { config: "dynamic_aot_eager_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_aot_eager_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "dynamic_aot_eager_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, ]} build-additional-packages: "vision audio fbgemm torchao" secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-test: - name: cuda12.8-py3.10-gcc9-sm86-periodic-dynamo-benchmarks + periodic-dynamo-benchmarks-test: + name: periodic-dynamo-benchmarks-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build + needs: periodic-dynamo-benchmarks-build with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }} + docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }} secrets: inherit - linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build: + rocm-periodic-dynamo-benchmarks-build: if: github.repository_owner == 'pytorch' - name: rocm-py3_10-periodic-dynamo-benchmarks + name: rocm-periodic-dynamo-benchmarks-build uses: ./.github/workflows/_linux-build.yml with: build-environment: linux-jammy-rocm-py3_10 @@ -99,21 +109,21 @@ jobs: ]} secrets: inherit - linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-test: + rocm-periodic-dynamo-benchmarks-test: permissions: id-token: write contents: read - name: rocm-py3_10-periodic-dynamo-benchmarks + name: rocm-periodic-dynamo-benchmarks-test uses: ./.github/workflows/_rocm-test.yml - needs: linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build + needs: rocm-periodic-dynamo-benchmarks-build with: build-environment: linux-jammy-rocm-py3_10 - docker-image: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-rocm-py3_10-periodic-dynamo-benchmarks-build.outputs.test-matrix }} + docker-image: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.docker-image }} + test-matrix: ${{ needs.rocm-periodic-dynamo-benchmarks-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build: - name: cuda12.8-py3.10-gcc9-sm80 + inductor-smoke-build: + name: inductor-smoke-build uses: ./.github/workflows/_linux-build.yml needs: - get-default-label-prefix @@ -129,23 +139,23 @@ jobs: build-additional-packages: "vision audio fbgemm torchao" secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-test: - name: cuda12.8-py3.10-gcc9-sm80 + inductor-smoke-test: + name: inductor-smoke-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build + needs: inductor-smoke-build with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-smoke-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build: - name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks + periodic-dynamo-benchmarks-cpu-build: + name: periodic-dynamo-benchmarks-cpu-build uses: ./.github/workflows/_linux-build.yml needs: get-default-label-prefix with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + build-environment: linux-jammy-py3.10-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" test-matrix: | { include: [ @@ -160,68 +170,6 @@ jobs: { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" }, { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" }, { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" }, - ]} - build-additional-packages: "vision audio torchao" - secrets: inherit - - linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-test: - name: linux-jammy-cpu-py3.9-gcc11-periodic-dynamo-benchmarks - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build - with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-periodic-dynamo-benchmarks-build.outputs.test-matrix }} - secrets: inherit - - - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: - name: cuda12.8-py3.10-gcc9-sm86 - uses: ./.github/workflows/_linux-build.yml - needs: get-default-label-prefix - with: - build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks - cuda-arch-list: '8.6' - runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" - sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build - test-matrix: | - { include: [ - { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, - ]} - build-additional-packages: "vision audio fbgemm torchao" - secrets: inherit - - linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: - name: cuda12.8-py3.10-gcc9-sm86 - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build - with: - build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} - secrets: inherit - - linux-jammy-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-cpu-py3.9-gcc11-inductor - uses: ./.github/workflows/_linux-build.yml - needs: get-default-label-prefix - with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks - runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}" - sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build - test-matrix: | - { include: [ { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" }, { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" }, { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" }, @@ -247,12 +195,12 @@ jobs: build-additional-packages: "vision audio torchao" secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor + periodic-dynamo-benchmarks-cpu-test: + name: periodic-dynamo-benchmarks-cpu-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: periodic-dynamo-benchmarks-cpu-build with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-gcc11-build + docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/inductor-rocm-mi300.yml b/.github/workflows/inductor-rocm-mi300.yml index 732ec7eb85f3..369eee791dd6 100644 --- a/.github/workflows/inductor-rocm-mi300.yml +++ b/.github/workflows/inductor-rocm-mi300.yml @@ -28,7 +28,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-rocm.yml b/.github/workflows/inductor-rocm.yml index b1bb7972d67d..87d78b600f44 100644 --- a/.github/workflows/inductor-rocm.yml +++ b/.github/workflows/inductor-rocm.yml @@ -20,7 +20,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/inductor-unittest.yml b/.github/workflows/inductor-unittest.yml index df918c329dd7..31ca8e6faa3b 100644 --- a/.github/workflows/inductor-unittest.yml +++ b/.github/workflows/inductor-unittest.yml @@ -19,7 +19,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -28,8 +28,8 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: - name: cuda12.8-py3.10-gcc9-sm86 + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -47,44 +47,18 @@ jobs: ]} secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: - name: cuda12.8-py3.10-gcc9-sm86 + inductor-test: + name: inductor-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + needs: inductor-build with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cuda12_8-py3_12-gcc9-inductor-build: - name: cuda12.8-py3.12-gcc9-sm86 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks - cuda-arch-list: '8.6' - runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - test-matrix: | - { include: [ - { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_8-py3_12-gcc9-inductor-test: - name: cuda12.8-py3.12-gcc9-sm86 - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_12-gcc9-inductor-build - with: - build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_12-gcc9-inductor-build.outputs.test-matrix }} - secrets: inherit - - linux-jammy-cpu-py3_12-inductor-halide-build: - name: linux-jammy-cpu-py3.12-gcc11-inductor-halide + inductor-halide-build: + name: inductor-halide-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -97,18 +71,18 @@ jobs: ]} secrets: inherit - linux-jammy-cpu-py3_12-inductor-halide-test: - name: linux-jammy-cpu-py3.12-gcc11-inductor-halide + inductor-halide-test: + name: inductor-halide-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_12-inductor-halide-build + needs: inductor-halide-build with: build-environment: linux-jammy-py3.12-gcc11 - docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-halide-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cpu-py3_12-inductor-triton-cpu-build: - name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu + inductor-triton-cpu-build: + name: inductor-triton-cpu-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -121,23 +95,23 @@ jobs: ]} secrets: inherit - linux-jammy-cpu-py3_12-inductor-triton-cpu-test: + inductor-triton-cpu-test: name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build + needs: inductor-triton-cpu-build with: build-environment: linux-jammy-py3.12-gcc11 - docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-triton-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-triton-cpu-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-cpu-build: + name: inductor-cpu-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + build-environment: linux-jammy-py3.10-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" test-matrix: | { include: [ @@ -148,37 +122,12 @@ jobs: ]} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor - uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build - with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} - secrets: inherit - - linux-jammy-cuda12_8-py3_13-gcc9-inductor-build: - name: cuda12.8-py3.13-gcc9-sm86 - uses: ./.github/workflows/_linux-build.yml - needs: get-label-type - with: - build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks - cuda-arch-list: '8.6' - test-matrix: | - { include: [ - { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, - ]} - secrets: inherit - - linux-jammy-cuda12_8-py3_13-gcc9-inductor-test: - name: cuda12.8-py3.13-gcc9-sm86 + inductor-cpu-test: + name: inductor-cpu-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_13-gcc9-inductor-build + needs: inductor-cpu-build with: - build-environment: linux-jammy-cuda12.8-py3.13-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_13-gcc9-inductor-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-gcc11-build + docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index 721572f1807b..a70929dd868d 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -35,7 +35,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -44,8 +44,8 @@ jobs: curr_ref_type: ${{ github.ref_type }} opt_out_experiments: lf - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: - name: cuda12.8-py3.10-gcc9-sm86 + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: @@ -53,7 +53,6 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks cuda-arch-list: '8.6' runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - sync-tag: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build test-matrix: | { include: [ { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" }, @@ -65,25 +64,24 @@ jobs: build-additional-packages: "vision audio fbgemm torchao" secrets: inherit - linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: - name: cuda12.8-py3.10-gcc9-sm86 + inductor-test: + name: inductor-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + needs: inductor-build with: build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86 - docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + docker-image: ${{ needs.inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-build: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-cpu-build: + name: inductor-cpu-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + build-environment: linux-jammy-py3.10-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build test-matrix: | { include: [ { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" }, @@ -98,12 +96,12 @@ jobs: build-additional-packages: "vision audio torchao" secrets: inherit - linux-jammy-cpu-py3_9-gcc11-inductor-test: - name: linux-jammy-cpu-py3.9-gcc11-inductor + inductor-cpu-test: + name: inductor-cpu-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-inductor-build + needs: inductor-cpu-build with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-gcc11-build + docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }} + test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/lint-autoformat.yml b/.github/workflows/lint-autoformat.yml index b962970dc5b7..f64c9973d698 100644 --- a/.github/workflows/lint-autoformat.yml +++ b/.github/workflows/lint-autoformat.yml @@ -13,7 +13,7 @@ jobs: if: ${{ github.repository_owner == 'pytorch' && contains(github.event.pull_request.labels.*.name, 'autoformat') }} steps: - name: Checkout pytorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: true fetch-depth: 0 diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml index e0de9ede3508..98adf44aefd8 100644 --- a/.github/workflows/lint-bc.yml +++ b/.github/workflows/lint-bc.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Run BC Lint Action - uses: pytorch/test-infra/.github/actions/bc-lint@main + uses: pytorch/test-infra/.github/actions/bc-lint@release/2.9 with: repo: ${{ github.event.pull_request.head.repo.full_name }} base_sha: ${{ github.event.pull_request.base.sha }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b1a6dfb39071..534c15824715 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -21,7 +21,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -33,7 +33,7 @@ jobs: uses: ./.github/workflows/_get-changed-files.yml lintrunner-clang: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 needs: [get-label-type, get-changed-files] # Only run if there are changed files relevant to clangtidy / clangformat if: | @@ -53,7 +53,7 @@ jobs: with: timeout: 120 runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" - docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter + docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout # to run git rev-parse HEAD~:.ci/docker when a new image is needed fetch-depth: 0 @@ -72,7 +72,7 @@ jobs: # NOTE: mypy needs its own job because it depends on --all-files, without assessing all files it sometimes # fails to find types when it should lintrunner-mypy: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 needs: [get-label-type, get-changed-files] # Only run if there are changed files relevant to mypy if: | @@ -96,7 +96,7 @@ jobs: ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh lintrunner-noclang: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 needs: [get-label-type, get-changed-files] with: timeout: 120 @@ -117,7 +117,7 @@ jobs: fi quick-checks: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 needs: get-label-type with: timeout: 120 @@ -157,7 +157,7 @@ jobs: if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'skip-pr-sanity-checks') steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: false fetch-depth: -1 @@ -170,7 +170,7 @@ jobs: bash .github/scripts/pr-sanity-check.sh workflow-checks: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 needs: get-label-type with: timeout: 120 @@ -181,6 +181,7 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | # Regenerate workflows + export RELEASE_VERSION_TAG=2.9 .github/scripts/generate_ci_workflows.py RC=0 @@ -190,7 +191,7 @@ jobs: echo 'As shown by the above diff, the committed .github/workflows' echo 'are not up to date according to .github/templates.' echo 'Please run this command, commit, and push again to your PR:' - echo + echo export RELEASE_VERSION_TAG=2.9 echo ' .github/scripts/generate_ci_workflows.py' echo echo 'If running that command does nothing, you may need to rebase' @@ -204,7 +205,7 @@ jobs: exit $RC toc: - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 needs: get-label-type with: timeout: 120 @@ -240,7 +241,7 @@ jobs: test-tools: name: Test tools if: ${{ github.repository == 'pytorch/pytorch' }} - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 needs: get-label-type with: timeout: 120 @@ -260,14 +261,14 @@ jobs: runs-on: linux.24_04.4x steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: false fetch-depth: 1 - - name: Setup Python 3.9 + - name: Setup Python 3.10 uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: '3.9' + python-version: '3.10' architecture: x64 cache: pip - name: Install dependencies @@ -297,7 +298,7 @@ jobs: # [see note: pytorch repo ref] # deep clone (fetch-depth 0) required, to allow us to use git log - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml index 2b840a39a5c2..357347f78138 100644 --- a/.github/workflows/linux-aarch64.yml +++ b/.github/workflows/linux-aarch64.yml @@ -19,7 +19,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml index 565a9b25df50..292f0a956c35 100644 --- a/.github/workflows/llm_td_retrieval.yml +++ b/.github/workflows/llm_td_retrieval.yml @@ -12,7 +12,7 @@ jobs: name: get-label-type # Don't run on forked repos if: github.repository_owner == 'pytorch' - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -116,5 +116,5 @@ jobs: AWS_REGION: "" - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 if: always() diff --git a/.github/workflows/nightly-s3-uploads.yml b/.github/workflows/nightly-s3-uploads.yml index acf3504dec9c..1cafca0e0c85 100644 --- a/.github/workflows/nightly-s3-uploads.yml +++ b/.github/workflows/nightly-s3-uploads.yml @@ -23,7 +23,7 @@ jobs: environment: upload-stats steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 2acc987e523c..eddb21ea2ca5 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -19,7 +19,7 @@ concurrency: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -42,8 +42,8 @@ jobs: needs: get-label-type with: runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" - build-environment: linux-jammy-py3.9-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 secrets: inherit docs-push: @@ -54,7 +54,7 @@ jobs: - get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11 docker-image: ${{ needs.docs-build.outputs.docker-image }} push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }} run-doxygen: true @@ -92,7 +92,7 @@ jobs: if: github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') steps: - name: "${{ matrix.repo-owner }}/${{ matrix.repo-name }} update-commit-hash" - uses: pytorch/test-infra/.github/actions/update-commit-hash@main + uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.9 with: repo-owner: ${{ matrix.repo-owner }} repo-name: ${{ matrix.repo-name }} diff --git a/.github/workflows/nitpicker.yml b/.github/workflows/nitpicker.yml index 40bd245ce913..242f021e46fa 100644 --- a/.github/workflows/nitpicker.yml +++ b/.github/workflows/nitpicker.yml @@ -19,7 +19,7 @@ jobs: if: ${{ github.event.pull_request.number != 26921 && github.repository_owner == 'pytorch' }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 - uses: ethanis/nitpicker@v1 with: nitpicks: '.github/nitpicks.yml' diff --git a/.github/workflows/operator_benchmark.yml b/.github/workflows/operator_benchmark.yml index 16cb1600b8d6..dcdc2cd0ba24 100644 --- a/.github/workflows/operator_benchmark.yml +++ b/.github/workflows/operator_benchmark.yml @@ -14,6 +14,10 @@ on: schedule: # Run at 07:00 UTC every Sunday - cron: 0 7 * * 0 + pull_request: + paths: + - benchmarks/operator_benchmark/** + - .github/workflows/operator_benchmark.yml concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -24,38 +28,38 @@ permissions: contents: read jobs: - linux-jammy-cpu-py3_9-gcc11-opbenchmark-build: + opbenchmark-build: if: github.repository_owner == 'pytorch' - name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + name: opbenchmark-build uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + build-environment: linux-jammy-py3.10-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, ]} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-opbenchmark-on-demand-build: + opbenchmark-on-demand-build: if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }} - name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + name: opbenchmark-on-demand-build uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + build-environment: linux-jammy-py3.10-gcc11-build + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, ]} secrets: inherit - linux-jammy-cpu-py3_9-gcc11-opbenchmark-test: - name: linux-jammy-cpu-py3.9-gcc11-opbenchmark + opbenchmark-test: + name: opbenchmark-test uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-cpu-py3_9-gcc11-opbenchmark-build + needs: opbenchmark-build with: - build-environment: linux-jammy-py3.9-gcc11-build - docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-opbenchmark-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-gcc11-build + docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/operator_microbenchmark.yml b/.github/workflows/operator_microbenchmark.yml new file mode 100644 index 000000000000..9205b927c5d7 --- /dev/null +++ b/.github/workflows/operator_microbenchmark.yml @@ -0,0 +1,46 @@ +name: operator_microbenchmark + +on: + push: + tags: + - ciflow/op-benchmark/* + workflow_dispatch: + schedule: + # Run at 06:00 UTC everyday + - cron: 0 6 * * * + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: + id-token: write + contents: read + +jobs: + opmicrobenchmark-build: + if: github.repository_owner == 'pytorch' + name: opmicrobenchmark-build + uses: ./.github/workflows/_linux-build.yml + with: + runner: linux.12xlarge.memory + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 + cuda-arch-list: '8.0 9.0' + test-matrix: | + { include: [ + { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" }, + { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" }, + ]} + secrets: inherit + + opmicrobenchmark-test: + name: opmicrobenchmark-test + uses: ./.github/workflows/_linux-test.yml + needs: opmicrobenchmark-build + with: + timeout-minutes: 500 + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }} + secrets: inherit diff --git a/.github/workflows/periodic-rocm-mi300.yml b/.github/workflows/periodic-rocm-mi300.yml index 4d8890e69fc7..850c98b3fa81 100644 --- a/.github/workflows/periodic-rocm-mi300.yml +++ b/.github/workflows/periodic-rocm-mi300.yml @@ -41,7 +41,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index 7d43c68c61b0..418699cb5f5a 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -43,7 +43,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' with: triggering_actor: ${{ github.triggering_actor }} @@ -59,13 +59,14 @@ jobs: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-cuda12.4-py3.10-gcc11 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11 + cuda-arch-list: 7.5 test-matrix: | { include: [ - { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, - { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, ]} secrets: inherit @@ -170,6 +171,38 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }} secrets: inherit + linux-jammy-cuda13_0-py3_10-gcc11-build: + name: linux-jammy-cuda13.0-py3.10-gcc11 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + cuda-arch-list: 7.5 + build-environment: linux-jammy-cuda13.0-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11 + test-matrix: | + { include: [ + { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "nogpu_AVX512", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda13_0-py3_10-gcc11-test: + name: linux-jammy-cuda13.0-py3.10-gcc11 + uses: ./.github/workflows/_linux-test.yml + needs: + - linux-jammy-cuda13_0-py3_10-gcc11-build + - target-determination + with: + build-environment: linux-jammy-cuda13.0-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }} + secrets: inherit + linux-jammy-rocm-py3_10-build: name: linux-jammy-rocm-py3.10 uses: ./.github/workflows/_linux-build.yml diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index e2cac7bb7315..f884fee53fc7 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -42,21 +42,21 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} - linux-jammy-py3_9-gcc11-build: - name: linux-jammy-py3.9-gcc11 + linux-jammy-py3_10-gcc11-build: + name: linux-jammy-py3.10-gcc11 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, @@ -73,49 +73,49 @@ jobs: ]} secrets: inherit - linux-jammy-py3_9-gcc11-test: - name: linux-jammy-py3.9-gcc11 + linux-jammy-py3_10-gcc11-test: + name: linux-jammy-py3.10-gcc11 uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-py3_9-gcc11-build + - linux-jammy-py3_10-gcc11-build - target-determination with: - build-environment: linux-jammy-py3.9-gcc11 - docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }} secrets: inherit linux-docs: name: linux-docs uses: ./.github/workflows/_docs.yml - needs: linux-jammy-py3_9-gcc11-build + needs: linux-jammy-py3_10-gcc11-build with: - build-environment: linux-jammy-py3.9-gcc11 - docker-image: ${{ needs.linux-jammy-py3_9-gcc11-build.outputs.docker-image }} + build-environment: linux-jammy-py3.10-gcc11 + docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }} secrets: inherit - linux-jammy-py3_9-gcc11-no-ops: - name: linux-jammy-py3.9-gcc11-no-ops + linux-jammy-py3_10-gcc11-no-ops: + name: linux-jammy-py3.10-gcc11-no-ops uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-no-ops - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11-no-ops + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, ]} secrets: inherit - linux-jammy-py3_9-gcc11-pch: - name: linux-jammy-py3.9-gcc11-pch + linux-jammy-py3_10-gcc11-pch: + name: linux-jammy-py3.10-gcc11-pch uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-pch - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11-pch + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, @@ -132,17 +132,17 @@ jobs: docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, - { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, + { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, ]} sync-tag: asan-build secrets: inherit - linux-jammy-py3_10-clang18-asan-test: name: linux-jammy-py3.10-clang18-asan uses: ./.github/workflows/_linux-test.yml @@ -183,14 +183,14 @@ jobs: test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }} secrets: inherit - linux-jammy-py3_9-clang12-build: - name: linux-jammy-py3.9-clang12 + linux-jammy-py3_10-clang12-build: + name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-clang12 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12 + build-environment: linux-jammy-py3.10-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" }, @@ -207,16 +207,16 @@ jobs: ]} secrets: inherit - linux-jammy-py3_9-clang12-test: - name: linux-jammy-py3.9-clang12 + linux-jammy-py3_10-clang12-test: + name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-py3_9-clang12-build + - linux-jammy-py3_10-clang12-build - target-determination with: - build-environment: linux-jammy-py3.9-clang12 - docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-clang12 + docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }} secrets: inherit linux-jammy-py3_13-clang12-build: @@ -253,14 +253,14 @@ jobs: test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }} secrets: inherit - linux-jammy-cuda12_8-cudnn9-py3_9-clang12-build: - name: linux-jammy-cuda12.8-cudnn9-py3.9-clang12 + linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build: + name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-cuda12.8-cudnn9-py3.9-clang12 - docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12 + build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 1 }, @@ -282,14 +282,14 @@ jobs: ]} secrets: inherit - linux-jammy-py3_9-gcc11-mobile-lightweight-dispatch-build: - name: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build + linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build: + name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11-mobile-lightweight-dispatch-build - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11 build-generates-artifacts: false test-matrix: | { include: [ @@ -342,15 +342,40 @@ jobs: test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }} secrets: inherit - linux-jammy-xpu-2025_1-py3_9-build: - name: linux-jammy-xpu-2025.1-py3.9 + linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: + name: cuda12.8-py3.10-gcc9-sm75 + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75 + docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks + cuda-arch-list: '7.5' + test-matrix: | + { include: [ + { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" }, + ]} + secrets: inherit + + linux-jammy-cuda12_8-py3_10-gcc9-inductor-test: + name: cuda12.8-py3.10-gcc9-sm75 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build + with: + build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75 + docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }} + secrets: inherit + + linux-jammy-xpu-n-py3_9-build: + name: linux-jammy-xpu-n-py3.9 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - sync-tag: linux-xpu-2025-1-build + sync-tag: linux-xpu-n-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-2025.1-py3.9 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3 + build-environment: linux-jammy-xpu-n-py3.9 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 test-matrix: | { include: [ { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" }, diff --git a/.github/workflows/rocm-mi300.yml b/.github/workflows/rocm-mi300.yml index 7e3ba43bf984..51a807250f54 100644 --- a/.github/workflows/rocm-mi300.yml +++ b/.github/workflows/rocm-mi300.yml @@ -28,7 +28,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/rocm-mi355.yml b/.github/workflows/rocm-mi355.yml index e5dda604a4db..adf5fe919087 100644 --- a/.github/workflows/rocm-mi355.yml +++ b/.github/workflows/rocm-mi355.yml @@ -22,7 +22,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index 2a7b1d184330..197a04054bfe 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -41,7 +41,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -78,14 +78,14 @@ jobs: test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }} secrets: inherit - linux-jammy-py3_9-clang12-build: - name: linux-jammy-py3.9-clang12 + linux-jammy-py3_10-clang12-build: + name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-clang12 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-clang12 + build-environment: linux-jammy-py3.10-clang12 + docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12 test-matrix: | { include: [ { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, @@ -93,16 +93,16 @@ jobs: ]} secrets: inherit - linux-jammy-py3_9-clang12-test: - name: linux-jammy-py3.9-clang12 + linux-jammy-py3_10-clang12-test: + name: linux-jammy-py3.10-clang12 uses: ./.github/workflows/_linux-test.yml needs: - - linux-jammy-py3_9-clang12-build + - linux-jammy-py3_10-clang12-build - target-determination with: - build-environment: linux-jammy-py3.9-clang12 - docker-image: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-py3_9-clang12-build.outputs.test-matrix }} + build-environment: linux-jammy-py3.10-clang12 + docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }} secrets: inherit linux-jammy-rocm-py3_10-build: diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml index ec579fda8da9..f5f29c9646f4 100644 --- a/.github/workflows/target-determination-indexer.yml +++ b/.github/workflows/target-determination-indexer.yml @@ -13,7 +13,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -35,7 +35,7 @@ jobs: - name: Calculate docker image id: calculate-docker-image - uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.9 with: docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11 working-directory: pytorch @@ -50,13 +50,13 @@ jobs: echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" - name: Pull docker image - uses: pytorch/test-infra/.github/actions/pull-docker-image@main + uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.9 with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver - uses: pytorch/test-infra/.github/actions/setup-nvidia@main + uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9 - name: Clone CodeLlama uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -149,7 +149,7 @@ jobs: "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}" - name: Teardown Linux - uses: pytorch/test-infra/.github/actions/teardown-linux@main + uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.9 if: always() concurrency: diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml index c712b11185a7..3e9f848e9e09 100644 --- a/.github/workflows/target_determination.yml +++ b/.github/workflows/target_determination.yml @@ -9,7 +9,7 @@ jobs: name: get-label-type # Don't run on forked repos if: github.repository_owner == 'pytorch' - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} @@ -27,7 +27,7 @@ jobs: # checkout because when we run this action we don't *have* a local # checkout. In other cases you should prefer a local checkout. - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: false diff --git a/.github/workflows/test-check-binary.yml b/.github/workflows/test-check-binary.yml index 5f0ad59d3a3b..a13e1d027f13 100644 --- a/.github/workflows/test-check-binary.yml +++ b/.github/workflows/test-check-binary.yml @@ -15,7 +15,7 @@ jobs: check_binary_linux_cpu: if: github.repository_owner == 'pytorch' name: Test check_binary.sh for Linux CPU - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 with: docker-image: python:3.11 docker-build-dir: "skip-docker-build" @@ -28,7 +28,7 @@ jobs: check_binary_linux_cuda: if: github.repository_owner == 'pytorch' name: Test check_binary.sh for Linux CUDA - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@release/2.9 with: runner: linux.g4dn.4xlarge.nvidia.gpu docker-image: python:3.11 diff --git a/.github/workflows/test-h100.yml b/.github/workflows/test-h100.yml index 1e83c7b9d98c..d08d6033c47e 100644 --- a/.github/workflows/test-h100.yml +++ b/.github/workflows/test-h100.yml @@ -28,7 +28,7 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/tools-unit-tests.yml b/.github/workflows/tools-unit-tests.yml index c687c07b7ca7..9c104571ef89 100644 --- a/.github/workflows/tools-unit-tests.yml +++ b/.github/workflows/tools-unit-tests.yml @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout pytorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: true fetch-depth: 0 @@ -52,7 +52,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout pytorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: true fetch-depth: 0 diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml index 08fcd3340262..e4f0c692e976 100644 --- a/.github/workflows/torchbench.yml +++ b/.github/workflows/torchbench.yml @@ -18,7 +18,7 @@ jobs: get-default-label-prefix: if: github.repository_owner == 'pytorch' name: get-default-label-prefix - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 0081e4e1f895..efc027ad2acb 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -39,7 +39,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -224,13 +224,12 @@ jobs: tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl" secrets: inherit - # NB: Keep this in sync with inductor-perf-test-nightly.yml - linux-jammy-cuda12_8-py3_10-gcc9-inductor-build: - name: cuda12.8-py3.10-gcc9-sm80 + inductor-build: + name: inductor-build uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80 + build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80 docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks cuda-arch-list: '8.0' secrets: inherit @@ -241,8 +240,8 @@ jobs: needs: get-label-type with: runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" - build-environment: linux-jammy-py3.9-gcc11 - docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks + build-environment: linux-jammy-py3.10-gcc11 + docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks test-matrix: | { include: [ { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }, @@ -256,7 +255,7 @@ jobs: - verify-cachebench-cpu-build - target-determination with: - build-environment: linux-jammy-py3.9-gcc11 + build-environment: linux-jammy-py3.10-gcc11 docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }} test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }} secrets: inherit diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml index 1fdb1da67a59..5c456c607c88 100644 --- a/.github/workflows/trymerge.yml +++ b/.github/workflows/trymerge.yml @@ -59,22 +59,19 @@ jobs: # on the PR appear in chronological order (timing issues can shuffle them around) sleep 60 fi + + # Require a comment id for merge operations + if [ -z "${COMMENT_ID}" ]; then + echo "Error: merge requires COMMENT_ID to be specified" + exit 1 + fi + if [ -n "${FORCE}" ]; then - if [ -n "${COMMENT_ID}" ]; then - python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}" - else - python3 .github/scripts/trymerge.py --force "${PR_NUM}" - fi + python3 .github/scripts/trymerge.py --force --comment-id "${COMMENT_ID}" "${PR_NUM}" elif [ -n "${IGNORE_CURRENT}" ]; then - if [ -n "${COMMENT_ID}" ]; then - python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}" - else - python3 .github/scripts/trymerge.py --ignore-current "${PR_NUM}" - fi - elif [ -n "${COMMENT_ID}" ]; then - python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}" + python3 .github/scripts/trymerge.py --ignore-current --comment-id "${COMMENT_ID}" "${PR_NUM}" else - python3 .github/scripts/trymerge.py "${PR_NUM}" + python3 .github/scripts/trymerge.py --comment-id "${COMMENT_ID}" "${PR_NUM}" fi - name: Comment on Canceled if: ${{ cancelled() && steps.checkout.outcome == 'success' }} diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml index 7f0fe6058bd0..5eeb8b19a325 100644 --- a/.github/workflows/unstable.yml +++ b/.github/workflows/unstable.yml @@ -46,7 +46,7 @@ jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index 3d445756f7a2..e3ca35d2d01d 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -18,12 +18,12 @@ jobs: environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }} steps: - name: Update viable/strict - uses: pytorch/test-infra/.github/actions/update-viablestrict@main + uses: pytorch/test-infra/.github/actions/update-viablestrict@release/2.9 id: update_viablestrict with: repository: pytorch/pytorch stable-branch: viable/strict - requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\", \"linux-aarch64\"]' + requires: '[\"pull\", \"trunk\", \"lint\", \"^linux-binary-manywheel$\", \"^linux-binary-libtorch-release$\", \"linux-aarch64\"]' secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }} clickhouse-url: ${{ secrets.CLICKHOUSE_URL }} clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }} diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml index a1b8c38141ae..535950b3c0b7 100644 --- a/.github/workflows/update_pytorch_labels.yml +++ b/.github/workflows/update_pytorch_labels.yml @@ -17,7 +17,7 @@ jobs: contents: read steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/upload-test-stats-while-running.yml b/.github/workflows/upload-test-stats-while-running.yml index 9aecaad0e068..82c21467dc6a 100644 --- a/.github/workflows/upload-test-stats-while-running.yml +++ b/.github/workflows/upload-test-stats-while-running.yml @@ -16,7 +16,7 @@ jobs: runs-on: linux.2xlarge steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml index f77b6081b776..3cfc651b2a62 100644 --- a/.github/workflows/upload-test-stats.yml +++ b/.github/workflows/upload-test-stats.yml @@ -58,7 +58,7 @@ jobs: run: echo "${TRIGGERING_WORKFLOW}" - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 - name: Configure aws credentials uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml index 07471619437a..db3fc72e68e9 100644 --- a/.github/workflows/upload-torch-dynamo-perf-stats.yml +++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml @@ -32,7 +32,7 @@ jobs: name: Upload dynamo performance stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }} steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: submodules: false fetch-depth: 1 diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml index 570256200605..1764139fed25 100644 --- a/.github/workflows/upload_test_stats_intermediate.yml +++ b/.github/workflows/upload_test_stats_intermediate.yml @@ -17,7 +17,7 @@ jobs: environment: upload-stats steps: - name: Checkout PyTorch - uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.9 with: fetch-depth: 1 submodules: false diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml index 14524069ab5a..dcfa4027c7ae 100644 --- a/.github/workflows/vllm.yml +++ b/.github/workflows/vllm.yml @@ -2,12 +2,14 @@ name: vllm-test on: push: + branches: + - main + - release/* tags: - ciflow/vllm/* workflow_dispatch: schedule: - # Every 12 hours starting at 00:00 UTC (00:00 and 12:00) - - cron: '0 0,12 * * *' + - cron: '0 */8 * * *' # every 8 hours at minute 0 (UTC) concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -20,7 +22,7 @@ permissions: jobs: get-label-type: name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} with: triggering_actor: ${{ github.triggering_actor }} @@ -46,14 +48,18 @@ jobs: { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "vllm_lora_280_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_multi_model_processor_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_lora_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, + { config: "vllm_multi_model_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"}, + { config: "vllm_languagde_model_test_extended_generation_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu"}, + { config: "vllm_distributed_test_2_gpu_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_lora_test", shard: 2, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, { config: "vllm_lora_test", shard: 3, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" }, - { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4"}, + { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"}, + { config: "vllm_distributed_test_28_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.12xlarge.nvidia.gpu"} ]} secrets: inherit diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index b95dadd5f2b1..2c534891c6e2 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -22,7 +22,7 @@ jobs: fetch-depth: 0 - name: update-xla-commit-hash continue-on-error: true - uses: pytorch/test-infra/.github/actions/update-commit-hash@main + uses: pytorch/test-infra/.github/actions/update-commit-hash@release/2.9 with: repo-name: xla branch: master diff --git a/.github/workflows/win-arm64-build-test.yml b/.github/workflows/win-arm64-build-test.yml index 627a43b56bf7..95b4e2f027f6 100644 --- a/.github/workflows/win-arm64-build-test.yml +++ b/.github/workflows/win-arm64-build-test.yml @@ -4,6 +4,9 @@ on: push: tags: - ciflow/win-arm64/* + schedule: + # Every 4 hours starting at 00:00 UTC + - cron: '0 */4 * * *' env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml index c62918b4af21..3a17bb9d70a1 100644 --- a/.github/workflows/xpu.yml +++ b/.github/workflows/xpu.yml @@ -19,22 +19,22 @@ jobs: get-label-type: if: github.repository_owner == 'pytorch' name: get-label-type - uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.9 with: triggering_actor: ${{ github.triggering_actor }} issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} curr_branch: ${{ github.head_ref || github.ref_name }} curr_ref_type: ${{ github.ref_type }} - linux-jammy-xpu-2025_0-py3_9-build: - name: linux-jammy-xpu-2025.0-py3.9 + linux-jammy-xpu-n-1-py3_10-build: + name: linux-jammy-xpu-n-1-py3.10 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - sync-tag: linux-xpu-2025-0-build + sync-tag: linux-xpu-n-1-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-2025.0-py3.9 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.0-py3 + build-environment: linux-jammy-xpu-n-1-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3 runner: linux.12xlarge test-matrix: | { include: [ @@ -47,60 +47,62 @@ jobs: ]} secrets: inherit - linux-jammy-xpu-2025_1-py3_9-build: - name: linux-jammy-xpu-2025.1-py3.9 + linux-jammy-xpu-n-py3_10-build: + name: linux-jammy-xpu-n-py3.10 uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - sync-tag: linux-xpu-2025-1-build + sync-tag: linux-xpu-n-build runner_prefix: ${{ needs.get-label-type.outputs.label-type }} - build-environment: linux-jammy-xpu-2025.1-py3.9 - docker-image-name: ci-image:pytorch-linux-jammy-xpu-2025.1-py3 + build-environment: linux-jammy-xpu-n-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3 runner: linux.12xlarge test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 2, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 3, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 4, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 5, num_shards: 6, runner: "linux.idc.xpu" }, - { config: "default", shard: 6, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" }, + { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" }, ]} secrets: inherit - linux-jammy-xpu-2025_1-py3_9-test: - name: linux-jammy-xpu-2025.1-py3.9 + linux-jammy-xpu-n-py3_10-test: + name: linux-jammy-xpu-n-py3.10 uses: ./.github/workflows/_xpu-test.yml - needs: linux-jammy-xpu-2025_1-py3_9-build + needs: linux-jammy-xpu-n-py3_10-build permissions: id-token: write contents: read with: - build-environment: linux-jammy-xpu-2025.1-py3.9 - docker-image: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-jammy-xpu-2025_1-py3_9-build.outputs.test-matrix }} + build-environment: linux-jammy-xpu-n-py3.10 + docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }} secrets: inherit - windows-xpu-2025_0-build: + windows-xpu-n-1-build: if: github.repository_owner == 'pytorch' - name: win-vs2022-xpu-2025_0-py3 + name: win-vs2022-xpu-n-1-py3 uses: ./.github/workflows/_win-build.yml with: - build-environment: win-vs2022-xpu-py3 + build-environment: win-vs2022-xpu-n-1-py3 cuda-version: cpu use-xpu: true - xpu-version: '2025.0' + xpu-version: '2025.1' vc-year: '2022' secrets: inherit - windows-xpu-2025_1-build: + windows-xpu-n-build: if: github.repository_owner == 'pytorch' - name: win-vs2022-xpu-2025_1-py3 + name: win-vs2022-xpu-n-py3 uses: ./.github/workflows/_win-build.yml with: - build-environment: win-vs2022-xpu-py3 + build-environment: win-vs2022-xpu-n-py3 cuda-version: cpu use-xpu: true - xpu-version: '2025.1' + xpu-version: '2025.2' vc-year: '2022' secrets: inherit diff --git a/.gitignore b/.gitignore index d1fa4cd3caf2..f20486806796 100644 --- a/.gitignore +++ b/.gitignore @@ -82,6 +82,7 @@ torch/return_types.pyi torch/nn/functional.pyi torch/utils/data/datapipes/datapipe.pyi torch/csrc/autograd/generated/* +torch/csrc/functionalization/generated/* torch/csrc/lazy/generated/*.[!m]* torch_compile_debug/ # Listed manually because some files in this directory are not generated diff --git a/BUILD.bazel b/BUILD.bazel index 58ebc31e243c..f13da6bfbe43 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -91,6 +91,8 @@ generated_cpu_cpp = [ "aten/src/ATen/NativeMetaFunctions.h", "aten/src/ATen/RegistrationDeclarations.h", "aten/src/ATen/VmapGeneratedPlumbing.h", + "aten/src/ATen/ViewMetaClasses.h", + "aten/src/ATen/ViewMetaClasses.cpp", "aten/src/ATen/core/aten_interned_strings.h", "aten/src/ATen/core/enum_tag.h", "aten/src/ATen/core/TensorBody.h", @@ -747,6 +749,7 @@ cc_library( "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu", "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu", "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp", + "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp", "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", ], )) + torch_sources, @@ -1105,6 +1108,7 @@ test_suite( "aten/src/ATen/templates/LazyNonNativeIr.h", "aten/src/ATen/templates/RegisterDispatchKey.cpp", "aten/src/ATen/templates/RegisterDispatchDefinitions.ini", + "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp", "aten/src/ATen/native/native_functions.yaml", "aten/src/ATen/native/tags.yaml", "aten/src/ATen/native/ts_native_functions.yaml", diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000000..dcdf409e7314 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,15 @@ +# Testing + +Use our test class and test runner: + +``` +from torch.testing._internal.common_utils import run_tests, TestCase + +class TestFeature(TestCase): + ... + +if __name__ == "__main__": + run_tests() +``` + +To test Tensor equality, use assertEqual. diff --git a/CMakeLists.txt b/CMakeLists.txt index ad7368e19298..ce7890f002d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -272,7 +272,7 @@ cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) cmake_dependent_option(USE_SYSTEM_NCCL "Use system-wide NCCL" OFF "USE_NCCL" OFF) cmake_dependent_option(USE_NVSHMEM "Use NVSHMEM" ON - "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) + "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) option(USE_NNAPI "Use NNAPI" OFF) option(USE_NNPACK "Use NNPACK" ON) cmake_dependent_option(USE_NUMA "Use NUMA. Only available on Linux." ON "LINUX" @@ -880,10 +880,21 @@ cmake_dependent_option( USE_FBGEMM_GENAI "Whether to build FBGEMM GenAI quantized GEMM kernels.\ Will be disabled if not supported by the platform" - OFF - "USE_CUDA OR USE_ROCM" + ON + "USE_ROCM" OFF) +IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH) + message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF") + set(USE_FBGEMM_GENAI off) +endif() + +# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100 +if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0a") + message(WARNING "Setting USE_FBGEMM_GENAI to ON for CUDA build on SM100") + set(USE_FBGEMM_GENAI ON) +endif() + # CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem # Eff Attention won't cmake_dependent_option( diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dade8f4ec6ec..9d2b5d355391 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -88,13 +88,13 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below. -* When installing with `python -m pip install -e .` (in contrast to `python -m pip install .`) Python runtime will use +* When installing with `python -m pip install -e . -v --no-build-isolation` (in contrast to `python -m pip install . -v --no-build-isolation`) Python runtime will use the current local source-tree when importing `torch` package. (This is done by creating [`.egg-link`](https://wiki.python.org/moin/PythonPackagingTerminology#egg-link) file in `site-packages` folder) This way you do not need to repeatedly install after modifying Python files (`.py`). However, you would need to reinstall if you modify Python interface (`.pyi`, `.pyi.in`) or non-Python files (`.cpp`, `.cc`, `.cu`, `.h`, ...). - One way to avoid running `python -m pip install -e .` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac, + One way to avoid running `python -m pip install -e . -v --no-build-isolation` every time one makes a change to C++/CUDA/ObjectiveC files on Linux/Mac, is to create a symbolic link from `build` folder to `torch/lib`, for example, by issuing following: ```bash pushd torch/lib; sh -c "ln -sf ../../build/lib/libtorch_cpu.* ."; popd @@ -116,7 +116,7 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows Next run `python setup.py clean`. After that, you can install in editable mode again. -* If you run into errors when running `python -m pip install -e .`, here are some debugging steps: +* If you run into errors when running `python -m pip install -e . -v --no-build-isolation`, here are some debugging steps: 1. Run `printf '#include \nint main() { printf("Hello World");}'|clang -x c -; ./a.out` to make sure your CMake works and can compile this simple Hello World program without errors. 2. Nuke your `build` directory. The `setup.py` script compiles binaries into the `build` folder and caches many @@ -129,10 +129,10 @@ source venv/bin/activate # or `& .\venv\Scripts\Activate.ps1` on Windows git clean -xdf python setup.py clean git submodule update --init --recursive - python -m pip install -r requirements.txt + python -m pip install --group dev python -m pip install --no-build-isolation -v -e . ``` - 4. The main step within `python -m pip install -e .` is running `cmake --build build` from the `build` directory. If you want to + 4. The main step within `python -m pip install -e . -v --no-build-isolation` is running `make` from the `build` directory. If you want to experiment with some environment variables, you can pass them into the command: ```bash ENV_KEY1=ENV_VAL1[, ENV_KEY2=ENV_VAL2]* CMAKE_FRESH=1 python -m pip install --no-build-isolation -v -e . @@ -259,6 +259,7 @@ dependencies as well as the nightly binaries into the repo directory. support for PyTorch. * [tools](tools) - Code generation scripts for the PyTorch library. See [README](tools/README.md) of this directory for more details. +* [torchgen](torchgen) - contains the logic and tooling for generating PyTorch's low-level C++ and Python bindings from operator definitions, typically specified in native_functions.yaml * [test](test) - Python unit tests for PyTorch Python frontend. * [test_torch.py](test/test_torch.py) - Basic tests for PyTorch functionality. @@ -294,7 +295,7 @@ The following packages should be installed with `pip`: - `pytest` - recommended to run tests more selectively Running ``` -pip install -r requirements.txt +pip install --group dev ``` will install these dependencies for you. @@ -645,9 +646,9 @@ can be selected interactively with your mouse to zoom in on a particular part of the program execution timeline. The `--native` command-line option tells `py-spy` to record stack frame entries for PyTorch C++ code. To get line numbers for C++ code it may be necessary to compile PyTorch in debug mode by prepending -your `python -m pip install -e .` call to compile PyTorch with `DEBUG=1`. -Depending on your operating system it may also be necessary to run `py-spy` with -root privileges. +your `python -m pip install -e . -v --no-build-isolation` call to compile +PyTorch with `DEBUG=1`. Depending on your operating system it may also be +necessary to run `py-spy` with root privileges. `py-spy` can also work in an `htop`-like "live profiling" mode and can be tweaked to adjust the stack sampling rate, see the `py-spy` readme for more @@ -655,10 +656,10 @@ details. ## Managing multiple build trees -One downside to using `python -m pip install -e .` is that your development -version of PyTorch will be installed globally on your account (e.g., if -you run `import torch` anywhere else, the development version will be -used). +One downside to using `python -m pip install -e . -v --no-build-isolation` is +that your development version of PyTorch will be installed globally on your +account (e.g., if you run `import torch` anywhere else, the development version +will be used). If you want to manage multiple builds of PyTorch, you can make use of [venv environments](https://docs.python.org/3/library/venv.html) to maintain @@ -719,7 +720,7 @@ options. ### Code completion and IDE support -When using `python -m pip install -e .`, PyTorch will generate +When using `python -m pip install -e . -v --no-build-isolation`, PyTorch will generate a `compile_commands.json` file that can be used by many editors to provide command completion and error highlighting for PyTorch's C++ code. You need to `pip install ninja` to generate accurate diff --git a/Dockerfile b/Dockerfile index 7b8964bd860e..331cf00593cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,11 +50,10 @@ RUN git submodule update --init --recursive FROM conda as conda-installs ARG PYTHON_VERSION=3.11 ARG CUDA_PATH=cu121 -ARG CUDA_CHANNEL=nvidia ARG INSTALL_CHANNEL=whl/nightly # Automatically set by buildx -RUN /opt/conda/bin/conda update -y -n base -c defaults conda -RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} +# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574 +RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0 ARG TARGETPLATFORM diff --git a/README.md b/README.md index 4c18724be0c0..99e6dabd1618 100644 --- a/README.md +++ b/README.md @@ -243,7 +243,7 @@ git submodule update --init --recursive ```bash # Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above -pip install -r requirements.txt +pip install --group dev ``` **On Linux** @@ -394,7 +394,7 @@ On macOS ```bash export CMAKE_PREFIX_PATH="${CONDA_PREFIX:-'$(dirname $(which conda))/../'}:${CMAKE_PREFIX_PATH}" -MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ CMAKE_ONLY=1 python setup.py build +MACOSX_DEPLOYMENT_TARGET=11.0 CMAKE_ONLY=1 python setup.py build ccmake build # or cmake-gui build ``` diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index d8787154a213..38b383c2bb31 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -1,5 +1,15 @@ cmake_minimum_required(VERSION 3.27 FATAL_ERROR) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) +list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/public") +if(USE_ROCM) + include(LoadHIP OPTIONAL RESULT_VARIABLE _had_loadhip) + if(_had_loadhip) + # Exposed by LoadHIP.cmake, e.g. "7.1.2" or "7.2.0" + message(STATUS "LoadHIP loaded: ROCM_VERSION_DEV='${ROCM_VERSION_DEV}'") + else() + message(WARNING "LoadHIP.cmake not found; ROCM_VERSION_DEV unavailable") + endif() +endif() if(NOT MSVC) string(APPEND CMAKE_CXX_FLAGS " -Wno-ignored-qualifiers") @@ -216,7 +226,7 @@ file(GLOB mem_eff_attention_cuda_cpp "native/transformers/cuda/mem_eff_attention if(USE_CUDA AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)) add_library(flash_attention OBJECT EXCLUDE_FROM_ALL ${flash_attention_cuda_kernels_cu} ${flash_attention_cuda_cpp}) - target_include_directories(flash_attention PUBLIC + target_include_directories(flash_attention SYSTEM PUBLIC ${PROJECT_SOURCE_DIR}/third_party/flash-attention/csrc ${PROJECT_SOURCE_DIR}/third_party/flash-attention/include ${PROJECT_SOURCE_DIR}/third_party/cutlass/include @@ -252,47 +262,81 @@ if(USE_MEM_EFF_ATTENTION) list(APPEND ATen_ATTENTION_KERNEL_SRCS ${mem_eff_attention_cuda_kernels_cu}) endif() -IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH) - message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF") - set(USE_FBGEMM_GENAI off) -endif() - # FBGEMM GenAI IF(USE_FBGEMM_GENAI) set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/) - set(FBGEMM_GENAI_DIR ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize) - - if(USE_ROCM) - # Only include the kernels we want to build to avoid increasing binary size. - file(GLOB_RECURSE fbgemm_genai_native_rocm_hip - "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip" - "${FBGEMM_GENAI_DIR}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip") - set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - - # Add additional HIPCC compiler flags for performance - set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS - -mllvm - -amdgpu-coerce-illegal-types=1 - -mllvm - -enable-post-misched=0 - -mllvm - -greedy-reverse-local-assignment=1 - -fhip-new-launch-api) - - hip_add_library( - fbgemm_genai STATIC - ${fbgemm_genai_native_rocm_hip} - HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) + set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize) + if(USE_CUDA) + # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build. + # If you want to integrate a kernel from FBGEMM into torch, you have to add it here. + set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*") + file(GLOB_RECURSE fbgemm_genai_native_cuda_cu + "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu" + "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu") + list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX}) + + file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp + "${FBGEMM_GENAI_SRCS}/common/*.cpp" + ) + + # Combine all source files into a single list + list(APPEND fbgemm_genai_all_sources + ${fbgemm_genai_native_cuda_cu} + ${fbgemm_genai_native_cuda_cpp} + ) + + # Now, create the library and provide the sources at the same time + add_library(fbgemm_genai OBJECT ${fbgemm_genai_all_sources}) + set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) + + set(fbgemm_genai_mx8mx8bf16_grouped + "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/" + ) target_include_directories(fbgemm_genai PUBLIC - # FBGEMM version of Composable Kernel is used due to some customizations - ${FBGEMM_THIRD_PARTY}/composable_kernel/include - ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include - ${FBGEMM_GENAI_DIR}/include/ - ${FBGEMM_GENAI_DIR}/common/include/ + ${FBGEMM_THIRD_PARTY}/cutlass/include + ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include + ${fbgemm_genai_mx8mx8bf16_grouped} + ${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp + ${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h ) + else() + if(USE_ROCM) + # Only include the kernels we want to build to avoid increasing binary size. + file(GLOB_RECURSE fbgemm_genai_native_rocm_hip + "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip" + "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip") + set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + + # Add additional HIPCC compiler flags for performance + set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS + -mllvm + -enable-post-misched=0 + -mllvm + -greedy-reverse-local-assignment=1 + -fhip-new-launch-api) + if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0") + list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1) + endif() + + hip_add_library( + fbgemm_genai STATIC + ${fbgemm_genai_native_rocm_hip} + HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS}) + set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES) + + target_include_directories(fbgemm_genai PUBLIC + # FBGEMM version of Composable Kernel is used due to some customizations + ${FBGEMM_THIRD_PARTY}/composable_kernel/include + ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include + ${FBGEMM_THIRD_PARTY}/cutlass/include + ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include + ${FBGEMM_GENAI_SRCS}/common/include/ # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp + ${FBGEMM_GENAI_SRCS}/include/ # includes fbgemm_gpu/torch_ops.h + ) + endif() endif() endif() @@ -635,12 +679,26 @@ if(USE_CUDA AND NOT USE_ROCM) add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include) list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include) + + # Add FBGEMM_GENAI include directories for torch_ops.h + if(USE_FBGEMM_GENAI) + list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include) + list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include) + endif() + if($ENV{ATEN_STATIC_CUDA}) - list(APPEND ATen_CUDA_DEPENDENCY_LIBS - ${CUDA_LIBRARIES} - CUDA::cusparse_static - CUDA::cufft_static_nocallback - ) + if(CUDA_VERSION VERSION_LESS_EQUAL 12.9) + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + ${CUDA_LIBRARIES} + CUDA::cusparse_static + CUDA::cufft_static_nocallback) + else() + list(APPEND ATen_CUDA_DEPENDENCY_LIBS + ${CUDA_LIBRARIES} + CUDA::cusparse_static + CUDA::cufft_static) + endif() + if(NOT BUILD_LAZY_CUDA_LINALG) list(APPEND ATen_CUDA_DEPENDENCY_LIBS CUDA::cusolver_static diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp index 9632cd5ed698..98ad757946be 100644 --- a/aten/src/ATen/DLConvertor.cpp +++ b/aten/src/ATen/DLConvertor.cpp @@ -308,17 +308,44 @@ void fillVersion( // constructed out of ATen tensor template T* toDLPackImpl(const Tensor& src) { - // create a new tensor with possibly normalized strides - // gh-83069 - auto shape = src.sizes(); - auto strides = src.strides().vec(); - for (int i = 0; i < src.dim(); i++) { - if (shape[i] < 2) { - strides[i] = 1; + auto view = src; + + // Detect whether there is need to normalize the strides + // Background: gh-83069 + // + // However, normalizing strides can come at a high-cost + // to slow down toDLPack conversion 3x, so we + // only normalize if needed. + // + // The following code detects whether the src follows + // a continuous pattern. If the src follows such pattern (common-case) + // then we do not need to normalize the strides. + bool need_normalize_strides = false; + int64_t expected_stride = 1; + for (int i = src.dim() - 1; i >= 0; i--) { + // detect if we do not meet continuous pattern + // and the size is 1, so there is opportunity to normalize + if (src.stride(i) != expected_stride && src.size(i) == 1) { + need_normalize_strides = true; + break; + } + expected_stride *= src.size(i); + } + + // less common case, try normalizing the strides + if (need_normalize_strides) { + // create a new tensor with possibly normalized strides + // gh-83069 + auto shape = src.sizes(); + auto strides = src.strides().vec(); + for (int i = 0; i < src.dim(); i++) { + if (shape[i] < 2) { + strides[i] = 1; + } } + view = src.as_strided(shape, strides, src.storage_offset()); } - auto view = src.as_strided(shape, strides, src.storage_offset()); ATenDLMTensor* atDLMTensor(new ATenDLMTensor); atDLMTensor->handle = view; atDLMTensor->tensor.manager_ctx = atDLMTensor; diff --git a/aten/src/ATen/DTensorState.cpp b/aten/src/ATen/DTensorState.cpp new file mode 100644 index 000000000000..0644aae3d070 --- /dev/null +++ b/aten/src/ATen/DTensorState.cpp @@ -0,0 +1,17 @@ +#include + +namespace at { + +namespace { +thread_local bool kDTensorAllowImplicitReplication = false; +} + +bool get_dtensor_allow_implicit_replication() { + return kDTensorAllowImplicitReplication; +} + +void set_dtensor_allow_implicit_replication(bool enabled) { + kDTensorAllowImplicitReplication = enabled; +} + +} // namespace at diff --git a/aten/src/ATen/DTensorState.h b/aten/src/ATen/DTensorState.h new file mode 100644 index 000000000000..07e89eaeddae --- /dev/null +++ b/aten/src/ATen/DTensorState.h @@ -0,0 +1,34 @@ +#pragma once + +#include + +namespace at { + +TORCH_API bool get_dtensor_allow_implicit_replication(); +TORCH_API void set_dtensor_allow_implicit_replication(bool enabled); + +struct DTensorAllowImplicitReplication { + DTensorAllowImplicitReplication() + : prev_dtensor_allow_implicit_replication_( + get_dtensor_allow_implicit_replication()) { + set_dtensor_allow_implicit_replication(true); + } + + DTensorAllowImplicitReplication(const DTensorAllowImplicitReplication&) = + delete; + DTensorAllowImplicitReplication& operator=( + const DTensorAllowImplicitReplication&) = delete; + DTensorAllowImplicitReplication(DTensorAllowImplicitReplication&&) = delete; + DTensorAllowImplicitReplication& operator=( + DTensorAllowImplicitReplication&&) = delete; + + ~DTensorAllowImplicitReplication() { + set_dtensor_allow_implicit_replication( + prev_dtensor_allow_implicit_replication_); + } + + private: + bool prev_dtensor_allow_implicit_replication_; +}; + +} // namespace at diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp index a5512818343f..8bca495abdc6 100644 --- a/aten/src/ATen/FunctionalStorageImpl.cpp +++ b/aten/src/ATen/FunctionalStorageImpl.cpp @@ -9,11 +9,6 @@ namespace at::functionalization { -ViewMeta ViewMeta::to_out_idx(int64_t out_idx) { - if (out_idx == this->out_index) return *this; - return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx); -} - // Note [Functionalization: Alias Removal Part 2] // See Note [Functionalization: Alias Removal] for more details. // This function applies a single update from one of the views to the StorageImpl. @@ -42,12 +37,12 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) { static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) { at::Tensor t = update.new_val; TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); - if (update.view_metas.empty()) return t; + if (update.view_metas.empty()) { return t; } std::vector tmp_values({base}); tmp_values.reserve(update.view_metas.size()); for (size_t i = 0; i < update.view_metas.size() - 1; ++i) { - at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index); + at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back()); // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided // All of these ops require additional information to recover the sizes of the original tensor. // If need to, we could probably apply this optimization and only bother computing tmp_values @@ -55,9 +50,8 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co tmp_values.push_back(std::move(next_view)); } for(int64_t i = static_cast(update.view_metas.size()) - 1; i >= 0; --i) { - int64_t out_idx = update.view_metas[i].out_index; // Each view inverse is implemented in ViewInverses.cpp. - t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx); + t = update.view_metas[i]->reverse(tmp_values[i], t); } TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); return t; @@ -111,13 +105,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base) TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_)); } -void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector& metas) { +void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector>& metas) { TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage"); if (metas.size() > 1) { for (size_t i = 1; i < metas.size(); ++i) { // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI - TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided, + TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided, "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i, " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today," "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you " diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h index 8cd1cb7434aa..0c9c1fd775f3 100644 --- a/aten/src/ATen/FunctionalStorageImpl.h +++ b/aten/src/ATen/FunctionalStorageImpl.h @@ -8,44 +8,89 @@ namespace at::functionalization { // See Note [Functionalization Pass In Core] +enum class InverseReturnMode { + /// Specifies that functional inverses should always return a view. + AlwaysView, + /// Specifies that functional inverses should always return a non-view / copy. + NeverView, + /// Specifies that functional inverses should return a view unless a (copying) + /// scatter + /// inverse exists, in which case that will be used instead. + /// This avoids as_strided() calls that can be difficult for subclasses to + /// handle. + ViewOrScatterInverse, +}; + +#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \ + static const char* name() { \ + return #TYPE; \ + } + +#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \ + using SerializableTuple = std::tuple<__VA_ARGS__> + // ViewMeta is a class used by the functionalization pass to navigate between // a base tensor and a view tensor. // For example, if I call `b = a.view1(...)` -// the functionalization pass will generate and store a ViewMeta on b that looks -// like: +// the functionalization pass will generate and store a ViewMeta specialization +// for `view1` operation on b that looks like: // -// ViewMeta( -// [](const Tensor& base, int64_t mutated_view_idx) { -// return base.view1(...); -// }, -// [](const at::Tensor& base, const at::Tensor& mutated_view, -// int64_t mutated_view_idx) -> at::Tensor { -// return at::functionalization::impl::view1_inverse(base, mutated_view, -// ...); +// struct TORCH_API view1_ViewMeta : public ViewMeta { +// FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta); +// FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( +// bool /* reapply_views */, +// const std::vector&); +// +// view1_ViewMeta(const SerializableTuple& tpl) +// : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} +// +// view1_ViewMeta(bool reapply_views, const std::vector& size) +// : ViewMeta(/*has_symbolic_inputs=*/false), +// reapply_views(reapply_views), +// size(size) {} +// +// Tensor forward(const Tensor& base) override { +// return base.view1(...); // } // -// The forward_fn lambda describes how to replay view1 on a tensor. +// Tensor reverse(const Tensor& base, const Tensor& mutated_view) override { +// return at::functionalization::impl::view1_inverse(base, mutated_view, +// ...); +// } // -// The reverse_fn lambda describes how, given a tensor that is already a view, +// SerializableTuple to_serializable_tuple() { +// return std::make_tuple(reapply_views, size); +// } +// +// bool reapply_views; +// std::vector size; +// }; +// +// The forward function describes how to replay view1 on a tensor. +// +// The reverse function describes how, given a tensor that is already a view, // how to get the corresponding base tensor. See Note [Functionalization Pass: // View Inverses] for details. +// +// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type +// representing the `ViewMeta` instance state. Methods that take in/return such +// a type are used for supporting pickle serialization. struct ViewMeta { ViewMeta( - std::function forward, - std::function reverse, bool has_symbolic_inputs, bool is_multi_output = false, bool is_as_strided = false, int64_t out_idx = 0) - : forward_fn(std::move(forward)), - reverse_fn(std::move(reverse)), - out_index(out_idx), + : out_index(out_idx), is_multi_output(is_multi_output), is_as_strided(is_as_strided), has_symbolic_inputs(has_symbolic_inputs) {} - std::function forward_fn; - std::function reverse_fn; + virtual ~ViewMeta() = default; + + virtual Tensor forward(const Tensor& base) = 0; + virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0; + // See Note [out_idx in ViewMeta] int64_t out_index; @@ -57,10 +102,17 @@ struct ViewMeta { // Tells us if this view operation has any symbolic inputs bool has_symbolic_inputs; - // Returns a copy of the current ViewMeta, if out_idx matches the current - // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse + // Returns a new ViewMeta with the same forward/reverse // functions, but a new out index. - ViewMeta to_out_idx(int64_t out_idx); + // + // This method should be implemented by those `ViewMeta` that have more than + // one output. + virtual std::shared_ptr to_out_index(int64_t out_index) { + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "ViewMeta::to_out_index not implemented. ", + "Likely because there's only one output."); + } }; // FunctionalStorageImpl is a subclass of StorageImpl used by the @@ -93,14 +145,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const at::Tensor new_val; // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) - const std::vector view_metas; + const std::vector> view_metas; }; explicit FunctionalStorageImpl(const Tensor& value); void add_update( const Tensor& updated_val, - const std::vector& view_metas); + const std::vector>& view_metas); bool apply_updates(); const Tensor& base() { return base_; diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index 7d5e4e84e861..3a574fa7d491 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -129,17 +129,19 @@ void FunctionalTensorWrapper::freeze_storage() const { // - view_value: The output tensor that we need to wrap. // - base: The "base" of the view that `view_value` was generated from. // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic. -FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta) - : c10::TensorImpl( - c10::DispatchKeySet(DispatchKey::Functionalize), - view_value.dtype(), - view_value.device() - ), - value_(view_value), - is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output), - was_storage_changed_(base->was_storage_changed_), - is_symbolic_(base->is_symbolic_) -{ +FunctionalTensorWrapper::FunctionalTensorWrapper( + const Tensor& view_value, + const FunctionalTensorWrapper* base, + const std::shared_ptr& meta) + : c10::TensorImpl( + c10::DispatchKeySet(DispatchKey::Functionalize), + view_value.dtype(), + view_value.device()), + value_(view_value), + is_multi_output_view_( + base->is_multi_output_view_ || meta->is_multi_output), + was_storage_changed_(base->was_storage_changed_), + is_symbolic_(base->is_symbolic_) { TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_)); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); set_constructor_metadata(); @@ -148,11 +150,10 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const view_metas_ = base->view_metas_; // copy } view_metas_.push_back(meta); - maybe_mark_symbolic(meta); + maybe_mark_symbolic(meta.get()); storage_ = base->storage_; // alias this tensor's storage with the base tensor's } - functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const { return static_cast(storage_.unsafeGetStorageImpl()); } @@ -176,18 +177,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const { } // See Note [Functionalization Pass - Inplace View Ops] -void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) { +void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr& meta) { view_metas_.push_back(meta); // Manually track the fact that this tensor received a metadata mutation! has_metadata_mutation_ = true; // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation. - maybe_mark_symbolic(meta); + maybe_mark_symbolic(meta.get()); // Note [Functionalization Pass - Inplace View Ops] // So, these ops are special - they're mutation AND view ops. They get special codegen. // An example is transpose_, e.g. `a.transpose_()` // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas. at::AutoDispatchSkipFunctionalize guard; - value_ = meta.forward_fn(value_, meta.out_index); + value_ = meta->forward(value_); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); } @@ -368,15 +369,8 @@ void FunctionalTensorWrapper::sync_() { regenerate_from_base(); } -Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) { - auto t = base; - - // Reapply views to get the viewed tensor from the base in alias_ - for (auto& view_meta: view_metas_) { - t = view_meta.forward_fn(t, view_meta.out_index); - } - - return t; +const std::vector>& FunctionalTensorWrapper::view_metas() const { + return view_metas_; } void FunctionalTensorWrapper::regenerate_from_base() { @@ -385,7 +379,7 @@ void FunctionalTensorWrapper::regenerate_from_base() { auto t = storage_impl->base(); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); - t = apply_view_metas(t); + t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); replace_(t, /*from_lazy_regenerate=*/true); @@ -724,11 +718,11 @@ bool isFunctionalTensor(const std::optional& t) { } bool isFunctionalTensor(const c10::List<::std::optional>& t_list) { - if (t_list.empty()) return false; + if (t_list.empty()) { return false; } auto functional_count = 0; for (const auto i : c10::irange(t_list.size())) { auto const & e= t_list[i]; - if (!e.has_value() || !e->defined()) continue; + if (!e.has_value() || !e->defined()) { continue; } if (isFunctionalTensor(e)) { ++functional_count; } @@ -738,10 +732,10 @@ bool isFunctionalTensor(const c10::List<::std::optional>& t_list) { template static bool isFunctionalTensorIListRef(c10::IListRef list) { - if (list.size() == 0) return false; + if (list.size() == 0) { return false; } auto functional_count = 0; for (const auto& tensor : list) { - if (!tensor.defined()) continue; + if (!tensor.defined()) { continue; } if (isFunctionalTensor(tensor)) { ++functional_count; } @@ -759,20 +753,28 @@ void freeze_functional_tensor(const Tensor& tensor) { functional_base_impl->freeze_storage(); } -Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) { +Tensor create_functional_tensor_with_view_meta( + const at::Tensor& view_to_wrap, + const at::Tensor& base, + const std::shared_ptr& meta, + int64_t out_idx) { TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap)); TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base)); auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base); + auto meta_ = meta; if (out_idx != 0) { // Note [out_idx in ViewMeta] // When a view op outputs multiple tensors, each output needs its own separate ViewMeta. // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function. - meta = meta.to_out_idx(out_idx); + meta_ = meta->to_out_index(out_idx); } - return at::detail::make_tensor(view_to_wrap, functional_base_impl, meta); + return at::detail::make_tensor(view_to_wrap, functional_base_impl, meta_); } -std::vector create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) { +std::vector create_functional_tensor_with_view_meta( + ITensorListRef view_to_wrap, + const at::Tensor& base, + const std::shared_ptr& meta) { std::vector outputs(view_to_wrap.size()); int64_t i = 0; for (const auto& tensor : view_to_wrap) { @@ -782,12 +784,22 @@ std::vector create_functional_tensor_with_view_meta(ITensorListRef view_ return outputs; } -void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) { +void mutate_view_meta(const at::Tensor& self, const std::shared_ptr& meta) { TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self)); auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self); self_impl->mutate_view_meta(meta); } +Tensor apply_view_meta_sequence( + const Tensor& base, + const std::vector>& sequence) { + Tensor r = base; + for (auto& vm : sequence) { + r = vm->forward(r); + } + return r; +} + // Note [Propagating strides in the functionalization pass] // In order to properly compute stride information, the functionalization pass // calls each {view} reference implementations with meta tensors. @@ -881,7 +893,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s const auto& ivalue = returns[idx]; if (ivalue.isTensor()) { const auto& t = ivalue.toTensor(); - if (!t.defined()) continue; + if (!t.defined()) { continue; } at::functionalization::impl::sync(t); auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t)); (*stack)[returns_begin + idx] = t_new; diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h index b260b7c9f958..6d9050728da7 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.h +++ b/aten/src/ATen/FunctionalTensorWrapper.h @@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { explicit FunctionalTensorWrapper( const Tensor& view_value, const FunctionalTensorWrapper* base, - const functionalization::ViewMeta& meta); + const std::shared_ptr& meta); // Get the underlying, actual tensor, that doesn't know anything about // functionalization. @@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { ->are_all_mutations_under_no_grad_or_inference_mode(); } - void maybe_mark_symbolic(const functionalization::ViewMeta& meta) { - is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs; + void maybe_mark_symbolic(functionalization::ViewMeta* meta) { + is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs; } bool is_symbolic() const { return is_symbolic_; } - // Runs the forward_fn of every ViewMeta collected in the current instance - // to some other base. - Tensor apply_view_metas(const Tensor& base); + // Retrieves the ViewMeta sequence of this tensor. + const std::vector>& view_metas() + const; // Sync's the underlying tensor with its alias, if it's out of date. This // involves two steps: 1) Apply any pending updates/mutations to the alias 2) @@ -146,7 +146,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { // from the base tensor. This method is used by inplace-view ops like // transpose_. It appends a ViewMeta to the existing stack, and refreshes the // tensor by replaying the views off of the alias. - void mutate_view_meta(const at::functionalization::ViewMeta& meta); + void mutate_view_meta( + const std::shared_ptr& meta); // Custom implementation of self.set_(src) void set__impl(const FunctionalTensorWrapper* other); @@ -285,7 +286,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { bool is_symbolic_ = false; size_t generation_ = 0; - std::vector view_metas_; + std::vector> view_metas_; protected: static void copy_tensor_metadata( @@ -377,16 +378,20 @@ TORCH_API void propagate_xla_data_direct( Tensor create_functional_tensor_with_view_meta( const Tensor& view_to_wrap, const Tensor& base, - functionalization::ViewMeta meta, + const std::shared_ptr& meta, int64_t out_idx = 0); std::vector create_functional_tensor_with_view_meta( ITensorListRef view_to_wrap, const Tensor& base, - const functionalization::ViewMeta& meta); + const std::shared_ptr& meta); void mutate_view_meta( const Tensor& self, - const functionalization::ViewMeta& meta); + const std::shared_ptr& meta); + +TORCH_API Tensor apply_view_meta_sequence( + const Tensor& base, + const std::vector>& sequence); void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); void set_sizes_strides_offset( diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp index 97094c9f125a..10f988b4d281 100644 --- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp +++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -7,7 +9,6 @@ #include #include #include -#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -28,6 +29,31 @@ #include #endif +namespace at::functionalization { + +Tensor resize__ViewMeta::forward(const Tensor& base) { + if (reapply_views) { + return base.as_strided(size, c10::contiguous_strides(size)); + } else { + return at::as_strided_copy(base, size, c10::contiguous_strides(size)); + } +} + +Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) { + return base.as_strided_scatter( + mutated_view, size, c10::contiguous_strides(size)); +} + +Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) { + return at::_unsafe_view_symint(base, size); +} + +Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) { + return at::_unsafe_view_symint(mutated_view, base.sym_sizes()); +} + +} // namespace at::functionalization + namespace { void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) { const auto& schema = op.schema(); @@ -106,7 +132,9 @@ namespace { const auto& ivalue = returns[idx]; if (ivalue.isTensor() && should_wrap_outputs) { const auto& t = ivalue.toTensor(); - if (!t.defined()) continue; + if (!t.defined()) { + continue; + } auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t)); (*stack)[returns_begin + idx] = t_new; } else if (ivalue.isTensorList() && should_wrap_outputs) { @@ -169,19 +197,8 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch // The output of resizing is equivalent to taking a slice of a larger tensor. // We have to emulate this "slicing" with an as_strided call. auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS(); - at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( - [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { - if (reapply_views) { - return base.as_strided(size, c10::contiguous_strides(size)); - } else { - return at::as_strided_copy(base, size, c10::contiguous_strides(size)); - } - }, - [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { - return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size)); - }, - /*has_symbolic_inputs=*/false - ); + auto view_meta = std::make_shared( + reapply_views, size.vec()); at::functionalization::impl::mutate_view_meta(self, view_meta); return self; } @@ -300,17 +317,11 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt tmp_output = at::_unsafe_view_symint(self_, size); } - bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); }); - - at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta( - [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { - return at::_unsafe_view_symint(base, size); - }, - [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor { - return at::_unsafe_view_symint(mutated_view, base.sym_sizes()); - }, - /*has_symbolic_inputs=*/has_symbolic_inputs - ); + bool has_symbolic_inputs = std::any_of( + size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); }); + auto view_meta = + std::make_shared( + has_symbolic_inputs, size.vec()); auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta)); // See Note [Propagating strides in the functionalization pass] diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.h b/aten/src/ATen/FunctionalizeFallbackKernel.h new file mode 100644 index 000000000000..aabcfc827af3 --- /dev/null +++ b/aten/src/ATen/FunctionalizeFallbackKernel.h @@ -0,0 +1,58 @@ +#pragma once + +#include + +namespace at::functionalization { + +// `ViewMeta` implementation for `resize_` operation. +struct TORCH_API resize__ViewMeta : public ViewMeta { + FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta) + FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( + bool /* reapply_views */, + const std::vector&); + + resize__ViewMeta(const SerializableTuple& tpl) + : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} + + resize__ViewMeta(bool reapply_views, const std::vector& size) + : ViewMeta(/*has_symbolic_inputs=*/false), + reapply_views(reapply_views), + size(size) {} + + Tensor forward(const Tensor& base) override; + Tensor reverse(const Tensor& base, const Tensor& mutated_view) override; + + SerializableTuple to_serializable_tuple() { + return std::make_tuple(reapply_views, size); + } + + bool reapply_views; + std::vector size; +}; + +// `ViewMeta` implementation for `_unsafe_view` operation. +struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta { + FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta) + FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE( + bool /* has_symbolic_inputs */, + const std::vector&); + + _unsafe_view_ViewMeta(const SerializableTuple& tpl) + : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {} + + _unsafe_view_ViewMeta( + bool has_symbolic_inputs, + const std::vector& size) + : ViewMeta(has_symbolic_inputs), size(size) {} + + Tensor forward(const Tensor& base) override; + Tensor reverse(const Tensor& base, const Tensor& mutated_view) override; + + SerializableTuple to_serializable_tuple() { + return std::make_tuple(has_symbolic_inputs, size); + } + + std::vector size; +}; + +} // namespace at::functionalization diff --git a/aten/src/ATen/ThreadLocalState.cpp b/aten/src/ATen/ThreadLocalState.cpp index 33977d8d7cf8..22509c7be4e1 100644 --- a/aten/src/ATen/ThreadLocalState.cpp +++ b/aten/src/ATen/ThreadLocalState.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace at { @@ -19,6 +20,7 @@ ThreadLocalState::ThreadLocalState() torch_dispatch_mode_state_(c10::impl::TorchDispatchModeTLS::get_state()), python_dispatcher_state_(c10::impl::PythonDispatcherTLS::get_state()), python_torch_function_state_(at::impl::PythonTorchFunctionTLS::get_state()), saved_tensors_default_hooks_state_(at::SavedTensorDefaultHooks::get_tls_state()), functionalization_reapply_views_state_(at::functionalization::impl::getFunctionalizationReapplyViewsTLS()), + dtensor_allow_implicit_replication_(at::get_dtensor_allow_implicit_replication()), saved_objects_(at::impl::ThreadLocalPythonObjects::get_state()) { #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && !defined(BUILD_LITE_INTERPRETER) for(size_t i=0; i #include #include diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index be93d5991e9a..672309ec19a2 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -15,7 +15,7 @@ std::enable_if_t< std::is_base_of_v, std::unique_ptr> make_unique_base(Args&&... args) { - return std::unique_ptr(new Child(std::forward(args)...)); + return std::make_unique(std::forward(args)...); } } // namespace detail diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h index b33e7ce0c549..2ba841e44e20 100644 --- a/aten/src/ATen/core/dynamic_type.h +++ b/aten/src/ATen/core/dynamic_type.h @@ -64,6 +64,7 @@ constexpr DynamicTypeBits kDynamicClassTypeBit = DYNAMIC_TYPE_BIT(10); _(ScalarType, kDynamicIntTypeBit, 1) \ _(Layout, kDynamicIntTypeBit, 1) \ _(SymInt, kDynamicIntTypeBit, 1) \ + _(SymBool, kDynamicIntTypeBit, 1) \ _(MemoryFormat, kDynamicIntTypeBit, 1) #define FORWARD_DECL_TYPE(NAME, _, __) struct NAME ## Type; diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 4ab57f0beb1c..0d319ea59384 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -996,9 +996,6 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { - #ifdef USE_ROCM - TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm"); - #endif // TODO: Support tuning for Half inputs and FP32 output bgemm_internal(CUDABLAS_BGEMM_ARGS(at::Half)); } @@ -1006,9 +1003,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float) template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { - #ifdef USE_ROCM - TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm"); - #else + #ifndef USE_ROCM cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major < 8) @@ -1513,9 +1508,6 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { template <> void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) { - #ifdef USE_ROCM - TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm"); - #endif // TODO: Support Tuning for fp16-fp32 gemm gemm_internal(CUDABLAS_GEMM_ARGS(at::Half)); } @@ -1523,9 +1515,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) template <> void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) { - #ifdef USE_ROCM - TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm"); - #else + #ifndef USE_ROCM cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); if (prop->major < 8) @@ -1947,11 +1937,11 @@ void scaled_gemm( computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb)); cublasLtMatmulDescAttributes_t matmulDescA = CUBLASLT_MATMUL_DESC_A_SCALE_POINTER; cublasLtMatmulDescAttributes_t matmulDescB = CUBLASLT_MATMUL_DESC_B_SCALE_POINTER; +#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT) // hipblaslt supported row-wise before cublas, and did so their own way (via // the SCALE_POINTERSs), but then migrated to match how cublas does it (via // the SCALE_MODEs). Here we check for this early custom mode. bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise); -#if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT) if (use_rowwise) { matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT; matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT; @@ -1966,8 +1956,12 @@ void scaled_gemm( } #endif } -#else - // rowwise isn't supported using cublaslt or older hipblaslt +#elif (CUDA_VERSION < 12090) && !defined(USE_ROCM) + // hipblaslt supported row-wise before cublas, and did so their own way (via + // the SCALE_POINTERSs), but then migrated to match how cublas does it (via + // the SCALE_MODEs). Here we check for this early custom mode. + bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise); + // rowwise isn't supported using older cublaslt or older hipblaslt TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt"); #endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT) computeDesc.setAttribute(matmulDescA, mat1_scale_ptr); @@ -2583,8 +2577,6 @@ void vdot>(CUDABLAS_DOT_ARGTYPES(c10::complex)) { reinterpret_cast(result))); } -// HIP on Windows does not support -#if !(defined(USE_ROCM) && defined(_MSC_VER)) template <> void getrsBatched(CUDABLAS_GETRS_ARGTYPES(float)) { TORCH_CUDABLAS_CHECK(cublasSgetrsBatched( @@ -2783,6 +2775,5 @@ void gelsBatched>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::comple devInfoArray, batchSize)); } -#endif // !(defined(USE_ROCM) && defined(_MSC_VER)) } // namespace at::cuda::blas diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index 5021917fe095..b235840418e2 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -343,9 +343,6 @@ void vdot>(CUDABLAS_DOT_ARGTYPES(c10::complex)); int m, int n, int nrhs, Dtype** dA_array, int ldda, \ Dtype** dC_array, int lddc, int* info, int *devInfoArray, int batchSize -// HIP on Windows does not support getrs, geqrf, getrf, gels -#if !(defined(USE_ROCM) && defined(_MSC_VER)) - template void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) { static_assert(false&&sizeof(Dtype),"at::cuda::blas::getrsBatched: not implemented"); @@ -400,28 +397,4 @@ TORCH_CUDA_CU_API void gelsBatched>(CUDABLAS_GELS_BATCHED_A template<> TORCH_CUDA_CU_API void gelsBatched>(CUDABLAS_GELS_BATCHED_ARGTYPES(c10::complex)); -#else // !(defined(USE_ROCM) && defined(_MSC_VER)) - -template -void getrsBatched(CUDABLAS_GETRS_ARGTYPES(Dtype)) { - TORCH_CHECK(false, "at::cuda::blas::getrsBatched: not supported for HIP on Windows"); -} - -template -void geqrfBatched(CUDABLAS_GEQRF_BATCHED_ARGTYPES(Dtype)) { - TORCH_CHECK(false, "at::cuda::blas::geqrfBatched: not supported for HIP on Windows"); -} - -template -void getrfBatched(CUDABLAS_GETRF_ARGTYPES(Dtype)) { - TORCH_CHECK(false, "at::cuda::blas::getrfBatched: not supported for HIP on Windows"); -} - -template -void gelsBatched(CUDABLAS_GELS_BATCHED_ARGTYPES(Dtype)) { - TORCH_CHECK(false, "at::cuda::blas::gelsBatched: not supported for HIP on Windows"); -} - -#endif // !(defined(USE_ROCM) && defined(_MSC_VER)) - } // namespace at::cuda::blas diff --git a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh index 60e1a19c1aac..a65db3f2df12 100644 --- a/aten/src/ATen/cuda/detail/OffsetCalculator.cuh +++ b/aten/src/ATen/cuda/detail/OffsetCalculator.cuh @@ -45,6 +45,24 @@ struct OffsetCalculator { C10_HOST_DEVICE offset_type get(index_t linear_idx) const { offset_type offsets; + +#if defined(USE_ROCM) + if ((dims > 0) && (dims <= 2)) { + auto divmod = sizes_[0].divmod(linear_idx); + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) + offsets[arg] = divmod.mod * strides_[0][arg]; + if (dims >= 2) { + divmod = sizes_[1].divmod(divmod.div); + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) + offsets[arg] += divmod.mod * strides_[1][arg]; + } + // [...] + return offsets; + } +#endif + #pragma unroll for (int arg = 0; arg < NARGS; arg++) { offsets[arg] = 0; diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h index d89875865b88..aca83386ad42 100644 --- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h @@ -117,6 +117,8 @@ namespace at::cuda { _(nvrtcGetPTXSize) \ _(nvrtcGetPTX) \ _(cuModuleLoadData) \ + _(cuModuleLoad) \ + _(cuGetErrorString) \ _(cuModuleGetFunction) \ _(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \ _(nvrtcGetErrorString) \ diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp index 9972cbd1c151..3511e48ae061 100644 --- a/aten/src/ATen/cuda/tunable/Tunable.cpp +++ b/aten/src/ATen/cuda/tunable/Tunable.cpp @@ -220,19 +220,17 @@ TuningResultsValidator::TuningResultsValidator() { []() { return GetPyTorchVersion(); }, [this](auto&& k) { return ValidatePyTorchVersion(std::forward(k)); }); #ifdef USE_ROCM - // rocm + // hip { -#ifdef _WIN32 - std::string rocm_version = HIP_VERSION_BUILD_NAME; -#else - std::string rocm_version = ROCM_BUILD_INFO; -#endif + // HIP version is more accurate than ROCm version. User's environment could be a stock + // ROCm install but with a mix of newer components, making ROCm version meaningless. + std::string hip_version = c10::str(TORCH_HIP_VERSION); RegisterValidator( - "ROCM_VERSION", - [rocm_version]() { return rocm_version; }, - [rocm_version](auto&& k) { - TUNABLE_LOG1("ROCM_VERSION validation: expect ", k, " to match ", rocm_version); - return rocm_version == k ? OK : FAIL; + "HIP_VERSION", + [hip_version]() { return hip_version; }, + [hip_version](auto&& k) { + TUNABLE_LOG1("HIP_VERSION validation: expect ", k, " to match ", hip_version); + return hip_version == k ? OK : FAIL; }); } // gfx arch diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index 6c2492b12e6b..85f0286542e7 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -38,6 +38,7 @@ inline int dataSize(cudnnDataType_t dataType) } } +// NOTE [ cudnn fixSizeOneDimStride ] // The stride for a size-1 dimensions is not uniquely determined; in // fact, it can be anything you want, because the fact that the // tensor is size 1 at this dimension means that you will never actually diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp index de69e5c1e23a..6e63708a90f4 100644 --- a/aten/src/ATen/functorch/BatchRulesModules.cpp +++ b/aten/src/ATen/functorch/BatchRulesModules.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -44,8 +45,13 @@ static std::tuple> embedding_batch_rule( const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight); auto indices_ = moveBatchDimToFront(indices, indices_bdim); - const auto range = getStepTensor(indices, batch_size, num_embeddings); - indices_ = indices_ + range; + { + // getStepTensor returns a regular Tensor. If indices_ is a DTensor + // we want to allow this mixed DTensor-Tensor operation. + at::DTensorAllowImplicitReplication guard; + const auto range = getStepTensor(indices, batch_size, num_embeddings); + indices_ = indices_ + range; + } auto result = at::embedding_symint(weight_, indices_, std::move(padding_idx), scale_grad_by_freq, sparse); return std::make_tuple(std::move(result), 0); } diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp index b26d2c4a419e..48a735c3e533 100644 --- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp @@ -171,6 +171,8 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { POINTWISE_BOXED(fill_.Scalar); POINTWISE_BOXED(zero_); + // This is special because this op doesn't return anything + m.impl("_assert_tensor_metadata", native::_assert_tensor_metadata); #undef UNARY_POINTWISE #undef UNARY_POINTWISE_ALL diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp index 08c09b88f99c..86e42ee3b66d 100644 --- a/aten/src/ATen/miopen/Descriptors.cpp +++ b/aten/src/ATen/miopen/Descriptors.cpp @@ -19,31 +19,37 @@ inline miopenDataType_t getDataType(const at::Tensor& t) { } else { TORCH_CHECK( false, - "TensorDescriptor only supports float, half and bfloat16 tensors"); + "TensorDescriptor does not support ", scalar_type); } } } // anonymous namespace +constexpr size_t MIOPEN_DIM_MAX = 5; -void TensorDescriptor::set(const at::Tensor &t, size_t pad) { - set(getDataType(t), t.sizes(), t.strides(), pad); +void TensorDescriptor::set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad) { + set(getDataType(t), t.sizes(), t.strides(), pad, + memory_format == at::MemoryFormat::ChannelsLast || + memory_format == at::MemoryFormat::ChannelsLast3d); } -constexpr size_t MIOPEN_DIM_MAX = 5; +void TensorDescriptor::set(const at::Tensor &t, size_t pad) { + auto memory_format = t.suggest_memory_format(); + set(getDataType(t), t.sizes(), t.strides(), pad, + memory_format == at::MemoryFormat::ChannelsLast || + memory_format == at::MemoryFormat::ChannelsLast3d); +} void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad) { + set(datatype, t_sizes, t_strides, pad, + is_channels_last_strides_2d(t_sizes, t_strides) || + is_channels_last_strides_3d(t_sizes, t_strides)); +} + +void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntArrayRef t_strides, size_t pad, bool nhwc) { size_t dim = t_sizes.size(); if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX) -#define _STR(X) #X -#define STR(X) _STR(X) - TORCH_CHECK( - false, - "MIOpen supports only up to ", - STR(MIOPEN_DIM_MAX), - " dimensions"); -#undef _STR -#undef STR + TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions"); int size[MIOPEN_DIM_MAX]; int stride[MIOPEN_DIM_MAX]; for (const auto i : c10::irange(dim)) { @@ -54,7 +60,7 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr size[i] = 1; stride[i] = 1; } - set(datatype, static_cast(std::max(dim, pad)), size, stride); + set(datatype, static_cast(std::max(dim, pad)), size, stride, nhwc); } std::string miopenTypeToString(miopenDataType_t dtype) { @@ -74,10 +80,11 @@ std::string miopenTypeToString(miopenDataType_t dtype) { std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) { out << "TensorDescriptor " << static_cast(d.desc()) << "\n"; - int nbDims = 4; + int nbDims = 0; int dimA[MIOPEN_DIM_MAX]; int strideA[MIOPEN_DIM_MAX]; miopenDataType_t dtype; + miopenGetTensorDescriptorSize(d.desc(), &nbDims); miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA); out << " type = " << miopenTypeToString(dtype) << "\n"; out << " nbDims = " << nbDims << "\n"; @@ -99,19 +106,17 @@ void TensorDescriptor::print() { std::cout << *this; } void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad) { auto dim = t.ndimension(); - if (dim > static_cast(MIOPEN_DIM_MAX) || pad > static_cast(MIOPEN_DIM_MAX)) { -#define _STR(X) #X -#define STR(X) _STR(X) - TORCH_CHECK( - false, - "MIOpen supports only up to ", - STR(MIOPEN_DIM_MAX), - " dimensions"); -#undef _STR -#undef STR - } + if (dim > MIOPEN_DIM_MAX || pad > MIOPEN_DIM_MAX) + TORCH_CHECK(false, "MIOpen supports only up to ", MIOPEN_DIM_MAX, " dimensions"); + // NB: It is possible for this test to be insufficient, because the + // Tensor passed in to set the filter descriptor may not be the actual + // Tensor whose data pointer is passed to cuDNN. Nevertheless, + // that is the common case, so we can catch most client errors with this test. TORCH_CHECK(t.is_contiguous(memory_format), - "MIOpen filters (a.k.a. weights) must be contiguous"); + "MIOpen filters (a.k.a. weights) must be contiguous in desired memory_format\n", + "Weight sizes: ", t.sizes(), "\n", + "Weight strides: ", t.strides(), "\n", + "cuDNN suggested memory_format: ", memory_format); int size[MIOPEN_DIM_MAX]; int stride[MIOPEN_DIM_MAX]; @@ -131,7 +136,9 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo } dim = std::max(dim, pad); - set(getDataType(t), (int) dim, size, stride); + set(getDataType(t), static_cast(dim), size, stride, + memory_format == at::MemoryFormat::ChannelsLast || + memory_format == at::MemoryFormat::ChannelsLast3d); } }} diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h index 2eee837cd533..8825575c9231 100644 --- a/aten/src/ATen/miopen/Descriptors.h +++ b/aten/src/ATen/miopen/Descriptors.h @@ -9,6 +9,8 @@ namespace at { namespace native { +std::string miopenTypeToString(miopenDataType_t dtype); + inline int dataSize(miopenDataType_t dataType) { switch (dataType) { @@ -19,6 +21,32 @@ inline int dataSize(miopenDataType_t dataType) } } +// See NOTE [ cudnn fixSizeOneDimStride ] in aten/src/ATen/cudnn/Descriptors.h +template +static inline void fixSizeOneDimStride(int dim, const T *size, T *stride, bool nhwc) { + int64_t z = 1; + int index = 0; + std::vector permutation(dim); + + if (nhwc) { + permutation[index++] = 1; + } + for (int d = dim-1; d > 1; d--) { + permutation[index++] = d; + } + if (!nhwc) { + permutation[index++] = 1; + } + permutation[index++] = 0; + for (int d : permutation) { + if (size[d] == 1) { + stride[d] = z; + } else { + z *= size[d]; + } + } +} + template struct DescriptorDeleter { void operator()(T* x) { @@ -75,14 +103,20 @@ class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor< set(t, pad); } + // See Note [CuDNN broadcast padding] void set(const at::Tensor &t, size_t pad = 0); + void set(const at::Tensor &t, at::MemoryFormat memory_format, size_t pad = 0); void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0); void print(); private: - void set(miopenDataType_t dataType, int dim, int* size, int* stride) { - MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride)); + void set(miopenDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad, bool nhwc); + + void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) { + std::vector strides_copy(stride, stride + dim); + fixSizeOneDimStride(dim, size, strides_copy.data(), nhwc); + MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data())); } }; @@ -100,8 +134,10 @@ class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor< void set(const at::Tensor &t, const at::MemoryFormat memory_format, int64_t pad = 0); private: - void set(miopenDataType_t dataType, int dim, int* size, int* stride) { - MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, stride)); + void set(miopenDataType_t dataType, int dim, int* size, int* stride, bool nhwc) { + std::vector strides_copy(stride, stride + dim); + fixSizeOneDimStride(dim, size, strides_copy.data(), nhwc); + MIOPEN_CHECK(miopenSetTensorDescriptor(mut_desc(), dataType, dim, size, strides_copy.data())); } }; @@ -166,4 +202,4 @@ union Constant } }; -}} // namespace +}} // namespace diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp index d858df073397..6c58de099648 100644 --- a/aten/src/ATen/mps/EmptyTensor.cpp +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -12,7 +12,7 @@ #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled" #define MPS_ERROR_RUNTIME_TOO_LOW \ - "The MPS backend is supported on MacOS 13.0+.", \ + "The MPS backend is supported on MacOS 14.0+. ", \ "Current OS version can be queried using `sw_vers`" #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \ "as the MPS framework doesn't support float64. Please use float32 instead." diff --git a/aten/src/ATen/mps/MPSHooks.mm b/aten/src/ATen/mps/MPSHooks.mm index a2ec221c1bfe..34fbd31af91d 100644 --- a/aten/src/ATen/mps/MPSHooks.mm +++ b/aten/src/ATen/mps/MPSHooks.mm @@ -70,7 +70,10 @@ } void* MPSHooks::getCommandBuffer() const { - return at::mps::getDefaultMPSStream()->commandBuffer(); + auto stream = at::mps::getDefaultMPSStream(); + // Release pending computeCommandEncoder, as extensions is likely to allocate new one + stream->endKernelCoalescing(); + return stream->commandBuffer(); } void* MPSHooks::getDispatchQueue() const { diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm index e9627a343ad6..71325bd69e1d 100644 --- a/aten/src/ATen/mps/MPSStream.mm +++ b/aten/src/ATen/mps/MPSStream.mm @@ -158,7 +158,18 @@ @interface MPSGraphExecutionDescriptor () endKernelCoalescing(); id blitEncoder = [commandBuffer() blitCommandEncoder]; - [blitEncoder fillBuffer:buffer range:NSMakeRange(offset, length) value:value]; + // For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26 + // See https://github.com/pytorch/pytorch/issues/163962 + // Workaround by batching copy commands into 4Gb chunks + constexpr size_t max_copy_size = 0x100000000; // 4GB + size_t bytes_filled = 0; + size_t bytes_remains = length; + while (bytes_remains > 0) { + NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains); + [blitEncoder fillBuffer:buffer range:NSMakeRange(offset + bytes_filled, bytes_to_copy) value:value]; + bytes_filled += bytes_to_copy; + bytes_remains -= bytes_to_copy; + } [blitEncoder endEncoding]; synchronize(syncType); } diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp index 674ccf11cfb9..49366151ae60 100644 --- a/aten/src/ATen/native/Blas.cpp +++ b/aten/src/ATen/native/Blas.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #if !defined(__s390x__) && !defined(__powerpc__) #include #endif @@ -332,4 +333,23 @@ _scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b, return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out); } +// TODO(vasiliy, future PR): figure out why we need to declare this function, when +// other functions that live in ATen/native/*.cpp without declarations +// or headers work just fine. +Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype); + +Tensor _grouped_mm(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype) { + _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype); + const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); + return out; +} + } // namespace at::native diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp index b16c1ef04fa0..e06afddd05aa 100644 --- a/aten/src/ATen/native/CPUBlas.cpp +++ b/aten/src/ATen/native/CPUBlas.cpp @@ -496,18 +496,18 @@ void gemm( // for the fallback path, first compute gemm with beta = 0, // and then add c in full precision. int64_t c_size = n * m; - std::vector float16_c(c_size, 0.f); - gemm_stub( + std::vector float_c(c_size, 0.f); + gemm_no_downcast_stub( at::kCPU, at::kHalf, - transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float16_c.data(), m); + transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m); for (const auto j : c10::irange(n)) { for (const auto i : c10::irange(m)) { auto offset = j * ldc + i; // beta == 0 won't propagate NaN from C if (beta == 0.f) { - c[offset] = c10::convert(float16_c[j * m + i]); + c[offset] = float_c[j * m + i]; } else { - c[offset] = beta * c[offset] + c10::convert(float16_c[j * m + i]); + c[offset] = beta * c[offset] + float_c[j * m + i]; } } } diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 84381efe55b0..e160c84ced33 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -353,19 +353,21 @@ TORCH_API void _cudnn_set_conv_benchmark_empty_cache(bool enable); TORCH_API bool _cudnn_get_conv_benchmark_empty_cache(); -inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { - +inline at::MemoryFormat miopen_conv_suggest_memory_format(const at::Tensor& input, const at::Tensor& weight) { // disable NHWC for float64 input. if (!at::detail::getCUDAHooks().compiledWithMIOpen() || input.scalar_type() == at::kDouble || weight.scalar_type() == at::kDouble) { - return false; + return at::MemoryFormat::Contiguous; } // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen - // See #64427 - static std::optional PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC"); - static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC; + // See https://github.com/pytorch/pytorch/issues/64427. + // non static variable is used to be able to change environment variable in runtime for testing + // enabled by default for ROCm >= 7.0.0 with miopen 3.5 + int miopen_version = detail::getCUDAHooks().compiledWithMIOpen() ? detail::getCUDAHooks().versionMIOpen() : 0; + bool is_miopen_3_5 = miopen_version >= 30500; // ROCm 7.0 + bool suggest_nhwc = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC").value_or(is_miopen_3_5); auto input_memory_format = input.suggest_memory_format(); auto weight_memory_format = weight.suggest_memory_format(); @@ -375,13 +377,24 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten (input_memory_format == at::MemoryFormat::ChannelsLast) || (weight_memory_format == at::MemoryFormat::ChannelsLast) ); + if (can_use_miopen_channels_last_2d) { + return at::MemoryFormat::ChannelsLast; + } bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && ( (input_memory_format == at::MemoryFormat::ChannelsLast3d) || (weight_memory_format == at::MemoryFormat::ChannelsLast3d) ); + if (can_use_miopen_channels_last_3d) { + return at::MemoryFormat::ChannelsLast3d; + } + + return at::MemoryFormat::Contiguous; +} - return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d; +// deprecated, but to remove would be BC-breaking +inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { + return miopen_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous; } inline bool mkldnn_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) { diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index d2b7b055684e..2e0e4a47f37b 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -300,67 +301,50 @@ struct ConvParams { bool allow_tf32{}; bool is_strided() const { - bool is_strided = false; - for (const auto& s : stride) { - is_strided |= (s != 1); - } - return is_strided; + return std::any_of( + stride.cbegin(), stride.cend(), [](const T& s) { return s != 1; }); } bool is_dilated() const { - bool is_dilated = false; - for (const auto& d : dilation) { - is_dilated |= (d != 1); - } - return is_dilated; + return std::any_of( + dilation.cbegin(), dilation.cend(), [](const T& d) { return d != 1; }); } bool is_padded() const { - bool is_padded = false; - for (auto p : padding) { - is_padded |= (p != 0); - } - return is_padded; + return std::any_of( + padding.cbegin(), padding.cend(), [](const T& p) { return p != 0; }); } bool is_output_padding_neg() const { - bool is_non_neg = false; - for (const auto& p : output_padding) { - is_non_neg |= (p < 0); - } - return is_non_neg; + return std::any_of( + output_padding.cbegin(), + output_padding.cend(), + [](const T& p) { return p < 0; }); } bool is_output_padding_big() const { - bool is_big = false; + // Revisit this with std::views::zip at C++20. for (auto i: c10::irange(output_padding.size())) { - is_big |= (output_padding[i] >= stride[i]); + if (output_padding[i] >= stride[i]) { + return true; + } } - return is_big; + return false; } bool is_padding_neg() const { - bool is_non_neg = false; - for (const auto& p : padding) { - is_non_neg |= (p < 0); - } - return is_non_neg; + return std::any_of( + padding.cbegin(), padding.cend(), [](const T& p) { return p < 0; }); } bool is_dilation_neg() const { - bool is_non_neg = false; - for (const auto& p : dilation) { - is_non_neg |= (p < 0); - } - return is_non_neg; + return std::any_of( + dilation.cbegin(), dilation.cend(), [](const T& d) { return d < 0; }); } bool is_stride_nonpos() const { - bool is_nonpos = false; - for (const auto& s : stride) { - is_nonpos |= (s <= 0); - } - return is_nonpos; + return std::any_of( + stride.cbegin(), stride.cend(), [](const T& s) { return s <= 0; }); } void view1d_as_2d() { @@ -426,11 +410,23 @@ struct ConvParams { // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how) #if !defined(C10_MOBILE) - if (!detail::getCUDAHooks().compiledWithCuDNN()) { + if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) { return false; } + static long cudnn_version = detail::getCUDAHooks().versionCuDNN(); + // broken on cuDNN 9.8 + if (cudnn_version >= 90800) { + if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous && + (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) && + weight.dim() == 5) { + for (int i = 2; i < weight.dim(); i++) { + if (weight.size(i) != 1) { + return false; + } + } + } + } if (needs_64bit_indexing_no_split(input, weight)) { - static long cudnn_version = detail::getCUDAHooks().versionCuDNN(); if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" " if the V8 API is not enabled or before cuDNN version 9.3+." @@ -438,9 +434,6 @@ struct ConvParams { return false; } } - if (!input.is_cuda() || !cudnn_enabled) { - return false; - } if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) { if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) { return false; @@ -459,13 +452,19 @@ struct ConvParams { // Use cudnn for FP16 depthwise convolutions bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const { - if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) { - // always use cudnn_depthwise for channels_last format - return true; + if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) { + return false; } // native kernel doesn't support 64-bit non-splittable case - if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { + if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) { static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1; + // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x + if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) { + if (cudnn_version < 0 || cudnn_version > 91000) { + return false; + } + } + if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) { TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions" " if the V8 API is not enabled or before cuDNN version 9.3+." @@ -475,6 +474,10 @@ struct ConvParams { return true; } } + if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) { + // always use cudnn_depthwise for channels_last format + return true; + } if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) { bool kernel_cond = (use_cudnn(input, weight) && input.scalar_type() == kHalf && // only for FP16 @@ -1419,10 +1422,8 @@ static inline at::MemoryFormat determine_backend_memory_format( case ConvBackend::Miopen: case ConvBackend::MiopenDepthwise: case ConvBackend::MiopenTranspose: - if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) { - TORCH_INTERNAL_ASSERT((k == 4 || k == 5), - "Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()"); - backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; + if (detail::getCUDAHooks().compiledWithMIOpen()) { + backend_memory_format = miopen_conv_suggest_memory_format(input, weight); } break; case ConvBackend::Mkldnn: diff --git a/aten/src/ATen/native/GroupedMMUtils.h b/aten/src/ATen/native/GroupedMMUtils.h new file mode 100644 index 000000000000..78993308cd5f --- /dev/null +++ b/aten/src/ATen/native/GroupedMMUtils.h @@ -0,0 +1,167 @@ +#pragma once + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native { + +inline bool check_valid_strides_and_return_transposed(const Tensor& mat) { + IntArrayRef tensor_strides = mat.strides(); + IntArrayRef tensor_sizes = mat.sizes(); + int end_dim = mat.dim() - 1; + int alignment = 16 / mat.element_size(); + TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n"); + if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max(1, tensor_sizes[end_dim - 1]))) { + TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes"); + return true; + } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max(1, tensor_sizes[end_dim]))) { + TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes"); + return false; + } else { + TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes"); + } +} + +inline at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a, +const Tensor& mat_b, +const std::optional& offs, +c10::ScalarType out_dtype +) { + c10::SmallVector out_size; + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (a_is_2d) { + if (b_is_2d) { + out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)}; + } else { + TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match"); + out_size = {mat_a.size(0), mat_b.size(-1)}; + } + } else { + if (b_is_2d) { + // this case is not actually encountered for MoE gemms + TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match"); + out_size = {mat_a.size(1), mat_b.size(1)}; + } else { // regular bmm + TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match"); + out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)}; + } + } + + #ifndef USE_ROCM + // For TMA transfers, strides of output tensor have to be either + // 1, or aligned to 16 bytes. + const auto last_dim = out_size.size() - 1; + const auto alignment = 16 / c10::elementSize(out_dtype); + const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment; + std::vector out_stride; + if (a_is_2d != b_is_2d) { + out_stride = {size_padded, 1}; + } else { + out_stride = {out_size[1] * size_padded, size_padded, 1}; + } + return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype)); + #else + return at::empty(out_size, mat_a.options().dtype(out_dtype)); + #endif +} + +inline void _grouped_mm_validate_inputs(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype) { + TORCH_CHECK((mat_a.dtype() == at::kBFloat16) || (mat_a.dtype() == at::kFloat) || (mat_a.dtype() == at::kHalf), "Expected mat_a to be Float32, BFloat16 or Float16 matrix, got ", mat_a.scalar_type()); + TORCH_CHECK((mat_b.dtype() == at::kBFloat16) || (mat_b.dtype() == at::kFloat) || (mat_b.dtype() == at::kHalf), "Expected mat_b to be Float32, BFloat16 or Float16 matrix, got ", mat_b.scalar_type()); + TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); + TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (!a_is_2d || !b_is_2d) { + TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match"); + } + + // check that the strides are valid, the fn will throw an error if not + check_valid_strides_and_return_transposed(mat_a); + check_valid_strides_and_return_transposed(mat_b); + TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d"); + + if (offs.has_value()) { + TORCH_CHECK(offs->dim() == 1, "offs has to be 1D"); + TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); + } + TORCH_CHECK(!bias.has_value(), "Bias not supported yet"); +} + +inline c10::ScalarType _resolve_grouped_mm_out_dtype(const Tensor& mat_a, const Tensor& mat_b, +std::optional out_dtype) { + const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type()); + // TODO(future PR): enable float32 output dtype for bfloat16 and float16 inputs + TORCH_CHECK(out_dtype_ == mat_a.dtype(), "Grouped gemm output dtype must match `mat_a` dtype"); + return out_dtype_; +} + + +inline void _grouped_mm_fallback(const Tensor& mat_a, const Tensor& mat_b, +const std::optional& offs, +const std::optional& bias, +std::optional out_dtype, +Tensor out) { + LOG(INFO) << "fallback path for `torch._grouped_mm`, performance may not be optimal"; + const bool a_is_2d = mat_a.dim() == 2; + const bool b_is_2d = mat_b.dim() == 2; + if (a_is_2d && !b_is_2d) { + // 2d x 3d with offsets + int group_start_idx = 0; + auto offs_cpu = offs.value().cpu(); + for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) { + int group_end_idx = offs_cpu[group_idx].item(); + auto mat_a_slice = mat_a.slice(0, group_start_idx, group_end_idx); + auto out_slice = out.slice(0, group_start_idx, group_end_idx); + at::mm_out(out_slice, mat_a_slice, mat_b[group_idx]); + group_start_idx = group_end_idx; + } + + } else if (!a_is_2d && b_is_2d) { + // 3d x 2d with offsets + int group_start_idx = 0; + auto offs_cpu = offs.value().cpu(); + for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) { + int group_end_idx = offs_cpu[group_idx].item(); + auto mat_b_slice = mat_b.slice(1, group_start_idx, group_end_idx); + auto out_slice = out.slice(1, group_start_idx, group_end_idx); + at::mm_out(out_slice, mat_a[group_idx], mat_b_slice); + group_start_idx = group_end_idx; + } + + } else if (a_is_2d && b_is_2d) { + // 2d x 2d with offsets + int group_start_idx = 0; + auto offs_cpu = offs.value().cpu(); + for (int group_idx = 0; group_idx < offs_cpu.size(0); group_idx++) { + int group_end_idx = offs_cpu[group_idx].item(); + auto mat_a_slice = mat_a.slice(1, group_start_idx, group_end_idx); + auto mat_b_slice = mat_b.slice(0, group_start_idx, group_end_idx); + auto out_slice = out[group_idx]; + at::mm_out(out_slice, mat_a_slice, mat_b_slice); + group_start_idx = group_end_idx; + } + + } else { + // 3d x 3d without offsets - regular bmm + at::bmm_out(out, mat_a, mat_b); + } +} + + +} // namespace at::native diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 5d3a84ea39f6..a744da3bcad2 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -185,6 +185,17 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra // right: "lro, summed, ro" permuted with rpermutation and the three flattened // then the permuted output is a view of bmm(left, right) // finally, opermutation reverts the permutation to the original order of dimensions + // By default the output is "lro, lo, 1-for-summed-dims, ro" with original shape dimensions. + // However, if all dimensions from the right operand appear before those from the left + // operand in the final output, we can swap the operands so that bmm directly produces + // the result in the correct memory order. + + bool swap_lo_ro = !lo.empty() && !ro.empty() && ro.back() < lo.front(); + if (swap_lo_ro) { + std::swap(left, right); + std::swap(lo, ro); + std::swap(lo_size, ro_size); + } auto out_num_dim = lro.size() + lo.size() + sum_dims_.size() + ro.size(); std::vector out_size; out_size.reserve(out_num_dim); diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index b62c584641db..616e6ec60e13 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1360,7 +1360,8 @@ Tensor outer(const Tensor& self, const Tensor& vec2) { #endif -#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() +#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED() +// Used by default on x86 platforms and on AArch64+ACL static inline int64_t get_mkldnn_matmul_min_dim() { static auto value = [&] { const int64_t default_min_dim = [&] { @@ -1395,8 +1396,6 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) { return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size; } #endif - - static void addmm_impl_cpu_( Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) { TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2); @@ -1772,8 +1771,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) || (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1])); }; - -#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED() +#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED() + // Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]); if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) { try { @@ -1785,7 +1784,6 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens } } #endif - if (contraction_size * res_rows * res_cols < 400) { if (is_bmm_out) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] { diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index 53d56622fe62..ca86292403fb 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -47,10 +47,14 @@ TORCH_META_FUNC(nll_loss_forward) TORCH_CHECK( target.dim() <= 1, "0D or 1D target tensor expected, multi-target not supported"); - - auto no_batch_dim = self.dim() == 1 && target.dim() == 0; + if (self.dim() == 1 && target.dim() == 1) { + TORCH_CHECK_VALUE( + target.size(0) == 1, + "For 1D input, 1D target must have size 1, but got target size: ", + target.size(0)); + } TORCH_CHECK( - no_batch_dim || (self.size(0) == target.size(0)), + self.dim() == 1 || (self.size(0) == target.size(0)), "size mismatch (got input: ", self.sizes(), ", target: ", diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp index 2ac513bf0888..8833bdb6e471 100644 --- a/aten/src/ATen/native/Onehot.cpp +++ b/aten/src/ATen/native/Onehot.cpp @@ -1,5 +1,6 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -24,8 +25,13 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) { if (num_classes == -1) { num_classes = self.max().item().toLong() + 1; } - at::Tensor index = at::arange(num_classes, self.options()); - return at::eq(self.unsqueeze(-1), index).to(kLong); + { + // If `self` is a DTensor, then allow implicit replication + // of the `index` Tensor. + at::DTensorAllowImplicitReplication guard; + at::Tensor index = at::arange(num_classes, self.options()); + return at::eq(self.unsqueeze(-1), index).to(kLong); + } } auto shape = self.sizes().vec(); diff --git a/aten/src/ATen/native/PadNd.cpp b/aten/src/ATen/native/PadNd.cpp index 8072d24a1090..8099648d37b2 100644 --- a/aten/src/ATen/native/PadNd.cpp +++ b/aten/src/ATen/native/PadNd.cpp @@ -240,8 +240,15 @@ Tensor _pad_enum_symint(const Tensor &self, c10::SymIntArrayRef pad, int64_t mod default: {} } } - C10_THROW_ERROR(NotImplementedError, - "Only 2D, 3D, 4D, 5D padding with non-constant padding are supported for now"); + + std::ostringstream error_msg; + error_msg << "Padding size " << pad.size() << " is not supported for " << input_dim << "D input tensor.\n"; + error_msg << "Supported combinations for non-constant padding:\n"; + error_msg << " - 2D or 3D input: padding size = 2 (pads last dimension)\n"; + error_msg << " - 3D or 4D input: padding size = 4 (pads last 2 dimensions)\n"; + error_msg << " - 4D or 5D input: padding size = 6 (pads last 3 dimensions)"; + + C10_THROW_ERROR(NotImplementedError, error_msg.str()); } Tensor pad_symint(const Tensor &self, c10::SymIntArrayRef pad, std::string_view mode, std::optional value) { diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index 408faea1b764..7d613fc02312 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -2174,7 +2174,7 @@ static void _scatter_via_index_put( if (self.dim() == 1 || broadcast_index) { Tensor squeezed = index; if (broadcast_index && index.dim() > 1) { - for (const auto d : c10::irange(index.dim())) { + for (int64_t d = index.dim() - 1; d >= 0; --d) { if (d == dim) { continue; } diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 054cc66cf8eb..1886e65fc1ed 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -1640,6 +1640,9 @@ Tensor zeros_symint( std::optional layout, std::optional device, std::optional pin_memory) { + for (const auto& dim_size : size) { + TORCH_CHECK(dim_size >= 0, "zeros: Dimension size must be non-negative."); + } Layout layout_ = layout.value_or(Layout::Strided); if (at::sparse_csr::is_sparse_compressed(layout_)) { return zeros_sparse_compressed_symint( diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index 77acfe47363e..4fa0556ad785 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +58,12 @@ c10::SymInt sym_size(const Tensor& self, int64_t dim) { return self.sym_size(dim); } +c10::SymBool sym_is_contiguous( + const Tensor& self, + c10::MemoryFormat memory_format) { + return self.sym_is_contiguous(memory_format); +} + c10::SymInt sym_stride(const Tensor& self, int64_t dim) { return self.sym_stride(dim); } diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h index 5715fd8f047f..83b51a998563 100644 --- a/aten/src/ATen/native/cpu/Loops.h +++ b/aten/src/ATen/native/cpu/Loops.h @@ -89,7 +89,7 @@ execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t using result_type = typename traits::result_type; for (; i < n; i++) { result_type* out_ptr = (result_type*)(data[0] + i * strides[0]); - *out_ptr = c10::guts::apply(op, dereference( + *out_ptr = std::apply(op, dereference( &data[1], &strides[1], i)); @@ -102,7 +102,7 @@ inline void execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { using traits = function_traits; for (; i < n; i++) { - c10::guts::apply(op, dereference( + std::apply(op, dereference( &data[0], &strides[0], i)); @@ -162,7 +162,7 @@ void handle_tuple_outputs(char* C10_RESTRICT data[], } // Loop operation for `cpu_kernel_multiple_outputs`. -// 1. Use `c10::guts::apply` to make dynamic method invocation +// 1. Use `std::apply` to make dynamic method invocation // for the lambda passed in `cpu_kernel_multiple_outputs`. // 2. Iterate over the members of the returned tuple, set the corresponding // output tensor by the tuple member in `handle_tuple_outputs` function. @@ -183,7 +183,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_ } for (; i < n; i++) { - auto output = c10::guts::apply(op, dereference( + auto output = std::apply(op, dereference( &data[num_outputs], &strides[num_outputs], i)); @@ -213,8 +213,8 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { auto args1 = dereference_vec(&data[1], opt_scalar, S, i); auto args2 = dereference_vec(&data[1], opt_scalar, S, i + Vec::size()); - auto out1 = c10::guts::apply(vop, std::move(args1)); - auto out2 = c10::guts::apply(vop, std::move(args2)); + auto out1 = std::apply(vop, std::move(args1)); + auto out2 = std::apply(vop, std::move(args2)); out1.store(data[0] + i * sizeof(scalar_t)); out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t)); } diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp index e3f08194bb58..59d838b9782d 100644 --- a/aten/src/ATen/native/cpu/PaddingKernel.cpp +++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp @@ -156,7 +156,7 @@ void cpu_padding( int64_t offset_h = ndim >= 2 ? p.offsets[ndim - 2] : 0; int64_t offset_w = p.offsets[ndim - 1]; - // do vectorized copy whe output is overlapped with input on W, + // do vectorized copy when output is overlapped with input on W, // only applies to positive padding auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) { if (positive_padding) { diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp index 5a288193143d..d013dfa0485e 100644 --- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp @@ -318,7 +318,7 @@ batch_norm_cpu_collect_stats_channels_last_impl( // // The optimal THRESHOLD to tile was found empirically. // When C > THRESHOLD, C is large enough that the benefit from tiling and vectorization outweigh the synchronization overhead. - // Wehn C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization. + // When C <= TILE_SIZE, the problem size is small enough (C <= TILE_SIZE && NHW <= max_threads) that it's better to launch single thread with vectorization than C threads without vectorization. // // When num_threads == 1, always use Method 2 as there is no synchronization overhead. // diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index a7c17893903b..23447c7e09b3 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1079,6 +1080,16 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals #endif } +static bool _grouped_mm_allowed_device() { +#ifdef USE_ROCM + return false; +#else + auto dprops = at::cuda::getCurrentDeviceProperties(); + // CUDA capability 8.0 and greater + return dprops->major >= 8; +#endif +} + #ifdef USE_ROCM static bool _scaled_mm_is_fnuz() { return at::detail::getCUDAHooks().isGPUArch({"gfx942"}); @@ -1540,71 +1551,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, } namespace { - at::Tensor create_grouped_gemm_output_tensor(const Tensor& mat_a, - const Tensor& mat_b, - const std::optional& offs, - std::optional out_dtype - ) { - c10::SmallVector out_size; - const bool a_is_2d = mat_a.dim() == 2; - const bool b_is_2d = mat_b.dim() == 2; - if (a_is_2d) { - if (b_is_2d) { - out_size = {offs->size(0), mat_a.size(0), mat_b.size(1)}; - } else { - TORCH_CHECK(offs->size(0) == mat_b.size(0), "matrix batch sizes have to match"); - out_size = {mat_a.size(0), mat_b.size(-1)}; - } - } else { - if (b_is_2d) { - // this case is not actually encountered for MoE gemms - TORCH_CHECK(offs->size(0) == mat_a.size(0), "matrix batch sizes have to match"); - out_size = {mat_a.size(1), mat_b.size(1)}; - } else { // regular bmm - TORCH_CHECK(mat_a.size(0) == mat_b.size(0), "batched dimension has to match"); - out_size = {mat_a.size(0), mat_a.size(1), mat_b.size(-1)}; - } - } - - const auto out_dtype_ = out_dtype.value_or(kBFloat16); - TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm"); - - #ifndef USE_ROCM - // For TMA transfers, strides of output tensor have to be either - // 1, or aligned to 16 bytes. - const auto last_dim = out_size.size() - 1; - const auto alignment = 16 / c10::elementSize(out_dtype_); - const int64_t size_padded = (out_size[last_dim] + alignment - 1) / alignment * alignment; - std::vector out_stride; - if (a_is_2d != b_is_2d) { - out_stride = {size_padded, 1}; - } else { - out_stride = {out_size[1] * size_padded, size_padded, 1}; - } - return at::empty_strided(out_size, out_stride, mat_a.options().dtype(out_dtype_)); - #else - return at::empty(out_size, mat_a.options().dtype(out_dtype_)); - #endif - } - - bool check_valid_strides_and_return_transposed(const Tensor& mat) { - IntArrayRef tensor_strides = mat.strides(); - IntArrayRef tensor_sizes = mat.sizes(); - int end_dim = mat.dim() - 1; - int alignment = 16 / mat.element_size(); - TORCH_CHECK(uint64_t(mat.data_ptr()) % 16 ==0, "expected data_ptr to be aligned to 16 bytes\n"); - if ((tensor_strides[end_dim - 1] == 1) && (tensor_strides[end_dim] >= std::max(1, tensor_sizes[end_dim - 1]))) { - TORCH_CHECK(tensor_strides[end_dim] % alignment == 0, "strides should be multiple of 16 bytes"); - return true; - } else if ((tensor_strides[end_dim] == 1) && (tensor_strides[end_dim - 1] >= std::max(1, tensor_sizes[end_dim]))) { - TORCH_CHECK(tensor_strides[end_dim - 1] % alignment == 0, "strides should be multiple of 16 bytes"); - return false; - } else { - TORCH_CHECK(false, "Invalid strides/sizes, got ", mat.strides(), " for strides and ", mat.sizes(), " for sizes"); - } - } - - void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) { + void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) { + // Checks scales for 2d or 3d target tensors (`mat`). if (mat.dim() == 2) { TORCH_CHECK( scale.dim() == 1, @@ -1638,9 +1586,66 @@ namespace { "scale must have the same first dimension as mat for arg ", arg_idx); } -} + } + void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) { + // Checks scales for 2d or 3d target tensors (`mat`). + if (mat.dim() == 2) { + // For MXFP8, 2d tensors have variable size groups represented as subtensors, + // that are converted to blocked padded format individually, + // so we can't check the scale sizes without doing a d2h sync to get the group sizes here. + TORCH_CHECK( + scale.dim() == mat.dim(), + "for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx); + + // LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4)) + // RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4)) + // * weight is transposed prior to the call, scale stays non-transposed. + bool LHS = arg_idx == 0; + int scale_dim_to_check = 0; + int mat_dim_to_check = LHS ? 0 : 1; + TORCH_CHECK( + scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check), + "for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ", + "must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")"); + } else { + // For MXFP8, 3d tensors have static group sizes (stack of 2d tensors), + // so we can check the exact expected scale sizes here without a d2h sync. + auto round_up = [](auto x, auto y) { + return ((x + y - 1) / y) * y; + }; + + // TODO: this is for 3d tensor in 2d-3d case specifically. + // We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them. + int64_t G = mat.size(0); + int64_t K = mat.size(1); + int64_t N = mat.size(2); + int64_t blocked_scale_K = round_up(K/32, 4); + int64_t blocked_scale_N = round_up(N, 128); + + // fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N). + TORCH_CHECK( + scale.dim() == mat.dim() - 1, + "for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx + ); + TORCH_CHECK( + scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N, + "for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx + ); + } + } + void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) { + bool using_fp8_rowwise = scale.scalar_type() == kFloat; + bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu; + if (using_fp8_rowwise) { + _check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier); + } else if (using_mxfp8) { + _check_scales_mxfp8(mat, scale, dim, arg_idx); + } else { + TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype()); + } + } } Tensor @@ -1665,8 +1670,8 @@ const std::optional& bias, const std::optional& scale_result, std::optional out_dtype, bool use_fast_accum) { - bool allowed_device = _scaled_mm_allowed_device(); - TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = 9.0, or ROCm MI300+"); + bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true); + TORCH_CHECK(allowed_device, "torch._scaled_grouped_mm is only supported on CUDA devices with compute capability = [9.0, 10.0], or ROCm MI300+"); TORCH_CHECK(!check_valid_strides_and_return_transposed(mat_a), "Expected mat1 to not be transposed"); TORCH_CHECK(check_valid_strides_and_return_transposed(mat_b), "Expected mat2 to be transposed"); @@ -1699,16 +1704,47 @@ bool use_fast_accum) { TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); } - // Both Per-Tensor and Row-wise scaling expect fp32 tensors + // FP8 per-tensor and per-row scaling expect fp32 scales. + // MXFP8 expects float8_e8m0fnu scales. TORCH_CHECK( - scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat, - "Both scale_a and scale_b must be float (fp32) tensors."); + (scale_a.scalar_type() == kFloat && scale_b.scalar_type() == kFloat) || + (scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu), + "For FP8 tensorwise and rowwise, both scales must both be float32 tensors. For MXFP8, scales must both be float8_e8m0fnu tensors."); const int scale_multiplier = (mat_a.dim() == 2 && mat_b.dim() == 2) ? offs->size(0) : 1; check_scale(mat_a, scale_a, 0 ,0, scale_multiplier); check_scale(mat_b, scale_b, 1, 1, scale_multiplier); - Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype); + const auto out_dtype_ = out_dtype.value_or(kBFloat16); + TORCH_CHECK(out_dtype_ == kBFloat16, "Only bf16 high precision output types are supported for grouped gemm"); + + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + +#if defined(USE_FBGEMM_GENAI) && defined(USE_CUDA) && !defined(USE_ROCM) + // MXFP8 grouped GEMM dispatching + bool is_mx8mx8bf16 = ( + mat_a.scalar_type() == at::kFloat8_e4m3fn && mat_b.scalar_type() == at::kFloat8_e4m3fn && + scale_a.scalar_type() == at::kFloat8_e8m0fnu && scale_b.scalar_type() == at::kFloat8_e8m0fnu + ); + TORCH_CHECK(out_dtype == at::kBFloat16, "Only bf16 out_dtype is supported for MXFP8 grouped gemm"); + + if (is_mx8mx8bf16) { + bool b_is_3d = mat_b.dim() == 3; + bool is_2d_2d = a_is_2d && b_is_2d; + bool is_2d_3d = a_is_2d && b_is_3d; + TORCH_CHECK(is_2d_2d || is_2d_3d, "MXFP8 grouped GEMM currently only supports 2d-2d and 2d-3d cases"); + TORCH_CHECK(offs.has_value(), "MXFP8 2d-2d and 2d-3d grouped GEMMs requires offsets"); + + fbgemm_gpu::mx8mx8bf16_grouped_mm( + mat_a, + mat_b, + scale_a, + scale_b, + offs.value(), + out); + return out; + } +#endif #ifndef USE_ROCM TORCH_CHECK(mat_a.dtype() == at::kFloat8_e4m3fn, "Expected mat_a to be Float8_e4m3 matrix got ", mat_a.scalar_type()); @@ -1741,6 +1777,7 @@ bool use_fast_accum) { #else TORCH_CHECK(false, "grouped gemm is not supported without USE_FBGEMM_GENAI on ROCM") #endif + #endif } @@ -1750,33 +1787,21 @@ const std::optional& offs, const std::optional& bias, std::optional out_dtype) { #ifndef USE_ROCM - bool allowed_device = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true); - TORCH_CHECK(allowed_device, "torch._grouped_mm is only supported on CUDA devices with compute capability = 9.0, 10.0"); - - TORCH_CHECK(mat_a.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_a.scalar_type()); - TORCH_CHECK(mat_b.dtype() == at::kBFloat16, "Expected mat_a to be BFloat16 matrix got ", mat_b.scalar_type()); - TORCH_CHECK(mat_a.dim() == 2 || mat_a.dim() == 3, "mat_a has to be 2 or 3d"); - TORCH_CHECK(mat_b.dim() == 2 || mat_b.dim() == 3, "mat_b has to be 2 or 3d"); - const bool a_is_2d = mat_a.dim() == 2; - const bool b_is_2d = mat_b.dim() == 2; - if (!a_is_2d || !b_is_2d) { - TORCH_CHECK(mat_a.size(-1) == mat_b.size(-2), "contraction dimension of mat_a and mat_b must match"); - } - - // check that the strides are valid, the fn will throw an error if not - check_valid_strides_and_return_transposed(mat_a); - check_valid_strides_and_return_transposed(mat_b); - TORCH_CHECK(offs.has_value() == (a_is_2d || b_is_2d), "Have to provide offsets if there is a 2d matrix, or no offset if both matrices are 3d"); - - if (offs.has_value()) { - TORCH_CHECK(offs->dim() == 1, "offs has to be 1D"); - TORCH_CHECK(offs->dtype() == at::kInt, "Offsets have to be int32"); + _grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype); + bool a_b_and_out_are_bf16 = ( + mat_a.dtype() == at::kBFloat16 && + mat_b.dtype() == at::kBFloat16 && + out_dtype.value_or(at::kBFloat16) == at::kBFloat16 + ); + bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16; + const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype); + Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_); + if (use_fast_path) { + // fast path, no d2h sync needed + at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out); + } else { + _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out); } - TORCH_CHECK(!bias.has_value(), "Bias not supported yet"); - - Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype); - - at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out); return out; #else TORCH_CHECK(false, "grouped gemm is not supported on ROCM") diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index 12ad84a15b18..ee28c5c1693f 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -999,12 +999,41 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { dtypes[i] = iter.dtype(i); } auto offset_calc = ::make_offset_calculator(iter); +#ifdef USE_ROCM + constexpr int grp_sz = 128; + launch_legacy_kernel_manual_unroll(numel, [=] GPU_LAMBDA(int idx, bool unrl) { + if (unrl) { + auto offsets0 = offset_calc.get(idx); + auto offsets1 = offset_calc.get(idx + grp_sz); + auto offsets2 = offset_calc.get(idx + grp_sz * 2); + auto offsets3 = offset_calc.get(idx + grp_sz * 3); + void* out0 = data[0] + offsets0[0]; + void* out1 = data[0] + offsets1[0]; + void* out2 = data[0] + offsets2[0]; + void* out3 = data[0] + offsets3[0]; + arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1); + arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1); + arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1); + arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1); + c10::cast_and_store(dtypes[0], out0, result0); + c10::cast_and_store(dtypes[0], out1, result1); + c10::cast_and_store(dtypes[0], out2, result2); + c10::cast_and_store(dtypes[0], out3, result3); + } else { + auto offsets = offset_calc.get(idx); + void* out = data[0] + offsets[0]; + arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); + c10::cast_and_store(dtypes[0], out, result); + } + }); +#else launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) { auto offsets = offset_calc.get(idx); void* out = data[0] + offsets[0]; arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1); c10::cast_and_store(dtypes[0], out, result); }); +#endif } } diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu index 59b0426bab1f..62a07e1e28c8 100644 --- a/aten/src/ATen/native/cuda/Copy.cu +++ b/aten/src/ATen/native/cuda/Copy.cu @@ -42,6 +42,19 @@ void bfloat16_copy_kernel_cuda(TensorIteratorBase &iter) { }); } +#ifdef USE_ROCM +void bfloat16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) { + gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::BFloat16 value) { + return static_cast(value); + }); +} +void float16tofloat32_copy_kernel_cuda(TensorIteratorBase &iter) { + gpu_kernel_nocast(iter, [] GPU_LAMBDA(at::Half value) { + return static_cast(value); + }); +} +#endif + void float8_copy_kernel_cuda(TensorIteratorBase &iter) { ScalarType dtype = iter.dtype(0); ScalarType other_dtype = iter.dtype(1); @@ -187,7 +200,17 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) { } else { float16_copy_kernel_cuda(iter); } - } else if (isBitsType(dtype)) { + } +#ifdef USE_ROCM + else if ((iter.dtype(1) == kBFloat16 || iter.dtype(1) == kHalf) && dtype == kFloat) { + if (iter.dtype(1) == kBFloat16) { + bfloat16tofloat32_copy_kernel_cuda(iter); + } else { + float16tofloat32_copy_kernel_cuda(iter); + } + } +#endif + else if (isBitsType(dtype)) { TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting " "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype); AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] { diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 02feb55cb69d..dacef18c79b6 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -59,7 +59,7 @@ constexpr uint64_t getDefaultMaxThreadsPerBlock() { #ifdef USE_ROCM #define SKIP_SORTED_INDICES 32 template -__global__ void indexing_backward_kernel( +__global__ void indexing_backward_kernel_many_indices( const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight, int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim, bool accumulate) { using opmath_t = at::opmath_type; @@ -254,7 +254,8 @@ __global__ void indexing_backward_kernel_stride_1( } } } -#else +#endif + template __global__ void indexing_backward_kernel( const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight, @@ -333,6 +334,7 @@ __global__ void indexing_backward_kernel( } } +#ifndef USE_ROCM template __global__ void indexing_backward_kernel_stride_1( const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight, @@ -708,6 +710,9 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List(at::cuda::getCurrentDeviceProperties()->maxGridSize[1], ceil_div(sliceSize, (int64_t) (warp_size))) : grid.y, + grid.z); dim3 new_grid(ceil_div(num_indices, (int64_t) (indices_per_block * warp_size)), grid.y, grid.z); size_t smem_dups_size = indices_per_block * warp_size * sizeof(int64_t); #define KERNEL_GRID new_grid @@ -780,11 +785,43 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List= 200000) + AT_DISPATCH_V2( + expandedValue.scalar_type(), + "indexing_backward_many_indices", + AT_WRAP([&] { + indexing_backward_kernel_many_indices<<>>( + sorted_indices.const_data_ptr(), + orig_indices.const_data_ptr(), + expandedValue.const_data_ptr(), + src_.mutable_data_ptr(), + num_indices, + sliceSize, + strideBefore, + nElemBefore, + accumulate); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }), + AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), + // AT_EXPAND(AT_FLOAT8_TYPES), + // TODO(#113663): clean up accumulation behavior in float8 dtypes, accumulate=True + // should not be supported here, then reenable AT_FLOAT8_DTYPES + kFloat8_e4m3fn, + kFloat8_e5m2, + kFloat8_e4m3fnuz, + kFloat8_e5m2fnuz, + kComplexHalf, + kHalf, + kBool, + kBFloat16); + else +#endif AT_DISPATCH_V2( expandedValue.scalar_type(), "indexing_backward", AT_WRAP([&] { - indexing_backward_kernel<<>>( + indexing_backward_kernel<<>>( sorted_indices.const_data_ptr(), orig_indices.const_data_ptr(), expandedValue.const_data_ptr(), diff --git a/aten/src/ATen/native/cuda/SegmentReduce.cu b/aten/src/ATen/native/cuda/SegmentReduce.cu index 3acb359342f1..c6f88692a8a5 100644 --- a/aten/src/ATen/native/cuda/SegmentReduce.cu +++ b/aten/src/ATen/native/cuda/SegmentReduce.cu @@ -20,7 +20,7 @@ // SegmentReduce compilation with CUDA-12.9 causes NVCC crash on Windows // See https://github.com/pytorch/pytorch/issues/156181 -#if !defined(_WIN32) || CUDART_VERSION < 12090 +#if !(defined(_WIN32) && CUDART_VERSION == 12090) namespace at::native { @@ -606,4 +606,4 @@ REGISTER_DISPATCH( } // namespace at::native -#endif +#endif \ No newline at end of file diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index 940680eb3682..81387bcceaf0 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -141,7 +141,11 @@ WelfordDataLN cuWelfordOnlineSum( if constexpr (!rms_norm){ U delta = val - curr_sum.mean; U new_count = curr_sum.count + 1.f; +#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL) + U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count); +#else U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster +#endif return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count}; } else{ return {0.f, curr_sum.sigma2 + val * val, 0}; @@ -159,7 +163,11 @@ WelfordDataLN cuWelfordCombine( U count = dataA.count + dataB.count; U mean, sigma2; if (count > decltype(dataB.count){0}) { +#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL) + auto coef = __builtin_amdgcn_rcpf(count); +#else auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division +#endif auto nA = dataA.count * coef; auto nB = dataB.count * coef; mean = nA*dataA.mean + nB*dataB.mean; diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp index 182716ed7a1a..1658ce34ca6c 100644 --- a/aten/src/ATen/native/cudnn/MHA.cpp +++ b/aten/src/ATen/native/cudnn/MHA.cpp @@ -146,7 +146,7 @@ namespace native { namespace fe = cudnn_frontend; -#define MAX_MHA_DIM 4 +constexpr uint8_t MAX_MHA_DIM = 4; // Whether we will use ragged offsets in the dense (non-nested) path // to avoid recompilation @@ -238,7 +238,8 @@ void setMHAParams( const std::optional& attn_bias, double dropout_probability, bool is_causal, - bool return_softmaxstats) { + bool return_softmaxstats, + bool is_nested) { memset(¶ms, 0, sizeof(MHAParams)); params.device_id = at::cuda::current_device(); params.dataType = fe::DataType_t::HALF; @@ -255,23 +256,24 @@ void setMHAParams( params.is_causal = is_causal; params.return_softmaxstats = return_softmaxstats; params.has_attn_bias = attn_bias.has_value(); + // Expect 4D dense tensor, 3D nested case (THD) TORCH_INTERNAL_ASSERT( - q.sizes().size() == MAX_MHA_DIM, + q.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "Q tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - q.strides().size() == MAX_MHA_DIM, + q.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "Q tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - k.sizes().size() == MAX_MHA_DIM, + k.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "K tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - k.strides().size() == MAX_MHA_DIM, + k.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "K tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - v.sizes().size() == MAX_MHA_DIM, + v.sizes().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "V tensor has unexpected number of dims, please report a bug to PyTorch."); TORCH_INTERNAL_ASSERT( - v.strides().size() == MAX_MHA_DIM, + v.strides().size() == (uint8_t)(MAX_MHA_DIM - (uint8_t)is_nested), "V tensor has unexpected number of dims, please report a bug to PyTorch."); std::copy(q.sizes().begin(), q.sizes().end(), params.q_dim.begin()); std::copy(q.strides().begin(), q.strides().end(), params.q_stride.begin()); @@ -320,7 +322,8 @@ struct MHACacheKeyWrapper : ParamsWrapper { const std::optional& attn_bias, double dropout_probability, bool is_causal, - bool return_softmaxstats) { + bool return_softmaxstats, + bool is_nested) { setMHAParams( this->pod, b, @@ -335,7 +338,8 @@ struct MHACacheKeyWrapper : ParamsWrapper { attn_bias, dropout_probability, is_causal, - return_softmaxstats); + return_softmaxstats, + is_nested); } }; @@ -479,6 +483,8 @@ auto build_graph( fe::graph::SDPA_attributes() .set_name("CUDNN_SDPA") .set_is_inference(return_softmaxstats == false) + // TODO(eqy): switch to this API once cuDNN FE is upgraded + // .set_generate_stats(return_softmaxstats) .set_causal_mask(is_causal) .set_attn_scale(attn_scale); if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) { @@ -699,6 +705,8 @@ auto build_graph_nestedtensor( fe::graph::SDPA_attributes() .set_name("CUDNN_SDPA_NESTEDTENSOR") .set_is_inference(return_softmaxstats == false) + // TODO(eqy): switch to this API once cuDNN FE is upgraded + // .set_generate_stats(return_softmaxstats) .set_causal_mask(is_causal) .set_attn_scale(attn_scale) .set_seq_len_q(SEQ_LEN_Q_) @@ -1386,7 +1394,8 @@ void run_cudnn_SDP_fprop( attn_bias, dropout_probability, is_causal, - return_softmaxstats); + return_softmaxstats, + false); auto graph_ptr = getMHAGraphCache_().find(key); std::shared_ptr mha_graph; if (graph_ptr) { @@ -1484,30 +1493,53 @@ void run_cudnn_SDP_fprop_nestedtensor( if (return_softmaxstats && !softmaxstats.defined()) { softmaxstats = at::empty({q.size(0), h_q, 1}, q.options().dtype(kFloat)); } - auto mha_graph = build_graph_nestedtensor( + + auto key = MHACacheKeyWrapper( b, h_q, - h_k, - h_v, - s_q, - s_kv, + s_q, // max-seqlen-q + s_kv, // max-seqlen-kv d_qk, d_v, - scaling_factor, - return_softmaxstats, - is_causal, - dropout_probability, - cum_seqlen_q, - cum_seqlen_kv, q, k, v, attn_bias, - softmaxstats, - o, - dropoutseed, - dropoutoffset, - handle); + dropout_probability, + is_causal, + return_softmaxstats, + true); + auto graph_ptr = getMHAGraphCache_().find(key); + std::shared_ptr mha_graph; + + if (graph_ptr) { + mha_graph = *graph_ptr; + } else { + mha_graph = build_graph_nestedtensor( + b, + h_q, + h_k, + h_v, + s_q, + s_kv, + d_qk, + d_v, + scaling_factor, + return_softmaxstats, + is_causal, + dropout_probability, + cum_seqlen_q, + cum_seqlen_kv, + q, + k, + v, + attn_bias, + softmaxstats, + o, + dropoutseed, + dropoutoffset, + handle); + } auto seqlen_q = at::diff(cum_seqlen_q, 1, 0); auto seqlen_kv = at::diff(cum_seqlen_kv, 1, 0); auto rag_q_off = cum_seqlen_q.mul(h_q * d_qk); @@ -1636,7 +1668,8 @@ void run_cudnn_SDP_bprop( attn_bias, dropout_probability, is_causal, - true); + true, + false); auto graph_backward_ptr = getMHAGraphBackwardCache_().find(key); std::shared_ptr mha_graph; if (graph_backward_ptr) { @@ -1761,33 +1794,55 @@ void run_cudnn_SDP_bprop_nestedtensor( cudnnHandle_t handle = getCudnnHandle(); - auto mha_graph = build_graph_backward_nestedtensor( + auto key = MHACacheKeyWrapper( b, h_q, - h_k, - h_v, - s_q, - s_kv, + s_q, // max-seqlen-q + s_kv, // max-seqlen-kv d_qk, d_v, - scaling_factor, - is_causal, - dropout_probability, - cum_seqlen_q, - cum_seqlen_kv, q, k, v, attn_bias, - o, - dO_, - softmaxstats, - dQ, - dK, - dV, - dropoutseed, - dropoutoffset, - handle); + dropout_probability, + is_causal, + true, + true); + auto graph_ptr = getMHAGraphCache_().find(key); + std::shared_ptr mha_graph; + + if (graph_ptr) { + mha_graph = *graph_ptr; + } else { + mha_graph = build_graph_backward_nestedtensor( + b, + h_q, + h_k, + h_v, + s_q, + s_kv, + d_qk, + d_v, + scaling_factor, + is_causal, + dropout_probability, + cum_seqlen_q, + cum_seqlen_kv, + q, + k, + v, + attn_bias, + o, + dO_, + softmaxstats, + dQ, + dK, + dV, + dropoutseed, + dropoutoffset, + handle); + } std::unordered_map variant_pack = { // inputs diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 154118d9f272..41226680c4b5 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #endif // TODO: Remove the condition on AT_ROCM_ENABLED entirely, @@ -145,13 +146,13 @@ at::Tensor miopen_convolution_relu( #include #include +#include #include #include #include #include -#include #include #include #include @@ -162,10 +163,13 @@ at::Tensor miopen_convolution_relu( namespace at { namespace native { -Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) { - auto group_size = t.size(dim) / groups; - return t.narrow(dim, group_idx * group_size, group_size); -} +// See NOTE [ Convolution design ] in aten/src/ATen/native/cudnn/ConvShared.cpp + +// --------------------------------------------------------------------- +// +// Helper classes +// +// --------------------------------------------------------------------- // This POD struct is used to let us easily compute hashes of the // parameters @@ -174,6 +178,8 @@ struct ConvolutionParams miopenHandle_t handle; miopenDataType_t dataType; int input_size[2 + max_dim]; + uint8_t input_dim; + at::MemoryFormat memory_format; int input_stride[2 + max_dim]; int weight_size[2 + max_dim]; int padding[max_dim]; @@ -181,25 +187,29 @@ struct ConvolutionParams int dilation[max_dim]; int64_t groups; bool deterministic; - int device_id; //This is needed to distinguish between miopen handles of multiple gpus. + c10::DeviceIndex device_id; //This is needed to distinguish between miopen handles of multiple gpus. // NB: transposed purposely omitted: transposed just swaps // forward and backward, so you can reuse the benchmark entry, }; -// ConvolutionParams must be a POD because we read out its memory -// contenst as char* when hashing -static_assert(std::is_standard_layout_v, "ConvolutionParams not POD"); void setConvolutionParams( - ConvolutionParams* params, miopenHandle_t handle, - const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool deterministic) { - + ConvolutionParams* params, + miopenHandle_t handle, + const at::Tensor& input, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool deterministic, + at::MemoryFormat memory_format) { miopenDataType_t dataType = getMiopenDataType(input); memset(params, 0, sizeof(ConvolutionParams)); params->dataType = dataType; params->handle = handle; // ASSERT(weight.dim() == input.dim()) + params->input_dim = input.dim(); + params->memory_format = memory_format; for (int i = 0; i != input.dim(); ++i) { params->input_size[i] = (int) input.size(i); params->input_stride[i] = (int) input.stride(i); @@ -214,9 +224,7 @@ void setConvolutionParams( } params->groups = groups; params->deterministic = deterministic; - int device_id; - HIP_CHECK(hipGetDevice(&device_id)); - params->device_id = device_id; + params->device_id = at::cuda::current_device(); } // Convenience struct for passing around descriptors and data @@ -239,31 +247,10 @@ struct ConvolutionArgs { // // --------------------------------------------------------------------- -// Hashing machinery for ConvolutionParams -struct ParamsHash { - std::size_t operator()(const ConvolutionParams& params) const { - auto ptr = reinterpret_cast(¶ms); - uint32_t value = 0x811C9DC5; - for (const auto i : c10::irange((int)sizeof(ConvolutionParams))) { - value ^= ptr[i]; - value *= 0x01000193; - } - return (size_t)value; - } -}; - -struct ParamsEqual { - bool operator()(const ConvolutionParams& a, const ConvolutionParams& b) const { - auto ptr1 = reinterpret_cast(&a); - auto ptr2 = reinterpret_cast(&b); - return memcmp(ptr1, ptr2, sizeof(ConvolutionParams)) == 0; - } -}; - template struct BenchmarkCache { std::mutex mutex; - std::unordered_map map; + std::unordered_map, ParamsEqual> map; bool find(const ConvolutionParams& params, T* results) { std::lock_guard guard(mutex); @@ -314,39 +301,39 @@ size_t getWorkspaceSize( const ConvolutionArgs& args, const miopenConvFwdAlgorithm_t) { size_t sz = 0; - miopenConvolutionForwardGetWorkSpaceSize( + MIOPEN_CHECK(miopenConvolutionForwardGetWorkSpaceSize( args.handle, args.wdesc.desc(), args.idesc.desc(), args.cdesc.desc(), args.odesc.desc(), - &sz); + &sz)); return sz; } size_t getWorkspaceSize( const ConvolutionArgs& args, const miopenConvBwdDataAlgorithm_t) { size_t sz = 0; - miopenConvolutionBackwardDataGetWorkSpaceSize( + MIOPEN_CHECK(miopenConvolutionBackwardDataGetWorkSpaceSize( args.handle, args.odesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.idesc.desc(), - &sz); + &sz)); return sz; } size_t getWorkspaceSize( const ConvolutionArgs& args, const miopenConvBwdWeightsAlgorithm_t) { size_t sz = 0; - miopenConvolutionBackwardWeightsGetWorkSpaceSize( + MIOPEN_CHECK(miopenConvolutionBackwardWeightsGetWorkSpaceSize( args.handle, args.odesc.desc(), args.idesc.desc(), args.cdesc.desc(), args.wdesc.desc(), - &sz); + &sz)); return sz; } @@ -649,6 +636,94 @@ Workspace chooseSolution(const ConvolutionArgs& args, uint64_t* solution_id) } } +// See NOTE [ raw_cudnn_convolution_forward_out ] in aten/src/ATen/native/cudnn/Conv_v7.cpp + +// --------------------------------------------------------------------- +// +// Splitting to 32bit +// +// --------------------------------------------------------------------- + +template +static inline void split_batch_dim_to_32bit_out( + const at::Tensor& output, + const at::Tensor& input, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise, + int64_t max_worksize, + func_t func_32bit) { + constexpr int64_t int_max = std::numeric_limits::max(); + const int64_t ni = input.numel(); + const int64_t no = output.numel(); + // Assume the shape of the tensor is (N, C, D1, D2, ...) + // if N * C * D1 * D2 * ... <= int_max, then no need to split at all + if (ni <= int_max && no <= int_max) { + func_32bit( + output, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); + return; + } + // else, if C * D1 * D2 * ... <= int_max, then we just need to split across + // the N dimension + // + // Here we use a simple heuristics to determine the size of each split + // We don't max out the 2^31 address space because this number is super + // large and very likely to get an OOM. + int64_t n = output.size(0); + int64_t max_inner_size = std::max(ni, no) / n; + int64_t split_size = std::max(max_worksize / max_inner_size, 1L); + int64_t num_splits = (n + split_size - 1) / split_size; + if (split_size * max_inner_size < int_max) { + for (const auto i : c10::irange(num_splits)) { + int64_t start = split_size * i; + int64_t split_size_ = std::min(split_size, n - start); + Tensor input_ = input.narrow(0, start, split_size_); + Tensor output_ = output.narrow(0, start, split_size_); + func_32bit( + output_, + input_, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); + } + return; + } + // If control flow reaches here, this means even splitting N is not enough, + // then things starts to become complicated: For example, for conv2d, there + // following questions needs to be considered. + // - Is the memory layout NCHW or NHWC ? + // - If the conv is NCHW -> NC'H'W', then should we + // - split only NC? + // - split only N'C'? + // - split both? + // - If the conv is NHWC, then we need to split across H, we need to be very + // careful about the boundary condition + // to make sure that the boundary is handled correctly. + // - If we decide to make these splits, is the memory contiguous? Do we need + // to copy the memory? Considering the complexity of this issue, it is better + // not to use cuDNN for this case + TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN."); +} + // --------------------------------------------------------------------- // // Bias addition @@ -690,8 +765,47 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const */ } -// see NOTE [ Convolution design ] in src/Aten/native/cudnn/Conv.cpp +Tensor miopen_convolution_backward_bias(const Tensor& grad_output_t) +{ + TensorArg grad_output{ grad_output_t, "grad_output", 1 }; + + // TODO: Workaround since MIOpen does not support NHWC bias + // See #64426 + std::vector discard_dims; + for( int i = 0; i < grad_output_t.dim(); i++ ) { + if(i != output_channels_dim ) { + discard_dims.push_back(i); + } + } + + Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) ); + if( outputBias.dim() == 0 ) { + // always return a tensor of shape [_] + return outputBias.unsqueeze(0); + } + else { + return outputBias; + } + +/* MIOpen does not support NHWC bias. Activate once support is added. + auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options()); + + TensorArg grad_bias{ grad_bias_t, "result", 0 }; + + TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}), + static_cast(grad_output->dim())}; + TensorDescriptor odesc{*grad_output}; + + auto handle = getMiopenHandle(); + auto dataType = getMiopenDataType(*grad_bias); + Constant one(dataType, 1); + Constant zero(dataType, 0); + MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(), + &zero, bdesc.desc(), grad_bias->data_ptr())); + return *grad_bias; +*/ +} // --------------------------------------------------------------------- // @@ -699,30 +813,47 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const // // --------------------------------------------------------------------- -// The raw API directly invokes MIOpen. -// -// There are a few reasons this should never be directly exposed -// via ATen: -// -// - It takes output as a parameter (this should be computed!) -// - It doesn't do input checking -// - It doesn't resize output (it is assumed to be correctly sized) -// -void raw_miopen_convolution_forward_out( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - +void raw_miopen_convolution_forward_out_32bit( + const Tensor& output, + const Tensor& input, + const Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenConvolution; + miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution; - ConvolutionArgs args{ input, output, weight }; + ConvolutionArgs args{input, output, weight}; args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(weight, input.suggest_memory_format(), 0); - args.odesc.set(output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); + at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight); + setConvolutionParams( + &args.params, + args.handle, + input, + weight, + padding, + stride, + dilation, + groups, + deterministic, + memory_format); + args.idesc.set(input, memory_format); + args.wdesc.set(weight, memory_format, 0); + args.odesc.set(output, memory_format); + args.cdesc.set( + dataType, + c_mode, + input.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + benchmark, + deterministic); if (at::globalContext().immediateMiopen()) { uint64_t solution_id; @@ -730,10 +861,16 @@ void raw_miopen_convolution_forward_out( MIOPEN_CHECK(miopenConvolutionForwardImmediate( args.handle, - args.wdesc.desc(), weight.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), + args.wdesc.desc(), + weight.const_data_ptr(), + args.idesc.desc(), + input.const_data_ptr(), args.cdesc.desc(), - args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id)); + args.odesc.desc(), + output.data_ptr(), + workspace.data, + workspace.size, + solution_id)); } else { miopenConvFwdAlgorithm_t fwdAlg; @@ -744,472 +881,216 @@ void raw_miopen_convolution_forward_out( MIOPEN_CHECK(miopenConvolutionForward( args.handle, - &one, args.idesc.desc(), input.const_data_ptr(), - args.wdesc.desc(), weight.const_data_ptr(), - args.cdesc.desc(), fwdAlg, &zero, - args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size)); + &one, + args.idesc.desc(), + input.const_data_ptr(), + args.wdesc.desc(), + weight.const_data_ptr(), + args.cdesc.desc(), + fwdAlg, + &zero, + args.odesc.desc(), + output.data_ptr(), + workspace.data, + workspace.size)); } } -Tensor miopen_convolution_forward( +void raw_miopen_convolution_forward_out( + const Tensor& output, + const Tensor& input, + const Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + split_batch_dim_to_32bit_out( + output, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise, + 1024 * 1024 * 256, + raw_miopen_convolution_forward_out_32bit); +} + +void miopen_convolution_forward_out( + TensorArg& output, CheckedFrom c, - const TensorArg& input, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ + const TensorArg& input, + const TensorArg& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { checkAllSameType(c, {input, weight}); checkAllSameGPU(c, {input, weight}); - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *weight)) { - memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - - Tensor output_t = at::detail::empty_cuda( - conv_output_size(input->sizes(), weight->sizes(), - padding, stride, dilation), - input->options().memory_format(memory_format)); - - if (output_t.numel() == 0) { - return output_t; - } - - // Avoid ambiguity of "output" when this is being used as backwards - TensorArg output{ output_t, "result", 0 }; - convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); + auto memory_format = output->suggest_memory_format(); + convolution_shape_check( + c, input, weight, output, padding, stride, dilation, groups); - // See #4500 Tensor weight_contig = weight->contiguous(memory_format); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), memory_format); Tensor input_contig = input->contiguous(memory_format); - input_contig.resize_(input_contig.sizes(), memory_format); - - raw_miopen_convolution_forward_out( - *output, input_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return *output; + *output, + input_contig, + weight_contig, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); } Tensor miopen_convolution( - const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) -{ + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_t_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt); const Tensor& bias_t = *bias_t_maybe_owned; - TensorArg input { input_t, "input", 1 }, - weight { weight_t, "weight", 2 }, - bias { bias_t, "bias", 3 }; + TensorArg input{input_t, "input", 1 }, weight{weight_t, "weight", 2}, bias{bias_t, "bias", 3}; CheckedFrom c = "miopen_convolution"; - auto output_t = miopen_convolution_forward( - c, input, weight, padding, stride, dilation, groups, benchmark, deterministic); + auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input->options().memory_format(memory_format)); + if (output_t.numel() == 0) { + return output_t; + } + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, + c, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); if (bias->defined()) { - miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + miopen_convolution_add_bias_(c, output, bias); } - return output_t; + return *output; } -//Depthwise Convolutions -void raw_miopen_depthwise_convolution_forward_out( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { +Tensor miopen_convolution_transpose_backward_input( + const Tensor& grad_output_t, + const Tensor& weight_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { + TensorArg grad_output{ grad_output_t, "grad_output", 1 }, weight{weight_t, "weight", 2}; + auto memory_format = + miopen_conv_suggest_memory_format(grad_output_t, weight_t); + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + grad_output_t.sizes(), weight_t.sizes(), padding, stride, dilation), + grad_output_t.options().memory_format(memory_format)); - auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenDepthwise; + if (output_t.numel() == 0) { + return output_t; + } + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, + "miopen_convolution_transpose_backward_input", + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); + return *output; +} - ConvolutionArgs args{ input, output, weight }; - args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(weight, input.suggest_memory_format(), 0); - args.odesc.set(output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); +// file organization would put miopen_convolution_transpose_backward_weight here, +// but it depends on miopen_convolution_backward_weight which is defined later +Tensor miopen_convolution_transpose_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic); - if (at::globalContext().immediateMiopen()) { - uint64_t solution_id; - Workspace workspace = chooseSolution(args, &solution_id); +std::tuple miopen_convolution_transpose_backward( + const at::Tensor& input, + const at::Tensor& grad_output_t, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + std::array output_mask) { + Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); - MIOPEN_CHECK(miopenConvolutionForwardImmediate( - args.handle, - args.wdesc.desc(), weight.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), - args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id)); + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = miopen_convolution_transpose_backward_input( + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); } - else { - miopenConvFwdAlgorithm_t fwdAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &fwdAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionForward( - args.handle, - &one, args.idesc.desc(), input.const_data_ptr(), - args.wdesc.desc(), weight.const_data_ptr(), - args.cdesc.desc(), fwdAlg, &zero, - args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size)); + if (output_mask[1]) { + grad_weight = miopen_convolution_transpose_backward_weight( + weight.sizes(), + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); + } + if (output_mask[2]) { + grad_bias = miopen_convolution_backward_bias(grad_output); } -} - -Tensor miopen_depthwise_convolution_forward( - CheckedFrom c, - const TensorArg& input, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - checkAllSameType(c, {input, weight}); - checkAllSameGPU(c, {input, weight}); - - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *weight)) { - memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - - Tensor output_t = at::detail::empty_cuda( - conv_output_size(input->sizes(), weight->sizes(), - padding, stride, dilation), - input->options().memory_format(memory_format)); - - TensorArg output{ output_t, "result", 0 }; - convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); - - // See #4500 - Tensor weight_contig = weight->contiguous(memory_format); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), memory_format); - Tensor input_contig = input->contiguous(memory_format); - input_contig.resize_(input_contig.sizes(), memory_format); - - raw_miopen_depthwise_convolution_forward_out( - *output, input_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return *output; -} - -Tensor miopen_depthwise_convolution( - const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) -{ - // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt); - const Tensor& bias_t = *bias_t_maybe_owned; - - TensorArg input { input_t, "input", 1 }, - weight { weight_t, "weight", 2 }, - bias { bias_t, "bias", 3 }; - CheckedFrom c = "miopen_depthwise_convolution"; - auto output_t = miopen_depthwise_convolution_forward( - c, input, weight, padding, stride, dilation, groups, benchmark, deterministic); - if (bias->defined()) { - miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias); - } - return output_t; -} - -// --------------------------------------------------------------------- -// -// Convolution backward (bias) -// -// --------------------------------------------------------------------- - -Tensor miopen_convolution_backward_bias( - const Tensor& grad_output_t) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }; - - // TODO: Workaround since MIOpen does not support NHWC bias - // See #64426 - std::vector discard_dims; - for( int i = 0; i < grad_output_t.dim(); i++ ) { - if(i != output_channels_dim ) { - discard_dims.push_back(i); - } - } - - Tensor outputBias = at::squeeze( at::sum(grad_output_t, discard_dims, true) ); - if( outputBias.dim() == 0 ) { - // always return a tensor of shape [_] - return outputBias.unsqueeze(0); - } - else { - return outputBias; - } - -/* MIOpen does not support NHWC bias. Activate once support is added. - auto grad_bias_t = at::empty( { grad_output->size(output_channels_dim) }, grad_output->options()); - - TensorArg grad_bias{ grad_bias_t, "result", 0 }; - - TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}), - static_cast(grad_output->dim())}; - TensorDescriptor odesc{*grad_output}; - - auto handle = getMiopenHandle(); - auto dataType = getMiopenDataType(*grad_bias); - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(), - &zero, bdesc.desc(), grad_bias->data_ptr())); - return *grad_bias; -*/ -} - -// --------------------------------------------------------------------- -// -// Convolution backward (weight) -// -// --------------------------------------------------------------------- - -void raw_miopen_convolution_backward_weight_out( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - - auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenConvolution; - - ConvolutionArgs args{ input, grad_output, grad_weight }; - args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(grad_weight, input.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); - - if (at::globalContext().immediateMiopen()) { - uint64_t solution_id; - Workspace workspace = chooseSolution(args, &solution_id); - - MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate( - args.handle, - args.odesc.desc(), grad_output.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id)); - } - else { - miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardWeights( - args.handle, - &one, args.odesc.desc(), grad_output.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), bwdFilterAlg, &zero, - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); - } -} - -//Depthwise backward weights. -void raw_miopen_depthwise_convolution_backward_weight_out( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - - auto dataType = getMiopenDataType(input); - miopenConvolutionMode_t c_mode = miopenDepthwise; - - ConvolutionArgs args{ input, grad_output, grad_weight }; - args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, grad_weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(grad_weight, input.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); - - if (at::globalContext().immediateMiopen()) { - uint64_t solution_id; - Workspace workspace = chooseSolution(args, &solution_id); - - MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate( - args.handle, - args.odesc.desc(), grad_output.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id)); - } - else { - miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardWeights( - args.handle, - &one, args.odesc.desc(), grad_output.const_data_ptr(), - args.idesc.desc(), input.const_data_ptr(), - args.cdesc.desc(), bwdFilterAlg, &zero, - args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); - } -} - -Tensor miopen_depthwise_convolution_backward_weight( - CheckedFrom c, - IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - - checkAllSameType(c, {grad_output, input}); - checkAllSameGPU(c, {grad_output, input}); - - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *grad_output)) { - memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - - Tensor grad_output_contig_t = grad_output->contiguous(memory_format); - // Make sure that NC11 strides follow formula - grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format); - TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; - - Tensor input_contig_t = input->contiguous(memory_format); - input_contig_t.resize_(input_contig_t.sizes(), memory_format); - TensorArg input_contig{ input_contig_t, "input", 2}; - - auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format); - - // For uniformity with everything else, although it seems grad_weight - // would be unambiguous too. - TensorArg grad_weight{ grad_weight_t, "result", 0 }; - convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); - - raw_miopen_depthwise_convolution_backward_weight_out( - *grad_weight, *grad_output_contig, *input_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return grad_weight_t; -} - -Tensor miopen_depthwise_convolution_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - input{ input_t, "input", 2 }; - return miopen_depthwise_convolution_backward_weight( - "miopen_depthwise_convolution_backward_weight", - weight_size, grad_output, input, - padding, stride, dilation, groups, benchmark, deterministic); -} - -Tensor miopen_convolution_backward_weight( - CheckedFrom c, - IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - - checkAllSameType(c, {grad_output, input}); - checkAllSameGPU(c, {grad_output, input}); - - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*input, *grad_output)) { - memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - - Tensor grad_output_contig_t = grad_output->contiguous(memory_format); - // Make sure that NC11 strides follow formula - grad_output_contig_t.resize_(grad_output_contig_t.sizes(), memory_format); - TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; - - Tensor input_contig_t = input->contiguous(memory_format); - input_contig_t.resize_(input_contig_t.sizes(), memory_format); - TensorArg input_contig{ input_contig_t, "input", 2}; - - auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), memory_format); - - // For uniformity with everything else, although it seems grad_weight - // would be unambiguous too. - TensorArg grad_weight{ grad_weight_t, "result", 0 }; - convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); - - raw_miopen_convolution_backward_weight_out( - *grad_weight, *grad_output_contig, *input_contig, - padding, stride, dilation, groups, benchmark, deterministic); - - return grad_weight_t; -} - -Tensor miopen_convolution_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - input{ input_t, "input", 2 }; - return miopen_convolution_backward_weight( - "miopen_convolution_backward_weight", - weight_size, grad_output, input, - padding, stride, dilation, groups, benchmark, deterministic); -} - -Tensor miopen_convolution_transpose_backward_input( - const Tensor& grad_output_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) -{ - TensorArg grad_output { grad_output_t, "grad_output", 1 }, - weight { weight_t, "weight", 2 }; - return miopen_convolution_forward( - "miopen_convolution_transpose_backward_input", - grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); -} - -Tensor miopen_convolution_transpose_backward_weight( - IntArrayRef weight_size, - const Tensor& grad_output_t, - const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - input{ input_t, "input", 2 }; - return miopen_convolution_backward_weight( - "miopen_convolution_backward_weight", - weight_size, input, grad_output, - padding, stride, dilation, groups, benchmark, deterministic); -} - -std::tuple miopen_convolution_transpose_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, std::array output_mask) { - - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); - - Tensor grad_input, grad_weight, grad_bias; - if (output_mask[0]) { - grad_input = miopen_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[1]) { - grad_weight = miopen_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[2]) { - grad_bias = miopen_convolution_backward_bias(grad_output); - } - - return std::tuple{grad_input, grad_weight, grad_bias}; + + return std::tuple{grad_input, grad_weight, grad_bias}; } // --------------------------------------------------------------------- @@ -1218,23 +1099,50 @@ std::tuple miopen_convolution_transpose_backwa // // --------------------------------------------------------------------- -void raw_miopen_convolution_backward_input_out( +// See NOTE [ Backward vs transpose convolutions ] in aten/src/ATen/native/cudnn/ConvShared.cpp + +void raw_miopen_convolution_backward_input_out_32bit( const at::Tensor& grad_input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { auto dataType = getMiopenDataType(grad_output); - miopenConvolutionMode_t c_mode = miopenConvolution; + miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution; - ConvolutionArgs args{ grad_input, grad_output, weight }; + ConvolutionArgs args{grad_input, grad_output, weight}; args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(grad_input); - args.wdesc.set(weight, grad_output.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); + at::MemoryFormat memory_format = + miopen_conv_suggest_memory_format(grad_input, weight); + setConvolutionParams( + &args.params, + args.handle, + grad_input, + weight, + padding, + stride, + dilation, + groups, + deterministic, + memory_format); + args.idesc.set(grad_input, memory_format); + args.wdesc.set(weight, memory_format, 0); + args.odesc.set(grad_output, memory_format); + args.cdesc.set( + dataType, + c_mode, + grad_output.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + benchmark, + deterministic); if (at::globalContext().immediateMiopen()) { uint64_t solution_id; @@ -1245,7 +1153,10 @@ void raw_miopen_convolution_backward_input_out( args.odesc.desc(), grad_output.const_data_ptr(), args.wdesc.desc(), weight.const_data_ptr(), args.cdesc.desc(), - args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id)); + args.idesc.desc(), grad_input.mutable_data_ptr(), + workspace.data, + workspace.size, + solution_id)); } else { miopenConvBwdDataAlgorithm_t bwdDataAlg; @@ -1256,216 +1167,521 @@ void raw_miopen_convolution_backward_input_out( MIOPEN_CHECK(miopenConvolutionBackwardData( args.handle, - &one, args.odesc.desc(), grad_output.const_data_ptr(), + &one, + args.odesc.desc(), grad_output.const_data_ptr(), args.wdesc.desc(), weight.const_data_ptr(), - args.cdesc.desc(), bwdDataAlg, &zero, - args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size)); + args.cdesc.desc(), + bwdDataAlg, + &zero, + args.idesc.desc(), grad_input.mutable_data_ptr(), + workspace.data, + workspace.size)); } } -// see NOTE [ Backward vs transpose convolutions ] in src/Aten/native/cudnn/Conv.cpp +void raw_miopen_convolution_backward_input_out( + const at::Tensor& grad_input, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + split_batch_dim_to_32bit_out( + grad_input, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise, + 1024 * 1024 * 128, + raw_miopen_convolution_backward_input_out_32bit); +} Tensor miopen_convolution_backward_input( CheckedFrom c, - IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ + IntArrayRef input_size, + const TensorArg& grad_output, + const TensorArg& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { checkAllSameType(c, {grad_output, weight}); checkAllSameGPU(c, {grad_output, weight}); - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*grad_output, *weight)) { - memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } - + auto memory_format = miopen_conv_suggest_memory_format(*grad_output, *weight); Tensor grad_input_t = at::detail::empty_cuda( input_size, grad_output->options().memory_format(memory_format)); // Avoid "grad_input" when this is being used as transposed convolution - TensorArg grad_input{ grad_input_t, "result", 0 }; - convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); + TensorArg grad_input{grad_input_t, "result", 0}; + convolution_shape_check( + c, grad_input, weight, grad_output, padding, stride, dilation, groups); - // See #4500 Tensor weight_contig = weight->contiguous(memory_format); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), memory_format); - Tensor grad_output_contig = grad_output->contiguous(memory_format); - grad_output_contig.resize_(grad_output_contig.sizes(), memory_format); raw_miopen_convolution_backward_input_out( - *grad_input, grad_output_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic); + *grad_input, + grad_output_contig, + weight_contig, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); return *grad_input; } -Tensor miopen_convolution_transpose_forward( - CheckedFrom c, - const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(), - padding, output_padding, stride, dilation, groups); - return miopen_convolution_backward_input(c, input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic); -} - +// overload Tensor miopen_convolution_backward_input( - IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - weight{ weight_t, "weight", 2 }; + IntArrayRef input_size, + const Tensor& grad_output_t, + const Tensor& weight_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + TensorArg grad_output{grad_output_t, "grad_output", 1}, + weight{weight_t, "weight", 2}; return miopen_convolution_backward_input( "miopen_convolution_backward_input", - input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic); + input_size, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); } -//Depthwise convolutions backward data. -void raw_miopen_depthwise_convolution_backward_input_out( - const at::Tensor& grad_input, - const at::Tensor& grad_output, - const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) { - - auto dataType = getMiopenDataType(grad_output); - miopenConvolutionMode_t c_mode = miopenDepthwise; +void raw_miopen_convolution_backward_weight_out_32bit( + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + auto dataType = getMiopenDataType(input); + miopenConvolutionMode_t c_mode = depthwise ? miopenDepthwise : miopenConvolution; - ConvolutionArgs args{ grad_input, grad_output, weight }; + ConvolutionArgs args{input, grad_output, grad_weight}; args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, grad_input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(grad_input); - args.wdesc.set(weight, grad_output.suggest_memory_format(), 0); - args.odesc.set(grad_output); - args.cdesc.set(dataType, c_mode, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); + at::MemoryFormat memory_format = + miopen_conv_suggest_memory_format(input, grad_weight); + setConvolutionParams( + &args.params, + args.handle, + input, + grad_weight, + padding, + stride, + dilation, + groups, + deterministic, + memory_format); + args.idesc.set(input, memory_format); + args.wdesc.set(grad_weight, memory_format, 0); + args.odesc.set(grad_output, memory_format); + args.cdesc.set( + dataType, + c_mode, + input.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + benchmark, + deterministic); if (at::globalContext().immediateMiopen()) { uint64_t solution_id; - Workspace workspace = chooseSolution(args, &solution_id); + Workspace workspace = chooseSolution(args, &solution_id); - MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate( + MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate( args.handle, args.odesc.desc(), grad_output.const_data_ptr(), - args.wdesc.desc(), weight.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), args.cdesc.desc(), - args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id)); + args.wdesc.desc(), grad_weight.data_ptr(), + workspace.data, + workspace.size, + solution_id)); + } + else { + miopenConvBwdWeightsAlgorithm_t bwdFilterAlg; + Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlg); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + MIOPEN_CHECK(miopenConvolutionBackwardWeights( + args.handle, + &one, + args.odesc.desc(), grad_output.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), + args.cdesc.desc(), + bwdFilterAlg, + &zero, + args.wdesc.desc(), grad_weight.data_ptr(), + workspace.data, + workspace.size)); + } +} + +void raw_miopen_convolution_backward_weight_out( + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + constexpr int64_t int_max = std::numeric_limits::max(); + const int64_t ni = input.numel(); + const int64_t no = grad_output.numel(); + // Assume the shape of the tensor is (N, C, D1, D2, ...) + // if N * C * D1 * D2 * ... <= int_max, then no need to split at all + if (ni <= int_max && no <= int_max) { + raw_miopen_convolution_backward_weight_out_32bit( + grad_weight, + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); + return; } - else { - miopenConvBwdDataAlgorithm_t bwdDataAlg; - Workspace workspace = chooseAlgorithm(args, benchmark, &bwdDataAlg); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - MIOPEN_CHECK(miopenConvolutionBackwardData( - args.handle, - &one, args.odesc.desc(), grad_output.const_data_ptr(), - args.wdesc.desc(), weight.const_data_ptr(), - args.cdesc.desc(), bwdDataAlg, &zero, - args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size)); + // else, if C * D1 * D2 * ... <= int_max, then we just need to split across + // the N dimension + // + // Here we use a simple heuristics to determine the size of each split + // We don't max out the 2^31 address space because this number is super + // large and very likely to get an OOM. + int64_t n = grad_output.size(0); + int64_t max_inner_size = std::max(ni, no) / n; + int64_t split_size = + std::max(1024 * 1024 * 512 / max_inner_size, 1L); + int64_t num_splits = (n + split_size - 1) / split_size; + if (split_size * max_inner_size < int_max) { + const auto kAccType = (grad_weight.scalar_type() == kHalf || + grad_weight.scalar_type() == kBFloat16) + ? kFloat + : grad_weight.scalar_type(); + Tensor grad_weight_accumulator = + at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType)); + for (const auto i : c10::irange(num_splits)) { + int64_t start = split_size * i; + int64_t split_size_ = std::min(split_size, n - start); + Tensor input_ = input.narrow(0, start, split_size_); + Tensor grad_output_ = grad_output.narrow(0, start, split_size_); + Tensor grad_weight_ = at::empty_like(grad_weight); + raw_miopen_convolution_backward_weight_out_32bit( + grad_weight_, + grad_output_, + input_, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); + grad_weight_accumulator.add_(grad_weight_); + } + grad_weight.copy_(grad_weight_accumulator); + return; } + // If control flow reaches here, this means even splitting N is not enough, + // then things starts to become complicated: For example, for conv2d, there + // following questions needs to be considered. + // - Is the memory layout NCHW or NHWC ? + // - If the conv is NCHW -> NC'H'W', then should we + // - split only NC? + // - split only N'C'? + // - split both? + // - If the conv is NHWC, then we need to split across H, we need to be very + // careful about the boundary condition + // to make sure that the boundary is handled correctly. + // - If we decide to make these splits, is the memory contiguous? Do we need + // to copy the memory? Considering the complexity of this issue, it is better + // not to use cuDNN for this case + TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN."); } -Tensor miopen_depthwise_convolution_backward_input( +Tensor miopen_convolution_backward_weight( CheckedFrom c, - IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - checkAllSameType(c, {grad_output, weight}); - checkAllSameGPU(c, {grad_output, weight}); + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + auto memory_format = miopen_conv_suggest_memory_format(input_t, grad_output_t); - auto memory_format = at::MemoryFormat::Contiguous; - if (miopen_conv_use_channels_last(*grad_output, *weight)) { - memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; - } + Tensor grad_output_contig_t = grad_output_t.contiguous(memory_format); + TensorArg grad_output_contig{grad_output_contig_t, "grad_output", 1}; - Tensor grad_input_t = at::detail::empty_cuda( - input_size, grad_output->options().memory_format(memory_format)); + Tensor input_contig_t = input_t.contiguous(memory_format); + TensorArg input{input_contig_t, "input", 2}; - TensorArg grad_input{ grad_input_t, "result", 0 }; - convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); + checkAllSameType(c, {grad_output_contig, input}); + checkAllSameGPU(c, {grad_output_contig, input}); - // See #4500 - Tensor weight_contig = weight->contiguous(memory_format); - // Make sure that NC11 strides follow formula - weight_contig.resize_(weight_contig.sizes(), memory_format); + auto grad_weight_t = + at::empty(weight_size, grad_output_contig->options(), memory_format); - Tensor grad_output_contig = grad_output->contiguous(memory_format); - grad_output_contig.resize_(grad_output_contig.sizes(), memory_format); + // For uniformity with everything else, although it seems grad_weight + // would be unambiguous too. + TensorArg grad_weight{grad_weight_t, "result", 0}; + convolution_shape_check( + c, + input, + grad_weight, + grad_output_contig, + padding, + stride, + dilation, + groups); - raw_miopen_depthwise_convolution_backward_input_out( - *grad_input, grad_output_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic); + raw_miopen_convolution_backward_weight_out( + *grad_weight, + *grad_output_contig, + *input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); - return *grad_input; + return grad_weight_t; } -Tensor miopen_depthwise_convolution_backward_input( - IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - weight{ weight_t, "weight", 2 }; - return miopen_depthwise_convolution_backward_input( - "miopen_depthwise_convolution_backward_input", - input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic); +// overload +Tensor miopen_convolution_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool depthwise=false) { + return miopen_convolution_backward_weight( + "miopen_convolution_backward_weight", + weight_size, + grad_output_t, + input_t, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + depthwise); } -std::tuple miopen_convolution_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, std::array output_mask) { - - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); +std::tuple miopen_convolution_backward( + const at::Tensor& input, + const at::Tensor& grad_output_t, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + std::array output_mask) { + Tensor grad_output = grad_output_t.to(input.suggest_memory_format()); Tensor grad_input, grad_weight, grad_bias; - if (output_mask[0]) { - grad_input = miopen_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[1]) { - grad_weight = miopen_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[2]) { - grad_bias = miopen_convolution_backward_bias(grad_output); + if (input.numel() == 0) { + if (output_mask[0]) { + grad_input = at::empty_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (output_mask[1]) { + grad_weight = at::zeros_like(weight, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (output_mask[2]) { + grad_bias = at::zeros_like(grad_output_t, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + } else { + if (output_mask[0]) { + grad_input = miopen_convolution_backward_input( + input.sizes(), + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); + } + if (output_mask[1]) { + grad_weight = miopen_convolution_backward_weight( + weight.sizes(), + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); + } + if (output_mask[2]) { + grad_bias = miopen_convolution_backward_bias(grad_output); + } } - return std::tuple{grad_input, grad_weight, grad_bias}; + return std::tuple{grad_input, grad_weight, grad_bias}; } -std::tuple miopen_depthwise_convolution_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, std::array output_mask) { +Tensor miopen_convolution_transpose_forward( + CheckedFrom c, + const TensorArg& grad_output, + const TensorArg& weight, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { + auto input_size = conv_input_size( + grad_output->sizes(), + weight->sizes(), + padding, + output_padding, + stride, + dilation, + groups); + return miopen_convolution_backward_input( + c, + input_size, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); +} - Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); +Tensor miopen_convolution_transpose_backward_weight( + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { + return miopen_convolution_backward_weight( + "miopen_convolution_backward_weight", + weight_size, + input_t, + grad_output_t, + padding, + stride, + dilation, + groups, + benchmark, + deterministic); +} - Tensor grad_input, grad_weight, grad_bias; - if (output_mask[0]) { - grad_input = miopen_depthwise_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[1]) { - grad_weight = miopen_depthwise_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic); - } - if (output_mask[2]) { - grad_bias = miopen_convolution_backward_bias(grad_output); - } +Tensor miopen_convolution_transpose( + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_t_opt, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt); + const Tensor& bias_t = *bias_t_maybe_owned; - return std::tuple{grad_input, grad_weight, grad_bias}; + TensorArg input{input_t, "input", 1}, weight{weight_t, "weight", 2}, bias{bias_t, "bias", 3}; + CheckedFrom c = "miopen_convolution_transpose"; + auto output_t = miopen_convolution_transpose_forward( + c, + input, + weight, + padding, + output_padding, + stride, + dilation, + groups, + benchmark, + deterministic); + if (bias->defined()) { + miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + } + return output_t; } -Tensor miopen_convolution_transpose( - const Tensor& input_t, const Tensor& weight_t, const std::optional& bias_t_opt, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic) +// --------------------------------------------------------------------- +// +// Convolution depthwise +// +// --------------------------------------------------------------------- + +Tensor miopen_depthwise_convolution( + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_t_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt); @@ -1474,16 +1690,86 @@ Tensor miopen_convolution_transpose( TensorArg input { input_t, "input", 1 }, weight { weight_t, "weight", 2 }, bias { bias_t, "bias", 3 }; - CheckedFrom c = "miopen_convolution_transpose"; - auto output_t = miopen_convolution_transpose_forward( - c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic); + CheckedFrom c = "miopen_depthwise_convolution"; + auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input_t.options().memory_format(memory_format)); + if (output_t.numel() == 0) { + return output_t; + } + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, + c, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + true); if (bias->defined()) { - miopen_convolution_add_bias_(c, { output_t, "result", 0 }, bias); + miopen_convolution_add_bias_(c, output, bias); } - return output_t; + return *output; } -// MIOpen fused convolution bias activation forward +std::tuple miopen_depthwise_convolution_backward( + const at::Tensor& input, + const at::Tensor& grad_output_t, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + std::array output_mask) { + Tensor grad_output = grad_output_t.to(input.suggest_memory_format()); + + Tensor grad_input, grad_weight, grad_bias; + if (output_mask[0]) { + grad_input = miopen_convolution_backward_input( + input.sizes(), + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + true); + } + if (output_mask[1]) { + grad_weight = miopen_convolution_backward_weight( + weight.sizes(), + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + true); + } + if (output_mask[2]) { + grad_bias = miopen_convolution_backward_bias(grad_output); + } + + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +// --------------------------------------------------------------------- +// fusions +// --------------------------------------------------------------------- + void raw_miopen_convolution_relu_out( const Tensor& output, const Tensor& input, @@ -1495,17 +1781,35 @@ void raw_miopen_convolution_relu_out( int64_t groups, bool benchmark, bool deterministic) { - auto dataType = getMiopenDataType(input); miopenConvolutionMode_t c_mode = miopenConvolution; - ConvolutionArgs args{ input, output, weight }; args.handle = getMiopenHandle(); - setConvolutionParams(&args.params, args.handle, input, weight, padding, stride, dilation, groups, deterministic); - args.idesc.set(input); - args.wdesc.set(weight, input.suggest_memory_format(), 0); - args.odesc.set(output); - args.cdesc.set(dataType, c_mode, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, benchmark, deterministic); + at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight); + setConvolutionParams( + &args.params, + args.handle, + input, + weight, + padding, + stride, + dilation, + groups, + deterministic, + memory_format); + args.idesc.set(input, memory_format); + args.wdesc.set(weight, memory_format, 0); + args.odesc.set(output, memory_format); + args.cdesc.set( + dataType, + c_mode, + input.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + benchmark, + deterministic); TensorDescriptor bdesc; bdesc.set(bias.expand({1, bias.size(0)}), output.dim()); @@ -1549,8 +1853,8 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m } Tensor miopen_convolution_add_relu( - const Tensor& input, - const Tensor& weight, + const Tensor& input_t, + const Tensor& weight_t, const Tensor& z, const std::optional& alpha, const std::optional& bias, @@ -1562,17 +1866,28 @@ Tensor miopen_convolution_add_relu( // MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function: // y = act ( alpha1 * conv(x) + alpha2 * z + bias ) - auto memory_format = input.suggest_memory_format(); + auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); auto& ctx = at::globalContext(); bool benchmark = ctx.benchmarkCuDNN(); - TensorArg input_arg { input, "input", 1 }, - weight_arg { weight, "weight", 2 }; - auto output = miopen_convolution_forward( + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }; + + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input_t.options().memory_format(memory_format)); + if (output_t.numel() == 0){ + return output_t; + } + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, "miopen_convolution_add_relu", - input_arg, - weight_arg, + input, + weight, padding, stride, dilation, @@ -1581,53 +1896,51 @@ Tensor miopen_convolution_add_relu( false // deterministic ); - auto contig_output = self_or_new_memory_format(output, memory_format); + auto contig_output_t = self_or_new_memory_format(output_t, memory_format); - if (!output.is_same(contig_output)) { - contig_output.copy_(output); + if (!output_t.is_same(contig_output_t)) { + contig_output_t.copy_(output_t); } auto _alpha = alpha.has_value() ? alpha.value().to() : 1.0; auto _bias = bias.has_value() ? bias.value() : at::zeros( - {contig_output.size(1)}, - optTypeMetaToScalarType(contig_output.options().dtype_opt()), - contig_output.options().layout_opt(), - contig_output.options().device_opt(), - contig_output.options().pinned_memory_opt()); + {contig_output_t.size(1)}, + optTypeMetaToScalarType(contig_output_t.options().dtype_opt()), + contig_output_t.options().layout_opt(), + contig_output_t.options().device_opt(), + contig_output_t.options().pinned_memory_opt()); - at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input.dim(), _bias).add(z, _alpha); - contig_output.add_(alpha_mul_z_add_bias); - contig_output.relu_(); + at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha); + contig_output_t.add_(alpha_mul_z_add_bias); + contig_output_t.relu_(); - return contig_output; + return contig_output_t; } Tensor miopen_convolution_relu( - const Tensor& input, - const Tensor& weight, + const Tensor& input_t, + const Tensor& weight_t, const std::optional& bias, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, int64_t groups) { - auto memory_format = input.suggest_memory_format(); - auto& ctx = at::globalContext(); bool benchmark = ctx.benchmarkCuDNN(); // MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d - if (input.suggest_memory_format() == at::MemoryFormat::Contiguous - && input.scalar_type() == at::kFloat - && input.ndimension() == 4) { + if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous + && input_t.scalar_type() == at::kFloat + && input_t.ndimension() == 4) { // FuseFrozenConvAddRelu performs some tensor shape checking Tensor output_t = at::detail::empty_cuda( conv_output_size( - input.sizes(), weight.sizes(), padding, stride, dilation), - input.options().memory_format(input.suggest_memory_format())); + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input_t.options().memory_format(input_t.suggest_memory_format())); if (output_t.numel() == 0) { return output_t; } @@ -1643,8 +1956,8 @@ Tensor miopen_convolution_relu( raw_miopen_convolution_relu_out( output_t, - input, - weight, + input_t, + weight_t, _bias, stride, padding, @@ -1659,12 +1972,25 @@ Tensor miopen_convolution_relu( else { // fallback - TensorArg input_arg { input, "input", 1 }, - weight_arg { weight, "weight", 2 }; - auto output = miopen_convolution_forward( + auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t); + + TensorArg input { input_t, "input", 1 }, + weight { weight_t, "weight", 2 }; + + Tensor output_t = at::detail::empty_cuda( + conv_output_size( + input_t.sizes(), weight_t.sizes(), padding, stride, dilation), + input->options().memory_format(memory_format)); + if (output_t.numel() == 0){ + return output_t; + } + // Avoid ambiguity of "output" when this is being used as backwards + TensorArg output{output_t, "result", 0}; + miopen_convolution_forward_out( + output, "miopen_convolution_relu", - input_arg, - weight_arg, + input, + weight, padding, stride, dilation, @@ -1673,26 +1999,26 @@ Tensor miopen_convolution_relu( false // deterministic ); - auto contig_output = self_or_new_memory_format(output, memory_format); + auto contig_output_t = self_or_new_memory_format(output_t, memory_format); - if (!output.is_same(contig_output)) { - contig_output.copy_(output); + if (!output_t.is_same(contig_output_t)) { + contig_output_t.copy_(output_t); } auto _bias = bias.has_value() ? bias.value() : at::zeros( - {contig_output.size(1)}, - optTypeMetaToScalarType(contig_output.options().dtype_opt()), - contig_output.options().layout_opt(), - contig_output.options().device_opt(), - contig_output.options().pinned_memory_opt()); + {contig_output_t.size(1)}, + optTypeMetaToScalarType(contig_output_t.options().dtype_opt()), + contig_output_t.options().layout_opt(), + contig_output_t.options().device_opt(), + contig_output_t.options().pinned_memory_opt()); - at::Tensor reshaped_bias = at::native::reshape_bias(input.dim(), _bias); - contig_output.add_(reshaped_bias); - contig_output.relu_(); + at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias); + contig_output_t.add_(reshaped_bias); + contig_output_t.relu_(); - return contig_output; + return contig_output_t; } } diff --git a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp index ef485904f977..873005b3dd2b 100644 --- a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp @@ -260,7 +260,7 @@ _scaled_dot_product_fused_attention_overrideable_xpu( alloc_with_matching_layout(query, output, output_shape); at::Tensor logsumexp, debug_attn_mask; // not supported - at::native::onednn::gpu_float_sdpa( + at::native::onednn::sdpa( batch_size, seq_len_q, seq_len_kv, @@ -274,7 +274,9 @@ _scaled_dot_product_fused_attention_overrideable_xpu( attn_bias, is_causal, scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)), - output); + output, + false, + logsumexp); // rng not used auto philox_seed = at::empty({}, at::dtype(at::kLong)); diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp index 1d90711f6e38..e840e21f4f7a 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp @@ -13,6 +13,9 @@ using dims = logical_tensor::dims; using op = dnnl::graph::op; using partition = dnnl::graph::partition; +constexpr logical_tensor::data_type sdpa_intermediate_dtype = + logical_tensor::data_type::f32; + inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) { return scalar_type == c10::ScalarType::Float ? data_type::f32 : scalar_type == c10::ScalarType::Half ? data_type::f16 @@ -20,6 +23,8 @@ inline data_type to_logical_tensor_data_type(c10::ScalarType scalar_type) { : data_type::undef; } +namespace sdpa_forward { + struct SDPALogicalParams { enum class TensorID { query, @@ -28,7 +33,8 @@ struct SDPALogicalParams { neg_inf, attn_mask, value, - output, + attention, + logsumexp, end, }; @@ -38,14 +44,16 @@ struct SDPALogicalParams { std::optional neg_inf; std::optional attn_mask; logical_tensor value{}; - logical_tensor output{}; + logical_tensor attention{}; + std::optional logsumexp; SDPALogicalParams( const at::Tensor& query_, const at::Tensor& key_, const at::Tensor& value_, const std::optional& attn_mask_, - const at::Tensor& output_, + const at::Tensor& attention_, + const at::Tensor& logsumexp_, int batch_size, int seq_len_q, int seq_len_kv, @@ -53,19 +61,26 @@ struct SDPALogicalParams { int num_head_kv, int head_dim_qk, int head_dim_v, - bool is_causal) { + bool is_causal, + bool compute_logsumexp) { const data_type dtype = to_logical_tensor_data_type(query_.scalar_type()); TORCH_INTERNAL_ASSERT( (dtype != data_type::undef), "Only FP16/BF16/FP32 datatypes are currently supported"); + TORCH_INTERNAL_ASSERT( + query_.scalar_type() == attention_.scalar_type(), + "scaled_dot_product_attention_xpu: query and attention tensors should have the same data type."); const dims scalar_shape = {1}; - std::vector inputLogicalTensors; at::Tensor reshaped_query = query_; at::Tensor reshaped_key = key_; at::Tensor reshaped_value = value_; - at::Tensor reshaped_output = output_; + at::Tensor reshaped_attention = attention_; + at::Tensor reshaped_logsumexp = + compute_logsumexp ? logsumexp_.unsqueeze(-1) : logsumexp_; at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor()); + + // handle broadcasted input tensors for OneDNN if (at::native::onednn::is_broadcast(reshaped_query)) { at::native::onednn::undo_broadcast(reshaped_query); } @@ -75,9 +90,6 @@ struct SDPALogicalParams { if (at::native::onednn::is_broadcast(reshaped_value)) { at::native::onednn::undo_broadcast(reshaped_value); } - if (at::native::onednn::is_broadcast(reshaped_output)) { - at::native::onednn::undo_broadcast(reshaped_output); - } if (attn_mask_.has_value() && at::native::onednn::is_broadcast(reshaped_attn_mask)) { at::native::onednn::undo_broadcast(reshaped_attn_mask); @@ -95,23 +107,22 @@ struct SDPALogicalParams { {batch_size, group_num, group_size, seq_len_q, head_dim_qk}); reshaped_key = key_.unsqueeze(2); reshaped_value = value_.unsqueeze(2); - reshaped_output = output_.view( + reshaped_attention = attention_.view( {batch_size, group_num, group_size, seq_len_q, head_dim_v}); if (attn_mask_.has_value() && attn_mask_.value().dim() == 4) { reshaped_attn_mask = attn_mask_.value().unsqueeze(2); } } - query = { - static_cast(TensorID::query), - dtype, - reshaped_query.sizes().vec(), - reshaped_query.strides().vec()}; - key = { - static_cast(TensorID::key), - dtype, - reshaped_key.sizes().vec(), - reshaped_key.strides().vec()}; +#define LOGIC_TENSOR_DESC(name, dtype) \ + name = { \ + static_cast(TensorID::name), \ + dtype, \ + reshaped_##name.sizes().vec(), \ + reshaped_##name.strides().vec()} + + LOGIC_TENSOR_DESC(query, dtype); + LOGIC_TENSOR_DESC(key, dtype); scale = { static_cast(TensorID::scale), to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())), @@ -132,22 +143,19 @@ struct SDPALogicalParams { TORCH_INTERNAL_ASSERT( (mask_dtype != data_type::undef), "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask"); - attn_mask = { - static_cast(TensorID::attn_mask), - mask_dtype, - reshaped_attn_mask.sizes().vec(), - reshaped_attn_mask.strides().vec()}; + LOGIC_TENSOR_DESC(attn_mask, mask_dtype); } - value = { - static_cast(TensorID::value), - dtype, - reshaped_value.sizes().vec(), - reshaped_value.strides().vec()}; - output = { - static_cast(TensorID::output), - dtype, - reshaped_output.sizes().vec(), - reshaped_output.strides().vec()}; + LOGIC_TENSOR_DESC(value, dtype); + LOGIC_TENSOR_DESC(attention, dtype); + if (compute_logsumexp) { + TORCH_INTERNAL_ASSERT( + logsumexp_.scalar_type() == at::kFloat, + "scaled_dot_product_attention: Expected logsumexp data type in FP32, but got ", + logsumexp_.scalar_type(), + " instead."); + LOGIC_TENSOR_DESC(logsumexp, sdpa_intermediate_dtype); + } +#undef LOGIC_TENSOR_DESC } std::vector get_input() const { std::vector input = {query, key, scale}; @@ -161,16 +169,21 @@ struct SDPALogicalParams { return input; } std::vector get_output() const { - return {output}; + std::vector output; + output.push_back(attention); + if (logsumexp.has_value()) { + output.push_back(logsumexp.value()); + } + return output; } }; partition create_sdpa_graph_partition( bool is_causal, + bool compute_logsumexp, data_type dtype, const SDPALogicalParams& params) { // graph building and partitioning - // currently, we assume that Q and K have same sequence length size_t lt_id = static_cast(SDPALogicalParams::TensorID::end); size_t op_id = 0; @@ -180,7 +193,7 @@ partition create_sdpa_graph_partition( // Matrix Extensions (Intel(R) XMX) support, which means the // Q/K/V tensors have bf16 or f16 data type while the output of the first // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type. - logical_tensor matmul_qk_out{lt_id++, data_type::f32}; + logical_tensor matmul_qk_out{lt_id++, sdpa_intermediate_dtype}; op matmul_qk{ op_id++, op::kind::MatMul, @@ -189,7 +202,7 @@ partition create_sdpa_graph_partition( "matmul_qk"}; matmul_qk.set_attr(op::attr::transpose_b, true); - logical_tensor scaled_qk_out{lt_id++, data_type::f32}; + logical_tensor scaled_qk_out{lt_id++, sdpa_intermediate_dtype}; op scale_mul{ op_id++, op::kind::Multiply, @@ -214,7 +227,7 @@ partition create_sdpa_graph_partition( if (params.attn_mask.has_value()) { TORCH_INTERNAL_ASSERT( !is_causal, "Additive mask cannot use with is_causal."); - masked_qk_out = {lt_id++, data_type::f32}; + masked_qk_out = {lt_id++, sdpa_intermediate_dtype}; mask_add = { op_id++, op::kind::Add, @@ -249,7 +262,7 @@ partition create_sdpa_graph_partition( {mask_gt_out.value()}, "mask_gt"}; - masked_qk_out = {lt_id++, data_type::f32}; + masked_qk_out = {lt_id++, sdpa_intermediate_dtype}; mask_select = { op_id++, op::kind::Select, @@ -270,12 +283,15 @@ partition create_sdpa_graph_partition( logical_tensor softmax_out{lt_id++, dtype}; softmax.add_input(masked_qk_out.value_or(scaled_qk_out)); softmax.add_output(softmax_out); + if (compute_logsumexp) { + softmax.add_output(params.logsumexp.value()); + } op matmul_v{ op_id++, op::kind::MatMul, {softmax_out, params.value}, - {params.output}, + {params.attention}, "matmul_v"}; constexpr auto ekind = dnnl::engine::kind::gpu; @@ -304,44 +320,469 @@ partition create_sdpa_graph_partition( partition& find_or_create_graph_partition( bool is_causal, + bool compute_logsumexp, const SDPALogicalParams& params) { - thread_local static PartitionCache cache; + thread_local PartitionCache cache; const data_type dtype = params.query.get_data_type(); // cache key creation // patternID is determined on the basis of the arguments provided std::bitset<32> patternID; if (dtype == data_type::f32) { - // bit 3 corresponds to float32 dtype - patternID.set(3, 1); + patternID.set(static_cast(PartitionCache::BitType::Float32), 1); } if (dtype == data_type::bf16) { - // bit 2 corresponds to fp16/bf16 dtype - patternID.set(2, 1); + patternID.set(static_cast(PartitionCache::BitType::Bfloat16), 1); } // sdp pattern - patternID.set(4, 1); + patternID.set(static_cast(PartitionCache::BitType::SdpaPattern), 1); // Refer to comments in Utils.h. The first 8 bits are reserved int pos = 8; // attn_mask patternID.set(pos++, params.attn_mask.has_value()); patternID.set(pos++, is_causal); + // compute_logsumexp + patternID.set(pos++, compute_logsumexp); auto partition_ = cache.find_partition(patternID); if (!partition_.has_value()) { // partition cache no hit // graph building and partitioning - partition sdp_partition = - create_sdpa_graph_partition(is_causal, dtype, params); + partition sdp_partition = create_sdpa_graph_partition( + is_causal, compute_logsumexp, dtype, params); partition_ = cache.insert_partition_cache(patternID, sdp_partition); } return *partition_; } +} // namespace sdpa_forward + +namespace sdpa_backward { + +struct SDPABackwardLogicalParams { + enum class TensorID { + grad_out, + query, + key, + value, + out, + logsumexp, + scale, + neg_inf, + attn_mask, + grad_query, + grad_key, + grad_value, + end, + }; + + logical_tensor grad_out{}; + logical_tensor query{}; + logical_tensor key{}; + logical_tensor value{}; + logical_tensor out{}; + logical_tensor logsumexp{}; + logical_tensor scale{}; + std::optional neg_inf; + std::optional attn_mask; + logical_tensor grad_query{}; + logical_tensor grad_key{}; + logical_tensor grad_value{}; + + SDPABackwardLogicalParams( + const at::Tensor& grad_out_, + const at::Tensor& query_, + const at::Tensor& key_, + const at::Tensor& value_, + const at::Tensor& out_, + const at::Tensor& logsumexp_, + const std::optional& attn_mask_, + const at::Tensor& grad_query_, + const at::Tensor& grad_key_, + const at::Tensor& grad_value_, + int batch_size, + int num_head_q, + int num_head_kv, + int seq_len_q, + int seq_len_kv, + int head_dim_qk, + int head_dim_v, + bool is_causal) { + const data_type dtype = to_logical_tensor_data_type(query_.scalar_type()); + TORCH_INTERNAL_ASSERT( + (dtype != data_type::undef), + "Only FP16/BF16/FP32 datatypes are currently supported"); + TORCH_INTERNAL_ASSERT( + grad_out_.scalar_type() == query_.scalar_type() && + grad_out_.scalar_type() == key_.scalar_type() && + grad_out_.scalar_type() == value_.scalar_type() && + grad_out_.scalar_type() == out_.scalar_type(), + "scaled_dot_product_attention_backward_xpu: Expected grad_out, q, k, v and out to have the same data type, but got ", + " grad_out: ", + grad_out_.scalar_type(), + ", q: ", + query_.scalar_type(), + ", k: ", + key_.scalar_type(), + ", v: ", + value_.scalar_type(), + ", out: ", + out_.scalar_type()); + TORCH_INTERNAL_ASSERT( + logsumexp_.defined() && logsumexp_.scalar_type() == at::kFloat, + "scaled_dot_product_attention_backward_xpu: Expected logsumexp to be defined and have FP32 data type"); + const dims scalar_shape = {1}; + + at::Tensor reshaped_grad_out = grad_out_; + at::Tensor reshaped_query = query_; + at::Tensor reshaped_key = key_; + at::Tensor reshaped_value = value_; + at::Tensor reshaped_out = out_; + at::Tensor reshaped_logsumexp = logsumexp_.unsqueeze(-1); + at::Tensor reshaped_attn_mask = attn_mask_.value_or(at::Tensor()); + at::Tensor reshaped_grad_query = grad_query_; + at::Tensor reshaped_grad_key = grad_key_; + at::Tensor reshaped_grad_value = grad_value_; + + // handle broadcasted input tensors for OneDNN + if (at::native::onednn::is_broadcast(reshaped_grad_out)) { + at::native::onednn::undo_broadcast(reshaped_grad_out); + } + if (at::native::onednn::is_broadcast(reshaped_query)) { + at::native::onednn::undo_broadcast(reshaped_query); + } + if (at::native::onednn::is_broadcast(reshaped_key)) { + at::native::onednn::undo_broadcast(reshaped_key); + } + if (at::native::onednn::is_broadcast(reshaped_value)) { + at::native::onednn::undo_broadcast(reshaped_value); + } + if (attn_mask_.has_value() && + at::native::onednn::is_broadcast(reshaped_attn_mask)) { + at::native::onednn::undo_broadcast(reshaped_attn_mask); + } + + // TODO: Support GQA in backward pass once OneDNN supports it. + +#define LOGIC_TENSOR_DESC(name, dtype) \ + name = { \ + static_cast(TensorID::name), \ + dtype, \ + reshaped_##name.sizes().vec(), \ + reshaped_##name.strides().vec()} + + LOGIC_TENSOR_DESC(grad_out, dtype); + LOGIC_TENSOR_DESC(query, dtype); + LOGIC_TENSOR_DESC(key, dtype); + LOGIC_TENSOR_DESC(value, dtype); + LOGIC_TENSOR_DESC(out, dtype); + LOGIC_TENSOR_DESC(logsumexp, sdpa_intermediate_dtype); + scale = { + static_cast(TensorID::scale), + to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())), + scalar_shape, + logical_tensor::layout_type::strided, + logical_tensor::property_type::constant}; + if (is_causal) { + neg_inf = { + static_cast(TensorID::neg_inf), + to_logical_tensor_data_type(at::toOpMathType(query_.scalar_type())), + scalar_shape, + logical_tensor::layout_type::strided, + logical_tensor::property_type::constant}; + } + if (attn_mask_.has_value()) { + const data_type mask_dtype = + to_logical_tensor_data_type(attn_mask_->scalar_type()); + TORCH_INTERNAL_ASSERT( + (mask_dtype != data_type::undef), + "Only FP16/BF16/FP32 datatypes are currently supported for attn_mask"); + LOGIC_TENSOR_DESC(attn_mask, mask_dtype); + } + LOGIC_TENSOR_DESC(grad_query, dtype); + LOGIC_TENSOR_DESC(grad_key, dtype); + LOGIC_TENSOR_DESC(grad_value, dtype); +#undef LOGIC_TENSOR_DESC + } + std::vector get_input() const { + std::vector input = { + grad_out, query, key, value, out, logsumexp, scale}; + if (neg_inf.has_value()) { + input.push_back(neg_inf.value()); + } + if (attn_mask.has_value()) { + input.push_back(attn_mask.value()); + } + return input; + } + std::vector get_output() const { + std::vector output = {grad_query, grad_key, grad_value}; + return output; + } +}; + +partition create_sdpa_backward_graph_partition( + bool is_causal, + data_type dtype, + const SDPABackwardLogicalParams& params) { + // graph building and partitioning + size_t lt_id = static_cast(SDPABackwardLogicalParams::TensorID::end); + size_t op_id = 0; + + // OneDNN graph has optimized implementation for `f16` or `bf16` SDPA with + // `f32` intermediate data type on Intel Graphics Products with Intel(R) Xe + // Matrix Extensions (Intel(R) XMX) support, which means the + // Q/K/V tensors have bf16 or f16 data type while the output of the first + // MatMul, Scale, Mask, and the input of SoftMax are in f32 data type. + logical_tensor matmul_qk_out{lt_id++, sdpa_intermediate_dtype}; + op matmul_qk{ + op_id++, + op::kind::MatMul, + {params.query, params.key}, + {matmul_qk_out}, + "matmul_qk"}; + matmul_qk.set_attr(op::attr::transpose_b, true); + + logical_tensor scaled_qk_out{lt_id++, sdpa_intermediate_dtype}; + op scale_mul{ + op_id++, + op::kind::Multiply, + {matmul_qk_out, params.scale}, + {scaled_qk_out}, + "scale_mul"}; + + std::optional masked_qk_out; + + // For optional additive mask + std::optional mask_add; + + // For optional implicite causal mask + std::optional mask_gen_idx_row; + std::optional mask_row_idx; + std::optional mask_gen_idx_col; + std::optional mask_col_idx; + std::optional mask_gt; + std::optional mask_gt_out; + std::optional mask_select; + + if (params.attn_mask.has_value()) { + TORCH_INTERNAL_ASSERT( + !is_causal, "Additive mask cannot use with is_causal."); + masked_qk_out = {lt_id++, sdpa_intermediate_dtype}; + mask_add = { + op_id++, + op::kind::Add, + {scaled_qk_out, params.attn_mask.value()}, + {masked_qk_out.value()}, + "mask_add"}; + } else if (is_causal) { + mask_row_idx = {lt_id++, data_type::s32}; + mask_gen_idx_row = { + op_id++, + op::kind::GenIndex, + {scaled_qk_out}, + {mask_row_idx.value()}, + "mask_gen_idx_row"}; + mask_gen_idx_row->set_attr(op::attr::axis, -2); + + mask_col_idx = {lt_id++, data_type::s32}; + mask_gen_idx_col = { + op_id++, + op::kind::GenIndex, + {scaled_qk_out}, + {mask_col_idx.value()}, + "mask_gen_idx_col"}; + mask_gen_idx_col->set_attr(op::attr::axis, -1); + + mask_gt_out = {lt_id++, data_type::boolean}; + mask_gt = { + op_id++, + op::kind::GreaterEqual, + {mask_row_idx.value(), mask_col_idx.value()}, + {mask_gt_out.value()}, + "mask_gt"}; + + masked_qk_out = {lt_id++, sdpa_intermediate_dtype}; + mask_select = { + op_id++, + op::kind::Select, + {mask_gt_out.value(), scaled_qk_out, params.neg_inf.value()}, + {masked_qk_out.value()}, + "mask_select"}; + } + + // attention_probs = softmax(masked_score) = exp(masked_score - logsumexp) + logical_tensor sub_out{lt_id++, sdpa_intermediate_dtype}; + op subtract{ + op_id++, + op::kind::Subtract, + {masked_qk_out.value_or(scaled_qk_out), params.logsumexp}, + {sub_out}, + "subtract"}; + logical_tensor prob{lt_id++, sdpa_intermediate_dtype}; + op exp{op_id++, op::kind::Exp, {sub_out}, {prob}, "exp"}; + + // The following matmul doesn't support different input dtypes, insert a + // typecast + logical_tensor prob_casted = prob; + op typecast = op(op_id++, op::kind::TypeCast, "typecast"); + if (dtype != sdpa_intermediate_dtype) { + prob_casted = logical_tensor(lt_id++, dtype); + typecast.add_inputs({prob}); + typecast.add_outputs({prob_casted}); + } + + // grad_value = prob^T * grad_out + // TODO: handle GQA headnum because (batch_size, num_head_kv, seq_len_kv, + // head_dim_v) != (batch_size, num_head_q, seqlen_kv, seq_len_q) * + // (batch_size, num_head_q, seqlen_q, head_dim_v) + op matmul_grad_value{ + op_id++, + op::kind::MatMul, + {prob_casted, params.grad_out}, + {params.grad_value}, + "matmul_grad_value"}; + matmul_grad_value.set_attr(op::attr::transpose_a, true); + + // grad_prop = grad_out * value^T + // TODO: handle GQA headnum because (batch_size, num_head_q, seq_len_q, + // seq_len_kv) != (batch_size, num_head_q, seq_len_q, head_dim_v) * + // (batch_size, num_head_kv, head_dim_v, seq_len_kv) + logical_tensor grad_prop{lt_id++, sdpa_intermediate_dtype}; + op matmul_grad_prop{ + op_id++, + op::kind::MatMul, + {params.grad_out, params.value}, + {grad_prop}, + "matmul_grad_prop"}; + matmul_grad_prop.set_attr(op::attr::transpose_b, true); + + // grad_masked_score = softmaxbackward(grad_prop) + logical_tensor grad_masked_score{lt_id++, sdpa_intermediate_dtype}; + op softmax_backward{ + op_id++, + op::kind::SoftMaxBackward, + {grad_prop, prob}, + {grad_masked_score}, + "softmax_backward"}; + softmax_backward.set_attr(op::attr::axis, -1); + + // TODO: add output tensor grad_attn_mask = grad_masked_score once OneDNN + // supports output grad_attn_mask. + + // grad_scaled_score = grad_masked_score * scale + logical_tensor grad_scaled_score{lt_id++, sdpa_intermediate_dtype}; + op grad_scale_mul{ + op_id++, + op::kind::Multiply, + {grad_masked_score, params.scale}, + {grad_scaled_score}, + "grad_scale_mul"}; + + // The following matmul doesn't support different input dtypes, insert a + // typecast + logical_tensor grad_scaled_score_cast = grad_scaled_score; + op typecast2 = op(op_id++, op::kind::TypeCast, "typecast2"); + if (dtype != sdpa_intermediate_dtype) { + grad_scaled_score_cast = logical_tensor(lt_id++, dtype); + typecast2.add_inputs({grad_scaled_score}); + typecast2.add_outputs({grad_scaled_score_cast}); + } + + // grad_query = grad_scaled_score_cast * key + // TODO: handle GQA headnum because (batch_size, num_head_q, seq_len_q, + // head_dim_qk) != (batch_size, num_head_q, seq_len_q, seq_len_kv) * + // (batch_size, num_head_kv, seq_len_kv, head_dim_qk) + op matmul_grad_query{ + op_id++, + op::kind::MatMul, + {grad_scaled_score_cast, params.key}, + {params.grad_query}, + "matmul_grad_query"}; + + // grad_key = grad_scaled_score_cast^T * query + op matmul_grad_key{ + op_id++, + op::kind::MatMul, + {grad_scaled_score_cast, params.query}, + {params.grad_key}, + "matmul_grad_key"}; + matmul_grad_key.set_attr(op::attr::transpose_a, true); + + constexpr auto ekind = dnnl::engine::kind::gpu; + dnnl::graph::graph g(ekind); + g.add_op(matmul_qk); + g.add_op(scale_mul); + if (mask_add.has_value()) { + g.add_op(mask_add.value()); + } + if (is_causal) { + g.add_op(mask_gen_idx_row.value()); + g.add_op(mask_gen_idx_col.value()); + g.add_op(mask_gt.value()); + g.add_op(mask_select.value()); + } + g.add_op(subtract); + g.add_op(exp); + g.add_op(matmul_grad_value); + g.add_op(matmul_grad_prop); + g.add_op(softmax_backward); + g.add_op(grad_scale_mul); + g.add_op(matmul_grad_query); + g.add_op(matmul_grad_key); + if (dtype != sdpa_intermediate_dtype) { + g.add_op(typecast); + g.add_op(typecast2); + } + g.finalize(); + auto partitions = g.get_partitions(); + TORCH_INTERNAL_ASSERT( + (partitions.size() == 1) && partitions[0].is_supported(), + "oneDNN doesn't support this fusion pattern. If you'd like its support, please submit a issue."); + return partitions[0]; +} + +partition& find_or_create_backward_graph_partition( + bool is_causal, + const SDPABackwardLogicalParams& params) { + thread_local PartitionCache cache; + const data_type dtype = params.query.get_data_type(); + + // cache key creation + // patternID is determined on the basis of the arguments provided + std::bitset<32> patternID; + if (dtype == data_type::f32) { + patternID.set(static_cast(PartitionCache::BitType::Float32), 1); + } + if (dtype == data_type::bf16) { + patternID.set(static_cast(PartitionCache::BitType::Bfloat16), 1); + } + // sdpa backward pattern + patternID.set( + static_cast(PartitionCache::BitType::SdpaBwdPattern), 1); + + // Refer to comments in Utils.h. The first 8 bits are reserved + int pos = 8; + // attn_mask + patternID.set(pos++, params.attn_mask.has_value()); + patternID.set(pos++, is_causal); + + auto partition_ = cache.find_partition(patternID); + if (!partition_.has_value()) { + // partition cache no hit + // graph building and partitioning + partition sdpa_backward_partition = + create_sdpa_backward_graph_partition(is_causal, dtype, params); + partition_ = + cache.insert_partition_cache(patternID, sdpa_backward_partition); + } + return *partition_; +} +} // namespace sdpa_backward } // namespace namespace at::native::onednn { -void gpu_float_sdpa( +void sdpa( int batch_size, int seq_len_q, int seq_len_kv, @@ -355,7 +796,9 @@ void gpu_float_sdpa( std::optional attn_mask, bool is_causal, float softmax_scale, - const Tensor& output) { + const Tensor& attention, + bool compute_logsumexp, + const Tensor& logsumexp) { auto& eng = GpuEngineManager::Instance().get_engine(); auto& strm = GpuStreamManager::Instance().get_stream(); @@ -370,8 +813,8 @@ void gpu_float_sdpa( }; // OneDNN doesn't support fp32 ukernel for implicit causal mask, - // and the reference implementation is worse than aten math + explict causal - // mask. Fall back to explict causal mask until OneDNN v3.9 which has fp32 + // and the reference implementation is worse than aten math + explicit causal + // mask. Fall back to explicit causal mask until OneDNN v3.9 which has fp32 // ukernel for implicit causal mask. if (is_causal && query.dtype() == at::kFloat) { attn_mask = get_tril_mask(); @@ -381,32 +824,27 @@ void gpu_float_sdpa( std::vector l_inputs, l_outputs; std::optional compiled_partition; - auto get_compiled_partition = [&]() { - const SDPALogicalParams logical_params( - query, - key, - value, - attn_mask, - output, - batch_size, - seq_len_q, - seq_len_kv, - num_head_q, - num_head_kv, - head_dim_qk, - head_dim_v, - is_causal); - auto& partition_ = - find_or_create_graph_partition(is_causal, logical_params); - auto i = logical_params.get_input(); - auto o = logical_params.get_output(); - auto compiled_partition = partition_.compile(i, o, eng); - l_inputs = std::move(i); - l_outputs = std::move(o); - return compiled_partition; - }; - - compiled_partition = get_compiled_partition(); + const sdpa_forward::SDPALogicalParams logical_params( + query, + key, + value, + attn_mask, + attention, + logsumexp, + batch_size, + seq_len_q, + seq_len_kv, + num_head_q, + num_head_kv, + head_dim_qk, + head_dim_v, + is_causal, + compute_logsumexp); + auto& partition = sdpa_forward::find_or_create_graph_partition( + is_causal, compute_logsumexp, logical_params); + l_inputs = std::move(logical_params.get_input()); + l_outputs = std::move(logical_params.get_output()); + compiled_partition = partition.compile(l_inputs, l_outputs, eng); Tensor softmax_scale1 = at::full( {}, @@ -416,26 +854,147 @@ void gpu_float_sdpa( if (is_causal) { neg_inf = at::full( {}, - -INFINITY, + -std::numeric_limits::infinity(), query.options().dtype(at::toOpMathType(query.scalar_type()))); } std::vector outputs = { - {l_outputs[0], eng, output.data_ptr()}, + {l_outputs[0], eng, attention.data_ptr()}, }; + if (compute_logsumexp) { + outputs.emplace_back(l_outputs[1], eng, logsumexp.data_ptr()); + } + size_t i = 0; std::vector inputs; inputs.reserve(l_inputs.size()); - inputs.emplace_back(l_inputs[i++], eng, query.data_ptr()); - inputs.emplace_back(l_inputs[i++], eng, key.data_ptr()); - inputs.emplace_back(l_inputs[i++], eng, softmax_scale1.data_ptr()); + +#define ADD_INPUT(variable) \ + inputs.emplace_back(l_inputs[i++], eng, variable.data_ptr()) + + ADD_INPUT(query); + ADD_INPUT(key); + ADD_INPUT(softmax_scale1); if (neg_inf.has_value()) { - inputs.emplace_back(l_inputs[i++], eng, neg_inf->data_ptr()); + ADD_INPUT((*neg_inf)); } if (attn_mask.has_value()) { - inputs.emplace_back(l_inputs[i++], eng, attn_mask->data_ptr()); + ADD_INPUT((*attn_mask)); } - inputs.emplace_back(l_inputs[i++], eng, value.data_ptr()); + ADD_INPUT(value); +#undef ADD_INPUT + + compiled_partition->execute(strm, inputs, outputs); +} + +void sdpa_backward( + int batch_size, + int num_head_q, + int num_head_kv, + int seq_len_q, + int seq_len_kv, + int head_dim_qk, + int head_dim_v, + const Tensor& grad_out, + const Tensor& query, + const Tensor& key, + const Tensor& value, + const Tensor& out, + const Tensor& logsumexp, + std::optional attn_mask, + bool is_causal, + double scale, + Tensor& grad_query, + Tensor& grad_key, + Tensor& grad_value) { + auto& eng = GpuEngineManager::Instance().get_engine(); + auto& strm = GpuStreamManager::Instance().get_stream(); + + const auto get_tril_mask = [&]() { + auto opts = query.options(); + auto bool_tril = + at::ones_symint({seq_len_q, seq_len_kv}, opts.dtype(at::kBool)).tril(); + return at::where( + bool_tril, + 0.f, + at::scalar_tensor(-std::numeric_limits::infinity(), opts)); + }; + + // OneDNN doesn't support fp32 ukernel for implicit causal mask, + // and the reference implementation is worse than aten math + explicit causal + // mask. Fall back to explicit causal mask until OneDNN v3.9 which has fp32 + // ukernel for implicit causal mask. + if (is_causal && query.dtype() == at::kFloat) { + attn_mask = get_tril_mask(); + is_causal = false; + } + + std::vector l_inputs, l_outputs; + std::optional compiled_partition; + + const sdpa_backward::SDPABackwardLogicalParams logical_params( + grad_out, + query, + key, + value, + out, + logsumexp, + attn_mask, + grad_query, + grad_key, + grad_value, + batch_size, + num_head_q, + num_head_kv, + seq_len_q, + seq_len_kv, + head_dim_qk, + head_dim_v, + is_causal); + auto& partition = sdpa_backward::find_or_create_backward_graph_partition( + is_causal, logical_params); + l_inputs = std::move(logical_params.get_input()); + l_outputs = std::move(logical_params.get_output()); + compiled_partition = partition.compile(l_inputs, l_outputs, eng); + + Tensor softmax_scale = at::full( + {}, scale, query.options().dtype(at::toOpMathType(query.scalar_type()))); + std::optional neg_inf; + if (is_causal) { + neg_inf = at::full( + {}, + -std::numeric_limits::infinity(), + query.options().dtype(at::toOpMathType(query.scalar_type()))); + } + + std::vector outputs = { + {l_outputs[0], eng, grad_query.data_ptr()}, + {l_outputs[1], eng, grad_key.data_ptr()}, + {l_outputs[2], eng, grad_value.data_ptr()}, + }; + + size_t i = 0; + std::vector inputs; + inputs.reserve(l_inputs.size()); + +#define ADD_INPUT(variable) \ + inputs.emplace_back(l_inputs[i++], eng, variable.data_ptr()) + + ADD_INPUT(grad_out); + ADD_INPUT(query); + ADD_INPUT(key); + ADD_INPUT(value); + ADD_INPUT(out); + ADD_INPUT(logsumexp); + ADD_INPUT(softmax_scale); + if (neg_inf.has_value()) { + ADD_INPUT((*neg_inf)); + } + if (attn_mask.has_value()) { + ADD_INPUT((*attn_mask)); + } +#undef ADD_INPUT + compiled_partition->execute(strm, inputs, outputs); } } // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h index ac8645d3e4a5..52f89bc1395d 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h @@ -110,11 +110,21 @@ struct PartitionCache { // bit 1: is uint8 // bit 2: fp16(0) / bf16(1) // bit 3: is fp32 - // bit 4: is sdp pattern - // bit 5-7: N/A + // bit 4: is sdpa pattern + // bit 5: is sdpa backward pattern + // bit 6-7: reserved for future use // The rest of the bits depend upon the arguments provided // However, down the line, we might have different bitsets for different // patterns + enum class BitType : uint8_t { + Int8 = 0, + Uint8 = 1, + Bfloat16 = 2, + Float32 = 3, + SdpaPattern = 4, + SdpaBwdPattern = 5 + }; + dnnl::graph::partition& insert_partition_cache( std::bitset<32>& patternID, dnnl::graph::partition& p) { diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h index e73cb73e8b1e..6b2bf01e6d73 100644 --- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h +++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h @@ -164,7 +164,7 @@ void quantized_matmul( std::string_view unary_post_op_algorithm, bool m2_trnas); -void gpu_float_sdpa( +void sdpa( int batch_size, int seq_len_q, int seq_len_kv, @@ -178,5 +178,28 @@ void gpu_float_sdpa( std::optional attn_mask, bool is_causal, float softmax_scale, - const Tensor& output); + const Tensor& attention, + bool compute_logsumexp, + const Tensor& logsumexp); + +void sdpa_backward( + int batch_size, + int num_head_q, + int num_head_kv, + int seq_len_q, + int seq_len_kv, + int head_dim_qk, + int head_dim_v, + const Tensor& grad_out, + const Tensor& query, + const Tensor& key, + const Tensor& value, + const Tensor& out, + const Tensor& logsumexp, + std::optional attn_mask, + bool is_causal, + double scale, + Tensor& grad_query, + Tensor& grad_key, + Tensor& grad_value); } // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp index 1c6e2a6c89da..c014313a5b35 100644 --- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp @@ -1,5 +1,7 @@ #include #include +#include + #include #include #include @@ -7,7 +9,7 @@ using namespace at::native::onednn; namespace at::native::xpu { -static inline c10::ScalarType qconv_decide_out_dtype( +inline c10::ScalarType QConvoneDNNXPU::qconv_decide_out_dtype( const at::Tensor& act, const std::optional output_dtype) { bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat); @@ -19,7 +21,7 @@ static inline c10::ScalarType qconv_decide_out_dtype( return dst_dtype; } -static at::Tensor qconv_prepack_xpu( +at::Tensor QConvoneDNNXPU::qconv_prepack_xpu( at::Tensor weight, at::Tensor weight_scales, double input_scale, @@ -33,222 +35,265 @@ static at::Tensor qconv_prepack_xpu( return weight; } -class QConvoneDNNXPU final { - public: - static at::Tensor run_pointwise( - at::Tensor act, - double act_scale, - int64_t act_zero_point, - at::Tensor weight, - at::Tensor weight_scales, - at::Tensor weight_zero_points, - std::optional bias, - torch::List stride, - torch::List padding, - torch::List dilation, - int64_t groups, - double inv_output_scale, - int64_t output_zero_point, - std::optional output_dtype, - std::string_view attr, - torch::List> scalars, - std::optional algorithm) { - if (act.dim() == 3 || act.dim() == 5) { - TORCH_CHECK( - attr == "none", - "quantized pointwise conv", - act.dim() - 2, - "d doesn't support unary_post_op fusion. Got unary_post_op:", - attr, - "."); - } else { - TORCH_CHECK( - attr == "none" || attr == "relu" || attr == "hardtanh" || - attr == "hardswish" || attr == "swish", - "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:", - attr, - "."); - } +at::Tensor QConvoneDNNXPU::run_pointwise( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double inv_output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm) { + if (act.dim() == 3 || act.dim() == 5) { + TORCH_CHECK( + attr == "none", + "quantized pointwise conv", + act.dim() - 2, + "d doesn't support unary_post_op fusion. Got unary_post_op:", + attr, + "."); + } else { + TORCH_CHECK( + attr == "none" || attr == "relu" || attr == "hardtanh" || + attr == "hardswish" || attr == "swish", + "We support quantized convolution without any post-ops or combinations for Quantized Conv + ReLU, Hardtanh, GELU, Swish, and Hardswish are supported. However, encountered unsupported post operation:", + attr, + "."); + } - bool is_channels_last_suggested = use_channels_last_for_conv(act, weight); - auto mfmt = is_channels_last_suggested - ? get_cl_tag_by_ndim(act.ndimension()) - : at::MemoryFormat::Contiguous; - Tensor input_ = act.contiguous(mfmt); - Tensor weight_ = weight.contiguous(mfmt); + bool is_channels_last_suggested = use_channels_last_for_conv(act, weight); + auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension()) + : at::MemoryFormat::Contiguous; + Tensor input_ = act.contiguous(mfmt); + Tensor weight_ = weight.contiguous(mfmt); - auto dst_tz = conv_dst_size( - input_.ndimension(), - input_.sizes(), - weight_.sizes(), - padding.vec(), - padding.vec(), - stride.vec(), - dilation.vec()); + auto dst_tz = conv_dst_size( + input_.ndimension(), + input_.sizes(), + weight_.sizes(), + padding.vec(), + padding.vec(), + stride.vec(), + dilation.vec()); - auto dst_dtype = qconv_decide_out_dtype(act, output_dtype); - Tensor output = - at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt)); + auto dst_dtype = qconv_decide_out_dtype(act, output_dtype); + Tensor output = + at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt)); - return quantized_convolution( - act, - act_scale, - act_zero_point, - weight, - weight_scales, - weight_zero_points, - bias, - stride, - padding, - dilation, - /*transposed*/ false, - groups, - output, - inv_output_scale, - output_zero_point, - /*accum*/ std::nullopt, - /*accum_scale*/ 0.0, - /*accum_zero_point*/ 0, - /*output_dtype*/ output_dtype, - /*binary_attr*/ std::nullopt, - /*binary_alpha*/ std::nullopt, - /*unary_attr*/ attr, - /*unary_scalars*/ scalars, - /*unary_algorithm*/ algorithm); - } + return quantized_convolution( + act, + act_scale, + act_zero_point, + weight, + weight_scales, + weight_zero_points, + bias, + stride, + padding, + dilation, + /*transposed*/ false, + groups, + output, + inv_output_scale, + output_zero_point, + /*accum*/ std::nullopt, + /*accum_scale*/ 0.0, + /*accum_zero_point*/ 0, + /*output_dtype*/ output_dtype, + /*binary_attr*/ std::nullopt, + /*binary_alpha*/ std::nullopt, + /*unary_attr*/ attr, + /*unary_scalars*/ scalars, + /*unary_algorithm*/ algorithm); +} - static at::Tensor run_pointwise_tensor( - at::Tensor act, - at::Tensor act_scale, - at::Tensor act_zero_point, - at::Tensor weight, - at::Tensor weight_scales, - at::Tensor weight_zero_points, - std::optional bias, - torch::List stride, - torch::List padding, - torch::List dilation, - int64_t groups, - double output_scale, - int64_t output_zero_point, - std::optional output_dtype, - std::string_view attr, - torch::List> scalars, - std::optional algorithm) { - return run_pointwise( - act, - act_scale.item().toDouble(), - act_zero_point.item().toLong(), - weight, - weight_scales, - weight_zero_points, - bias, - stride, - padding, - dilation, - groups, - output_scale, - output_zero_point, - output_dtype, - /*unary_attr*/ attr, - /*unary_scalars*/ scalars, - /*unary_algorithm*/ algorithm); - } +at::Tensor QConvoneDNNXPU::run_pointwise_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm) { + return run_pointwise( + act, + act_scale.item().toDouble(), + act_zero_point.item().toLong(), + weight, + weight_scales, + weight_zero_points, + bias, + stride, + padding, + dilation, + groups, + output_scale, + output_zero_point, + output_dtype, + /*unary_attr*/ attr, + /*unary_scalars*/ scalars, + /*unary_algorithm*/ algorithm); +} - static at::Tensor run_pointwise_binary( - at::Tensor act, - double act_scale, - int64_t act_zero_point, - at::Tensor weight, - at::Tensor weight_scales, - at::Tensor weight_zero_points, - at::Tensor accum, - std::optional bias, - torch::List stride, - torch::List padding, - torch::List dilation, - int64_t groups, - double output_scale, - int64_t output_zero_point, - std::optional output_dtype, - double accum_scale, - int64_t accum_zero_point, - std::string_view binary_attr, - std::optional alpha, - std::optional unary_attr, - torch::List> unary_scalars, - std::optional unary_algorithm) { - TORCH_CHECK( - act.dim() == 4 && binary_attr == "sum" && - (!unary_attr.has_value() || - (unary_attr.has_value() && - (unary_attr.value() == "none" || unary_attr.value() == "relu"))), - "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ", - binary_attr, - " unary_post_op: ", - unary_attr.has_value() ? unary_attr.value() : "none", - ".") +at::Tensor QConvoneDNNXPU::run_pointwise_binary( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm) { + TORCH_CHECK( + act.dim() == 4 && binary_attr == "sum" && + (!unary_attr.has_value() || + (unary_attr.has_value() && + (unary_attr.value() == "none" || unary_attr.value() == "relu"))), + "post_op sum or post_op sum_relu is supported for quantized pointwise conv2d. Got binary_post_op: ", + binary_attr, + " unary_post_op: ", + unary_attr.has_value() ? unary_attr.value() : "none", + ".") - bool is_channels_last_suggested = use_channels_last_for_conv(act, weight); - auto mfmt = is_channels_last_suggested - ? get_cl_tag_by_ndim(act.ndimension()) - : at::MemoryFormat::Contiguous; - Tensor input_ = act.contiguous(mfmt); - Tensor weight_ = weight.contiguous(mfmt); + bool is_channels_last_suggested = use_channels_last_for_conv(act, weight); + auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(act.ndimension()) + : at::MemoryFormat::Contiguous; + Tensor input_ = act.contiguous(mfmt); + Tensor weight_ = weight.contiguous(mfmt); - auto dst_tz = conv_dst_size( - input_.ndimension(), - input_.sizes(), - weight_.sizes(), - padding.vec(), - padding.vec(), - stride.vec(), - dilation.vec()); + auto dst_tz = conv_dst_size( + input_.ndimension(), + input_.sizes(), + weight_.sizes(), + padding.vec(), + padding.vec(), + stride.vec(), + dilation.vec()); - auto dst_dtype = qconv_decide_out_dtype(act, output_dtype); - bool has_accum_postop_sum = binary_attr == "sum"; - Tensor output = has_accum_postop_sum - ? accum - : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt)); + auto dst_dtype = qconv_decide_out_dtype(act, output_dtype); + bool has_accum_postop_sum = binary_attr == "sum"; + Tensor output = has_accum_postop_sum + ? accum + : at::empty(dst_tz, act.options().dtype(dst_dtype).memory_format(mfmt)); - output = quantized_convolution( - act, - act_scale, - act_zero_point, - weight, - weight_scales, - weight_zero_points, - bias, - stride, - padding, - dilation, - /*transposed*/ false, - groups, - output, - output_scale, - output_zero_point, - /*accum*/ accum, - /*accum_scale*/ accum_scale, - /*accum_zero_point*/ accum_zero_point, - /*output_dtype*/ output_dtype, - /*binary_attr*/ binary_attr, - /*binary_alpha*/ alpha, - /*unary_attr*/ unary_attr, - /*unary_scalars*/ unary_scalars, - /*unary_algorithm*/ unary_algorithm); + output = quantized_convolution( + act, + act_scale, + act_zero_point, + weight, + weight_scales, + weight_zero_points, + bias, + stride, + padding, + dilation, + /*transposed*/ false, + groups, + output, + output_scale, + output_zero_point, + /*accum*/ accum, + /*accum_scale*/ accum_scale, + /*accum_zero_point*/ accum_zero_point, + /*output_dtype*/ output_dtype, + /*binary_attr*/ binary_attr, + /*binary_alpha*/ alpha, + /*unary_attr*/ unary_attr, + /*unary_scalars*/ unary_scalars, + /*unary_algorithm*/ unary_algorithm); - if (!has_accum_postop_sum) { - return output; - } else { - return accum; - } + if (!has_accum_postop_sum) { + return output; + } else { + return accum; } -}; +} + +at::Tensor QConvoneDNNXPU::run_pointwise_binary_tensor( + at::Tensor act, // contains quantized values but not QTensor + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, // contains quantized values but not QTensor + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm) { + return run_pointwise_binary( + act, + act_scale.item().toDouble(), + act_zero_point.item().toLong(), + weight, + weight_scales, + weight_zero_points, + accum, + bias, + stride, + padding, + dilation, + groups, + output_scale, + output_zero_point, + output_dtype, + accum_scale, + accum_zero_point, + binary_attr, + alpha, + unary_attr, + unary_scalars, + unary_algorithm); +} TORCH_LIBRARY_IMPL(onednn, XPU, m) { m.impl( TORCH_SELECTIVE_NAME("onednn::qconv_prepack"), - TORCH_FN(xpu::qconv_prepack_xpu)); + TORCH_FN(QConvoneDNNXPU::qconv_prepack_xpu)); m.impl( TORCH_SELECTIVE_NAME("onednn::qconv1d_pointwise"), QConvoneDNNXPU::run_pointwise); @@ -267,6 +312,9 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) { m.impl( TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"), QConvoneDNNXPU::run_pointwise_tensor); + m.impl( + TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary_tensor"), + QConvoneDNNXPU::run_pointwise_binary_tensor); } } // namespace at::native::xpu diff --git a/aten/src/ATen/native/mkldnn/xpu/qconv.h b/aten/src/ATen/native/mkldnn/xpu/qconv.h new file mode 100644 index 000000000000..e9ddd4fa2969 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/qconv.h @@ -0,0 +1,111 @@ +#pragma once + +#include +#include + +namespace at::native::xpu { +class QConvoneDNNXPU final { + public: + C10_API static at::Tensor run_pointwise( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double inv_output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_binary( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + C10_API static at::Tensor run_pointwise_binary_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + static inline c10::ScalarType qconv_decide_out_dtype( + const at::Tensor& act, + const std::optional output_dtype); + + static at::Tensor qconv_prepack_xpu( + at::Tensor weight, + at::Tensor weight_scales, + double input_scale, + int64_t input_zero_point, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + std::optional> input_shape); +}; + +} // namespace at::native::xpu \ No newline at end of file diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp index 7e3f2f01fa1e..e9584e8289eb 100644 --- a/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp +++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.cpp @@ -1,13 +1,14 @@ #include #include +#include #include using namespace at::native::onednn; namespace at::native::xpu { -static inline c10::ScalarType qlinear_decide_out_dtype( +inline c10::ScalarType QLinearOnednnXPU::qlinear_decide_out_dtype( const at::Tensor& act, const std::optional output_dtype) { bool fp32_output = output_dtype.has_value() && (output_dtype == c10::kFloat); @@ -19,7 +20,7 @@ static inline c10::ScalarType qlinear_decide_out_dtype( return dst_dtype; } -static Tensor q_linear_pointwise( +Tensor QLinearOnednnXPU::q_linear_pointwise( Tensor act, double act_scale, int64_t act_zero_point, @@ -78,7 +79,7 @@ static Tensor q_linear_pointwise( return qout; } -static Tensor q_linear_pointwise_tensor( +Tensor QLinearOnednnXPU::q_linear_pointwise_tensor( Tensor act, Tensor act_scale, Tensor act_zero_point, @@ -137,7 +138,7 @@ static Tensor q_linear_pointwise_tensor( return qout; } -static Tensor q_linear_pointwise_binary( +Tensor QLinearOnednnXPU::q_linear_pointwise_binary( Tensor act, double act_scale, int64_t act_zero_point, @@ -208,7 +209,7 @@ static Tensor q_linear_pointwise_binary( return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout; } -static Tensor q_linear_pointwise_binary_tensor( +Tensor QLinearOnednnXPU::q_linear_pointwise_binary_tensor( Tensor act, Tensor act_scale, Tensor act_zero_point, @@ -248,7 +249,7 @@ static Tensor q_linear_pointwise_binary_tensor( unary_post_op_algorithm); } -static at::Tensor q_linear_prepack_onednn( +Tensor QLinearOnednnXPU::q_linear_prepack_onednn( at::Tensor weight, std::optional> input_shape) { at::Tensor weight_transposed = weight.transpose(0, 1); @@ -258,19 +259,19 @@ static at::Tensor q_linear_prepack_onednn( TORCH_LIBRARY_IMPL(onednn, XPU, m) { m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"), - TORCH_FN(q_linear_pointwise)); + TORCH_FN(QLinearOnednnXPU::q_linear_pointwise)); m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"), - TORCH_FN(q_linear_pointwise_tensor)); + TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_tensor)); m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_prepack"), - TORCH_FN(q_linear_prepack_onednn)); + TORCH_FN(QLinearOnednnXPU::q_linear_prepack_onednn)); m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"), - TORCH_FN(q_linear_pointwise_binary)); + TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary)); m.impl( TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"), - TORCH_FN(q_linear_pointwise_binary_tensor)); + TORCH_FN(QLinearOnednnXPU::q_linear_pointwise_binary_tensor)); } } // namespace at::native::xpu diff --git a/aten/src/ATen/native/mkldnn/xpu/qlinear.h b/aten/src/ATen/native/mkldnn/xpu/qlinear.h new file mode 100644 index 000000000000..738227666424 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/qlinear.h @@ -0,0 +1,91 @@ +#pragma once + +#include +#include +#include + +namespace at::native::xpu { + +class QLinearOnednnXPU final { + public: + C10_API static Tensor q_linear_pointwise( + Tensor act, + double act_scale, + int64_t act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + torch::List> post_op_args, + std::string_view post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_tensor( + Tensor act, + Tensor act_scale, + Tensor act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + torch::List> post_op_args, + std::string_view post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_binary( + Tensor act, + double act_scale, + int64_t act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, + double binary_alpha, + std::string_view unary_post_op, + torch::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_binary_tensor( + Tensor act, + Tensor act_scale, + Tensor act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, + double binary_alpha, + std::string_view unary_post_op, + torch::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); + + C10_API static Tensor q_linear_prepack_onednn( + at::Tensor weight, + std::optional> input_shape); + + static inline c10::ScalarType qlinear_decide_out_dtype( + const at::Tensor& act, + const std::optional output_dtype); + +}; // class QLinearOnednnXPU + +} // namespace at::native::xpu diff --git a/aten/src/ATen/native/mps/kernels/Attention.metal b/aten/src/ATen/native/mps/kernels/Attention.metal index 6bb2cbfb3d71..5a317895f508 100644 --- a/aten/src/ATen/native/mps/kernels/Attention.metal +++ b/aten/src/ATen/native/mps/kernels/Attention.metal @@ -14,8 +14,8 @@ template device T* out [[buffer(3)]], const constant uint& gqa_factor [[buffer(4)]], const constant uint& N [[buffer(5)]], - const constant uint2& k_head_seq_stride [[buffer(6)]], - const constant uint2& v_head_seq_stride [[buffer(7)]], + const constant uint3& qkv_head_strides [[buffer(6)]], + const constant uint3& qkv_seq_strides [[buffer(7)]], const constant float& scale [[buffer(8)]], const device bool* mask [[buffer(9)]], const constant uint3& mask_strides [[buffer(10)]], @@ -28,10 +28,12 @@ template constexpr uint BD = 32; constexpr uint qk_per_thread = D / BD; constexpr uint v_per_thread = V / BD; - const uint k_head_stride = k_head_seq_stride.x; - const uint k_seq_stride = k_head_seq_stride.y; - const uint v_head_stride = v_head_seq_stride.x; - const uint v_seq_stride = v_head_seq_stride.y; + const uint q_head_stride = qkv_head_strides.x; + const uint q_seq_stride = qkv_seq_strides.x; + const uint k_head_stride = qkv_head_strides.y; + const uint k_seq_stride = qkv_seq_strides.y; + const uint v_head_stride = qkv_head_strides.z; + const uint v_seq_stride = qkv_seq_strides.z; const uint mask_head_stride = mask_strides.x; const uint mask_kv_seq_stride = mask_strides.y; const uint mask_q_seq_stride = mask_strides.z; @@ -54,9 +56,9 @@ template const int kv_head_idx = head_idx / gqa_factor; const int Q = tpg.y; const int group_offset = head_idx * Q + q_seq_idx; - const int q_offset = group_offset; const int o_offset = group_offset; - queries += q_offset * D + simd_lid * qk_per_thread; + queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride + + simd_lid * qk_per_thread; keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride + simd_lid * qk_per_thread; values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride + @@ -156,8 +158,8 @@ template device float* maxs [[buffer(5)]], const constant uint& gqa_factor [[buffer(6)]], const constant uint& N [[buffer(7)]], - const constant uint2& k_head_seq_stride [[buffer(8)]], - const constant uint2& v_head_seq_stride [[buffer(9)]], + const constant uint3& qkv_head_strides [[buffer(8)]], + const constant uint3& qkv_seq_strides [[buffer(9)]], const constant float& scale [[buffer(10)]], const device bool* mask [[buffer(11)]], const constant uint3& mask_strides [[buffer(12)]], @@ -170,10 +172,12 @@ template constexpr int BD = 32; constexpr int qk_per_thread = D / BD; constexpr int v_per_thread = V / BD; - const int k_head_stride = k_head_seq_stride.x; - const int k_seq_stride = k_head_seq_stride.y; - const int v_head_stride = v_head_seq_stride.x; - const int v_seq_stride = v_head_seq_stride.y; + const int q_head_stride = qkv_head_strides.x; + const int q_seq_stride = qkv_seq_strides.x; + const int k_head_stride = qkv_head_strides.y; + const int k_seq_stride = qkv_seq_strides.y; + const int v_head_stride = qkv_head_strides.z; + const int v_seq_stride = qkv_seq_strides.z; const int mask_kv_seq_stride = mask_strides.x; const int mask_q_seq_stride = mask_strides.y; const int mask_head_stride = mask_strides.z; @@ -196,10 +200,10 @@ template const int head_idx = tid.x; const int q_seq_idx = tid.y; const int o_offset = head_idx * tpg.y + q_seq_idx; - const int q_offset = o_offset; const int kv_head_idx = head_idx / gqa_factor; - queries += q_offset * D + simd_lid * qk_per_thread; + queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride + + simd_lid * qk_per_thread; keys += kv_head_idx * k_head_stride + (block_idx * BN + simd_gid) * k_seq_stride + simd_lid * qk_per_thread; values += kv_head_idx * v_head_stride + @@ -520,25 +524,25 @@ kernel void attention( } } -#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM) \ - template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM \ - "_" #VALUE_DIM)]] kernel void \ - sdpa_vector( \ - const device DTYPE* queries [[buffer(0)]], \ - const device DTYPE* keys [[buffer(1)]], \ - const device DTYPE* values [[buffer(2)]], \ - device DTYPE* out [[buffer(3)]], \ - const constant uint& gqa_factor [[buffer(4)]], \ - const constant uint& N [[buffer(5)]], \ - const constant uint2& k_head_seq_stride [[buffer(6)]], \ - const constant uint2& v_head_seq_stride [[buffer(7)]], \ - const constant float& scale [[buffer(8)]], \ - const device bool* mask [[buffer(9)]], \ - const constant uint3& mask_strides [[buffer(10)]], \ - const constant bool& has_mask [[buffer(11)]], \ - uint3 tid [[threadgroup_position_in_grid]], \ - uint3 tpg [[threadgroups_per_grid]], \ - uint simd_gid [[simdgroup_index_in_threadgroup]], \ +#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM) \ + template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM \ + "_" #VALUE_DIM)]] kernel void \ + sdpa_vector( \ + const device DTYPE* queries [[buffer(0)]], \ + const device DTYPE* keys [[buffer(1)]], \ + const device DTYPE* values [[buffer(2)]], \ + device DTYPE* out [[buffer(3)]], \ + const constant uint& gqa_factor [[buffer(4)]], \ + const constant uint& N [[buffer(5)]], \ + const constant uint3& qkv_head_strides [[buffer(6)]], \ + const constant uint3& qkv_seq_strides [[buffer(7)]], \ + const constant float& scale [[buffer(8)]], \ + const device bool* mask [[buffer(9)]], \ + const constant uint3& mask_strides [[buffer(10)]], \ + const constant bool& has_mask [[buffer(11)]], \ + uint3 tid [[threadgroup_position_in_grid]], \ + uint3 tpg [[threadgroups_per_grid]], \ + uint simd_gid [[simdgroup_index_in_threadgroup]], \ uint simd_lid [[thread_index_in_simdgroup]]); #define INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, QK_DIM, VALUE_DIM) \ @@ -553,8 +557,8 @@ kernel void attention( device float* maxs [[buffer(5)]], \ const constant uint& gqa_factor [[buffer(6)]], \ const constant uint& N [[buffer(7)]], \ - const constant uint2& k_head_seq_stride [[buffer(8)]], \ - const constant uint2& v_head_seq_stride [[buffer(9)]], \ + const constant uint3& qkv_head_strides [[buffer(8)]], \ + const constant uint3& qkv_seq_strides [[buffer(9)]], \ const constant float& scale [[buffer(10)]], \ const device bool* mask [[buffer(11)]], \ const constant uint3& mask_strides [[buffer(12)]], \ diff --git a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal index f6f4935608e4..0539eab79500 100644 --- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal +++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal @@ -39,6 +39,13 @@ struct lerp_alpha_functor { } }; +struct native_dropout_mask_and_scale_functor { + template + inline TA operator()(const TI a, const TI b, const TA scale) { + return static_cast(a) * static_cast(b) * scale; + } +}; + struct fmax_functor { template inline T operator()(const T a, const T b) { @@ -315,6 +322,20 @@ struct fmod_functor { } }; +struct igamma_functor { + template + inline T operator()(const T a, const T b) { + return c10::metal::igamma(a, b); + } +}; + +struct igammac_functor { + template + inline T operator()(const T a, const T b) { + return c10::metal::igammac(a, b); + } +}; + #define REGISTER_INTEGER_BINARY_OP(NAME) \ REGISTER_BINARY_OP(NAME, long, long); \ REGISTER_BINARY_OP(NAME, int, int); \ @@ -386,6 +407,8 @@ REGISTER_OPMATH_FLOAT_BINARY_OP(remainder); REGISTER_INTEGER_BINARY_OP(remainder); REGISTER_OPMATH_FLOAT_BINARY_OP(fmod); REGISTER_INTEGER_BINARY_OP(fmod); +REGISTER_OPMATH_FLOAT_BINARY_OP(igamma); +REGISTER_OPMATH_FLOAT_BINARY_OP(igammac); REGISTER_BINARY_ALPHA_OP(add_alpha, long, long, long); REGISTER_BINARY_ALPHA_OP(add_alpha, int, int, int); REGISTER_BINARY_ALPHA_OP(add_alpha, float, float, float); @@ -411,6 +434,10 @@ REGISTER_BINARY_ALPHA_OP(lerp_alpha, uchar, uchar, uchar); REGISTER_BINARY_ALPHA_OP(lerp_alpha, char, char, char); REGISTER_BINARY_ALPHA_OP(lerp_alpha, bool, bool, bool); +REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, float, float, float); +REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, bfloat, bfloat, bfloat); +REGISTER_BINARY_ALPHA_OP(native_dropout_mask_and_scale, half, half, half); + REGISTER_BINARY_ALPHA_OP(add_alpha, bfloat, bfloat, bfloat); REGISTER_BINARY_ALPHA_OP(sub_alpha, bfloat, bfloat, bfloat); REGISTER_BINARY_ALPHA_OP(lerp_alpha, bfloat, bfloat, bfloat); diff --git a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal index 23c4810a2496..7db38da80532 100644 --- a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal +++ b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal @@ -490,11 +490,6 @@ struct bitwise_not_functor { } }; -template -float erfc(T x) { - return 1.0 - erf(x); -} - struct round_decimals_functor { template inline T operator()(const T x, const long ndigits) { @@ -503,6 +498,17 @@ struct round_decimals_functor { } }; +struct round_functor { + template , bool> = true> + inline T operator()(const T x) { + return static_cast(rint(float(x))); + } + template , bool> = true> + inline T operator()(const T x) { + return x; + } +}; + DEFINE_UNARY_FLOATING_FUNCTOR(erf); DEFINE_UNARY_FLOATING_FUNCTOR(erfc); DEFINE_UNARY_FLOATING_FUNCTOR(erfinv); @@ -515,6 +521,13 @@ REGISTER_UNARY_OP(neg, char, char); REGISTER_UNARY_OP(neg, uchar, uchar); REGISTER_UNARY_OP(neg, float, float); REGISTER_UNARY_OP(neg, half, half); +REGISTER_UNARY_OP(round, int, int); +REGISTER_UNARY_OP(round, long, long); +REGISTER_UNARY_OP(round, short, short); +REGISTER_UNARY_OP(round, char, char); +REGISTER_UNARY_OP(round, uchar, uchar); +REGISTER_UNARY_OP(round, float, float); +REGISTER_UNARY_OP(round, half, half); REGISTER_UNARY_OP(bitwise_not, int, int); REGISTER_UNARY_OP(bitwise_not, long, long); @@ -558,6 +571,7 @@ REGISTER_UNARY_OP(abs, half, half); INSTANTIATE_UNARY_KERNELS2(bfloat, bfloat); REGISTER_UNARY_OP(neg, bfloat, bfloat); +REGISTER_UNARY_OP(round, bfloat, bfloat); REGISTER_UNARY_OP(abs, bfloat, bfloat); INSTANTIATE_UNARY_KERNELS2(half, half); INSTANTIATE_UNARY_KERNELS2(float, float); diff --git a/aten/src/ATen/native/mps/operations/Attention.mm b/aten/src/ATen/native/mps/operations/Attention.mm index 69ec9af055ba..11498ade6fd0 100644 --- a/aten/src/ATen/native/mps/operations/Attention.mm +++ b/aten/src/ATen/native/mps/operations/Attention.mm @@ -182,6 +182,8 @@ uint maxSeqLength = k_.size(2); uint N = k_.size(2); uint B = q_.size(0) * q_.size(1); + uint q_head_stride = q_.stride(1); + uint q_seq_stride = q_.stride(2); uint k_head_stride = k_.stride(1); uint k_seq_stride = k_.stride(2); uint v_head_stride = v_.stride(1); @@ -209,8 +211,8 @@ out, 1, N, - std::array{k_head_stride, k_seq_stride}, - std::array{v_head_stride, v_seq_stride}, + std::array{q_head_stride, k_head_stride, v_head_stride}, + std::array{q_seq_stride, k_seq_stride, v_seq_stride}, scale_factor); if (has_mask) { @@ -257,6 +259,8 @@ uint B = batchSize * num_heads; uint gqa_factor = q_.size(1) / k_.size(1); + uint q_head_stride = q_.stride(1); + uint q_seq_stride = q_.stride(2); uint k_head_stride = k_.stride(1); uint k_seq_stride = k_.stride(2); uint v_head_stride = v_.stride(1); @@ -294,8 +298,8 @@ maxs, gqa_factor, N, - std::array{k_head_stride, k_seq_stride}, - std::array{v_head_stride, v_seq_stride}, + std::array{q_head_stride, k_head_stride, v_head_stride}, + std::array{q_seq_stride, k_seq_stride, v_seq_stride}, scale_factor); if (has_mask) { diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm index b2a1b2757b13..0b303f48028f 100644 --- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm +++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm @@ -168,6 +168,10 @@ static void lerp_scalar_mps_kernel(at::TensorIteratorBase& iter, const Scalar& w lib.exec_binary_kernel(iter, "lerp_alpha", weight); } +static void native_dropout_mask_and_scale_mps_kernel(at::TensorIteratorBase& iter, const Scalar& scale) { + lib.exec_binary_kernel(iter, "native_dropout_mask_and_scale", scale); +} + static void mul_mps_kernel(TensorIteratorBase& iter) { lib.exec_binary_kernel(iter, "mul"); } @@ -192,6 +196,14 @@ static void fmod_mps_kernel(TensorIteratorBase& iter) { lib.exec_binary_kernel(iter, "fmod"); } +static void igamma_mps_kernel(TensorIteratorBase& iter) { + lib.exec_binary_kernel(iter, "igamma"); +} + +static void igammac_mps_kernel(TensorIteratorBase& iter) { + lib.exec_binary_kernel(iter, "igammac"); +} + REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel) REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel) REGISTER_DISPATCH(copysign_stub, ©sign_mps_kernel) @@ -217,4 +229,6 @@ static void fmod_mps_kernel(TensorIteratorBase& iter) { REGISTER_DISPATCH(div_trunc_stub, &div_trunc_mps_kernel) REGISTER_DISPATCH(fmod_stub, &fmod_mps_kernel) REGISTER_DISPATCH(remainder_stub, &remainder_mps_kernel) +REGISTER_DISPATCH(igamma_stub, &igamma_mps_kernel) +REGISTER_DISPATCH(igammac_stub, &igammac_mps_kernel) } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/Dropout.mm b/aten/src/ATen/native/mps/operations/Dropout.mm new file mode 100644 index 000000000000..116367d809eb --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Dropout.mm @@ -0,0 +1,45 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#endif + +namespace at::native { + +static Tensor native_dropout_mask_and_scale(const Tensor& input, const Tensor& mask, float scale) { + auto output = at::empty_like(input); + mps::binary_op_kernel("native_dropout_mask_and_scale", input, mask, output, scale); + return output; +} + +std::tuple native_dropout_mps(const Tensor& input, double p, std::optional train) { + if (input.numel() == 0 || !train.value_or(false) || p == 0) { + return {input.clone(), at::ones_like(input, input.options().dtype(c10::kBool))}; + } + + float p_comp = 1.0f - p; + Tensor mask = at::empty_like(input, input.options().dtype(c10::kBool)); + mask.bernoulli_(p_comp); + auto scale = p_comp == 0 ? 0.0f : 1.0f / p_comp; + Tensor output = native_dropout_mask_and_scale(input, mask, scale); + return {std::move(output), std::move(mask)}; +} + +Tensor native_dropout_backward_mps(const Tensor& grad, const Tensor& mask, double scale) { + auto grad_float = isFloatingType(grad.scalar_type()) ? grad : grad.to(c10::kFloat); + return native_dropout_mask_and_scale(grad_float, mask, scale); +} + +} // namespace at::native \ No newline at end of file diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm index 42769c13f1e1..219086edd8e3 100644 --- a/aten/src/ATen/native/mps/operations/Linear.mm +++ b/aten/src/ATen/native/mps/operations/Linear.mm @@ -115,7 +115,10 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt return output; } - if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS)) { + // No-graph execution causes nonsense if these are non-contiguous. + const bool is_contiguous = input.is_contiguous() && weight.is_contiguous() && bias.is_contiguous(); + + if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS) && is_contiguous) { _mps_linear_nograph(input, weight, bias, output); // Squeeze last dim of 1D linear return weight_arg.dim() != 1 ? output : output.squeeze(-1); diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm index cfec1e443e25..6ff47044df13 100644 --- a/aten/src/ATen/native/mps/operations/Sort.mm +++ b/aten/src/ATen/native/mps/operations/Sort.mm @@ -2,6 +2,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #include #include #include @@ -11,10 +12,85 @@ #include #include #else +#include #include #include #endif namespace at::native { +namespace { + +void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& values, Tensor& indices) { + using namespace mps; + if (self.dim() == 0 && self.numel() == 1) { + values.copy_(self); + indices.zero_(); + return; + } + // Handle empty tensors + if (self.numel() == 0) { + values.copy_(self); + indices.copy_(values.toType(at::ScalarType::Long)); + return; + } + // issue #154890, raising error to prevent crash within MPSGraph until + // workaround is implemented. + TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890"); + + auto stream = getCurrentMPSStream(); + struct CachedGraph : public MPSCachedGraph { + CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *selfTensor = nil, *valuesTensor = nil, *indicesTensor = nil; + }; + + // MPSGraph kthvalue is always sorted. + @autoreleasepool { + // Input as placeholders + MPSShape* input_shape = getMPSShape(self); + NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; + std::string key = std::string("kthvalue:") + [ns_shape_key UTF8String] + ":" + getMPSTypeString(self) + ":k" + + std::to_string(k) + ":dim" + std::to_string(dim); + auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { + newCachedGraph->selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), input_shape); + + MPSGraphTensor* castInputTensor = newCachedGraph->selfTensor; + MPSDataType dataType = getMPSDataType(self); + // #issue 104398441 sortWithTensor and argsortWithTensor + if (dataType != MPSDataTypeInt32 && dataType != MPSDataTypeFloat32 && dataType != MPSDataTypeFloat16) { + dataType = (dataType & MPSDataTypeFloatBit) ? MPSDataTypeFloat32 : MPSDataTypeInt32; + castInputTensor = [mpsGraph castTensor:newCachedGraph->selfTensor toType:dataType name:@"castInputTensor"]; + } + MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor + axis:(NSUInteger)dim + descending:false + name:nil]; + sortedTensor = [mpsGraph sliceTensor:sortedTensor + dimension:(NSUInteger)dim + start:((NSUInteger)k - 1) + length:1 + name:nil]; + MPSGraphTensor* argSortedTensor = [mpsGraph argSortWithTensor:castInputTensor + axis:(NSInteger)dim + descending:false + name:@"kthvalue_out"]; + argSortedTensor = [mpsGraph sliceTensor:argSortedTensor + dimension:dim + start:((NSUInteger)k - 1) + length:1 + name:nil]; + newCachedGraph->valuesTensor = sortedTensor; + newCachedGraph->indicesTensor = argSortedTensor; + }); + Placeholder inputPlaceholder = Placeholder(cachedGraph->selfTensor, self); + // Outputs as placeholders + Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values); + Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices); + // Create dictionary of inputs and outputs + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, results); + } +} +} // anonymous namespace // sort TORCH_IMPL_FUNC(sort_stable_out_mps) @@ -81,4 +157,31 @@ runMPSGraph(stream, cachedGraph->graph(), feeds, results); } } + +std::tuple kthvalue_out_mps(const Tensor& self, + int64_t k, + int64_t dim_, + bool keepdim, + Tensor& values, + Tensor& indices) { + // See note [Writing Nondeterministic Operations] + // If there are duplicate elements of the kth value, the procedure for choosing which + // of the duplicates to use for the indices output is nondeterministic. + at::globalContext().alertNotDeterministic("kthvalue MPS"); + + int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true); + int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim); + TORCH_CHECK(k >= 1 && k <= slicesize, "kthvalue(): selected number k out of range for dimension ", dim); + at::assert_no_overlap(self, values); + _reduction_with_indices_allocate_or_resize_output(values, indices, self, dim, keepdim); + + kthvalue_out_mps_impl(self, k, dim, values, indices); + + if (!keepdim) { + values.squeeze_(dim); + indices.squeeze_(dim); + } + + return std::forward_as_tuple(values, indices); +} } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm index 16e0608012f3..7b637d896f85 100644 --- a/aten/src/ATen/native/mps/operations/TensorCompare.mm +++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm @@ -335,6 +335,9 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements, } static void is_posneginf_helper(TensorIteratorBase& iter, bool is_neg) { + if (iter.numel() == 0) { + return; + } const auto& self = iter.input(0); auto& out = iter.output(0); @autoreleasepool { diff --git a/aten/src/ATen/native/mps/operations/UnaryKernel.mm b/aten/src/ATen/native/mps/operations/UnaryKernel.mm index b560739ed40c..7e150b133cc6 100644 --- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm +++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm @@ -50,6 +50,7 @@ static void round_decimals_kernel(TensorIteratorBase& iter, int64_t decimals) { REGISTER_UNARY_TI_DISPATCH(log); REGISTER_UNARY_TI_DISPATCH(log1p); REGISTER_UNARY_TI_DISPATCH(bitwise_not); +REGISTER_UNARY_TI_DISPATCH(round); REGISTER_UNARY_TI_DISPATCH(sigmoid); REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel); } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm index 8fbefcb6ab8a..d7ce40e5cbb4 100644 --- a/aten/src/ATen/native/mps/operations/UnaryOps.mm +++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm @@ -184,7 +184,6 @@ static void unary_op(const Tensor& self, REGISTER_MPS_UNARY_STUB(ceil, ceil); REGISTER_MPS_UNARY_STUB(floor, floor); -REGISTER_MPS_UNARY_STUB(round, round); REGISTER_MPS_UNARY_STUB(trunc, truncate); #define CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(func_out, func_stub) \ @@ -418,6 +417,7 @@ static void cumulative_op_impl(const Tensor& self, Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) { TORCH_CHECK(self.is_complex()); + TORCH_CHECK(self.dtype() != at::kComplexDouble); mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { return [mpsGraph conjugateWithTensor:inputTensor name:nil]; }); diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 113db1c1e437..abb061afc5c9 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -288,6 +288,7 @@ dispatch: CPU: native_dropout_cpu CUDA: native_dropout_cuda + MPS: native_dropout_mps NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_nested tags: [nondeterministic_seeded, core] autogen: native_dropout.out @@ -296,6 +297,7 @@ dispatch: CPU, NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: native_dropout_backward CUDA: native_dropout_backward_cuda + MPS: native_dropout_backward_mps autogen: native_dropout_backward.out tags: pointwise @@ -340,8 +342,8 @@ variants: function, method dispatch: CompositeExplicitAutograd: abs - SparseCPU, SparseCUDA: abs_sparse - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr + SparseCPU, SparseCUDA, SparseMPS: abs_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs tags: [core, pointwise] @@ -350,16 +352,16 @@ variants: function, method dispatch: CompositeExplicitAutograd: abs_ - SparseCPU, SparseCUDA: abs_sparse_ - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_ + SparseCPU, SparseCUDA, SparseMPS: abs_sparse_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_abs_ - func: abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: CPU, CUDA, MPS, MTIA: abs_out - SparseCPU, SparseCUDA: abs_sparse_out - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: abs_sparse_csr_out + SparseCPU, SparseCUDA, SparseMPS: abs_sparse_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: abs_sparse_csr_out tags: pointwise # Note [Adding an alias] @@ -428,7 +430,7 @@ variants: function, method structured_delegate: sgn.out dispatch: - SparseCPU, SparseCUDA: sgn_sparse + SparseCPU, SparseCUDA, SparseMPS: sgn_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn tags: pointwise @@ -437,7 +439,7 @@ variants: method structured_delegate: sgn.out dispatch: - SparseCPU, SparseCUDA: sgn_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sgn_ tags: pointwise @@ -448,7 +450,7 @@ dispatch: CPU, CUDA: sgn_out MPS: sgn_out_mps - SparseCPU, SparseCUDA: sgn_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sgn_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sgn_sparse_csr_out tags: pointwise @@ -476,7 +478,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: _conj_physical - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr autogen: _conj_physical.out - func: conj_physical(Tensor self) -> Tensor @@ -487,8 +489,8 @@ dispatch: CPU, CUDA: conj_physical_out MPS: conj_physical_out_mps - SparseCPU, SparseCUDA: conj_physical_out_sparse - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: conj_physical_sparse_csr_out + SparseCPU, SparseCUDA, SparseMPS: conj_physical_out_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: conj_physical_sparse_csr_out tags: pointwise - func: conj_physical_(Tensor(a!) self) -> Tensor(a!) @@ -554,7 +556,7 @@ structured_delegate: add.out variants: function, method dispatch: - SparseCPU, SparseCUDA, SparseMeta: add_sparse + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr MkldnnCPU: mkldnn_add ZeroTensor: add_zerotensor @@ -566,7 +568,7 @@ variants: method structured_delegate: add.out dispatch: - SparseCPU, SparseCUDA, SparseMeta: add_sparse_ + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: add_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_ MkldnnCPU: mkldnn_add_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_add__Tensor @@ -582,6 +584,7 @@ dispatch: SparseCPU, SparseMeta: add_out_sparse_cpu SparseCUDA: add_out_sparse_cuda + SparseMPS: add_out_sparse_mps SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu SparseCsrCUDA: add_out_sparse_compressed_cuda MkldnnCPU: mkldnn_add_out @@ -874,7 +877,7 @@ variants: function, method structured_delegate: asinh.out dispatch: - SparseCPU, SparseCUDA: asinh_sparse + SparseCPU, SparseCUDA, SparseMPS: asinh_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr tags: [core, pointwise] @@ -882,7 +885,7 @@ variants: function, method structured_delegate: asinh.out dispatch: - SparseCPU, SparseCUDA: asinh_sparse_ + SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_ tags: pointwise @@ -892,7 +895,7 @@ dispatch: CPU, CUDA: asinh_out MPS: asinh_out_mps - SparseCPU, SparseCUDA: asinh_sparse_out + SparseCPU, SparseCUDA, SparseMPS: asinh_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asinh_sparse_csr_out tags: pointwise @@ -909,7 +912,7 @@ structured_delegate: atanh.out variants: function, method dispatch: - SparseCPU, SparseCUDA: atanh_sparse + SparseCPU, SparseCUDA, SparseMPS: atanh_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr tags: [core, pointwise] @@ -917,7 +920,7 @@ structured_delegate: atanh.out variants: function, method dispatch: - SparseCPU, SparseCUDA: atanh_sparse_ + SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_ tags: pointwise @@ -927,7 +930,7 @@ dispatch: CPU, CUDA: atanh_out MPS: atanh_out_mps - SparseCPU, SparseCUDA: atanh_sparse_out + SparseCPU, SparseCUDA, SparseMPS: atanh_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atanh_sparse_csr_out tags: pointwise # arctanh, alias for atanh @@ -964,7 +967,7 @@ variants: function, method structured_delegate: asin.out dispatch: - SparseCPU, SparseCUDA: asin_sparse + SparseCPU, SparseCUDA, SparseMPS: asin_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr tags: [core, pointwise] @@ -973,7 +976,7 @@ variants: function, method structured_delegate: asin.out dispatch: - SparseCPU, SparseCUDA: asin_sparse_ + SparseCPU, SparseCUDA, SparseMPS: asin_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_ tags: pointwise @@ -983,7 +986,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: asin_out - SparseCPU, SparseCUDA: asin_sparse_out + SparseCPU, SparseCUDA, SparseMPS: asin_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: asin_sparse_csr_out tags: pointwise @@ -1001,7 +1004,7 @@ structured_delegate: atan.out variants: function, method dispatch: - SparseCPU, SparseCUDA: atan_sparse + SparseCPU, SparseCUDA, SparseMPS: atan_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr tags: [core, pointwise] @@ -1010,7 +1013,7 @@ structured_delegate: atan.out variants: function, method dispatch: - SparseCPU, SparseCUDA: atan_sparse_ + SparseCPU, SparseCUDA, SparseMPS: atan_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_ tags: pointwise @@ -1020,7 +1023,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: atan_out - SparseCPU, SparseCUDA: atan_sparse_out + SparseCPU, SparseCUDA, SparseMPS: atan_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: atan_sparse_csr_out tags: pointwise @@ -1459,7 +1462,7 @@ structured_delegate: ceil.out variants: function, method dispatch: - SparseCPU, SparseCUDA: ceil_sparse + SparseCPU, SparseCUDA, SparseMPS: ceil_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr tags: [core, pointwise] @@ -1468,7 +1471,7 @@ structured_delegate: ceil.out variants: function, method dispatch: - SparseCPU, SparseCUDA: ceil_sparse_ + SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_ tags: pointwise @@ -1478,7 +1481,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: ceil_out - SparseCPU, SparseCUDA: ceil_sparse_out + SparseCPU, SparseCUDA, SparseMPS: ceil_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ceil_sparse_csr_out tags: pointwise @@ -2406,7 +2409,7 @@ MPS: empty_mps Meta: empty_meta_symint MkldnnCPU: empty_mkldnn - SparseCPU, SparseCUDA: empty_sparse + SparseCPU, SparseCUDA, SparseMPS: empty_sparse SparseMeta: empty_sparse_symint SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed SparseCsrMeta: empty_sparse_compressed_symint @@ -2534,7 +2537,7 @@ structured_delegate: erf.out variants: function, method dispatch: - SparseCPU, SparseCUDA: erf_sparse + SparseCPU, SparseCUDA, SparseMPS: erf_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr tags: [core, pointwise] @@ -2543,7 +2546,7 @@ structured_delegate: erf.out variants: function, method dispatch: - SparseCPU, SparseCUDA: erf_sparse_ + SparseCPU, SparseCUDA, SparseMPS: erf_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_ tags: pointwise @@ -2553,7 +2556,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS, MTIA: erf_out - SparseCPU, SparseCUDA: erf_sparse_out + SparseCPU, SparseCUDA, SparseMPS: erf_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erf_sparse_csr_out tags: pointwise @@ -2619,7 +2622,7 @@ structured_delegate: expm1.out variants: function, method dispatch: - SparseCPU, SparseCUDA: expm1_sparse + SparseCPU, SparseCUDA, SparseMPS: expm1_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr tags: [core, pointwise] @@ -2628,7 +2631,7 @@ structured_delegate: expm1.out variants: function, method dispatch: - SparseCPU, SparseCUDA: expm1_sparse_ + SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_ tags: pointwise @@ -2638,7 +2641,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: expm1_out - SparseCPU, SparseCUDA: expm1_sparse_out + SparseCPU, SparseCUDA, SparseMPS: expm1_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: expm1_sparse_csr_out tags: pointwise @@ -2737,7 +2740,7 @@ structured_delegate: floor.out variants: function, method dispatch: - SparseCPU, SparseCUDA: floor_sparse + SparseCPU, SparseCUDA, SparseMPS: floor_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr tags: [core, pointwise] @@ -2746,7 +2749,7 @@ structured_delegate: floor.out variants: function, method dispatch: - SparseCPU, SparseCUDA: floor_sparse_ + SparseCPU, SparseCUDA, SparseMPS: floor_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_ tags: pointwise @@ -2756,7 +2759,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: floor_out - SparseCPU, SparseCUDA: floor_sparse_out + SparseCPU, SparseCUDA, SparseMPS: floor_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: floor_sparse_csr_out tags: pointwise @@ -2764,7 +2767,7 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA, MPS: floor_divide + CPU, CUDA, MPS, MTIA: floor_divide SparseCPU, SparseCUDA: floor_divide_sparse - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) @@ -2798,7 +2801,7 @@ structured_delegate: frac.out variants: function, method dispatch: - SparseCPU, SparseCUDA: frac_sparse + SparseCPU, SparseCUDA, SparseMPS: frac_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr tags: pointwise @@ -2807,7 +2810,7 @@ structured_delegate: frac.out variants: function, method dispatch: - SparseCPU, SparseCUDA: frac_sparse_ + SparseCPU, SparseCUDA, SparseMPS: frac_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_ tags: pointwise @@ -2818,7 +2821,7 @@ dispatch: CPU, CUDA: frac_out MPS: frac_out_mps - SparseCPU, SparseCUDA: frac_sparse_out + SparseCPU, SparseCUDA, SparseMPS: frac_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: frac_sparse_csr_out tags: pointwise @@ -3208,7 +3211,7 @@ dispatch: CPU, CUDA, MPS, MTIA: isnan NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isnan - SparseCPU, SparseCUDA: isnan_sparse + SparseCPU, SparseCUDA, SparseMPS: isnan_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isnan_sparse_csr autogen: isnan.out tags: [core, pointwise] @@ -3289,6 +3292,7 @@ dispatch: CPU: kthvalue_out_cpu CUDA: kthvalue_out_cuda + MPS: kthvalue_out_mps - func: kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) variants: function, method @@ -3336,21 +3340,21 @@ variants: function, method dispatch: CompositeExplicitAutograd: nan_to_num - SparseCPU, SparseCUDA: nan_to_num_sparse + SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse tags: pointwise - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!) variants: function, method dispatch: CompositeExplicitAutograd: nan_to_num_ - SparseCPU, SparseCUDA: nan_to_num_sparse_ + SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_ tags: pointwise - func: nan_to_num.out(Tensor self, float? nan=None, float? posinf=None, float? neginf=None, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA, MTIA: nan_to_num_out MPS: nan_to_num_out_mps - SparseCPU, SparseCUDA: nan_to_num_sparse_out + SparseCPU, SparseCUDA, SparseMPS: nan_to_num_sparse_out tags: pointwise - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor @@ -3553,7 +3557,7 @@ structured_delegate: log1p.out variants: function, method dispatch: - SparseCPU, SparseCUDA: log1p_sparse + SparseCPU, SparseCUDA, SparseMPS: log1p_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr tags: [core, pointwise] @@ -3562,7 +3566,7 @@ structured_delegate: log1p.out variants: function, method dispatch: - SparseCPU, SparseCUDA: log1p_sparse_ + SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_ tags: pointwise @@ -3572,7 +3576,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: log1p_out - SparseCPU, SparseCUDA: log1p_sparse_out + SparseCPU, SparseCUDA, SparseMPS: log1p_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: log1p_sparse_csr_out tags: pointwise @@ -4664,7 +4668,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: rad2deg - SparseCPU, SparseCUDA: rad2deg_sparse + SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr tags: pointwise @@ -4672,14 +4676,14 @@ variants: function, method dispatch: CompositeExplicitAutograd: rad2deg_ - SparseCPU, SparseCUDA: rad2deg_sparse_ + SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_ tags: pointwise - func: rad2deg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: rad2deg_out - SparseCPU, SparseCUDA: rad2deg_sparse_out + SparseCPU, SparseCUDA, SparseMPS: rad2deg_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: rad2deg_sparse_csr_out tags: pointwise @@ -4687,7 +4691,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: deg2rad - SparseCPU, SparseCUDA: deg2rad_sparse + SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr tags: pointwise @@ -4695,14 +4699,14 @@ variants: function, method dispatch: CompositeExplicitAutograd: deg2rad_ - SparseCPU, SparseCUDA: deg2rad_sparse_ + SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_ tags: pointwise - func: deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CompositeExplicitAutograd: deg2rad_out - SparseCPU, SparseCUDA: deg2rad_sparse_out + SparseCPU, SparseCUDA, SparseMPS: deg2rad_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: deg2rad_sparse_csr_out tags: pointwise @@ -4928,7 +4932,7 @@ structured_delegate: neg.out variants: function, method dispatch: - SparseCPU, SparseCUDA: neg_sparse + SparseCPU, SparseCUDA, SparseMPS: neg_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg tags: [core, pointwise] @@ -4938,7 +4942,7 @@ structured_delegate: neg.out variants: function, method dispatch: - SparseCPU, SparseCUDA: neg_sparse_ + SparseCPU, SparseCUDA, SparseMPS: neg_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_neg_ tags: pointwise @@ -4949,7 +4953,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS, MTIA: neg_out - SparseCPU, SparseCUDA: neg_out_sparse + SparseCPU, SparseCUDA, SparseMPS: neg_out_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: neg_sparse_csr_out tags: pointwise # Alias for neg @@ -5033,7 +5037,7 @@ structured_delegate: round.out variants: function, method dispatch: - SparseCPU, SparseCUDA: round_sparse + SparseCPU, SparseCUDA, SparseMPS: round_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr tags: [core, pointwise] @@ -5042,7 +5046,7 @@ structured_delegate: round.out variants: function, method dispatch: - SparseCPU, SparseCUDA: round_sparse_ + SparseCPU, SparseCUDA, SparseMPS: round_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_ tags: pointwise @@ -5052,7 +5056,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: round_out - SparseCPU, SparseCUDA: round_sparse_out + SparseCPU, SparseCUDA, SparseMPS: round_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: round_sparse_csr_out tags: pointwise @@ -5095,7 +5099,7 @@ QuantizedCPU: relu_quantized_cpu QuantizedCUDA: relu_quantized_cuda NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu - SparseCPU, SparseCUDA: relu_sparse + SparseCPU, SparseCUDA, SparseMPS: relu_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr tags: [core, pointwise] @@ -5110,7 +5114,7 @@ QuantizedCPU: relu_quantized_cpu_ QuantizedCUDA: relu_quantized_cuda_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_relu_ - SparseCPU, SparseCUDA: relu_sparse_ + SparseCPU, SparseCUDA, SparseMPS: relu_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: relu_sparse_csr_ autogen: relu.out tags: pointwise @@ -5397,7 +5401,7 @@ variants: function, method dispatch: SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr - SparseCPU, SparseCUDA: sin_sparse + SparseCPU, SparseCUDA, SparseMPS: sin_sparse NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sin tags: [core, pointwise] @@ -5407,7 +5411,7 @@ variants: function, method dispatch: SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_ - SparseCPU, SparseCUDA: sin_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sin_sparse_ tags: pointwise - func: sin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -5417,7 +5421,7 @@ dispatch: CPU, CUDA, MPS, MTIA: sin_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sin_sparse_csr_out - SparseCPU, SparseCUDA: sin_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sin_sparse_out tags: pointwise - func: sinc(Tensor self) -> Tensor @@ -5442,7 +5446,7 @@ structured_delegate: sinh.out variants: function, method dispatch: - SparseCPU, SparseCUDA: sinh_sparse + SparseCPU, SparseCUDA, SparseMPS: sinh_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr tags: [core, pointwise] @@ -5451,7 +5455,7 @@ structured_delegate: sinh.out variants: function, method dispatch: - SparseCPU, SparseCUDA: sinh_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_ tags: pointwise @@ -5461,7 +5465,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: sinh_out - SparseCPU, SparseCUDA: sinh_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sinh_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sinh_sparse_csr_out # Returns a copy of this `Variable` that is detached from its autograd graph. @@ -5509,6 +5513,13 @@ tags: core manual_cpp_binding: True +- func: sym_is_contiguous(Tensor self, MemoryFormat memory_format=contiguous_format) -> SymBool + variants: function + device_check: NoCheck + device_guard: False + tags: core + manual_cpp_binding: True + - func: sym_numel(Tensor self) -> SymInt variants: function device_check: NoCheck @@ -5904,7 +5915,7 @@ variants: function, method dispatch: NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sqrt - SparseCPU, SparseCUDA: sqrt_sparse + SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr tags: [core, pointwise] @@ -5913,7 +5924,7 @@ structured_delegate: sqrt.out variants: function, method dispatch: - SparseCPU, SparseCUDA: sqrt_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_ tags: pointwise @@ -5923,7 +5934,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS, MTIA: sqrt_out - SparseCPU, SparseCUDA: sqrt_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sqrt_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sqrt_sparse_csr_out tags: pointwise @@ -6061,7 +6072,7 @@ structured_delegate: tan.out variants: function, method dispatch: - SparseCPU, SparseCUDA: tan_sparse + SparseCPU, SparseCUDA, SparseMPS: tan_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr tags: [core, pointwise] @@ -6070,7 +6081,7 @@ structured_delegate: tan.out variants: function, method dispatch: - SparseCPU, SparseCUDA: tan_sparse_ + SparseCPU, SparseCUDA, SparseMPS: tan_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_ tags: pointwise @@ -6080,7 +6091,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: tan_out - SparseCPU, SparseCUDA: tan_sparse_out + SparseCPU, SparseCUDA, SparseMPS: tan_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tan_sparse_csr_out tags: pointwise @@ -6091,7 +6102,7 @@ dispatch: QuantizedCPU: tanh_quantized_cpu MkldnnCPU: mkldnn_tanh - SparseCPU, SparseCUDA: tanh_sparse + SparseCPU, SparseCUDA, SparseMPS: tanh_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh tags: [core, pointwise] @@ -6102,7 +6113,7 @@ variants: function, method dispatch: MkldnnCPU: mkldnn_tanh_ - SparseCPU, SparseCUDA: tanh_sparse_ + SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_tanh_ tags: pointwise @@ -6113,7 +6124,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS, MTIA: tanh_out - SparseCPU, SparseCUDA: tanh_sparse_out + SparseCPU, SparseCUDA, SparseMPS: tanh_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: tanh_sparse_csr_out tags: pointwise @@ -6385,8 +6396,8 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: - SparseCPU, SparseCUDA: trunc_sparse - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr + SparseCPU, SparseCUDA, SparseMPS: trunc_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr tags: [core, pointwise] - func: trunc_(Tensor(a!) self) -> Tensor(a!) @@ -6394,8 +6405,8 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: - SparseCPU, SparseCUDA: trunc_sparse_ - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_ + SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_ tags: pointwise - func: trunc.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -6404,8 +6415,8 @@ device_check: NoCheck # TensorIterator dispatch: CPU, CUDA, MPS: trunc_out - SparseCPU, SparseCUDA: trunc_sparse_out - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: trunc_sparse_csr_out + SparseCPU, SparseCUDA, SparseMPS: trunc_sparse_out + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: trunc_sparse_csr_out tags: pointwise # Alias for trunc @@ -6915,7 +6926,7 @@ variants: function, method dispatch: CompositeExplicitAutograd: clone - SparseCPU, SparseCUDA: clone_sparse + SparseCPU, SparseCUDA, SparseMPS: clone_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: clone_sparse_compressed MkldnnCPU: mkldnn_clone QuantizedCPU, QuantizedCUDA: quantized_clone @@ -6950,7 +6961,7 @@ CPU, CUDA: zero_ MPS: zero_mps_ Meta: zero_meta_ - SparseCPU, SparseCUDA, SparseMeta: zero_sparse_ + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: zero_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_ MkldnnCPU: mkldnn_zero_ NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: zero_nested_ @@ -7156,6 +7167,7 @@ - func: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None, Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor variants: function dispatch: + CompositeExplicitAutograd: _grouped_mm CUDA: _grouped_mm_cuda # NOTE [ Sparse: autograd and API ] @@ -7367,8 +7379,8 @@ - func: _to_dense(Tensor self, ScalarType? dtype=None, bool? masked_grad=None) -> Tensor variants: method dispatch: - SparseCPU, SparseCUDA: sparse_to_dense - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_compressed_to_dense + SparseCPU, SparseCUDA, SparseMPS: sparse_to_dense + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: sparse_compressed_to_dense MkldnnCPU: mkldnn_to_dense autogen: _to_dense.out @@ -7394,8 +7406,8 @@ - func: dense_dim(Tensor self) -> int variants: method dispatch: - SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse - SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: dense_dim_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMPS, SparseCsrMeta: dense_dim_sparse_csr CompositeExplicitAutograd: dense_dim_default device_check: NoCheck device_guard: False @@ -7528,7 +7540,7 @@ device_check: NoCheck # Allows copy into different device variants: function dispatch: - SparseCPU, SparseCUDA, SparseMeta: copy_sparse_ + SparseCPU, SparseCUDA, SparseMPS, SparseMeta: copy_sparse_ autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors @@ -9719,7 +9731,7 @@ structured_delegate: sign.out variants: function, method dispatch: - SparseCPU, SparseCUDA: sign_sparse + SparseCPU, SparseCUDA, SparseMPS: sign_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr tags: [core, pointwise] @@ -9728,7 +9740,7 @@ structured_delegate: sign.out variants: method dispatch: - SparseCPU, SparseCUDA: sign_sparse_ + SparseCPU, SparseCUDA, SparseMPS: sign_sparse_ SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_ tags: pointwise @@ -9739,7 +9751,7 @@ dispatch: CPU, CUDA: sign_out MPS: sign_out_mps - SparseCPU, SparseCUDA: sign_sparse_out + SparseCPU, SparseCUDA, SparseMPS: sign_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sign_sparse_csr_out tags: pointwise @@ -9747,7 +9759,7 @@ variants: function, method structured_delegate: signbit.out dispatch: - SparseCPU, SparseCUDA: signbit_sparse + SparseCPU, SparseCUDA, SparseMPS: signbit_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr tags: pointwise @@ -9758,7 +9770,7 @@ CPU: signbit_out CUDA: signbit_out MPS: signbit_out_mps - SparseCPU, SparseCUDA: signbit_sparse_out + SparseCPU, SparseCUDA, SparseMPS: signbit_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: signbit_sparse_csr_out tags: pointwise @@ -9941,7 +9953,7 @@ structured: True structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: igamma_out + CPU, CUDA, MPS: igamma_out tags: pointwise - func: igamma(Tensor self, Tensor other) -> Tensor @@ -9958,7 +9970,7 @@ structured: True structured_inherits: TensorIteratorBase dispatch: - CPU, CUDA: igammac_out + CPU, CUDA, MPS: igammac_out tags: pointwise - func: igammac(Tensor self, Tensor other) -> Tensor @@ -13262,7 +13274,7 @@ dispatch: CompositeExplicitAutograd: isinf NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isinf - SparseCPU, SparseCUDA: isinf_sparse + SparseCPU, SparseCUDA, SparseMPS: isinf_sparse SparseMeta: isinf_sparse_meta SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isinf_sparse_csr autogen: isinf.out @@ -13278,7 +13290,7 @@ structured_delegate: isposinf.out dispatch: NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isposinf - SparseCPU, SparseCUDA: isposinf_sparse + SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr tags: pointwise @@ -13287,7 +13299,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: isposinf_out - SparseCPU, SparseCUDA: isposinf_sparse_out + SparseCPU, SparseCUDA, SparseMPS: isposinf_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isposinf_sparse_csr_out tags: pointwise @@ -13296,7 +13308,7 @@ structured_delegate: isneginf.out dispatch: NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_isneginf - SparseCPU, SparseCUDA: isneginf_sparse + SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr tags: pointwise @@ -13305,7 +13317,7 @@ structured_inherits: TensorIteratorBase dispatch: CPU, CUDA, MPS: isneginf_out - SparseCPU, SparseCUDA: isneginf_sparse_out + SparseCPU, SparseCUDA, SparseMPS: isneginf_sparse_out SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: isneginf_sparse_csr_out tags: pointwise diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp index 4ca777be9cd4..f804670c3153 100644 --- a/aten/src/ATen/native/quantized/QTensor.cpp +++ b/aten/src/ATen/native/quantized/QTensor.cpp @@ -335,6 +335,8 @@ std::tuple choose_qparams_optimized( const int64_t n_bins, const double ratio, int64_t bit_width) { + const float* input_row = input_tensor.const_data_ptr(); + TORCH_CHECK_VALUE(input_row != nullptr, "input tensor is empty and has no data"); if (numel < 0 || numel > input_tensor.numel()) { TORCH_CHECK(false, "numel is out of the bound of input tensor"); @@ -342,7 +344,7 @@ std::tuple choose_qparams_optimized( TORCH_CHECK(numel <= input_tensor.numel(), "numel ", numel, " greater than input_tensor.numel() ", input_tensor.numel()); - const float* input_row = input_tensor.const_data_ptr(); + float xmin = *std::min_element(input_row, input_row + numel); float xmax = *std::max_element(input_row, input_row + numel); float n_bins_float = static_cast(n_bins); diff --git a/aten/src/ATen/native/sparse/mps/FlattenIndices.mm b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm new file mode 100644 index 000000000000..41efa545cd2a --- /dev/null +++ b/aten/src/ATen/native/sparse/mps/FlattenIndices.mm @@ -0,0 +1,73 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native { +namespace { + +using namespace mps; +using namespace at::sparse; + +#ifndef PYTORCH_JIT_COMPILE_SHADERS +static auto& lib = mps::MetalShaderLibrary::getBundledLibrary(); +#else +#include +#endif + +Tensor flatten_indices_mps(const Tensor& indices, IntArrayRef size) { + TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D"); + TORCH_CHECK(static_cast(indices.size(0)) == size.size(), + "flatten_indices: indices.size(0) must equal size.size()"); + + const int64_t sparse_dim = indices.size(0); + const int64_t nnz = indices.size(1); + + if (nnz == 0) { + return at::empty({0}, indices.options().dtype(kLong)); + } + + // Row-major multipliers for flattening: mul[d] = prod_{j>d}(size[j]) + std::vector row_muls(sparse_dim); + row_muls[sparse_dim - 1] = 1; + for (int64_t i = sparse_dim - 2; i >= 0; --i) { + row_muls[i] = row_muls[i + 1] * size[i + 1]; + } + + auto flat_indices = at::empty({nnz}, indices.options().dtype(kLong)); + + auto stream = getCurrentMPSStream(); + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel"); + auto encoder = stream->commandEncoder(); + [encoder setComputePipelineState:pipeline]; + mtl_setArgs(encoder, + indices, + row_muls, + flat_indices, + static_cast(sparse_dim), + indices.strides() + ); + + mtl_dispatch1DJob(encoder, pipeline, nnz); + } + }); + return flat_indices; +} + +} // namespace +REGISTER_MPS_DISPATCH(flatten_indices_stub, &flatten_indices_mps) +} // namespace at::native \ No newline at end of file diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm index 7ccdf4077542..3e0ac4e35da1 100644 --- a/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm +++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensor.mm @@ -20,46 +20,9 @@ #ifndef PYTORCH_JIT_COMPILE_SHADERS static auto& lib = mps::MetalShaderLibrary::getBundledLibrary(); #else -#include +#include #endif - -static Tensor flatten_indices(const Tensor& indices, IntArrayRef size) { - - TORCH_CHECK(indices.dim() == 2, "flatten_indices: indices must be 2D"); - TORCH_CHECK(static_cast(indices.size(0)) == size.size(), - "flatten_indices: indices.size(0) must equal size.size()"); - - int64_t sparse_dim = indices.size(0); - int64_t nnz = indices.size(1); - - if (nnz == 0) { - return at::empty({0}, indices.options().dtype(kLong)); - } - - std::vector strides(sparse_dim); - strides[sparse_dim - 1] = 1; - for (int64_t i = sparse_dim - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * size[i + 1]; - } - - Tensor flat_indices = at::empty({nnz}, indices.options().dtype(kLong)); - - auto stream = getCurrentMPSStream(); - dispatch_sync_with_rethrow(stream->queue(), ^() { - @autoreleasepool { - auto pipeline = lib.getPipelineStateForFunc("flatten_indices_kernel"); - auto encoder = stream->commandEncoder(); - [encoder setComputePipelineState:pipeline]; - - mtl_setArgs(encoder, indices, strides, flat_indices, sparse_dim, nnz); - mtl_dispatch1DJob(encoder, pipeline, nnz); - } - }); - - return flat_indices; -} - static Tensor compute_output_positions(const Tensor& is_unique) { int64_t nnz = is_unique.size(0); diff --git a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm new file mode 100644 index 000000000000..07ee2e097b49 --- /dev/null +++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm @@ -0,0 +1,183 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +namespace at::native { + +using namespace at::sparse; + +Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha); + +Tensor& add_out_dense_sparse_mps( + Tensor& out, + const Tensor& dense, + const SparseTensor& sparse, + const Scalar& alpha) { + TORCH_CHECK(dense.is_mps(), "add: expected 'self' to be an MPS tensor, got ", dense.device()); + TORCH_CHECK(sparse.is_mps(), "add: expected 'other' to be an MPS tensor, got ", sparse.device()); + TORCH_CHECK(out.is_mps(), "add: expected 'out' to be an MPS tensor, got ", out.device()); + TORCH_CHECK(dense.sizes().equals(sparse.sizes()), + "add: expected 'self' and 'other' to have same size, but self has size ", + dense.sizes(), " while other has size ", sparse.sizes(), + " (FYI: dense-sparse addition does not currently support broadcasting)"); + + const int64_t nnz = sparse._nnz(); + if (nnz == 0) { + out.resize_as_(dense); + out.copy_(dense); + return out; + } + + auto commonDtype = at::result_type(dense, sparse); + TORCH_CHECK(canCast(commonDtype, out.scalar_type()), + "Can't convert result type ", commonDtype, " to output ", out.scalar_type()); + + Tensor r; + const bool need_separate_buffer = out.is_same(dense) || (out.scalar_type() != commonDtype); + if (need_separate_buffer) { + r = at::empty(dense.sizes(), out.options().dtype(commonDtype)); + } else { + r = out; + r.resize_as_(dense); + } + + Tensor dense_buffer = dense.to(commonDtype); + if (!r.is_same(dense_buffer)) { + r.copy_(dense_buffer); + } + + Tensor indices = sparse._indices(); + Tensor values = sparse._values().to(commonDtype); + if (values.numel() == 0) { + if (!out.is_same(r)) { + out.resize_as_(dense); + out.copy_(r); + } + return out; + } + + const int64_t nDim = r.dim(); + const int64_t nDimI = sparse.sparse_dim(); + TORCH_CHECK(nDimI >= 0 && nDimI <= nDim, + "Invalid sparse_dim=", nDimI, " for dense tensor of dim ", nDim); + + Tensor indices1D = at::sparse::flatten_indices(indices, sparse.sizes()).contiguous(); + + int64_t view_rows = 1; + int64_t view_cols = 1; + for (int64_t i = 0; i < nDimI; i++) { + view_rows *= r.size(i); + } + for (int64_t i = nDimI; i < nDim; i++) { + view_cols *= r.size(i); + } + + if (view_cols == 1) { + Tensor r_flat = r.reshape({view_rows}); + Tensor values_1d = values.reshape({nnz}); + r_flat.index_add_(0, indices1D, values_1d, alpha); + } else { + Tensor r_view = r.view({view_rows, view_cols}); + Tensor values_2d = values.reshape({nnz, view_cols}); + r_view.index_add_(0, indices1D, values_2d, alpha); + } + + if (!out.is_same(r)) { + out.resize_as_(dense); + out.copy_(r); + } + return out; +} + + +SparseTensor& add_out_sparse_mps(const SparseTensor& self, + const SparseTensor& other, + const Scalar& alpha, + SparseTensor& out) { + TORCH_CHECK(other.is_sparse(), "add(sparse, dense) is not supported. Use add(dense, sparse) instead."); + TORCH_CHECK(self.is_mps(), "add: expected 'self' to be MPS, but got ", self.device()); + TORCH_CHECK(other.is_mps(), "add: expected 'other' to be MPS, but got ", other.device()); + TORCH_CHECK(out.is_mps(), "add: expected 'out' to be MPS, but got ", out.device()); + if (!self.is_sparse()) { + return add_out_dense_sparse_mps(out, self, other, alpha); + } + auto commonDtype = at::result_type(self, other); + TORCH_CHECK(canCast(commonDtype, out.scalar_type()), + "Can't convert result type ", commonDtype, " to output ", out.scalar_type()); + + TORCH_CHECK(self.sizes().equals(other.sizes()), + "add: expected 'self' and 'other' to have same size, but ", self.sizes(), " != ", other.sizes()); + + if (other._nnz() == 0) { + out.resize_as_(self); + Tensor vals = self._values(); + if (vals.scalar_type() != out.scalar_type()) { + vals = vals.to(out.scalar_type()); + } + alias_into_sparse(out, self._indices(), vals); + out._coalesced_(self.is_coalesced()); + return out; + } + + if (self._nnz() == 0) { + out.resize_as_(other); + Tensor vals = other._values(); + if (!alpha.isIntegral(false) || alpha.to() != 1.0) { + vals = at::mul(vals, alpha); + } + if (vals.scalar_type() != out.scalar_type()) { + vals = vals.to(out.scalar_type()); + } + alias_into_sparse(out, other._indices(), vals); + out._coalesced_(other.is_coalesced()); + return out; + } + + TORCH_CHECK(is_same_density(self, other), + "add: expected 'self' and 'other' to have same density, but 'self' has ", + self.sparse_dim(), " sparse dimensions while 'other' has ", other.sparse_dim(), " sparse dimensions"); + + Tensor t_indices_ = self._indices(); + Tensor s_indices_ = other._indices(); + + Tensor t_values_ = self._values().to(commonDtype); + Tensor s_values_ = other._values().to(commonDtype); + if (!alpha.isIntegral(false) || alpha.to() != 1.0) { + s_values_ = at::mul(s_values_, alpha); + } + + Tensor r_indices_ = at::cat({t_indices_, s_indices_}, 1); + Tensor r_values_ = at::cat({t_values_, s_values_ }, 0); + + SparseTensor tmp = empty({0}, out.options().dtype(commonDtype)); + tmp.resize_as_(other); + alias_into_sparse(tmp, r_indices_, r_values_); + tmp = _coalesce_sparse_mps(tmp); + + out.resize_as_(other); + Tensor out_vals = tmp._values(); + if (out.scalar_type() != commonDtype) { + out_vals = out_vals.to(out.scalar_type()); + } + alias_into_sparse(out, tmp._indices(), out_vals); + out._coalesced_(tmp.is_coalesced()); + + return out; +} + +} // namespace at::native \ No newline at end of file diff --git a/aten/src/ATen/native/sparse/mps/kernels/Sparse.metal b/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal similarity index 89% rename from aten/src/ATen/native/sparse/mps/kernels/Sparse.metal rename to aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal index 8b85950e393a..e32d1edf1c2f 100644 --- a/aten/src/ATen/native/sparse/mps/kernels/Sparse.metal +++ b/aten/src/ATen/native/sparse/mps/kernels/Coalesce.metal @@ -2,19 +2,6 @@ #include using namespace metal; -kernel void flatten_indices_kernel( - device const int64_t* indices [[buffer(0)]], - device const int64_t* strides [[buffer(1)]], - device int64_t* flat_indices [[buffer(2)]], - constant uint& sparse_dim [[buffer(3)]], - constant uint& nnz [[buffer(4)]], - uint gid [[thread_position_in_grid]]) { - int64_t flat_idx = 0; - for (uint d = 0; d < sparse_dim; d++) { - flat_idx += indices[d * nnz + gid] * strides[d]; - } - flat_indices[gid] = flat_idx; -} kernel void compute_output_positions_kernel( device const bool* is_unique [[buffer(0)]], @@ -125,4 +112,6 @@ INSTANTIATE_COALESCE_WITH_POSITIONS(long); INSTANTIATE_COALESCE_WITH_POSITIONS(char); INSTANTIATE_COALESCE_WITH_POSITIONS(uchar); INSTANTIATE_COALESCE_WITH_POSITIONS(short); -INSTANTIATE_COALESCE_WITH_POSITIONS(int); \ No newline at end of file +INSTANTIATE_COALESCE_WITH_POSITIONS(int); +INSTANTIATE_COALESCE_WITH_POSITIONS(float2); +INSTANTIATE_COALESCE_WITH_POSITIONS(half2); \ No newline at end of file diff --git a/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal b/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal new file mode 100644 index 000000000000..00156dddb06c --- /dev/null +++ b/aten/src/ATen/native/sparse/mps/kernels/FlattenIndices.metal @@ -0,0 +1,19 @@ +#include +using namespace metal; + + +kernel void flatten_indices_kernel( + device const long* indices [[ buffer(0) ]], + device const long* row_muls [[ buffer(1) ]], + device long* flat_indices [[ buffer(2) ]], + constant uint& sparse_dim [[ buffer(3) ]], + constant long2& idx_strides [[ buffer(4) ]], + uint gid [[ thread_position_in_grid ]]) { + long flat = 0; + for (uint d = 0; d < sparse_dim; ++d) { + long off = (long)d * idx_strides.x + (long)gid * idx_strides.y; + long v = indices[off]; + flat += v * row_muls[d]; + } + flat_indices[gid] = flat; +} \ No newline at end of file diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu index 1a3e2825d4fa..b8b43e0086c1 100644 --- a/aten/src/ATen/native/transformers/cuda/attention.cu +++ b/aten/src/ATen/native/transformers/cuda/attention.cu @@ -1396,12 +1396,15 @@ std::tuple _efficient_ at::Tensor v_t = value.transpose(1, 2); at::Tensor output_t = res.transpose(1, 2); bool is_causal; - if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { - is_causal = true; - } else if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { + if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { is_causal = false; } else { - TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now"); + is_causal = true; +#if AOTRITON_V3_API == 0 + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) { + TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now"); + } +#endif } at::Tensor atomic_counter; @@ -1426,7 +1429,51 @@ std::tuple _efficient_ auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr() : nullptr); auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr() : nullptr); hipError_t err; // TODO: Error handling - if (seqstart_q.has_value()) { + if constexpr (AOTRITON_ALWAYS_V3_API) { // Better readability than nesting ifdef +#if AOTRITON_V3_API // if constexpr does not stop errors from undefined functions + using aotriton::v3::flash::CausalType; + using aotriton::v3::flash::VarlenType; + using aotriton::v3::flash::WindowValue; + aotriton::v3::flash::attn_fwd_params params; + params.Q = mk_aotensor(q_t, "q"); + params.K = mk_aotensor(k_t, "k"); + params.V = mk_aotensor(v_t, "v"); + params.Sm_scale = softmax_scale; + params.L = compute_logsumexp ? mk_aotensor<2>(softmax_lse, "M") : empty_t2; + params.Out = mk_aotensor(output_t, "Out"); + params.Max_seqlen_q = max_seqlen_q; // Unused if cu_seqlens_q is empty + params.Max_seqlen_k = max_seqlen_k; // Unused if cu_seqlens_k is empty + params.dropout_p = dropout_p; + params.philox_seed_ptr = seed; + params.philox_offset1 = offset1; + params.philox_offset2 = offset2; + params.philox_seed_output = seed_output; + params.philox_offset_output = offset_output; + params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax"); + params.persistent_atomic_counter = persistent_counter; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { + params.window_left = WindowValue::TopLeftAligned; + params.window_right = WindowValue::TopLeftAligned; + } else if (static_cast(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) { + params.window_left = WindowValue::BottomRightAligned; + params.window_right = WindowValue::BottomRightAligned; + } + if (bias.has_value()) { + params.B = mk_aotensor(bias.value(), "bias"); + } + if (seqstart_q.has_value()) { + params.varlen_type = VarlenType::CompactVarlen; + params.cu_seqlens_q = mk_aotensor<1>(seqstart_q.value(), "cu_seqlens_q"); + params.cu_seqlens_k = mk_aotensor<1>(seqstart_k.value(), "cu_seqlens_k"); + } else { + params.varlen_type = VarlenType::None; + } + err = aotriton::v3::flash::attn_fwd(params, + aotriton::v3::flash::attn_fwd_params::kVersion, + stream); +#endif // AOTRITON_V3_API + } else if (seqstart_q.has_value()) { // varlen aka nested tensor err = attn_fwd_compact_varlen(mk_aotensor(q_t, "q"), mk_aotensor(k_t, "k"), diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu index 6940bbbcb812..55fc1e261219 100644 --- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu +++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu @@ -24,6 +24,7 @@ #include #include #else +#include #include #include #include @@ -47,6 +48,7 @@ #include #include #else +#include // MemoryEfficient Attention Specific Imports for ROCM #ifndef DISABLE_AOTRITON #include @@ -544,12 +546,15 @@ _efficient_attention_backward( } const auto softmax_scale = sdp::calculate_scale(query, scale).expect_float(); bool is_causal; - if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { - is_causal = true; - } else if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { + if (static_cast(sdp::CustomMaskType::NoCustomMask) == custom_mask_type) { is_causal = false; } else { - TORCH_CHECK(false, "[_efficient_attention_backward] Unsupported mask type in AOTriton, for now"); + is_causal = true; +#if AOTRITON_V3_API == 0 + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) != custom_mask_type) { + TORCH_CHECK(false, "[_efficient_attention_forward] Unsupported mask type on ROCM, for now"); + } +#endif } at::Tensor q_t = query.permute({0,2,1,3}); at::Tensor k_t = key.permute({0,2,1,3}); @@ -568,7 +573,62 @@ _efficient_attention_backward( using sdp::aotriton_adapter::mk_aoscalartensor; using sdp::aotriton_adapter::cast_dtype; aotriton::TensorView<4> empty_t4(0, {0, 0, 0, 0}, {0, 0, 0, 0}, cast_dtype(query.dtype())); - if (cu_seqlens_q.has_value()) { + if constexpr (AOTRITON_ALWAYS_V3_API) { // Better readability than nesting ifdef +#if AOTRITON_V3_API // if constexpr does not stop errors from undefined functions + using aotriton::v3::flash::CausalType; + using aotriton::v3::flash::VarlenType; + using aotriton::v3::flash::WindowValue; + aotriton::v3::flash::attn_bwd_params params; + params.Q = mk_aotensor(q_t, "q"); + params.K = mk_aotensor(k_t, "k"); + params.V = mk_aotensor(v_t, "v"); + params.B = bias.has_value() ? mk_aotensor(bias.value(), "bias") : empty_t4; + params.Sm_scale = softmax_scale; + params.Out = mk_aotensor(out_t, "out"); + params.DO = mk_aotensor(dout_t, "dout"); + params.DK = mk_aotensor(dk_t, "dk"); + params.DV = mk_aotensor(dv_t, "dv"); + params.DQ = mk_aotensor(dq_t, "dq"); + params.DB = bias_requires_grad ? mk_aotensor(grad_bias, "db") : empty_t4; + params.L = mk_aotensor<2>(softmax_lse, "L"); + params.Max_seqlen_q = max_seqlen_q; // Unused if cu_seqlens_q is empty + params.Max_seqlen_k = max_seqlen_k; // Unused if cu_seqlens_k is empty + params.dropout_p = float(dropout_p); + params.philox_seed_ptr = mk_aoscalartensor(philox_seed); + params.philox_offset1 = mk_aoscalartensor(philox_offset); + params.philox_offset2 = 0; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; + if (static_cast(sdp::CustomMaskType::CausalFromTopLeft) == custom_mask_type) { + params.window_left = WindowValue::TopLeftAligned; + params.window_right = WindowValue::TopLeftAligned; + } else if (static_cast(sdp::CustomMaskType::CausalFromBottomRight) == custom_mask_type) { + params.window_left = WindowValue::BottomRightAligned; + params.window_right = WindowValue::BottomRightAligned; + } +#if AOTRITON_ALWAYS_V3_API + using sdp::aotriton_adapter::mklazy_empty_like; + using sdp::aotriton_adapter::mklazy_fp32zeros; + using sdp::aotriton_adapter::LazyTensorContext; + LazyTensorContext lazy_delta { .like_tensor = softmax_lse, .tensor_name = "delta" }; + LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" }; + params.D = mklazy_empty_like<2>(&lazy_delta); + params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc); +#else + at::Tensor delta = at::empty_like(softmax_lse).contiguous(); + params.D = mk_aotensor<2>(delta, "delta"); +#endif + if (cu_seqlens_q.has_value()) { + params.varlen_type = VarlenType::CompactVarlen; + params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q.value(), "cu_seqlens_q"); + params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k.value(), "cu_seqlens_k"); + } else { + params.varlen_type = VarlenType::None; + } + err = aotriton::v3::flash::attn_bwd(params, + aotriton::v3::flash::attn_bwd_params::kVersion, + stream); +#endif // AOTRITON_V3_API + } else if (cu_seqlens_q.has_value()) { at::Tensor delta = at::empty_like(softmax_lse).contiguous(); // varlen aka Nested tensor err = attn_bwd_compact_varlen(mk_aotensor(q_t, "q"), diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp index 00a43920b096..660aee3647ce 100644 --- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp +++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #if AT_CUDNN_ENABLED() #include @@ -25,9 +26,12 @@ #if USE_ROCM #if defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION) +#include #include #define USE_ROCM_ATTENTION 1 #endif +#else +#define USE_ROCM_ATTENTION 0 #endif // Avoid potential compiler -Wall -Werror complains undefined macro @@ -72,13 +76,14 @@ bool priority_order_init_ = false; // TODO(eqy): more benchmarking to determine whether this should include sm86/89 // Needs to be kept in-sync with test_fused_chocie in test_transformers.py bool check_prefer_cudnn_attention() { - static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") == true; + static const bool prefer_cudnn = c10::utils::check_env("TORCH_CUDNN_SDPA_PREFERRED") != false; if (!prefer_cudnn) { return false; } -#if (defined(CUDNN_VERSION) && (CUDNN_VERSION > 90000)) +#if (defined(CUDNN_VERSION) && (CUDNN_VERSION >= 90900)) auto dprops = at::cuda::getCurrentDeviceProperties(); - return dprops->major >= 9 && !dprops->minor; + auto major = dprops->major; + return (major == 9 || major == 10) && !dprops->minor; #else return false; #endif @@ -129,9 +134,24 @@ int64_t minimum_gemm_alignment(sdp_params const& params) { // caller_is_meff is added to make the TORCH_WARN message showing the correct result template bool check_head_dim_size_flash(sdp_params const& params, bool debug) { -#if USE_ROCM_ATTENTION && AOTRITON_VERSION_MINOR >= 9 +#if USE_ROCM_ATTENTION // AOTriton 0.9+ supports head_dim up to 512 - const auto max_size = c10::SymInt(512); + const static auto max_hdim = []() { +#if AOTRITON_VERSION_CURRENT == AOTRITON_VERSION_INT(0, 11) + // gfx11xx only support hdim <= 256 on AOTriton 0.11 + auto dprops = at::cuda::getCurrentDeviceProperties(); + const c10::basic_string_view arch(dprops->gcnArchName); + if (arch.starts_with("gfx11")) { + return 256; + } +#endif // AOTriton 0.11 +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 9) + return 512; +#else + return 256; +#endif + }(); + const auto max_size = c10::SymInt(max_hdim); #else // All head_dim sizes must be equal and less than 256 const auto max_size = c10::SymInt(256); @@ -586,7 +606,7 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) { const auto dprop = at::cuda::getCurrentDeviceProperties(); // Check that the input is nested - if ((dprop->major == 9 || dprop->major == 10) && has_for_nested_inputs(params)) { + if (!(dprop->major == 9 || dprop->major == 10) && has_for_nested_inputs(params)) { if (debug) { TORCH_WARN("cuDNN SDPA supports nested tensors on SM 9.0, SM 10.0."); } @@ -646,6 +666,15 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) { TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use cuDNN Attention (< v9.0.0)"); } return false; +#endif +#if defined(CUDNN_VERSION) + static auto cudnn_version = cudnnGetVersion(); + if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) { + if (debug) { + TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13)."); + } + return false; + } #endif // Define gate functions that determine if a flash kernel can be ran // Replace with std::to_array when we migrate to c++20 diff --git a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h index aedb205e5710..d316808cf9be 100644 --- a/aten/src/ATen/native/transformers/hip/aotriton_adapter.h +++ b/aten/src/ATen/native/transformers/hip/aotriton_adapter.h @@ -2,8 +2,12 @@ #ifdef USE_ROCM +// Expect to be included after headers of at::zeros_like and at::empty_like + #include #include +#include +#include //////////////////////////////////////////////////////////////////////////////// // Common macros copied from cuda/mem_eff_attention/gemm_kernel_utils.h @@ -111,6 +115,61 @@ inline aotriton::TensorView<0> mk_atomictensor(const int32_t* ptr) aotriton::DType::kInt32); } +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11) + +struct LazyTensorContext { + at::Tensor like_tensor; + std::string_view tensor_name; + at::Tensor tensor; +}; + +template +struct LazyTensorFunctions : public LazyTensorContext { + static aotriton::TensorView acquire(void* cookie) { + auto ctx = (LazyTensorContext*)cookie; + if (!ctx->tensor.defined()) { + auto q = ctx->like_tensor; + if constexpr (kRequireZeros) { + ctx->tensor = at::zeros(q.sizes(), + q.options().dtype(at::kFloat)); + } else { + ctx->tensor = at::empty_like(q); + } + } + return mk_aotensor(ctx->tensor, ctx->tensor_name); + } + + static void dispose(void* cookie) { + } +}; + +template +aotriton::LazyTensor mklazy_common(LazyTensorContext* cookie) +{ + using LTF = LazyTensorFunctions; + return aotriton::LazyTensor { + .cookie = cookie, + .acquire = <F::acquire, + .dispose = <F::dispose + }; +} + +template +auto mklazy_empty_like(LazyTensorContext* cookie) +{ + return mklazy_common(cookie); +} + + +// Note: this will not keep the original strides +template +auto mklazy_fp32zeros(LazyTensorContext* cookie) +{ + return mklazy_common(cookie); +} + +#endif // >= 0.11 + } // namespace aotriton_adapter } // namespace sdp diff --git a/aten/src/ATen/native/transformers/hip/aotriton_versions.h b/aten/src/ATen/native/transformers/hip/aotriton_versions.h new file mode 100644 index 000000000000..2f5d3f0e1222 --- /dev/null +++ b/aten/src/ATen/native/transformers/hip/aotriton_versions.h @@ -0,0 +1,20 @@ +#pragma once + +#ifdef USE_ROCM + +#define AOTRITON_VERSION_INT(x, y) (x * 100 + y) +#define AOTRITON_VERSION_CURRENT (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR) + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11) +#define AOTRITON_ALWAYS_V3_API 1 +#else +#define AOTRITON_ALWAYS_V3_API 0 +#endif + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 10) +#define AOTRITON_V3_API 1 +#else +#define AOTRITON_V3_API 0 +#endif + +#endif diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip index 1d4926c02274..b5b1ed429289 100644 --- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip +++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip @@ -60,20 +60,13 @@ #include // AOTriton headers -#include #include #include -#if AOTRITON_VERSION_MINOR < 9 +#if AOTRITON_VERSION_CURRENT < AOTRITON_VERSION_INT(0, 9) #error "This adaptor code is only tested with AOTriton >= 0.9" #endif -#if (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR) >= 10 -#define V3_API 1 -#else -#define V3_API 0 -#endif - namespace pytorch_flash { namespace { @@ -93,15 +86,15 @@ calculate_swa(std::optional window_size_left, int max_seqlen_q, int max_seqlen_k, bool is_causal) { -#if V3_API // SWA is exposed through V3 API +#if AOTRITON_V3_API // SWA is exposed through V3 API bool needs_swa = false; using aotriton::v3::flash::WindowValue; // Default values when std::optional window_size_left/right have no value int window_left = max_seqlen_q; int window_right = max_seqlen_k; if (is_causal) { - window_left = WindowValue::TopLeftAligned; - window_right = WindowValue::TopLeftAligned; + window_left = WindowValue::BottomRightAligned; + window_right = WindowValue::BottomRightAligned; } if (window_size_left.has_value() || window_size_right.has_value()) { needs_swa = true; @@ -248,10 +241,10 @@ mha_fwd_aot(const at::Tensor &q, // batch_size x seqlen_q x num_heads x seqlen_q, seqlen_k, is_causal); -#if V3_API +#if AOTRITON_V3_API const bool uses_swa = needs_swa; #else - // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be + // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be // optimized out (hopefully). constexpr bool uses_swa = false; #endif @@ -278,8 +271,8 @@ mha_fwd_aot(const at::Tensor &q, // batch_size x seqlen_q x num_heads x auto seed_output = mk_philoxtensor(use_philox_state ? seed_t.data_ptr() : nullptr); auto offset_output = mk_philoxtensor(use_philox_state ? offset_t.data_ptr() : nullptr); auto persistent_counter = mk_atomictensor(is_causal ? atomic_counter.data_ptr() : nullptr); - if (uses_swa) { -#if V3_API + if (uses_swa || AOTRITON_ALWAYS_V3_API) { +#if AOTRITON_V3_API using aotriton::v3::flash::CausalType; using aotriton::v3::flash::VarlenType; aotriton::v3::flash::attn_fwd_params params; @@ -299,7 +292,7 @@ mha_fwd_aot(const at::Tensor &q, // batch_size x seqlen_q x num_heads x params.philox_offset_output = offset_output; params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax"); params.persistent_atomic_counter = persistent_counter; - params.causal_type = CausalType::WindowedAttention; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; params.varlen_type = VarlenType::None; params.window_left = window_left; params.window_right = window_right; @@ -449,10 +442,10 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot max_seqlen_q, max_seqlen_k, is_causal); -#if V3_API +#if AOTRITON_V3_API const bool uses_swa = needs_swa; #else - // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be + // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be // optimized out (hopefully). constexpr bool uses_swa = false; #endif @@ -482,8 +475,8 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot auto seed_output = use_philox_state ? mk_philoxtensor(seed_t.data_ptr()) : nullscalar; auto offset_output = use_philox_state ? mk_philoxtensor(offset_t.data_ptr()) : nullscalar; auto persistent_counter = is_causal ? mk_philoxtensor(atomic_counter.data_ptr()) : nullscalar; - if (uses_swa) { -#if V3_API + if (uses_swa || AOTRITON_ALWAYS_V3_API) { +#if AOTRITON_V3_API using aotriton::v3::flash::CausalType; using aotriton::v3::flash::VarlenType; aotriton::v3::flash::attn_fwd_params params; @@ -505,7 +498,7 @@ mha_varlen_fwd_aot(const at::Tensor &q, // total_q x num_heads x head_size, tot params.philox_offset_output = offset_output; params.encoded_softmax = mk_aotensor(softmax_fa_t, "encoded_softmax"); params.persistent_atomic_counter = persistent_counter; - params.causal_type = CausalType::WindowedAttention; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; params.varlen_type = VarlenType::CompactVarlen; params.window_left = window_left; params.window_right = window_right; @@ -599,10 +592,6 @@ mha_bwd_aot(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x hea const int seqlen_k = k.size(1); const int num_heads_k = k.size(2); - if (is_causal){ - TORCH_CHECK((seqlen_q == seqlen_k), "For backwards kernel seqlen_q must equal seqlen_k for causal kernels"); - } - TORCH_CHECK(batch_size > 0, "batch size must be positive"); TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8"); TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!"); @@ -654,10 +643,10 @@ mha_bwd_aot(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x hea seqlen_q, seqlen_k, is_causal); -#if V3_API +#if AOTRITON_V3_API const bool uses_swa = needs_swa; #else - // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be + // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be // optimized out (hopefully). constexpr bool uses_swa = false; #endif @@ -681,10 +670,9 @@ mha_bwd_aot(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x hea hipError_t err; // TODO: Error handling using sdp::aotriton_adapter::mk_aotensor; using sdp::aotriton_adapter::mk_aoscalartensor; - if (uses_swa) { -#if V3_API + if (uses_swa || AOTRITON_ALWAYS_V3_API) { +#if AOTRITON_V3_API // Fused BWD does not support SWA - at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); using aotriton::v3::flash::CausalType; using aotriton::v3::flash::VarlenType; aotriton::v3::flash::attn_bwd_params params; @@ -694,21 +682,32 @@ mha_bwd_aot(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x hea params.Sm_scale = softmax_scale; params.Out = mk_aotensor(out_t, "out"); params.DO = mk_aotensor(dout_t, "dout"); - params.DK = mk_aotensor(dq_t, "dq"); - params.DV = mk_aotensor(dk_t, "dk"); - params.DQ = mk_aotensor(dv_t, "dv"); + params.DQ = mk_aotensor(dq_t, "dq"); + params.DK = mk_aotensor(dk_t, "dk"); + params.DV = mk_aotensor(dv_t, "dv"); params.L = mk_aotensor<2>(softmax_lse_cont, "L"); - params.D = mk_aotensor<2>(delta, "delta"); params.Max_seqlen_q = seqlen_q; // Unused if cu_seqlens_q is empty params.Max_seqlen_k = seqlen_k; // Unused if cu_seqlens_k is empty params.dropout_p = p_dropout; params.philox_seed_ptr = mk_aoscalartensor(philox_seed); params.philox_offset1 = mk_aoscalartensor(philox_offset); params.philox_offset2 = 0; - params.causal_type = CausalType::WindowedAttention; - params.varlen_type = VarlenType::None; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; params.window_left = window_left; params.window_right = window_right; + params.varlen_type = VarlenType::None; +#if AOTRITON_ALWAYS_V3_API + using sdp::aotriton_adapter::mklazy_empty_like; + using sdp::aotriton_adapter::mklazy_fp32zeros; + using sdp::aotriton_adapter::LazyTensorContext; + LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" }; + LazyTensorContext lazy_dq_acc { .like_tensor = dq_t, .tensor_name = "dq_acc" }; + params.D = mklazy_empty_like<2>(&lazy_delta); + params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc); +#else + at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); + params.D = mk_aotensor<2>(delta, "delta"); +#endif err = aotriton::v3::flash::attn_bwd(params, aotriton::v3::flash::attn_bwd_params::kVersion, stream); @@ -843,7 +842,6 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size CHECK_SHAPE(cu_seqlens_k, batch_size + 1); at::Tensor softmax_lse_cont = softmax_lse.view({batch_size * num_heads, max_seqlen_q}).contiguous(); - at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); at::Tensor q_padded, k_padded, v_padded; q_padded = q.unsqueeze(0).transpose(1, 2); @@ -901,10 +899,10 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size max_seqlen_q, max_seqlen_k, is_causal); -#if V3_API +#if AOTRITON_V3_API const bool uses_swa = needs_swa; #else - // When V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be + // When AOTRITON_V3_API = 0, uses_swa is constexpr and the if (uses_swa) branch can be // optimized out (hopefully). constexpr bool uses_swa = false; #endif @@ -924,8 +922,8 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size hipError_t err; // TODO: Error handling using sdp::aotriton_adapter::mk_aotensor; using sdp::aotriton_adapter::mk_aoscalartensor; - if (uses_swa) { -#if V3_API + if (uses_swa || AOTRITON_ALWAYS_V3_API) { +#if AOTRITON_V3_API using aotriton::v3::flash::CausalType; using aotriton::v3::flash::VarlenType; aotriton::v3::flash::attn_bwd_params params; @@ -935,11 +933,10 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size params.Sm_scale = softmax_scale; params.Out = mk_aotensor(out_t, "out"); params.DO = mk_aotensor(dout_t, "dout"); - params.DK = mk_aotensor(dq_padded, "dq"); - params.DV = mk_aotensor(dk_padded, "dk"); - params.DQ = mk_aotensor(dv_padded, "dv"); + params.DK = mk_aotensor(dk_padded, "dk"); + params.DV = mk_aotensor(dv_padded, "dv"); + params.DQ = mk_aotensor(dq_padded, "dq"); params.L = mk_aotensor<2>(softmax_lse_cont, "L"); - params.D = mk_aotensor<2>(delta, "delta"); params.cu_seqlens_q = mk_aotensor<1>(cu_seqlens_q, "cu_seqlens_q"); params.cu_seqlens_k = mk_aotensor<1>(cu_seqlens_k, "cu_seqlens_k"); params.Max_seqlen_q = max_seqlen_q; // Unused if cu_seqlens_q is empty @@ -948,17 +945,30 @@ mha_varlen_bwd_aot(const at::Tensor &dout, // total_q x num_heads, x head_size params.philox_seed_ptr = mk_aoscalartensor(philox_seed); params.philox_offset1 = mk_aoscalartensor(philox_offset); params.philox_offset2 = 0; - params.causal_type = CausalType::WindowedAttention; + params.causal_type = is_causal ? CausalType::WindowedAttention : CausalType::None; params.varlen_type = VarlenType::CompactVarlen; params.window_left = window_left; params.window_right = window_right; +#if AOTRITON_ALWAYS_V3_API + using sdp::aotriton_adapter::mklazy_empty_like; + using sdp::aotriton_adapter::mklazy_fp32zeros; + using sdp::aotriton_adapter::LazyTensorContext; + LazyTensorContext lazy_delta { .like_tensor = softmax_lse_cont, .tensor_name = "delta" }; + LazyTensorContext lazy_dq_acc { .like_tensor = dq_padded, .tensor_name = "dq_acc" }; + params.D = mklazy_empty_like<2>(&lazy_delta); + params.DQ_ACC = mklazy_fp32zeros<4>(&lazy_dq_acc); +#else + at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); + params.D = mk_aotensor<2>(delta, "delta"); +#endif err = aotriton::v3::flash::attn_bwd(params, aotriton::v3::flash::attn_bwd_params::kVersion, stream); -#endif +#endif // AOTRITON_ALWAYS_V3_API } else { using aotriton::v2::flash::attn_bwd_compact_varlen; using sdp::aotriton_adapter::cast_dtype; + at::Tensor delta = at::empty_like(softmax_lse_cont).contiguous(); aotriton::TensorView<4> empty_bias(0, {0,0,0,0}, {0,0,0,0}, cast_dtype(q.dtype())); err = attn_bwd_compact_varlen(mk_aotensor(q_padded, "q"), mk_aotensor(k_padded, "k"), diff --git a/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h new file mode 100644 index 000000000000..c18744afc1ff --- /dev/null +++ b/aten/src/ATen/native/transformers/hip/gemm_kernel_utils.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// This file is a trimmed version of cuda/mem_eff_attention/gemm_kernel_utils.h +#pragma once + +#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK(TENSOR.is_contiguous()); + +#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK( \ + TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous"); + +#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \ + TORCH_CHECK( \ + uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned") + +#define ASSIGN_CHECK_OVERFLOW(A, B) \ + { \ + A = B; \ + TORCH_CHECK( \ + B < std::numeric_limits::max(), #B " overflows"); \ + } diff --git a/aten/src/ATen/templates/FunctionalInverses.h b/aten/src/ATen/templates/FunctionalInverses.h index 3217e097d7ad..b15cd09a6c65 100644 --- a/aten/src/ATen/templates/FunctionalInverses.h +++ b/aten/src/ATen/templates/FunctionalInverses.h @@ -2,22 +2,12 @@ // ${generated_comment} +#include #include namespace at { namespace functionalization { -enum class InverseReturnMode { - /// Specifies that functional inverses should always return a view. - AlwaysView, - /// Specifies that functional inverses should always return a non-view / copy. - NeverView, - /// Specifies that functional inverses should return a view unless a (copying) scatter - /// inverse exists, in which case that will be used instead. - /// This avoids as_strided() calls that can be difficult for subclasses to handle. - ViewOrScatterInverse, -}; - struct FunctionalInverses { ${view_inverse_declarations} diff --git a/aten/src/ATen/templates/RegisterFunctionalization.cpp b/aten/src/ATen/templates/RegisterFunctionalization.cpp index dc8619c25fc5..408aff0cdab4 100644 --- a/aten/src/ATen/templates/RegisterFunctionalization.cpp +++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include diff --git a/aten/src/ATen/templates/ViewMetaClasses.cpp b/aten/src/ATen/templates/ViewMetaClasses.cpp new file mode 100644 index 000000000000..0fd53171935f --- /dev/null +++ b/aten/src/ATen/templates/ViewMetaClasses.cpp @@ -0,0 +1,19 @@ +// ${generated_comment} + +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +${op_headers} +#endif + +namespace at { +namespace functionalization { + +${view_meta_implementations} + +} // namespace functionalization +} // namespace at diff --git a/aten/src/ATen/templates/ViewMetaClasses.h b/aten/src/ATen/templates/ViewMetaClasses.h new file mode 100644 index 000000000000..be2dee2a871b --- /dev/null +++ b/aten/src/ATen/templates/ViewMetaClasses.h @@ -0,0 +1,12 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +// ${generated_comment} + +#include + +namespace at { +namespace functionalization { + +${view_meta_declarations} + +} // namespace functionalization +} // namespace at diff --git a/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp b/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp new file mode 100644 index 000000000000..c784e5abe5c8 --- /dev/null +++ b/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp @@ -0,0 +1,11 @@ +#include +#include + +namespace torch::functionalization { + +void initGenerated(PyObject* module) { + auto functionalization = py::handle(module).cast(); + $view_meta_bindings +} + +} // namespace torch::functionalization diff --git a/aten/src/ATen/test/cuda_allocator_test.cpp b/aten/src/ATen/test/cuda_allocator_test.cpp index 5aa2378c22c4..27a352e7d5a2 100644 --- a/aten/src/ATen/test/cuda_allocator_test.cpp +++ b/aten/src/ATen/test/cuda_allocator_test.cpp @@ -5,51 +5,6 @@ #include -#include - TEST(AllocatorTestCUDA, test_clone) { test_allocator_clone(c10::cuda::CUDACachingAllocator::get()); } - -static int called_dummy_free_0 = 0; -static int called_dummy_free_1 = 0; - -void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;} -void dummy_free_0(void* data, size_t size, int device, void* stream) { - called_dummy_free_0++; -} -void dummy_free_1(void* data, size_t size, int device, void* stream) { - called_dummy_free_1++; -} - -// Tests that data_ptrs have their respective deleters -// when mixing allocators -TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) { - // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter - auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0); - c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get()); - at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA)); - - // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter - auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1); - c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get()); - at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA)); - - // Manually use a's deleter - auto* ctx = a.storage().data_ptr().get_context(); - a.storage().data_ptr().get_deleter()(ctx); - a.storage().mutable_data_ptr().release_context(); - - // a's deleter is dummy_free_0 - // dummy_free_0 should be called above, so called_dummy_free_0 should be 1 - ASSERT_TRUE(called_dummy_free_0 == 1); - - // Manually use b's deleter - ctx = b.storage().data_ptr().get_context(); - b.storage().data_ptr().get_deleter()(ctx); - b.storage().mutable_data_ptr().release_context(); - - // b's deleter is dummy_free_1 - // dummy_free_1 should be called above, so called_dummy_free_1 should be 1 - ASSERT_TRUE(called_dummy_free_1 == 1); -} diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu index 7ba7bcb99bce..e4c18102526a 100644 --- a/aten/src/ATen/test/cuda_vectorized_test.cu +++ b/aten/src/ATen/test/cuda_vectorized_test.cu @@ -10,8 +10,13 @@ using namespace at::native::memory; constexpr int buffer_size = 1024; +#if defined(CUDA_VERSION) && CUDA_VERSION < 13000 __managed__ double4 buffer1[buffer_size]; __managed__ double4 buffer2[buffer_size]; +#else +__managed__ double4_16a buffer1[buffer_size]; +__managed__ double4_16a buffer2[buffer_size]; +#endif void reset_buffers() { for (int i = 0; i < buffer_size; i++) { diff --git a/benchmarks/dynamo/check_accuracy.py b/benchmarks/dynamo/check_accuracy.py index 5cd714fe02e9..678cee5f752c 100644 --- a/benchmarks/dynamo/check_accuracy.py +++ b/benchmarks/dynamo/check_accuracy.py @@ -15,6 +15,8 @@ "timm_efficientnet", # see https://github.com/pytorch/pytorch/issues/148699 "XGLMForCausalLM", # discovered in https://github.com/pytorch/pytorch/pull/128148 "moondream", # discovered in https://github.com/pytorch/pytorch/pull/159291 + # discovered in https://github.com/pytorch/pytorch/issues/161419. Its not flaky but really hard to repro, so skipping it + "mobilenetv3_large_100", } diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv index 1dceba2f8ba9..1def1d99bd53 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_timm_training.csv @@ -130,7 +130,7 @@ mnasnet_100,pass,7 -mobilenetv2_100,fail_accuracy,7 +mobilenetv2_100,pass,7 diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv index 01762c5f5f29..1d199fe8ea66 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv index e68aa2fa5351..a4dbaeb7b546 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv @@ -98,11 +98,11 @@ dlrm,pass,0 -doctr_det_predictor,pass,5 +doctr_det_predictor,pass,3 -doctr_reco_predictor,pass,4 +doctr_reco_predictor,pass,1 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv index aec659fdcd65..885029ba8c56 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv @@ -98,11 +98,11 @@ dlrm,pass,0 -doctr_det_predictor,pass,5 +doctr_det_predictor,pass,3 -doctr_reco_predictor,pass,4 +doctr_reco_predictor,pass,1 diff --git a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv index 4f2eec149352..aa7a3161afcc 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv @@ -98,11 +98,11 @@ dlrm,pass,0 -doctr_det_predictor,pass,5 +doctr_det_predictor,pass,3 -doctr_reco_predictor,pass,4 +doctr_reco_predictor,pass,1 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv index 3e4c3caa1ca9..20cad351b127 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv index 3630f9a75af8..5050b3762ed9 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv @@ -2,7 +2,7 @@ name,accuracy,graph_breaks -torchrec_dlrm,fail_to_run,3 +torchrec_dlrm,pass,6 @@ -94,7 +94,7 @@ hf_Bert_large,pass,6 -hf_BigBird,fail_to_run,3 +hf_BigBird,pass,6 @@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0 -hf_Reformer,fail_to_run,21 +hf_Reformer,pass,25 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv index c8db4d582320..f26dea6f692e 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv @@ -82,11 +82,11 @@ dlrm,pass,0 -doctr_det_predictor,pass,5 +doctr_det_predictor,pass,3 -doctr_reco_predictor,pass,4 +doctr_reco_predictor,pass,1 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv index f4c9ffddd997..39149853947c 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv @@ -98,11 +98,11 @@ dlrm,pass,0 -doctr_det_predictor,pass,5 +doctr_det_predictor,pass,3 -doctr_reco_predictor,pass,4 +doctr_reco_predictor,pass,1 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv index 63d0efa38f63..2b2c1a504647 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv index 01762c5f5f29..1d199fe8ea66 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv index fbd169539ab7..e41018657c0e 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv @@ -106,7 +106,7 @@ dlrm,pass,0 -doctr_det_predictor,pass,4 +doctr_det_predictor,pass,3 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv index 6f316b219bb9..bf70642a855e 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv @@ -106,11 +106,11 @@ dlrm,pass,0 -doctr_det_predictor,eager_fail_to_run,5 +doctr_det_predictor,eager_fail_to_run,3 -doctr_reco_predictor,eager_fail_to_run,4 +doctr_reco_predictor,eager_fail_to_run,1 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv index 4b5138ce9c36..e019365ccbfd 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv @@ -106,11 +106,11 @@ dlrm,pass,0 -doctr_det_predictor,eager_fail_to_run,5 +doctr_det_predictor,eager_fail_to_run,3 -doctr_reco_predictor,eager_fail_to_run,4 +doctr_reco_predictor,eager_fail_to_run,1 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv index a3fc7cf19237..fed8ebded682 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv @@ -106,11 +106,11 @@ dlrm,pass,0 -doctr_det_predictor,eager_fail_to_run,5 +doctr_det_predictor,eager_fail_to_run,3 -doctr_reco_predictor,eager_fail_to_run,4 +doctr_reco_predictor,eager_fail_to_run,1 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv index 6f316b219bb9..bf70642a855e 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv @@ -106,11 +106,11 @@ dlrm,pass,0 -doctr_det_predictor,eager_fail_to_run,5 +doctr_det_predictor,eager_fail_to_run,3 -doctr_reco_predictor,eager_fail_to_run,4 +doctr_reco_predictor,eager_fail_to_run,1 diff --git a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv index 8ccf95da9659..014e23e41cb3 100644 --- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv +++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv @@ -106,11 +106,11 @@ dlrm,pass,0 -doctr_det_predictor,eager_fail_to_run,5 +doctr_det_predictor,eager_fail_to_run,3 -doctr_reco_predictor,eager_fail_to_run,4 +doctr_reco_predictor,eager_fail_to_run,1 diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml index bf0a1b6c31e8..6a15cf33222b 100644 --- a/benchmarks/dynamo/torchbench.yaml +++ b/benchmarks/dynamo/torchbench.yaml @@ -219,7 +219,9 @@ skip: - timm_regnet - timm_nfnet - cuda: [] + cuda: + # Temporary until https://github.com/pytorch/pytorch/issues/162282 is fixed + - sam_fast test: training: diff --git a/benchmarks/operator_benchmark/benchmark_core.py b/benchmarks/operator_benchmark/benchmark_core.py index cb836bb5eaa4..3f79ed2318c4 100644 --- a/benchmarks/operator_benchmark/benchmark_core.py +++ b/benchmarks/operator_benchmark/benchmark_core.py @@ -4,6 +4,7 @@ import functools import json import os +import platform import timeit from collections import namedtuple from dataclasses import asdict, dataclass @@ -17,6 +18,7 @@ # needs to be imported after torch import torch.utils.cpp_extension as cpp_extension # noqa: F401 +from torch.utils.benchmark import Timer """Performance microbenchmarks. @@ -191,6 +193,11 @@ def __init__(self, args): self.predefined_minimum_secs = 1 self.max_iters = 1e6 self.use_jit = args.use_jit + self.use_compile = args.use_compile + if self.use_jit and self.use_compile: + raise ValueError( + "use_jit and use_compile are mutually exclusive, please specify one." + ) self.num_runs = args.num_runs self.print_per_iter = False self.output_csv = args.output_csv @@ -222,7 +229,7 @@ def _print_header(self): if self.args.operators: print(f"# {self.args.operators}") - def _print_perf_result(self, reported_run_time_us, test_case): + def _print_perf_result(self, results, test_case): if self.args.report_aibench: # Output for AIBench # Print out per iteration execution time instead of avg time @@ -236,12 +243,14 @@ def _print_perf_result(self, reported_run_time_us, test_case): "type": test_name, "metric": "latency", "unit": "us", - "value": str(reported_run_time_us[run]), + "value": str(results["reported_run_time_us"[run]]), } ) ) else: - print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}") + print( + f"# Mode: {'JIT' if self.use_jit else 'Compile' if self.use_compile else 'Eager'}" + ) print( f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}" ) @@ -250,25 +259,33 @@ def _print_perf_result(self, reported_run_time_us, test_case): if self.num_runs > 1: for run in range(self.num_runs): print( - f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}" + f"Run: {run}, {mode} Execution Time (us) : {results['reported_run_time_us'][run]:.3f}" ) print() else: - print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n") + print( + f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}" + ) + print(f"Peak Memory (KB) : {results['peak_memory']}\n") - def _perf_result_to_dict(self, reported_run_time_us, test_case): + def _perf_result_to_dict(self, results, test_case): """This function is the parallel of _print_perf_result, which instead of writing information to terminal, returns a dictionary. """ if self.args.report_aibench: return {} + out = { "test_name": test_case.test_config.test_name, "input_config": test_case.test_config.input_config, - "mode": "JIT" if self.use_jit else "Eager", + "runtime": ( + "JIT" if self.use_jit else "Compile" if self.use_compile else "Eager" + ), "run": "Backward" if test_case.test_config.run_backward else "Forward", - "latency": round(reported_run_time_us[0], 3), + "latency": round(results["reported_run_time_us"][0], 3), "latency unit": "us", + "peak memory": results["peak_memory"], + "memory unit": "KB", } # parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary @@ -330,10 +347,26 @@ def _launch_forward(self, test_case, iters, print_per_iter): func = test_case.run_forward if self.use_jit: func = test_case.run_jit_forward - forward_time = timeit.timeit( - functools.partial(func, iters, print_per_iter, cuda_sync), number=1 + if self.use_compile: + func = test_case.run_compile_forward + + if not cuda_sync: + forward_time = timeit.timeit( + functools.partial(func, iters, print_per_iter, cuda_sync), number=1 + ) + return forward_time + # Stable timing with Timer + timer = Timer( + stmt="func(iters, print_per_iter, cuda_sync)", + globals={ + "func": func, + "iters": iters, + "print_per_iter": print_per_iter, + "cuda_sync": cuda_sync, + }, ) - return forward_time + result = timer.adaptive_autorange(min_run_time=0.0001) + return result.median * iters def _launch_backward(self, test_case, iters, print_per_iter=False): """This function runs forward path of an op to get an output. Then the backward path is executed @@ -346,7 +379,7 @@ def _launch_backward(self, test_case, iters, print_per_iter=False): ) return backward_time - def _measure_time(self, launch_test, test_case, iters, print_per_iter): + def _measure_metrics(self, launch_test, test_case, iters, print_per_iter): """ This function execute the operator for iterations then look at the time. If it's not significant, the number of iterations will be increased before rerun. @@ -354,8 +387,25 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter): """ curr_test_total_time = 0 time_trace = [] + peak_memory = 0 + input_values = test_case.op_bench.inputs.values() + device, device_module = None, None + if input_values and isinstance(next(iter(input_values)), torch.Tensor): + # The device and device module information are crucial for memory metric calculation, + # In case of ops where inputs are integers (not tensor), memory metrics need not be calculated. + sample_input = next(iter(input_values)) + device = sample_input.device + device_module = torch.get_device_module(device.type) + # TODO: add support for cpu memory measurement while True: + if hasattr(device_module, "reset_peak_memory_stats"): + device_module.reset_peak_memory_stats(device) run_time_sec = launch_test(test_case, iters, print_per_iter) + if hasattr(device_module, "synchronize"): + device_module.synchronize(device) + # Memory measurement process + if hasattr(device_module, "max_memory_allocated"): + peak_memory = device_module.max_memory_allocated(device) curr_test_total_time += run_time_sec # Analyze time after each run to decide if the result is stable results_are_significant = self._iteration_result_is_significant( @@ -369,7 +419,13 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter): time_trace.append(report_run_time) # Print out the time spent in each epoch in ms if self.args.report_aibench: - mode = "JIT" if self.use_jit else "Eager" + mode = ( + "JIT" + if self.use_jit + else "Compile" + if self.use_compile + else "Eager" + ) test_name = "_".join( [test_case.framework, test_case.test_config.test_name, mode] ) @@ -381,7 +437,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter): "metric": "latency", "unit": "ms", "value": str(report_run_time / 1e3), - } + }, ) ) if results_are_significant: @@ -391,7 +447,7 @@ def _measure_time(self, launch_test, test_case, iters, print_per_iter): # iteration count, and run the benchmark again... iters = self._predict_num_iter_needed(iters) reported_run_time_us = np.percentile(np.array(time_trace), 50) - return reported_run_time_us + return reported_run_time_us, peak_memory / 1024 def _check_keep(self, test_flag, cmd_flag): return cmd_flag is None or test_flag == cmd_flag @@ -478,6 +534,7 @@ def _output_json( self, perf_list, output_file, + benchmark_name="PyTorch operator benchmark", ): """ Write the result into JSON format, so that it can be uploaded to the benchmark database @@ -495,8 +552,10 @@ def _output_json( input_config = perf_item.get("input_config", "") run_type = perf_item.get("run") latency = perf_item.get("latency", 0) - - dtype = "float32" # default + peak_memory = perf_item.get("peak memory", 0) + device = perf_item.get("device", "unknown") + dtype = perf_item.get("dtype", "torch.float").split(".")[1] + runtime = perf_item.get("runtime", None) # Extract mode based on run_type mode = None @@ -505,6 +564,22 @@ def _output_json( elif run_type == "Backward": mode = "training" + # Extract use_compile from it + if runtime == "Compile": + use_compile = True + elif runtime == "Eager": + use_compile = False + else: + use_compile = None + + device_arch = ( + torch.cuda.get_device_name(0) + if device == "cuda" + else platform.processor() + if device == "cpu" + else "unknown" + ) + # Create the record @dataclass class BenchmarkInfo: @@ -532,12 +607,18 @@ class BenchmarkRecord: model: ModelInfo metric: MetricInfo - record = BenchmarkRecord( + # Add record for latency + record_latency = BenchmarkRecord( benchmark=BenchmarkInfo( - name="PyTorch operator benchmark", + name=benchmark_name, mode=mode, dtype=dtype, - extra_info={"input_config": input_config}, + extra_info={ + "input_config": input_config, + "device": device, + "arch": device_arch, + "use_compile": use_compile, + }, ), model=ModelInfo( name=test_name, type="micro-benchmark", origins=["pytorch"] @@ -549,8 +630,17 @@ class BenchmarkRecord: target_value=None, ), ) - - records.append(asdict(record)) + records.append(asdict(record_latency)) + + # Add record for peak memory + record_memory = copy.deepcopy(record_latency) + record_memory.metric = MetricInfo( + name="peak memory", + unit="KB", + benchmark_values=[peak_memory], + target_value=None, + ) + records.append(asdict(record_memory)) # Write all records to the output file with open(output_file, "w", encoding="utf-8") as f: @@ -566,6 +656,7 @@ def run(self): "tag", "run_backward", "Execution Time", + "Peak Memory (KB)", ] if self.args.output_json or self.args.output_json_for_dashboard: @@ -603,13 +694,16 @@ def run(self): test_case, self.args.warmup_iterations, print_per_iter=False ) # Actual Execution - reported_time = [ - self._measure_time( + results = [ + self._measure_metrics( launch_func, test_case, self.iters, self.print_per_iter ) for _ in range(self.num_runs) ] - self._print_perf_result(reported_time, test_case) + result_dict = dict() + result_dict["reported_run_time_us"] = [r[0] for r in results] + result_dict["peak_memory"] = results[0][1] + self._print_perf_result(results=result_dict, test_case=test_case) # output results to csv self._output_csv( @@ -625,16 +719,17 @@ def run(self): ), test_case.test_config.tag, test_case.test_config.run_backward, - reported_time[0], + result_dict["reported_run_time_us"][0], + result_dict["peak_memory"], ], ) if self.args.output_json or self.args.output_json_for_dashboard: - perf_list.append( - self._perf_result_to_dict(reported_time, test_case) - ) + perf_list.append(self._perf_result_to_dict(result_dict, test_case)) if self.args.output_json_for_dashboard: - self._output_json(perf_list, self.args.output_json_for_dashboard) + self._output_json( + perf_list, self.args.output_json_for_dashboard, self.args.benchmark_name + ) if self.args.output_json: with open(self.args.output_json, "w") as f: diff --git a/benchmarks/operator_benchmark/benchmark_pytorch.py b/benchmarks/operator_benchmark/benchmark_pytorch.py index 52ae47047daa..cfed9ebac04b 100644 --- a/benchmarks/operator_benchmark/benchmark_pytorch.py +++ b/benchmarks/operator_benchmark/benchmark_pytorch.py @@ -4,6 +4,15 @@ import torch +# Import the C++ extension to register the _consume operator +try: + import benchmark_cpp_extension # noqa: F401 +except ImportError as err: + # If the extension isn't built, the script must raise an error + raise ImportError( + "Failed to import C++ extension, please build it using \ncd pt_extension \npython -m pip install ." + ) from err + """PyTorch performance microbenchmarks. This module contains PyTorch-specific functionalities for performance @@ -71,6 +80,16 @@ def forward_consume(self, iters: int): for _ in range(iters): torch.ops.operator_benchmark._consume(self.forward_impl()) + def forward_impl_eager(self): + # This is to supply the inputs to the forward function which + # will be called in both the eager and compile mode of local runs + return self.forward(*self.get_inputs()) + + def forward_consume_eager(self, iters: int): + # Eager version of forward_consume without decorators (compilation handled by torch.compile) + for _ in range(iters): + torch.ops.operator_benchmark._consume(self.forward_impl_eager()) + def module_name(self): """this is used to label the operator being benchmarked""" if self.user_given_name: @@ -117,18 +136,34 @@ def __init__(self, op_bench, test_config): self.framework = "PyTorch" self.time_series = [] self._jit_forward_graph = None + self._compile_forward_graph = None def _generate_jit_forward_graph(self): """generate a graph for the forward function via scripting""" scripted_op_bench = torch.jit.script(self.op_bench) return scripted_op_bench.forward_consume + def _generate_compile_forward_graph(self): + """generate a compiled graph for the forward function via torch.compile""" + compiled_forward_consume = torch.compile( + self.op_bench.forward_consume_eager, backend="inductor" + ) + return compiled_forward_consume + def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False): """Run the forward path of an op with JIT mode""" if self._jit_forward_graph is None: self._jit_forward_graph = self._generate_jit_forward_graph() self._jit_forward_graph(num_runs) + def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False): + """Run the forward path of an op with compile mode""" + if self._compile_forward_graph is None: + self._compile_forward_graph = self._generate_compile_forward_graph() + self._compile_forward_graph(num_runs) + if cuda_sync: + torch.cuda.synchronize(torch.cuda.current_device()) + def _print_per_iter(self): # print last 50 values length = min(len(self.time_series), 50) @@ -150,14 +185,14 @@ def run_forward(self, num_runs, print_per_iter, cuda_sync): if print_per_iter: for _ in range(num_runs): start_time = time.time() - self.output = self.op_bench.forward_impl() + self.output = self.op_bench.forward_impl_eager() if cuda_sync: torch.cuda.synchronize(torch.cuda.current_device()) end_time = time.time() self.time_series.append((end_time - start_time) * 1e3) else: for _ in range(num_runs): - self.output = self.op_bench.forward_impl() + self.output = self.op_bench.forward_impl_eager() if cuda_sync: torch.cuda.synchronize(torch.cuda.current_device()) diff --git a/benchmarks/operator_benchmark/benchmark_runner.py b/benchmarks/operator_benchmark/benchmark_runner.py index 9dfab781498e..6568cf9bf3ee 100644 --- a/benchmarks/operator_benchmark/benchmark_runner.py +++ b/benchmarks/operator_benchmark/benchmark_runner.py @@ -62,6 +62,13 @@ def parse_args(): default=None, ) + parser.add_argument( + "--benchmark-name", + "--benchmark_name", + help="Name of the benchmark to store results to", + default="PyTorch operator benchmark", + ) + parser.add_argument( "--list-tests", "--list_tests", @@ -135,6 +142,16 @@ def parse_args(): help="Run operators with PyTorch JIT mode", ) + parser.add_argument( + "--use-compile", + "--use_compile", + type=benchmark_utils.str2bool, + nargs="?", + const=True, + default=False, + help="Run operators with PyTorch Compile mode", + ) + parser.add_argument( "--forward-only", "--forward_only", @@ -162,7 +179,7 @@ def parse_args(): "--output-json-for-dashboard", "--output_json_for_dashboard", help="Save results in JSON format for display on the OSS dashboard", - default="False", + default="benchmark-results.json", ) args, _ = parser.parse_known_args() diff --git a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv index 873f14d20127..9a7b6797e982 100644 --- a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv +++ b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv @@ -1,5 +1,5 @@ Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time -PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497 +PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459 PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181 PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826 PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449 @@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547 -PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739 +PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664 -PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875 +PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719 PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728 @@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619 -PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118 +PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613 -PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436 +PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295 PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189 @@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667 PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333 -PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667 \ No newline at end of file +PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667 diff --git a/benchmarks/operator_benchmark/pt/add_test.py b/benchmarks/operator_benchmark/pt/add_test.py index 54504c4f3005..739b8ef14a54 100644 --- a/benchmarks/operator_benchmark/pt/add_test.py +++ b/benchmarks/operator_benchmark/pt/add_test.py @@ -52,27 +52,6 @@ def forward(self, input_one, input_two): op_bench.generate_pt_test(add_long_configs + add_short_configs, AddBenchmark) op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddBenchmark) - -"""Mircobenchmark for addmm operator.""" - - -class AddmmBenchmark(op_bench.TorchBenchmarkBase): - def init(self, M, N, K, device): - self.inputs = { - "input_one": torch.rand(M, K, device=device, requires_grad=self.auto_set()), - "mat1": torch.rand(M, N, device=device, requires_grad=self.auto_set()), - "mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set()), - } - self.set_module_name("addmm") - - def forward(self, input_one, mat1, mat2): - return torch.addmm(input_one, mat1, mat2) - - -op_bench.generate_pt_test(add_long_configs + add_short_configs, AddmmBenchmark) -op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBenchmark) - - """Mircobenchmark for addr operator.""" @@ -106,46 +85,5 @@ def forward(self, input_one, vec1, vec2): op_bench.generate_pt_test(addr_configs, AddrBenchmark) op_bench.generate_pt_gradient_test(addr_configs, AddrBenchmark) - -"""Mircobenchmark for addbmm operator.""" - - -class AddbmmBenchmark(op_bench.TorchBenchmarkBase): - def init(self, B, M, N, K, device): - self.inputs = { - "input_one": torch.rand( - (M, N), device=device, requires_grad=self.auto_set() - ), - "batch1": torch.rand( - (B, M, K), device=device, requires_grad=self.auto_set() - ), - "batch2": torch.rand( - ( - B, - K, - N, - ), - device=device, - requires_grad=self.auto_set(), - ), - } - self.set_module_name("addbmm") - - def forward(self, input_one, batch1, batch2): - return torch.addbmm(input_one, batch1, batch2) - - -addbmm_configs = op_bench.cross_product_configs( - B=[2, 100], - M=[8, 256], - N=[256, 16], - K=[15, 16], - device=["cpu", "cuda"], - tags=["addbmm"], -) - -op_bench.generate_pt_test(addbmm_configs, AddbmmBenchmark) -op_bench.generate_pt_gradient_test(addbmm_configs, AddbmmBenchmark) - if __name__ == "__main__": op_bench.benchmark_runner.main() diff --git a/benchmarks/operator_benchmark/pt/addmm_test.py b/benchmarks/operator_benchmark/pt/addmm_test.py new file mode 100644 index 000000000000..a98628944b3e --- /dev/null +++ b/benchmarks/operator_benchmark/pt/addmm_test.py @@ -0,0 +1,115 @@ +import operator_benchmark as op_bench + +import torch + + +"""Microbenchmarks for add_(matmul) operator. Supports both Caffe2/PyTorch.""" + +# Configs for PT add operator +addmm_long_configs = op_bench.cross_product_configs( + M=[256, 1024, 3000], + N=[512, 4096], + K=[512, 4096], + device=["cuda"], + tags=["long"], + dtype=[torch.float16, torch.bfloat16, torch.float32], +) + + +addmm_short_configs = op_bench.config_list( + attr_names=["M", "N", "K"], + attrs=[ + [1, 1, 1], + [64, 64, 64], + [64, 64, 128], + ], + cross_product_configs={ + "device": ["cpu", "cuda"], + "dtype": [torch.float], + }, + tags=["short"], +) + + +"""Mircobenchmark for addmm operator.""" + + +class AddmmBenchmark(op_bench.TorchBenchmarkBase): + def init(self, M, N, K, device, dtype): + self.inputs = { + "input_one": torch.rand( + M, K, device=device, requires_grad=self.auto_set(), dtype=dtype + ), + "mat1": torch.rand( + M, N, device=device, requires_grad=self.auto_set(), dtype=dtype + ), + "mat2": torch.rand( + N, K, device=device, requires_grad=self.auto_set(), dtype=dtype + ), + } + self.set_module_name("addmm") + + def forward(self, input_one, mat1, mat2): + return torch.addmm(input_one, mat1, mat2) + + +op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark) +op_bench.generate_pt_gradient_test( + addmm_long_configs + addmm_long_configs, AddmmBenchmark +) + +"""Mircobenchmark for addbmm operator.""" + + +class AddbmmBenchmark(op_bench.TorchBenchmarkBase): + def init(self, B, M, N, K, device, dtype): + self.inputs = { + "input_one": torch.rand( + (M, N), device=device, requires_grad=self.auto_set(), dtype=dtype + ), + "batch1": torch.rand( + (B, M, K), device=device, requires_grad=self.auto_set(), dtype=dtype + ), + "batch2": torch.rand( + ( + B, + K, + N, + ), + device=device, + requires_grad=self.auto_set(), + dtype=dtype, + ), + } + self.set_module_name("addbmm") + + def forward(self, input_one, batch1, batch2): + return torch.addbmm(input_one, batch1, batch2) + + +addbmm_long_configs = op_bench.cross_product_configs( + B=[8, 32], + M=[256, 1024], + N=[256, 1024], + K=[64, 128], + device=["cuda"], + dtype=[torch.float16, torch.bfloat16, torch.float32], + tags=["long"], +) +addbmm_short_configs = op_bench.cross_product_configs( + B=[1, 8], + M=[8, 128], + N=[32, 64], + K=[256, 512], + device=["cpu", "cuda"], + dtype=[torch.float16, torch.bfloat16, torch.float32], + tags=["short"], +) + +op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark) +op_bench.generate_pt_gradient_test( + addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark +) + +if __name__ == "__main__": + op_bench.benchmark_runner.main() diff --git a/benchmarks/operator_benchmark/pt/bmm_test.py b/benchmarks/operator_benchmark/pt/bmm_test.py index 1c6d1f9aca55..f867f6ac09f8 100644 --- a/benchmarks/operator_benchmark/pt/bmm_test.py +++ b/benchmarks/operator_benchmark/pt/bmm_test.py @@ -27,12 +27,12 @@ ) batched_binary_configs_long = op_bench.cross_product_configs( - B=[1, 128], - M=[8, 128], - N=[32, 64], - K=[4, 256], - device=["cpu", "cuda"], - dtype=[torch.float, torch.bfloat16], + B=[8, 32], + M=[256, 1024], + N=[256, 1024], + K=[64, 128], + device=["cuda"], + dtype=[torch.float32, torch.bfloat16, torch.float16], tags=["long"], ) @@ -40,8 +40,12 @@ class BatchedBinaryOpBenchmark(op_bench.TorchBenchmarkBase): def init(self, B, M, N, K, device, dtype, op_func): self.inputs = { - "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype), - "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype), + "batch1": torch.rand( + (B, M, N), device=device, dtype=dtype, requires_grad=self.auto_set() + ), + "batch2": torch.rand( + (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set() + ), } self.op_func = op_func @@ -54,6 +58,11 @@ def forward(self, batch1, batch2): batched_binary_configs_short + batched_binary_configs_long, BatchedBinaryOpBenchmark, ) +op_bench.generate_pt_gradient_tests_from_op_list( + batched_binary_ops, + batched_binary_configs_long, + BatchedBinaryOpBenchmark, +) # batched ternary ops @@ -66,9 +75,15 @@ def forward(self, batch1, batch2): class BatchedTernaryOpBenchmark(op_bench.TorchBenchmarkBase): def init(self, B, M, N, K, device, dtype, op_func): self.inputs = { - "input_": torch.rand((B, M, K), device=device).to(dtype=dtype), - "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype), - "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype), + "input_": torch.rand( + (B, M, K), device=device, dtype=dtype, requires_grad=self.auto_set() + ), + "batch1": torch.rand( + (B, M, N), device=device, dtype=dtype, requires_grad=self.auto_set() + ), + "batch2": torch.rand( + (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set() + ), } self.op_func = op_func @@ -81,6 +96,12 @@ def forward(self, input_, batch1, batch2): batched_binary_configs_short + batched_binary_configs_long, BatchedTernaryOpBenchmark, ) +op_bench.generate_pt_gradient_tests_from_op_list( + batched_ternary_ops, + batched_binary_configs_long, + BatchedTernaryOpBenchmark, +) + # TODO: does it automatically register new scripts? diff --git a/benchmarks/operator_benchmark/pt/matmul_test.py b/benchmarks/operator_benchmark/pt/matmul_test.py index e92728e9ebd3..d0c58aa16e8f 100644 --- a/benchmarks/operator_benchmark/pt/matmul_test.py +++ b/benchmarks/operator_benchmark/pt/matmul_test.py @@ -13,33 +13,46 @@ [128, 128, 128, True, False], [256, 256, 256, False, True], ], - cross_product_configs={ - "device": ["cpu", "cuda"], - }, + cross_product_configs={"device": ["cpu", "cuda"]}, tags=["short"], ) mm_long_configs = op_bench.cross_product_configs( - M=[32], - N=[512, 128], - K=[64], + M=[256, 1024, 3000], + N=[512, 4096], + K=[512, 4096], trans_a=[False, True], trans_b=[True, False], - device=["cpu", "cuda"], + device=["cuda"], + dtype=[torch.float16, torch.bfloat16, torch.float32], tags=["long"], ) class MatMulBenchmark(op_bench.TorchBenchmarkBase): - def init(self, M, N, K, trans_a, trans_b, device): + def init(self, M, N, K, trans_a, trans_b, device, dtype=torch.float): + # Create tensors without requires_grad first, then set it separately + # This avoids creating graph leaves that cannot be deep copied + if trans_a: + input_one = torch.rand(M, N, device=device, dtype=dtype) + else: + input_one = torch.rand(N, M, device=device, dtype=dtype).t() + + if trans_b: + input_two = torch.rand(N, K, device=device, dtype=dtype) + else: + input_two = torch.rand(K, N, device=device, dtype=dtype).t() + + # Set requires_grad after tensor creation to avoid graph leaf issues + if self.auto_set(): + input_one.requires_grad_(True) + if self.auto_set(): + input_two.requires_grad_(True) + self.inputs = { - "input_one": torch.rand(M, N, device=device) - if trans_a - else torch.rand(N, M, device=device).t(), - "input_two": torch.rand(N, K, device=device) - if trans_b - else torch.rand(K, N, device=device).t(), + "input_one": input_one, + "input_two": input_two, } self.set_module_name("matmul") @@ -48,6 +61,7 @@ def forward(self, input_one, input_two): op_bench.generate_pt_test(mm_long_configs + mm_short_configs, MatMulBenchmark) +op_bench.generate_pt_gradient_test(mm_long_configs, MatMulBenchmark) if __name__ == "__main__": diff --git a/benchmarks/operator_benchmark/pt/mm_test.py b/benchmarks/operator_benchmark/pt/mm_test.py index bf2a2651e8fb..f9e0743ba712 100644 --- a/benchmarks/operator_benchmark/pt/mm_test.py +++ b/benchmarks/operator_benchmark/pt/mm_test.py @@ -23,11 +23,11 @@ ) mm_long_configs = op_bench.cross_product_configs( - M=[8, 128], - N=[32, 64], - K=[256, 512], - device=["cpu", "cuda"], - dtype=[torch.float, torch.bfloat16], + M=[256, 1024, 3000], + N=[512, 4096], + K=[512, 4096], + device=["cuda"], + dtype=[torch.float16, torch.bfloat16, torch.float32], tags=["long"], ) @@ -35,8 +35,12 @@ class MmOpBenchmark(op_bench.TorchBenchmarkBase): def init(self, M, N, K, device, dtype, op_func): self.inputs = { - "input_one": torch.randn(M, N, device=device).to(dtype=dtype), - "input_two": torch.randn(N, K, device=device).to(dtype=dtype), + "input_one": torch.randn( + M, N, device=device, requires_grad=self.auto_set(), dtype=dtype + ), + "input_two": torch.randn( + N, K, device=device, requires_grad=self.auto_set(), dtype=dtype + ), } self.op_func = op_func @@ -47,6 +51,9 @@ def forward(self, input_one, input_two): op_bench.generate_pt_tests_from_op_list( ops_list, mm_short_configs + mm_long_configs, MmOpBenchmark ) +op_bench.generate_pt_gradient_tests_from_op_list( + ops_list, mm_long_configs, MmOpBenchmark +) if __name__ == "__main__": diff --git a/buckbuild.bzl b/buckbuild.bzl index c5608f53ffea..193c16fbd4e5 100644 --- a/buckbuild.bzl +++ b/buckbuild.bzl @@ -11,7 +11,7 @@ load("//tools/build_defs:glob_defs.bzl", "subdir_glob") load("//tools/build_defs:platform_defs.bzl", "APPLETVOS", "IOS", "MACOSX") load("//tools/build_defs:type_defs.bzl", "is_list", "is_string") load("//tools/build_defs/android:build_mode_defs.bzl", is_production_build_android = "is_production_build") -load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build") +load("//tools/build_defs/apple:build_mode_defs.bzl", is_production_build_ios = "is_production_build", is_profile_build_ios = "is_profile_build") load( ":build_variables.bzl", "aten_cpu_source_list", @@ -74,7 +74,7 @@ def _is_build_mode_dev(): if is_production_build_android(): # Android Prod builds return False - if is_production_build_ios(): + if is_production_build_ios() or is_profile_build_ios(): # iOS Prod builds return False @@ -391,6 +391,8 @@ def get_aten_generated_files(enabled_backends): "CompositeExplicitAutogradFunctions_inl.h", "CompositeExplicitAutogradNonFunctionalFunctions.h", "CompositeExplicitAutogradNonFunctionalFunctions_inl.h", + "ViewMetaClasses.h", + "ViewMetaClasses.cpp", "VmapGeneratedPlumbing.h", "core/ATenOpList.cpp", "core/TensorBody.h", @@ -1192,6 +1194,7 @@ def define_buck_targets( "NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]", "Operators.h": ":gen_aten[Operators.h]", "RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]", + "ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]", "core/TensorBody.h": ":gen_aten[core/TensorBody.h]", "core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]", "core/enum_tag.h": ":gen_aten[core/enum_tag.h]", diff --git a/build.bzl b/build.bzl index 7c2c3e24dc5a..91529e75c9f0 100644 --- a/build.bzl +++ b/build.bzl @@ -118,6 +118,9 @@ def define_targets(rules): ":LazyNonNativeIr.h", ":RegisterDispatchDefinitions.ini", ":RegisterDispatchKey.cpp", + ":ViewMetaClassesPythonBinding.cpp", + ":ViewMetaClasses.cpp", + ":ViewMetaClasses.h", ":native_functions.yaml", ":shape_inference.h", ":tags.yaml", @@ -170,6 +173,7 @@ GENERATED_H = [ "FunctionalInverses.h", "RedispatchFunctions.h", "RegistrationDeclarations.h", + "ViewMetaClasses.h", "VmapGeneratedPlumbing.h", ] @@ -246,6 +250,7 @@ GENERATED_CPP = [ "RegisterFunctionalization_1.cpp", "RegisterFunctionalization_2.cpp", "RegisterFunctionalization_3.cpp", + "ViewMetaClasses.cpp", ] GENERATED_CPP_CORE = [ @@ -307,6 +312,7 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [ "torch/csrc/autograd/generated/python_torch_functions_1.cpp", "torch/csrc/autograd/generated/python_torch_functions_2.cpp", "torch/csrc/autograd/generated/python_variable_methods.cpp", + "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp" ] GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP diff --git a/build_variables.bzl b/build_variables.bzl index dfae1d527bb7..05f5fb1068c8 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -512,6 +512,7 @@ libtorch_distributed_base_sources = [ "torch/csrc/distributed/c10d/TCPStore.cpp", "torch/csrc/distributed/c10d/TCPStoreBackend.cpp", "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp", + "torch/csrc/distributed/c10d/Types.cpp", "torch/csrc/distributed/c10d/Utils.cpp", "torch/csrc/distributed/c10d/Work.cpp", "torch/csrc/distributed/c10d/comm.cpp", @@ -635,6 +636,12 @@ libtorch_nativert_sources = [ "torch/nativert/graph/passes/pass_manager/GraphPasses.cpp", "torch/nativert/graph/passes/pass_manager/PassManager.cpp", "torch/nativert/kernels/KernelHandlerRegistry.cpp", + "torch/nativert/kernels/TritonKernel.cpp", + "torch/nativert/executor/triton/CpuTritonKernelManager.cpp", +] + +libtorch_nativert_cuda_sources = [ + "torch/nativert/executor/triton/CudaTritonKernelManager.cpp", ] torch_mobile_tracer_sources = [ @@ -755,14 +762,22 @@ libtorch_cuda_distributed_extra_sources = [ "torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu", "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp", "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", + "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp", "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp", ] +libtorch_nvshmem_sources = [ + "torch/csrc/distributed/c10d/cuda/utils.cpp", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp", + "torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu", + "torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu", +] + libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [ "torch/csrc/cuda/nccl.cpp", -] +] + libtorch_nativert_cuda_sources torch_cpp_srcs = [ "torch/csrc/api/src/cuda.cpp", # this just forwards stuff, no real CUDA @@ -992,6 +1007,7 @@ libtorch_python_core_sources = [ "torch/csrc/utils/disable_torch_function.cpp", "torch/csrc/utils/verbose.cpp", "torch/csrc/cpu/Module.cpp", + "torch/csrc/functionalization/Module.cpp", "torch/csrc/instruction_counter/Module.cpp", "torch/nativert/python/Bindings.cpp", ] + lazy_tensor_core_python_sources @@ -1034,6 +1050,7 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"): "torch/csrc/autograd/generated/python_torch_functions_1.cpp", "torch/csrc/autograd/generated/python_torch_functions_2.cpp", "torch/csrc/autograd/generated/python_variable_methods.cpp", + "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp", ]] _libtorch_python_sources.extend(libtorch_python_core_sources) @@ -1079,6 +1096,7 @@ aten_cpu_source_non_codegen_list = [ "aten/src/ATen/DeviceAccelerator.cpp", "aten/src/ATen/Context.cpp", "aten/src/ATen/DLConvertor.cpp", + "aten/src/ATen/DTensorState.cpp", "aten/src/ATen/EmptyTensor.cpp", "aten/src/ATen/ExpandUtils.cpp", "aten/src/ATen/CachedTensorUtils.cpp", diff --git a/c10/core/AllocatorConfig.cpp b/c10/core/AllocatorConfig.cpp index e154338d501b..c6b6e95f43b2 100644 --- a/c10/core/AllocatorConfig.cpp +++ b/c10/core/AllocatorConfig.cpp @@ -45,7 +45,7 @@ size_t AcceleratorAllocatorConfig::roundup_power2_divisions(size_t size) { 63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoStart); const size_t interval_end = 63 - llvm::countLeadingZeros(kRoundUpPowerOfTwoEnd); - TORCH_CHECK_VALUE( + TORCH_CHECK( interval_end - interval_start == kRoundUpPowerOfTwoIntervals, "kRoundUpPowerOfTwoIntervals mismatch"); @@ -64,7 +64,7 @@ size_t AcceleratorAllocatorConfig::parseMaxSplitSize( std::numeric_limits::max() / kMB; size_t val_env = tokenizer.toSizeT(++i); - TORCH_CHECK_VALUE( + TORCH_CHECK( val_env >= min_allowed_split_size_mb, "CachingAllocator option max_split_size_mb too small, must be >= ", min_allowed_split_size_mb); @@ -83,7 +83,7 @@ size_t AcceleratorAllocatorConfig::parseMaxNonSplitRoundingSize( std::numeric_limits::max() / kMB; size_t val_env = tokenizer.toSizeT(++i); - TORCH_CHECK_VALUE( + TORCH_CHECK( val_env >= min_allowed_split_size_mb, "CachingAllocator option max_non_split_rounding_mb too small, must be >= ", min_allowed_split_size_mb); @@ -98,7 +98,7 @@ size_t AcceleratorAllocatorConfig::parseGarbageCollectionThreshold( size_t i) { tokenizer.checkToken(++i, ":"); double val_env = tokenizer.toDouble(++i); - TORCH_CHECK_VALUE( + TORCH_CHECK( val_env > 0 && val_env < 1.0, "garbage_collect_threshold is invalid, set it in (0.0, 1.0)"); garbage_collection_threshold_ = val_env; @@ -119,7 +119,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions( size_t value_index = i; tokenizer.checkToken(++i, ":"); size_t value = tokenizer.toSizeT(++i); - TORCH_CHECK_VALUE( + TORCH_CHECK( value == 0 || llvm::isPowerOf2_64(value), "For roundups, the divisions has to be power of 2 or 0 to disable roundup "); @@ -133,7 +133,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions( value); } else { size_t boundary = tokenizer.toSizeT(value_index); - TORCH_CHECK_VALUE( + TORCH_CHECK( llvm::isPowerOf2_64(boundary), "For roundups, the intervals have to be power of 2 "); @@ -163,7 +163,7 @@ size_t AcceleratorAllocatorConfig::parseRoundUpPower2Divisions( "Expected closing bracket ']' in ConfigTokenizer but reached end of config"); } else { // Keep this for backwards compatibility size_t value = tokenizer.toSizeT(i); - TORCH_CHECK_VALUE( + TORCH_CHECK( llvm::isPowerOf2_64(value), "For roundups, the divisions has to be power of 2 "); std::fill( diff --git a/c10/core/AllocatorConfig.h b/c10/core/AllocatorConfig.h index efde5e3a8ff9..68cc47a8417c 100644 --- a/c10/core/AllocatorConfig.h +++ b/c10/core/AllocatorConfig.h @@ -76,7 +76,7 @@ class ConfigTokenizer { } else if (token == "False") { return false; } else { - TORCH_CHECK_VALUE( + TORCH_CHECK( false, "Expected 'True' or 'False' at index ", i, diff --git a/c10/core/Contiguity.h b/c10/core/Contiguity.h index 279a795583b1..eed3f2498342 100644 --- a/c10/core/Contiguity.h +++ b/c10/core/Contiguity.h @@ -33,7 +33,8 @@ bool _compute_contiguous(ArrayRef sizes, ArrayRef strides, T numel) { } // Return a SymBool with underlying symbolic expression that represents -// contiguity. Guaranteed not to add guards. +// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions +// or symbolic True. inline static c10::SymBool _compute_contiguous_sym( ArrayRef sizes, ArrayRef strides, @@ -76,6 +77,8 @@ inline static c10::SymBool _compute_contiguous_sym( return true; }; + // We try to minimize creating large symbolic expressions when not needed to + // avoid symbolic evaluation perf issues. if (is_contiguous_or_false()) { return c10::SymBool(true); } @@ -94,6 +97,9 @@ inline static c10::SymBool _compute_contiguous_sym( return is_contiguous_cond.sym_or(is_empty); } +// When T is SymInt this function may throw a data dependent error. +// _compute_channels_last_contiguous_2d_sym does not. Only use this function +// when inputs are hinted. template bool _compute_channels_last_contiguous_2d( ArrayRef sizes, @@ -105,8 +111,8 @@ bool _compute_channels_last_contiguous_2d( T expected = 1; for (auto& d : {1, 3, 2, 0}) { const auto& size_d = sizes[d]; - if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) { - if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) { + if (size_d != 1) { + if (strides[d] != expected) { return false; } expected *= size_d; @@ -123,6 +129,65 @@ bool _compute_channels_last_contiguous_2d( } } +// Return a SymBool with underlying symbolic expression that represents +// contiguity. Guaranteed not to throw DDE, may returns a symbolic expressions +// or symbolic True. +inline static c10::SymBool _compute_channels_last_contiguous_2d_sym( + ArrayRef sizes, + ArrayRef strides) { + switch (sizes.size()) { + case 4: { + // When this function return True, result always true. When it return + // False, result could be False or data dependent. + auto guard_or_false = [&]() { + c10::SymInt expected = 1; + for (auto& d : {1, 3, 2, 0}) { + const auto& size_d = sizes[d]; + // Not taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) { + continue; + } + // Taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) { + return false; + } + expected *= size_d; + } + return true; + }; + + // We try to minimize creating large symbolic expressions when not needed + // to avoid symbolic evaluation perf issues. + if (guard_or_false()) { + return c10::SymBool(true); + } + + // Result is either false, or data dependent. + c10::SymInt expected_stride = 1; + c10::SymBool cond = true; + + for (auto& d : {1, 3, 2, 0}) { + const auto& size_d = sizes[d]; + cond = cond.sym_and( + size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride))); + expected_stride *= size_d; + } + return cond; + } + // NOLINTNEXTLINE(bugprone-branch-clone) + case 3: + // TODO dim == 3 case will be enabled once it is fully tested + return c10::SymBool(false); + default: + return c10::SymBool(false); + } +} + +// When T is SymInt this function may throw a data dependent error. +// _compute_channels_last_contiguous_3d_sym does not. Only use this function +// when inputs are hinted. template bool _compute_channels_last_contiguous_3d( ArrayRef sizes, @@ -134,8 +199,8 @@ bool _compute_channels_last_contiguous_3d( T expected = 1; for (auto& d : {1, 4, 3, 2, 0}) { const auto& size_d = sizes[d]; - if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(size_d, 1))) { - if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(strides[d], expected))) { + if (size_d != 1) { + if (strides[d] != expected) { return false; } expected *= size_d; @@ -152,6 +217,59 @@ bool _compute_channels_last_contiguous_3d( } } +inline static c10::SymBool _compute_channels_last_contiguous_3d_sym( + ArrayRef sizes, + ArrayRef strides) { + switch (sizes.size()) { + case 5: { + // When this function return True, result always true. When it return + // False, result could be False or data dependent. + auto guard_or_false = [&]() { + c10::SymInt expected = 1; + for (auto& d : {1, 4, 3, 2, 0}) { + const auto& size_d = sizes[d]; + // Not taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_FALSE(sym_eq(sizes[d], 1))) { + continue; + } + // Taking this branch could make this return False instead of True + // but not vice-versa. so its ok. + if (TORCH_GUARD_OR_TRUE(sym_ne(strides[d], expected))) { + return false; + } + expected *= size_d; + } + return true; + }; + + // We try to minimize creating large symbolic expressions when not needed + // to avoid symbolic evaluation perf issues. + if (guard_or_false()) { + return c10::SymBool(true); + } + + // Result is either false, or data dependent. + c10::SymInt expected_stride = 1; + c10::SymBool cond = true; + + for (auto& d : {1, 4, 3, 2, 0}) { + const auto& size_d = sizes[d]; + cond = cond.sym_and( + size_d.sym_eq(1).sym_or(sym_eq(strides[d], expected_stride))); + expected_stride *= size_d; + } + return cond; + } + // NOLINTNEXTLINE(bugprone-branch-clone) + case 4: + // TODO dim == 4 case will be enabled once it is fully tested + return c10::SymBool(false); + default: + return c10::SymBool(false); + } +} + template bool _compute_non_overlapping_and_dense( ArrayRef sizes, diff --git a/c10/core/SymInt.cpp b/c10/core/SymInt.cpp index c6c2743d8358..b78ca94dc514 100644 --- a/c10/core/SymInt.cpp +++ b/c10/core/SymInt.cpp @@ -20,6 +20,14 @@ void SymInt::promote_to_negative() { s.data_ = 0; } +std::optional SymInt::maybe_as_int_slow_path() const { + auto* node = toSymNodeImplUnowned(); + if (auto c = node->constant_int()) { + return c; + } + return node->maybe_as_int(); +} + SymNode SymInt::toSymNode() const { TORCH_CHECK_ALWAYS_SHOW_CPP_STACKTRACE( is_heap_allocated(), "SymInt::toSymNode is_heap_allocated"); @@ -45,12 +53,11 @@ bool SymInt::has_hint() const { #define DEFINE_BINARY(API, OP, METHOD, RET) \ RET SymInt::API(const SymInt& sci) const { \ if (auto ma = maybe_as_int()) { \ - if (auto mb = sci.maybe_as_int()) { \ - return RET(OP(*ma, *mb)); \ - } else { \ - auto b = sci.toSymNode(); \ - return RET(b->wrap_int(*ma)->METHOD(b)); \ - } \ + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( \ + !sci.maybe_as_int(), \ + "should have hit fast path in the header in this case."); \ + auto b = sci.toSymNode(); \ + return RET(b->wrap_int(*ma)->METHOD(b)); \ } else { \ if (auto mb = sci.maybe_as_int()) { \ auto a = toSymNodeImplUnowned(); \ @@ -61,19 +68,19 @@ bool SymInt::has_hint() const { } \ } -DEFINE_BINARY(operator+, std::plus<>(), add, SymInt) -DEFINE_BINARY(operator-, std::minus<>(), sub, SymInt) -DEFINE_BINARY(operator*, std::multiplies<>(), mul, SymInt) -DEFINE_BINARY(operator/, std::divides<>(), floordiv, SymInt) -DEFINE_BINARY(operator%, std::modulus<>(), mod, SymInt) -DEFINE_BINARY(sym_eq, std::equal_to<>(), eq, SymBool) -DEFINE_BINARY(sym_ne, std::not_equal_to<>(), ne, SymBool) -DEFINE_BINARY(sym_lt, std::less<>(), lt, SymBool) -DEFINE_BINARY(sym_le, std::less_equal<>(), le, SymBool) -DEFINE_BINARY(sym_gt, std::greater<>(), gt, SymBool) -DEFINE_BINARY(sym_ge, std::greater_equal<>(), ge, SymBool) -DEFINE_BINARY(min, std::min, sym_min, SymInt) -DEFINE_BINARY(max, std::max, sym_max, SymInt) +DEFINE_BINARY(operator_add_slow_path, std::plus<>(), add, SymInt) +DEFINE_BINARY(operator_sub_slow_path, std::minus<>(), sub, SymInt) +DEFINE_BINARY(operator_mul_slow_path, std::multiplies<>(), mul, SymInt) +DEFINE_BINARY(operator_div_slow_path, std::divides<>(), floordiv, SymInt) +DEFINE_BINARY(operator_mod_slow_path, std::modulus<>(), mod, SymInt) +DEFINE_BINARY(sym_eq_slow_path, std::equal_to<>(), eq, SymBool) +DEFINE_BINARY(sym_ne_slow_path, std::not_equal_to<>(), ne, SymBool) +DEFINE_BINARY(sym_lt_slow_path, std::less<>(), lt, SymBool) +DEFINE_BINARY(sym_le_slow_path, std::less_equal<>(), le, SymBool) +DEFINE_BINARY(sym_gt_slow_path, std::greater<>(), gt, SymBool) +DEFINE_BINARY(sym_ge_slow_path, std::greater_equal<>(), ge, SymBool) +DEFINE_BINARY(min_slow_path, std::min, sym_min, SymInt) +DEFINE_BINARY(max_slow_path, std::max, sym_max, SymInt) SymInt::operator SymFloat() const { if (auto ma = maybe_as_int()) { @@ -153,15 +160,15 @@ SymInt operator-(const SymInt& s) { } } -void SymInt::operator*=(const SymInt& sci) { +void SymInt::operator_imul_slow_path(const SymInt& sci) { *this = *this * sci; } -void SymInt::operator/=(const SymInt& sci) { +void SymInt::operator_idiv_slow_path(const SymInt& sci) { *this = *this / sci; } -void SymInt::operator+=(const SymInt& sci) { +void SymInt::operator_iadd_slow_path(const SymInt& sci) { *this = *this + sci; } diff --git a/c10/core/SymInt.h b/c10/core/SymInt.h index 51686f8b81af..9b1c776cbe2a 100644 --- a/c10/core/SymInt.h +++ b/c10/core/SymInt.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -177,23 +178,136 @@ class C10_API SymInt { #endif } - SymInt operator+(const SymInt& sci) const; - SymInt operator-(const SymInt& sci) const; - SymInt operator*(const SymInt& sci) const; - SymInt operator/(const SymInt& sci) const; - SymInt operator%(const SymInt& sci) const; - void operator*=(const SymInt& sci); - void operator+=(const SymInt& sci); - void operator/=(const SymInt& sci); + SymInt operator+(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma + *mb); + } + } + return operator_add_slow_path(sci); + } + + SymInt operator-(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma - *mb); + } + } + return operator_sub_slow_path(sci); + } + + SymInt operator*(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma * *mb); + } + } + return operator_mul_slow_path(sci); + } + + SymInt operator/(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma / *mb); + } + } + return operator_div_slow_path(sci); + } + + SymInt operator%(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(*ma % *mb); + } + } + return operator_mod_slow_path(sci); + } + + void operator*=(const SymInt& sci) { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + *this = SymInt(*ma * *mb); + return; + } + } + operator_imul_slow_path(sci); + } + + void operator+=(const SymInt& sci) { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + *this = SymInt(*ma + *mb); + return; + } + } + operator_iadd_slow_path(sci); + } + + void operator/=(const SymInt& sci) { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + *this = SymInt(*ma / *mb); + return; + } + } + operator_idiv_slow_path(sci); + } SymInt clone() const; - SymBool sym_eq(const SymInt&) const; - SymBool sym_ne(const SymInt&) const; - SymBool sym_lt(const SymInt&) const; - SymBool sym_le(const SymInt&) const; - SymBool sym_gt(const SymInt&) const; - SymBool sym_ge(const SymInt&) const; + SymBool sym_eq(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma == *mb); + } + } + return sym_eq_slow_path(sci); + } + + SymBool sym_ne(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma != *mb); + } + } + return sym_ne_slow_path(sci); + } + + SymBool sym_lt(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma < *mb); + } + } + return sym_lt_slow_path(sci); + } + + SymBool sym_le(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma <= *mb); + } + } + return sym_le_slow_path(sci); + } + + SymBool sym_gt(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma > *mb); + } + } + return sym_gt_slow_path(sci); + } + + SymBool sym_ge(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymBool(*ma >= *mb); + } + } + return sym_ge_slow_path(sci); + } bool operator==(const SymInt& o) const { return sym_eq(o).guard_bool(__FILE__, __LINE__); @@ -214,8 +328,23 @@ class C10_API SymInt { return sym_ge(o).guard_bool(__FILE__, __LINE__); } - SymInt min(const SymInt& sci) const; - SymInt max(const SymInt& sci) const; + SymInt min(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(std::min(*ma, *mb)); + } + } + return min_slow_path(sci); + } + + SymInt max(const SymInt& sci) const { + if (auto ma = maybe_as_int()) { + if (auto mb = sci.maybe_as_int()) { + return SymInt(std::max(*ma, *mb)); + } + } + return max_slow_path(sci); + } // If both are symbolic, this checks if // they share the same node. @@ -239,11 +368,7 @@ class C10_API SymInt { if (!is_heap_allocated()) { return data_; } - auto* node = toSymNodeImplUnowned(); - if (auto c = node->constant_int()) { - return c; - } - return node->maybe_as_int(); + return maybe_as_int_slow_path(); } // Return whether the integer is directly coercible to a SymInt @@ -264,6 +389,25 @@ class C10_API SymInt { private: void promote_to_negative(); + SymInt operator_add_slow_path(const SymInt& sci) const; + SymInt operator_sub_slow_path(const SymInt& sci) const; + SymInt operator_mul_slow_path(const SymInt& sci) const; + SymInt operator_div_slow_path(const SymInt& sci) const; + SymInt operator_mod_slow_path(const SymInt& sci) const; + void operator_imul_slow_path(const SymInt& sci); + void operator_iadd_slow_path(const SymInt& sci); + void operator_idiv_slow_path(const SymInt& sci); + SymBool sym_eq_slow_path(const SymInt& sci) const; + SymBool sym_ne_slow_path(const SymInt& sci) const; + SymBool sym_lt_slow_path(const SymInt& sci) const; + SymBool sym_le_slow_path(const SymInt& sci) const; + SymBool sym_gt_slow_path(const SymInt& sci) const; + SymBool sym_ge_slow_path(const SymInt& sci) const; + + SymInt min_slow_path(const SymInt& sci) const; + SymInt max_slow_path(const SymInt& sci) const; + + std::optional maybe_as_int_slow_path() const; // Constraints on the internal representation: // diff --git a/c10/core/SymbolicShapeMeta.cpp b/c10/core/SymbolicShapeMeta.cpp index 6fa2ab0ed4f1..01276d416fbb 100644 --- a/c10/core/SymbolicShapeMeta.cpp +++ b/c10/core/SymbolicShapeMeta.cpp @@ -71,6 +71,27 @@ normalize_sym_sizes_strides(SymIntArrayRef sizes, SymIntArrayRef strides) { return std::tuple, std::vector>( std::move(base), std::move(size_nodes), std::move(stride_nodes)); } +namespace { +bool all_hinted( + const c10::SymIntArrayRef& sizes, + const c10::SymIntArrayRef& strides) { + auto all_hinted = true; + for (const auto& s : sizes) { + if (!s.has_hint()) { + return false; + } + } + + if (all_hinted) { + for (const auto& s : strides) { + if (!s.has_hint()) { + return false; + } + } + } + return all_hinted; +} +} // namespace // Special treatment because of numel SymBool SymbolicShapeMeta::compute_contiguous() const { @@ -88,28 +109,61 @@ SymBool SymbolicShapeMeta::compute_contiguous() const { return maybe_as_bool.value(); } - auto all_hinted = true; - for (const auto& s : sizes) { - if (!s.has_hint()) { - all_hinted = false; - break; - } + if (all_hinted(sizes, strides)) { + // We avoid going through the slow path if everything is hinted, + // because evaluating a large SymPy expression can be expensive. + // TODO exclude backed_size_oblivious from this path. + return _compute_contiguous(sizes_, strides_, numel()); } - if (all_hinted) { - for (const auto& s : strides) { - if (!s.has_hint()) { - all_hinted = false; - break; - } - } + return result; +} + +SymBool SymbolicShapeMeta::compute_channels_last_contiguous_2d() const { + if (!strides_valid_) { + return false; } + c10::SymIntArrayRef sizes(sizes_); + c10::SymIntArrayRef strides(strides_); - if (all_hinted) { + auto result = _compute_channels_last_contiguous_2d_sym(sizes, strides); + + // If the result is already determined without guarding, just return it. + auto maybe_as_bool = result.maybe_as_bool(); + if (maybe_as_bool.has_value()) { + return maybe_as_bool.value(); + } + + if (all_hinted(sizes, strides)) { // We avoid going through the slow path if everything is hinted, // because evaluating a large SymPy expression can be expensive. // TODO exclude backed_size_oblivious from this path. - return _compute_contiguous(sizes_, strides_, numel()); + return _compute_channels_last_contiguous_2d(sizes_, strides_); + } + + return result; +} + +SymBool SymbolicShapeMeta::compute_channels_last_contiguous_3d() const { + if (!strides_valid_) { + return false; + } + c10::SymIntArrayRef sizes(sizes_); + c10::SymIntArrayRef strides(strides_); + + auto result = _compute_channels_last_contiguous_3d_sym(sizes, strides); + + // If the result is already determined without guarding, just return it. + auto maybe_as_bool = result.maybe_as_bool(); + if (maybe_as_bool.has_value()) { + return maybe_as_bool.value(); + } + + if (all_hinted(sizes, strides)) { + // We avoid going through the slow path if everything is hinted, + // because evaluating a large SymPy expression can be expensive. + // TODO exclude backed_size_oblivious from this path. + return _compute_channels_last_contiguous_3d(sizes_, strides_); } return result; @@ -143,8 +197,6 @@ SymBool SymbolicShapeMeta::compute_contiguous() const { } // clang-format off -DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_2d, _compute_channels_last_contiguous_2d) -DEFINE_EAGER_SYMBOOL_COMPUTE(compute_channels_last_contiguous_3d, _compute_channels_last_contiguous_3d) DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_2d, is_channels_last_strides_2d) DEFINE_EAGER_SYMBOOL_COMPUTE(compute_strides_like_channels_last_3d, is_channels_last_strides_3d) diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index f3ec2f2d46ea..cd0321d3bb6f 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -313,8 +313,15 @@ void TensorImpl::throw_data_ptr_access_error() const { c10::SymBool TensorImpl::sym_is_contiguous_custom( at::MemoryFormat memory_format) const { if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) { - return pyobj_slot_.load_pyobj_interpreter()->is_contiguous( - this, memory_format); + // TO reduce BC breaking and reduce having to introduce + // sym_is_contiguous. call is_contiguous when tensor does not + if (C10_UNLIKELY(has_symbolic_sizes_strides_)) { + return pyobj_slot_.load_pyobj_interpreter()->sym_is_contiguous( + this, memory_format); + } else { + return pyobj_slot_.load_pyobj_interpreter()->is_contiguous( + this, memory_format); + } } return sym_is_contiguous_default(memory_format); diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp index b4ae1d612e96..913bc7872657 100644 --- a/c10/core/impl/PyInterpreter.cpp +++ b/c10/core/impl/PyInterpreter.cpp @@ -60,6 +60,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable { bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const override { PANIC(is_contiguous); } + c10::SymBool sym_is_contiguous(const TensorImpl* self, at::MemoryFormat) + const override { + PANIC(sym_is_contiguous); + } bool is_strides_like(const TensorImpl* self, at::MemoryFormat) const override { PANIC(is_strides_like); diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h index 09d4801f7d83..def708c24b80 100644 --- a/c10/core/impl/PyInterpreter.h +++ b/c10/core/impl/PyInterpreter.h @@ -168,6 +168,9 @@ struct C10_API PyInterpreterVTable { virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const = 0; + virtual c10::SymBool sym_is_contiguous( + const TensorImpl* self, + at::MemoryFormat) const = 0; virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat) const = 0; virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0; diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp index d2efb8c593e4..8706f7362a3d 100644 --- a/c10/cuda/CUDAAllocatorConfig.cpp +++ b/c10/cuda/CUDAAllocatorConfig.cpp @@ -25,6 +25,7 @@ CUDAAllocatorConfig::CUDAAllocatorConfig() #endif m_release_lock_on_cudamalloc(false), m_pinned_use_cuda_host_register(false), + m_graph_capture_record_stream_reuse(false), m_pinned_use_background_threads(false) { m_roundup_power2_divisions.assign(kRoundUpPowerOfTwoIntervals, 0); } @@ -373,6 +374,9 @@ void CUDAAllocatorConfig::parseArgs(const std::optional& env) { } else if (config_item_view == "pinned_use_background_threads") { i = parsePinnedUseBackgroundThreads(config, i); used_native_specific_option = true; + } else if (config_item_view == "graph_capture_record_stream_reuse") { + i = parseGraphCaptureRecordStreamReuse(config, i); + used_native_specific_option = true; } else { TORCH_CHECK( false, "Unrecognized CachingAllocator option: ", config_item_view); @@ -406,6 +410,23 @@ size_t CUDAAllocatorConfig::parsePinnedUseCudaHostRegister( return i; } +size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse( + const std::vector& config, + size_t i) { + consumeToken(config, ++i, ':'); + if (++i < config.size()) { + TORCH_CHECK( + (config[i] == "True" || config[i] == "False"), + "Expected a single True/False argument for graph_capture_record_stream_reuse"); + m_graph_capture_record_stream_reuse = (config[i] == "True"); + } else { + TORCH_CHECK( + false, "Error, expecting graph_capture_record_stream_reuse value", ""); + } + + return i; +} + size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads( const std::vector& config, size_t i) { diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h index fda3cc02e5d0..54c41ba70fb6 100644 --- a/c10/cuda/CUDAAllocatorConfig.h +++ b/c10/cuda/CUDAAllocatorConfig.h @@ -53,6 +53,10 @@ class C10_CUDA_API CUDAAllocatorConfig { return instance().m_release_lock_on_cudamalloc; } + static bool graph_capture_record_stream_reuse() { + return instance().m_graph_capture_record_stream_reuse; + } + /** Pinned memory allocator settings */ static bool pinned_use_cuda_host_register() { return instance().m_pinned_use_cuda_host_register; @@ -142,6 +146,9 @@ class C10_CUDA_API CUDAAllocatorConfig { size_t parsePinnedUseBackgroundThreads( const std::vector& config, size_t i); + size_t parseGraphCaptureRecordStreamReuse( + const std::vector& config, + size_t i); std::atomic m_max_split_size; std::atomic m_max_non_split_rounding_size; @@ -153,6 +160,7 @@ class C10_CUDA_API CUDAAllocatorConfig { m_expandable_segments_handle_type; std::atomic m_release_lock_on_cudamalloc; std::atomic m_pinned_use_cuda_host_register; + std::atomic m_graph_capture_record_stream_reuse; std::atomic m_pinned_use_background_threads; std::string m_last_allocator_settings; std::mutex m_last_allocator_settings_mutex; diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index e701f1527c00..93ac4f7a4c64 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -1167,8 +1167,13 @@ class DeviceCachingAllocator { // tracks which pools we can use as a last resort before ooming ska::flat_hash_set use_on_oom_pools; - // See free() for this thing's purpose - std::vector needs_events_deferred_until_no_capture; + // Map of blocks whose freeing is deferred until after CUDA graph capture. + // - Key: Block* to be freed. + // - Value: List of "empty nodes" inserted as free markers during capture. + // If the vector is empty, the block must always be deferred until capture + // ends. + ska::flat_hash_map> deferred_blocks; + // outstanding cuda events ska::flat_hash_map< cuda::CUDAStream, @@ -1329,6 +1334,11 @@ class DeviceCachingAllocator { // capture. Cross-stream memory use is uncommon, so the deferral's // effect on memory use during capture should be small. process_events(context); + } else { + if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) { + // We check if there is some block that is safe to reuse on this stream + free_safe_blocks_in_capture(context, stream); + } } size_t size = round_size(orig_size); auto& pool = get_pool(size, stream); @@ -1619,6 +1629,248 @@ class DeviceCachingAllocator { return block; } + // Insert "free marker" (empty nodes) into the CUDA graph for all streams that + // have used the block, including the allocation stream. These nodes mark the + // last use of the block in the capture graph. Returns a vector of the + // inserted nodes, or an empty vector if any stream is not capturing. + std::vector insert_free_marker(Block* block) { + std::vector empty_nodes; + + auto try_add_empty_node = [&](cudaStream_t stream) -> bool { + cudaStreamCaptureStatus status{}; + cudaGraph_t graph{}; + const cudaGraphNode_t* deps = nullptr; + size_t num_deps = 0; +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000) + C10_CUDA_CHECK(cudaStreamGetCaptureInfo( + stream, &status, nullptr, &graph, &deps, nullptr, &num_deps)); +#else + C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2( + stream, &status, nullptr, &graph, &deps, &num_deps)); +#endif + + TORCH_INTERNAL_ASSERT( + status != cudaStreamCaptureStatusInvalidated, + "Invalid stream capture status"); + + if (status == cudaStreamCaptureStatusNone) { + return false; + } + + cudaGraphNode_t node{}; + C10_CUDA_CHECK(cudaGraphAddEmptyNode(&node, graph, deps, num_deps)); +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000) + C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies( + stream, &node, nullptr, 1, cudaStreamSetCaptureDependencies)); +#else + C10_CUDA_CHECK(cudaStreamUpdateCaptureDependencies( + stream, &node, 1, cudaStreamSetCaptureDependencies)); +#endif + empty_nodes.push_back(node); + return true; + }; + + // If any stream is not currently capturing, return an empty node vector. + // An empty vector indicates that the block should be deferred for freeing + // until after capture. + + // Attempt to add an empty node for the allocation stream. + if (!try_add_empty_node(block->stream)) { + return {}; + } + // Attempt to add empty nodes for all streams that have used the block. + for (const auto& s : block->stream_uses) { + if (!try_add_empty_node(s.stream())) { + return {}; + } + } + return empty_nodes; + } + + // Returns the current set of "terminal" nodes in the CUDA graph for a given + // stream. These represent the current endpoints of the stream, and may + // include additional nodes if the graph branches. Any new work captured will + // be attached after one or more of these terminals. + std::vector get_terminals(cudaStream_t stream) { + std::vector result; + + cudaStreamCaptureStatus status{}; + cudaGraph_t graph{}; + const cudaGraphNode_t* dependencies = nullptr; + size_t num_dependencies = 0; + +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000) + C10_CUDA_CHECK(cudaStreamGetCaptureInfo( + stream, + &status, + nullptr, + &graph, + &dependencies, + nullptr, + &num_dependencies)); +#else + C10_CUDA_CHECK(cudaStreamGetCaptureInfo_v2( + stream, &status, nullptr, &graph, &dependencies, &num_dependencies)); +#endif + + TORCH_INTERNAL_ASSERT( + status == cudaStreamCaptureStatusActive, + "Invalid stream capture status"); + + for (size_t i = 0; i < num_dependencies; i++) { + auto node = dependencies[i]; + if (node != nullptr) { + result.push_back(node); + } + } + + return result; + } + + // Returns the set of "reusable" free markers (empty nodes) in the current + // CUDA graph capture. A free marker is considered reusable if it is a + // predecessor of every terminal node. + // This ensures that all future captured work will occur after the free + // marker, making it safe to reuse. + ska::flat_hash_set get_reusable_empty_nodes( + cudaStream_t stream) { + auto terminals = get_terminals(stream); + if (terminals.empty()) { + // No terminal nodes found; nothing to free. + return {}; + } + + auto get_dependencies = [](cudaGraphNode_t node, + cudaGraphNode_t* pDependencies, + size_t* pNumDependencies) -> void { +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 13000) + C10_CUDA_CHECK(cudaGraphNodeGetDependencies( + node, pDependencies, nullptr, pNumDependencies)); +#else + C10_CUDA_CHECK( + cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies)); +#endif + }; + + // Helper to retrieve all parent nodes (dependencies) of a given node. + auto get_parents = + [&](cudaGraphNode_t node) -> std::vector { + size_t count = 0; + get_dependencies(node, nullptr, &count); + std::vector out(count); + if (count) { + get_dependencies(node, out.data(), &count); + out.resize(count); + } + return out; + }; + + // Helper to determine if a node is an empty node (used as a free marker). + auto is_empty_node = [](cudaGraphNode_t n) -> bool { + cudaGraphNodeType type{}; + C10_CUDA_CHECK(cudaGraphNodeGetType(n, &type)); + return type == cudaGraphNodeTypeEmpty; + }; + + // For each terminal node, perform a reverse DFS to count, for each empty + // node, how many terminals it can reach (i.e., for how many terminals it is + // a predecessor). An empty node is reusable if it is a predecessor of all + // terminal nodes. + ska::flat_hash_map num_terminals_reachable; + + for (auto terminal : terminals) { + ska::flat_hash_set visited; + ska::flat_hash_set empty_nodes; + + std::function reverse_dfs = + [&](cudaGraphNode_t node) { + if (!visited.insert(node).second) + return; + + if (is_empty_node(node)) { + num_terminals_reachable[node]++; + empty_nodes.insert(node); + } + auto parents = get_parents(node); + for (auto p : parents) { + reverse_dfs(p); + } + }; + + reverse_dfs(terminal); + } + + ska::flat_hash_set reusable_empty_nodes; + for (auto [node, count] : num_terminals_reachable) { + if (count == terminals.size()) { + reusable_empty_nodes.insert(node); + } + } + + return reusable_empty_nodes; + } + + // A block is considered reusable during CUDA graph capture if every free + // marker (empty node) associated with the block is a predecessor of every + // terminal node. + // + // This ensures that any new operation added to the graph will be attached + // after all terminal nodes, which themselves are after all free markers. As a + // result, all future work is guaranteed to occur after the block's last use + // on every stream, so the block's previous lifetime ends before any new + // lifetime begins. This check relies solely on the DAG topology and does not + // require event queries, making it safe to use during capture. + // + // This function iterates over all deferred blocks, determines if their empty + // nodes are reusable according to the above criteria, and frees the block if + // so. + void free_safe_blocks_in_capture( + const std::shared_ptr& context, + cudaStream_t stream) { + auto reusable_empty_nodes = get_reusable_empty_nodes(stream); + + // If there are no reusable empty nodes (e.g., not currently capturing), + // there is nothing to do. + if (reusable_empty_nodes.empty()) { + return; + } + + std::vector blocks_to_erase; + + for (auto& [block, inserted_empty_nodes] : deferred_blocks) { + // Skip this block if it has no empty nodes, as we defer its freeing until + // after graph capture. Also skip if the block was not allocated on the + // current stream; such blocks will be freed when + // free_safe_blocks_in_capture is attempted on that stream. + if (inserted_empty_nodes.empty() || block->stream != stream) { + continue; + } + + bool is_reusable = true; + + for (const auto& node : inserted_empty_nodes) { + if (reusable_empty_nodes.find(node) == reusable_empty_nodes.end()) { + is_reusable = false; + break; + } + } + + if (is_reusable) { + // Clear stream uses since the graph ensures proper synchronization. + // No need to insert events. + block->stream_uses.clear(); + + free_block(block, context); + blocks_to_erase.push_back(block); + } + } + + // Remove blocks that were freed from the deferred_blocks map. + for (auto* block : blocks_to_erase) { + deferred_blocks.erase(block); + } + } + void free(Block* block) { std::shared_ptr context = maybeGatherContext(RecordContext::ALL); @@ -1654,14 +1906,22 @@ class DeviceCachingAllocator { if (block->size >= CUDAAllocatorConfig::max_split_size()) stats.oversize_allocations.decrease(1); + // If the block has been used on more than one stream, handle accordingly. if (!block->stream_uses.empty()) { if (C10_UNLIKELY(!captures_underway.empty())) { - // It's forbidden to cudaEventQuery an event recorded during CUDA graph - // capture. We conservatively defer recording end-of-life events until - // the next call to process_events() (which won't happen until no - // captures are underway) - needs_events_deferred_until_no_capture.push_back(block); + if (CUDAAllocatorConfig::graph_capture_record_stream_reuse()) { + // insert_free_marker returns a vector of free markers, + // or an empty vector if any associated stream is not currently + // capturing. The empty vector means that we will defer the free until + // capture is finished. + deferred_blocks.emplace(block, insert_free_marker(block)); + } else { + // If graph_capture_record_stream_reuse is not enabled, always defer + // the free until capture is finished. + deferred_blocks.emplace(block, std::vector{}); + } } else { + // If not in a capture, insert events for the block. insert_events(block); } } else { @@ -2977,8 +3237,8 @@ class DeviceCachingAllocator { --it; } if (!(*cur)->expandable_segment_) { - release_block(*cur, context); totalReleased += (*cur)->size; + release_block(*cur, context); } if (is_first) { break; @@ -3287,8 +3547,8 @@ class DeviceCachingAllocator { void insert_events_deferred_until_no_capture( const std::shared_ptr& context) { - if (C10_UNLIKELY(!needs_events_deferred_until_no_capture.empty())) { - for (auto* block : needs_events_deferred_until_no_capture) { + if (C10_UNLIKELY(!deferred_blocks.empty())) { + for (auto& [block, inserted_empty_nodes] : deferred_blocks) { TORCH_INTERNAL_ASSERT(!block->stream_uses.empty()); // only streams recorded before cudagraph will be used to insert events // since we know all streams recorded during cudagraph must have @@ -3300,7 +3560,7 @@ class DeviceCachingAllocator { free_block(block, context); } } - needs_events_deferred_until_no_capture.clear(); + deferred_blocks.clear(); } } @@ -3731,6 +3991,8 @@ class NativeCachingAllocator : public CUDAAllocator { md.pinned_use_host_register = CUDAAllocatorConfig::pinned_use_cuda_host_register(); md.last_allocator_settings = CUDAAllocatorConfig::last_allocator_settings(); + md.graph_capture_record_stream_reuse = + CUDAAllocatorConfig::graph_capture_record_stream_reuse(); md.roundup_power2_divisions = CUDAAllocatorConfig::roundup_power2_divisions(); diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index a89adb91e61d..bfc486d69fcf 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -163,6 +163,7 @@ struct AllocatorConfigInfo { bool expandable_segments; bool release_lock_on_malloc; bool pinned_use_host_register; + bool graph_capture_record_stream_reuse; std::string last_allocator_settings; std::vector roundup_power2_divisions; }; diff --git a/c10/cuda/CUDAFunctions.cpp b/c10/cuda/CUDAFunctions.cpp index 683ed9b76845..9839e4e72049 100644 --- a/c10/cuda/CUDAFunctions.cpp +++ b/c10/cuda/CUDAFunctions.cpp @@ -78,6 +78,18 @@ int device_count_impl(bool fail_if_no_driver) { "would like to use GPUs, turn off ASAN."); break; #endif // C10_ASAN_ENABLED +#if _WIN32 && CUDA_VERSION >= 13000 + // Workaround for CUDA-13.0 error handling on Windows, see + // https://github.com/pytorch/pytorch/issues/162333#issuecomment-3267929585 + case cudaErrorNotSupported: + if (!fail_if_no_driver) { + TORCH_WARN( + "cudaGetDeviceCount() returned cudaErrorNotSupported, " + "likely using older driver or on CPU machine"); + count = 0; + break; + } +#endif default: TORCH_CHECK( false, diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp index f936b02ec9ab..d545bf5477b6 100644 --- a/c10/cuda/driver_api.cpp +++ b/c10/cuda/driver_api.cpp @@ -61,11 +61,14 @@ void* get_symbol(const char* name, int version) { } #endif + // As of CUDA 13, this API is deprecated. +#if defined(CUDA_VERSION) && (CUDA_VERSION < 13000) // This fallback to the old API to try getting the symbol again. if (auto st = cudaGetDriverEntryPoint(name, &out, cudaEnableDefault, &qres); st == cudaSuccess && qres == cudaDriverEntryPointSuccess && out) { return out; } +#endif // If the symbol cannot be resolved, report and return nullptr; // the caller is responsible for checking the pointer. diff --git a/c10/metal/igamma.h b/c10/metal/igamma.h new file mode 100644 index 000000000000..8dabdbbb621c --- /dev/null +++ b/c10/metal/igamma.h @@ -0,0 +1,744 @@ +#pragma once + +#include +#include +#include + +using namespace c10::metal; +using namespace metal; + +namespace c10 { +namespace metal { + +template +inline float log_gamma(const T); + +inline float expm1f(float a); + +template +float erfc(T x); + +} // namespace metal +} // namespace c10 + +namespace { + +template +inline float lgamma(const T a) { + return log_gamma(a); +} + +inline float expm1(float a) { + return expm1f(a); +} + +// NOTE: The following code was ported directly from the CUDA implementation in +// `aten/src/ATen/native/cuda/IGammaKernel.cu` + +/* + * This implementation of the regularized incomplete gamma functions and + * their helper functions are derived from the implementation of SciPy's + * gammainc, Cephes's igam and igamc, and Boost's Lanczos approximations. + * See NOTICE for the licenses. + */ +// regularized lower & upper incomplete gamma +template +scalar_t ratevl( + scalar_t x, + const scalar_t num[], + int64_t M, + const scalar_t denom[], + int64_t N) { + // evaluating rational function, i.e., the ratio of two polynomials + // the coefficients for numerator are given by `num` while coeffs for + // denumerator are given by `denom` + + using accscalar_t = opmath_t; + int64_t i, dir; + accscalar_t y, num_ans, denom_ans; + accscalar_t absx = ::fabs(x); + thread const accscalar_t* p; + + if (absx > 1) { + /* Evaluate as a polynomial in 1/x. */ + dir = -1; + p = num + M; + y = 1 / x; + } else { + dir = 1; + p = num; + y = x; + } + + /* Evaluate the numerator */ + num_ans = *p; + p += dir; + for (i = 1; i <= M; i++) { + num_ans = num_ans * y + *p; + p += dir; + } + /* Evaluate the denominator */ + if (absx > 1) { + p = denom + N; + } else { + p = denom; + } + + denom_ans = *p; + p += dir; + for (i = 1; i <= N; i++) { + denom_ans = denom_ans * y + *p; + p += dir; + } + if (absx > 1) { + i = N - M; + return ::pow(x, static_cast(i)) * num_ans / denom_ans; + } else { + return num_ans / denom_ans; + } +} + +template +scalar_t lanczos_sum_expg_scaled(scalar_t x) { + // lanczos approximation + using accscalar_t = opmath_t; + + const accscalar_t lanczos_sum_expg_scaled_num[13] = { + 0.006061842346248906525783753964555936883222, + 0.5098416655656676188125178644804694509993, + 19.51992788247617482847860966235652136208, + 449.9445569063168119446858607650988409623, + 6955.999602515376140356310115515198987526, + 75999.29304014542649875303443598909137092, + 601859.6171681098786670226533699352302507, + 3481712.15498064590882071018964774556468, + 14605578.08768506808414169982791359218571, + 43338889.32467613834773723740590533316085, + 86363131.28813859145546927288977868422342, + 103794043.1163445451906271053616070238554, + 56906521.91347156388090791033559122686859}; + const accscalar_t lanczos_sum_expg_scaled_denom[13] = { + 1., + 66., + 1925., + 32670., + 357423., + 2637558., + 13339535., + 45995730., + 105258076., + 150917976., + 120543840., + 39916800., + 0}; + return ratevl( + static_cast(x), + lanczos_sum_expg_scaled_num, + sizeof(lanczos_sum_expg_scaled_num) / + sizeof(lanczos_sum_expg_scaled_num[0]) - + 1, + lanczos_sum_expg_scaled_denom, + sizeof(lanczos_sum_expg_scaled_denom) / + sizeof(lanczos_sum_expg_scaled_denom[0]) - + 1); +} + +template +scalar_t _igam_helper_fac(scalar_t a, scalar_t x) { + // compute x^a * exp(-a) / gamma(a) + // corrected from (15) and (16) in [igam2] by replacing exp(x - a) with + // exp(a - x). + + using accscalar_t = opmath_t; + accscalar_t ax, fac, res, num, numfac; + const accscalar_t MAXLOG = 88.72283905206835; + const accscalar_t EXP1 = 2.718281828459045; + const accscalar_t lanczos_g = 6.024680040776729583740234375; + + if (::fabs(a - x) > 0.4 * ::fabs(a)) { + ax = a * ::log(x) - x - ::lgamma(a); + if (ax < -MAXLOG) { + return 0.0; + } + return ::exp(ax); + } + + fac = a + lanczos_g - 0.5; + res = ::sqrt(fac / EXP1) / lanczos_sum_expg_scaled(a); + + if ((a < 200) && (x < 200)) { + res *= ::exp(a - x) * ::pow(x / fac, a); + } else { + num = x - a - lanczos_g + 0.5; + numfac = num / fac; + res *= ::exp(a * (::log1p(numfac) - numfac) + x * (0.5 - lanczos_g) / fac); + } + return res; +} + +template +scalar_t _igam_helper_series(scalar_t a, scalar_t x) { + // Compute igam using DLMF 8.11.4. [igam1] + + using accscalar_t = opmath_t; + const accscalar_t MACHEP = 5.9604644775390625E-8; + const int MAXITER = 2000; + + int i; + accscalar_t ans, ax, c, r; + + ax = _igam_helper_fac(a, x); + if (ax == 0.0) { + return 0.0; + } + + /* power series */ + r = a; + c = 1.0; + ans = 1.0; + + for (i = 0; i < MAXITER; i++) { + r += 1.0; + c *= x / r; + ans += c; + if (c <= MACHEP * ans) { + break; + } + } + return (ans * ax / a); +} + +template +scalar_t _igamc_helper_series(scalar_t a, scalar_t x) { + // Compute igamc using DLMF 8.7.3 [igam1]. This is related to the series in + // _igam_helper_series but extra care is taken to avoid cancellation. + + using accscalar_t = opmath_t; + int n; + accscalar_t fac = 1; + accscalar_t sum = 0; + accscalar_t term, logx; + const int MAXITER = 2000; + const accscalar_t MACHEP = 5.9604644775390625E-8; + + for (n = 1; n < MAXITER; n++) { + fac *= -x / n; + term = fac / (a + n); + sum += term; + if (::fabs(term) <= MACHEP * ::fabs(sum)) { + break; + } + } + + logx = ::log(x); + term = -::expm1(a * logx - ::lgamma(1 + a)); + return term - ::exp(a * logx - ::lgamma(a)) * sum; +} + +template +scalar_t _igam_helper_asymptotic_series(scalar_t a, scalar_t x, bool igam) { + // Compute igam/igamc using DLMF 8.12.3/8.12.4 [igam1] + + using accscalar_t = opmath_t; + const accscalar_t d[25][25] = { + {-3.3333333333333333e-1, 8.3333333333333333e-2, + -1.4814814814814815e-2, 1.1574074074074074e-3, + 3.527336860670194e-4, -1.7875514403292181e-4, + 3.9192631785224378e-5, -2.1854485106799922e-6, + -1.85406221071516e-6, 8.296711340953086e-7, + -1.7665952736826079e-7, 6.7078535434014986e-9, + 1.0261809784240308e-8, -4.3820360184533532e-9, + 9.1476995822367902e-10, -2.551419399494625e-11, + -5.8307721325504251e-11, 2.4361948020667416e-11, + -5.0276692801141756e-12, 1.1004392031956135e-13, + 3.3717632624009854e-13, -1.3923887224181621e-13, + 2.8534893807047443e-14, -5.1391118342425726e-16, + -1.9752288294349443e-15}, + {-1.8518518518518519e-3, -3.4722222222222222e-3, 2.6455026455026455e-3, + -9.9022633744855967e-4, 2.0576131687242798e-4, -4.0187757201646091e-7, + -1.8098550334489978e-5, 7.6491609160811101e-6, -1.6120900894563446e-6, + 4.6471278028074343e-9, 1.378633446915721e-7, -5.752545603517705e-8, + 1.1951628599778147e-8, -1.7543241719747648e-11, -1.0091543710600413e-9, + 4.1627929918425826e-10, -8.5639070264929806e-11, 6.0672151016047586e-14, + 7.1624989648114854e-12, -2.9331866437714371e-12, 5.9966963656836887e-13, + -2.1671786527323314e-16, -4.9783399723692616e-14, 2.0291628823713425e-14, + -4.13125571381061e-15}, + {4.1335978835978836e-3, -2.6813271604938272e-3, 7.7160493827160494e-4, + 2.0093878600823045e-6, -1.0736653226365161e-4, 5.2923448829120125e-5, + -1.2760635188618728e-5, 3.4235787340961381e-8, 1.3721957309062933e-6, + -6.298992138380055e-7, 1.4280614206064242e-7, -2.0477098421990866e-10, + -1.4092529910867521e-8, 6.228974084922022e-9, -1.3670488396617113e-9, + 9.4283561590146782e-13, 1.2872252400089318e-10, -5.5645956134363321e-11, + 1.1975935546366981e-11, -4.1689782251838635e-15, -1.0940640427884594e-12, + 4.6622399463901357e-13, -9.905105763906906e-14, 1.8931876768373515e-17, + 8.8592218725911273e-15}, + {6.4943415637860082e-4, 2.2947209362139918e-4, -4.6918949439525571e-4, + 2.6772063206283885e-4, -7.5618016718839764e-5, -2.3965051138672967e-7, + 1.1082654115347302e-5, -5.6749528269915966e-6, 1.4230900732435884e-6, + -2.7861080291528142e-11, -1.6958404091930277e-7, 8.0994649053880824e-8, + -1.9111168485973654e-8, 2.3928620439808118e-12, 2.0620131815488798e-9, + -9.4604966618551322e-10, 2.1541049775774908e-10, -1.388823336813903e-14, + -2.1894761681963939e-11, 9.7909989511716851e-12, -2.1782191880180962e-12, + 6.2088195734079014e-17, 2.126978363279737e-13, -9.3446887915174333e-14, + 2.0453671226782849e-14}, + {-8.618882909167117e-4, 7.8403922172006663e-4, + -2.9907248030319018e-4, -1.4638452578843418e-6, + 6.6414982154651222e-5, -3.9683650471794347e-5, + 1.1375726970678419e-5, 2.5074972262375328e-10, + -1.6954149536558306e-6, 8.9075075322053097e-7, + -2.2929348340008049e-7, 2.956794137544049e-11, + 2.8865829742708784e-8, -1.4189739437803219e-8, + 3.4463580499464897e-9, -2.3024517174528067e-13, + -3.9409233028046405e-10, 1.8602338968504502e-10, + -4.356323005056618e-11, 1.2786001016296231e-15, + 4.6792750266579195e-12, -2.1492464706134829e-12, + 4.9088156148096522e-13, -6.3385914848915603e-18, + -5.0453320690800944e-14}, + {-3.3679855336635815e-4, -6.9728137583658578e-5, 2.7727532449593921e-4, + -1.9932570516188848e-4, 6.7977804779372078e-5, 1.419062920643967e-7, + -1.3594048189768693e-5, 8.0184702563342015e-6, -2.2914811765080952e-6, + -3.252473551298454e-10, 3.4652846491085265e-7, -1.8447187191171343e-7, + 4.8240967037894181e-8, -1.7989466721743515e-14, -6.3061945000135234e-9, + 3.1624176287745679e-9, -7.8409242536974293e-10, 5.1926791652540407e-15, + 9.3589442423067836e-11, -4.5134262161632782e-11, 1.0799129993116827e-11, + -3.661886712685252e-17, -1.210902069055155e-12, 5.6807435849905643e-13, + -1.3249659916340829e-13}, + {5.3130793646399222e-4, -5.9216643735369388e-4, 2.7087820967180448e-4, + 7.9023532326603279e-7, -8.1539693675619688e-5, 5.6116827531062497e-5, + -1.8329116582843376e-5, -3.0796134506033048e-9, 3.4651553688036091e-6, + -2.0291327396058604e-6, 5.7887928631490037e-7, 2.338630673826657e-13, + -8.8286007463304835e-8, 4.7435958880408128e-8, -1.2545415020710382e-8, + 8.6496488580102925e-14, 1.6846058979264063e-9, -8.5754928235775947e-10, + 2.1598224929232125e-10, -7.6132305204761539e-16, -2.6639822008536144e-11, + 1.3065700536611057e-11, -3.1799163902367977e-12, 4.7109761213674315e-18, + 3.6902800842763467e-13}, + {3.4436760689237767e-4, 5.1717909082605922e-5, + -3.3493161081142236e-4, 2.812695154763237e-4, + -1.0976582244684731e-4, -1.2741009095484485e-7, + 2.7744451511563644e-5, -1.8263488805711333e-5, + 5.7876949497350524e-6, 4.9387589339362704e-10, + -1.0595367014026043e-6, 6.1667143761104075e-7, + -1.7562973359060462e-7, -1.2974473287015439e-12, + 2.695423606288966e-8, -1.4578352908731271e-8, + 3.887645959386175e-9, -3.8810022510194121e-17, + -5.3279941738772867e-10, 2.7437977643314845e-10, + -6.9957960920705679e-11, 2.5899863874868481e-17, + 8.8566890996696381e-12, -4.403168815871311e-12, + 1.0865561947091654e-12}, + {-6.5262391859530942e-4, 8.3949872067208728e-4, -4.3829709854172101e-4, + -6.969091458420552e-7, 1.6644846642067548e-4, -1.2783517679769219e-4, + 4.6299532636913043e-5, 4.5579098679227077e-9, -1.0595271125805195e-5, + 6.7833429048651666e-6, -2.1075476666258804e-6, -1.7213731432817145e-11, + 3.7735877416110979e-7, -2.1867506700122867e-7, 6.2202288040189269e-8, + 6.5977038267330006e-16, -9.5903864974256858e-9, 5.2132144922808078e-9, + -1.3991589583935709e-9, 5.382058999060575e-16, 1.9484714275467745e-10, + -1.0127287556389682e-10, 2.6077347197254926e-11, -5.0904186999932993e-18, + -3.3721464474854592e-12}, + {-5.9676129019274625e-4, -7.2048954160200106e-5, + 6.7823088376673284e-4, -6.4014752602627585e-4, + 2.7750107634328704e-4, 1.8197008380465151e-7, + -8.4795071170685032e-5, 6.105192082501531e-5, + -2.1073920183404862e-5, -8.8585890141255994e-10, + 4.5284535953805377e-6, -2.8427815022504408e-6, + 8.7082341778646412e-7, 3.6886101871706965e-12, + -1.5344695190702061e-7, 8.862466778790695e-8, + -2.5184812301826817e-8, -1.0225912098215092e-14, + 3.8969470758154777e-9, -2.1267304792235635e-9, + 5.7370135528051385e-10, -1.887749850169741e-19, + -8.0931538694657866e-11, 4.2382723283449199e-11, + -1.1002224534207726e-11}, + {1.3324454494800656e-3, -1.9144384985654775e-3, 1.1089369134596637e-3, + 9.932404122642299e-7, -5.0874501293093199e-4, 4.2735056665392884e-4, + -1.6858853767910799e-4, -8.1301893922784998e-9, 4.5284402370562147e-5, + -3.127053674781734e-5, 1.044986828530338e-5, 4.8435226265680926e-11, + -2.1482565873456258e-6, 1.329369701097492e-6, -4.0295693092101029e-7, + -1.7567877666323291e-13, 7.0145043163668257e-8, -4.040787734999483e-8, + 1.1474026743371963e-8, 3.9642746853563325e-18, -1.7804938269892714e-9, + 9.7480262548731646e-10, -2.6405338676507616e-10, 5.794875163403742e-18, + 3.7647749553543836e-11}, + {1.579727660730835e-3, 1.6251626278391582e-4, -2.0633421035543276e-3, + 2.1389686185689098e-3, -1.0108559391263003e-3, -3.9912705529919201e-7, + 3.6235025084764691e-4, -2.8143901463712154e-4, 1.0449513336495887e-4, + 2.1211418491830297e-9, -2.5779417251947842e-5, 1.7281818956040463e-5, + -5.6413773872904282e-6, -1.1024320105776174e-11, 1.1223224418895175e-6, + -6.8693396379526735e-7, 2.0653236975414887e-7, 4.6714772409838506e-14, + -3.5609886164949055e-8, 2.0470855345905963e-8, -5.8091738633283358e-9, + -1.332821287582869e-16, 9.0354604391335133e-10, -4.9598782517330834e-10, + 1.3481607129399749e-10}, + {-4.0725121195140166e-3, 6.4033628338080698e-3, -4.0410161081676618e-3, + -2.183732802866233e-6, 2.1740441801254639e-3, -1.9700440518418892e-3, + 8.3595469747962458e-4, 1.9445447567109655e-8, -2.5779387120421696e-4, + 1.9009987368139304e-4, -6.7696499937438965e-5, -1.4440629666426572e-10, + 1.5712512518742269e-5, -1.0304008744776893e-5, 3.304517767401387e-6, + 7.9829760242325709e-13, -6.4097794149313004e-7, 3.8894624761300056e-7, + -1.1618347644948869e-7, -2.816808630596451e-15, 1.9878012911297093e-8, + -1.1407719956357511e-8, 3.2355857064185555e-9, 4.1759468293455945e-20, + -5.0423112718105824e-10}, + {-5.9475779383993003e-3, -5.4016476789260452e-4, 8.7910413550767898e-3, + -9.8576315587856125e-3, 5.0134695031021538e-3, 1.2807521786221875e-6, + -2.0626019342754683e-3, 1.7109128573523058e-3, -6.7695312714133799e-4, + -6.9011545676562133e-9, 1.8855128143995902e-4, -1.3395215663491969e-4, + 4.6263183033528039e-5, 4.0034230613321351e-11, -1.0255652921494033e-5, + 6.612086372797651e-6, -2.0913022027253008e-6, -2.0951775649603837e-13, + 3.9756029041993247e-7, -2.3956211978815887e-7, 7.1182883382145864e-8, + 8.925574873053455e-16, -1.2101547235064676e-8, 6.9350618248334386e-9, + -1.9661464453856102e-9}, + {1.7402027787522711e-2, -2.9527880945699121e-2, 2.0045875571402799e-2, + 7.0289515966903407e-6, -1.2375421071343148e-2, 1.1976293444235254e-2, + -5.4156038466518525e-3, -6.3290893396418616e-8, 1.8855118129005065e-3, + -1.473473274825001e-3, 5.5515810097708387e-4, 5.2406834412550662e-10, + -1.4357913535784836e-4, 9.9181293224943297e-5, -3.3460834749478311e-5, + -3.5755837291098993e-12, 7.1560851960630076e-6, -4.5516802628155526e-6, + 1.4236576649271475e-6, 1.8803149082089664e-14, -2.6623403898929211e-7, + 1.5950642189595716e-7, -4.7187514673841102e-8, -6.5107872958755177e-17, + 7.9795091026746235e-9}, + {3.0249124160905891e-2, 2.4817436002649977e-3, -4.9939134373457022e-2, + 5.9915643009307869e-2, -3.2483207601623391e-2, -5.7212968652103441e-6, + 1.5085251778569354e-2, -1.3261324005088445e-2, 5.5515262632426148e-3, + 3.0263182257030016e-8, -1.7229548406756723e-3, 1.2893570099929637e-3, + -4.6845138348319876e-4, -1.830259937893045e-10, 1.1449739014822654e-4, + -7.7378565221244477e-5, 2.5625836246985201e-5, 1.0766165333192814e-12, + -5.3246809282422621e-6, 3.349634863064464e-6, -1.0381253128684018e-6, + -5.608909920621128e-15, 1.9150821930676591e-7, -1.1418365800203486e-7, + 3.3654425209171788e-8}, + {-9.9051020880159045e-2, 1.7954011706123486e-1, -1.2989606383463778e-1, + -3.1478872752284357e-5, 9.0510635276848131e-2, -9.2828824411184397e-2, + 4.4412112839877808e-2, 2.7779236316835888e-7, -1.7229543805449697e-2, + 1.4182925050891573e-2, -5.6214161633747336e-3, -2.39598509186381e-9, + 1.6029634366079908e-3, -1.1606784674435773e-3, 4.1001337768153873e-4, + 1.8365800754090661e-11, -9.5844256563655903e-5, 6.3643062337764708e-5, + -2.076250624489065e-5, -1.1806020912804483e-13, 4.2131808239120649e-6, + -2.6262241337012467e-6, 8.0770620494930662e-7, 6.0125912123632725e-16, + -1.4729737374018841e-7}, + {-1.9994542198219728e-1, -1.5056113040026424e-2, 3.6470239469348489e-1, + -4.6435192311733545e-1, 2.6640934719197893e-1, 3.4038266027147191e-5, + -1.3784338709329624e-1, 1.276467178337056e-1, -5.6213828755200985e-2, + -1.753150885483011e-7, 1.9235592956768113e-2, -1.5088821281095315e-2, + 5.7401854451350123e-3, 1.0622382710310225e-9, -1.5335082692563998e-3, + 1.0819320643228214e-3, -3.7372510193945659e-4, -6.6170909729031985e-12, + 8.4263617380909628e-5, -5.5150706827483479e-5, 1.7769536448348069e-5, + 3.8827923210205533e-14, -3.53513697488768e-6, 2.1865832130045269e-6, + -6.6812849447625594e-7}, + {7.2438608504029431e-1, -1.3918010932653375, 1.0654143352413968, + 1.876173868950258e-4, -8.2705501176152696e-1, 8.9352433347828414e-1, + -4.4971003995291339e-1, -1.6107401567546652e-6, 1.9235590165271091e-1, + -1.6597702160042609e-1, 6.8882222681814333e-2, 1.3910091724608687e-8, + -2.146911561508663e-2, 1.6228980898865892e-2, -5.9796016172584256e-3, + -1.1287469112826745e-10, 1.5167451119784857e-3, -1.0478634293553899e-3, + 3.5539072889126421e-4, 8.1704322111801517e-13, -7.7773013442452395e-5, + 5.0291413897007722e-5, -1.6035083867000518e-5, 1.2469354315487605e-14, + 3.1369106244517615e-6}, + {1.6668949727276811, 1.165462765994632e-1, -3.3288393225018906, + 4.4692325482864037, -2.6977693045875807, -2.600667859891061e-4, + 1.5389017615694539, -1.4937962361134612, 6.8881964633233148e-1, + 1.3077482004552385e-6, -2.5762963325596288e-1, 2.1097676102125449e-1, + -8.3714408359219882e-2, -7.7920428881354753e-9, 2.4267923064833599e-2, + -1.7813678334552311e-2, 6.3970330388900056e-3, 4.9430807090480523e-11, + -1.5554602758465635e-3, 1.0561196919903214e-3, -3.5277184460472902e-4, + 9.3002334645022459e-14, 7.5285855026557172e-5, -4.8186515569156351e-5, + 1.5227271505597605e-5}, + {-6.6188298861372935, 1.3397985455142589e+1, -1.0789350606845146e+1, + -1.4352254537875018e-3, 9.2333694596189809, -1.0456552819547769e+1, + 5.5105526029033471, 1.2024439690716742e-5, -2.5762961164755816, + 2.3207442745387179, -1.0045728797216284, -1.0207833290021914e-7, + 3.3975092171169466e-1, -2.6720517450757468e-1, 1.0235252851562706e-1, + 8.4329730484871625e-10, -2.7998284958442595e-2, 2.0066274144976813e-2, + -7.0554368915086242e-3, 1.9402238183698188e-12, 1.6562888105449611e-3, + -1.1082898580743683e-3, 3.654545161310169e-4, -5.1290032026971794e-11, + -7.6340103696869031e-5}, + {-1.7112706061976095e+1, -1.1208044642899116, 3.7131966511885444e+1, + -5.2298271025348962e+1, 3.3058589696624618e+1, 2.4791298976200222e-3, + -2.061089403411526e+1, 2.088672775145582e+1, -1.0045703956517752e+1, + -1.2238783449063012e-5, 4.0770134274221141, -3.473667358470195, + 1.4329352617312006, 7.1359914411879712e-8, -4.4797257159115612e-1, + 3.4112666080644461e-1, -1.2699786326594923e-1, -2.8953677269081528e-10, + 3.3125776278259863e-2, -2.3274087021036101e-2, 8.0399993503648882e-3, + -1.177805216235265e-9, -1.8321624891071668e-3, 1.2108282933588665e-3, + -3.9479941246822517e-4}, + {7.389033153567425e+1, -1.5680141270402273e+2, 1.322177542759164e+2, + 1.3692876877324546e-2, -1.2366496885920151e+2, 1.4620689391062729e+2, + -8.0365587724865346e+1, -1.1259851148881298e-4, 4.0770132196179938e+1, + -3.8210340013273034e+1, 1.719522294277362e+1, 9.3519707955168356e-7, + -6.2716159907747034, 5.1168999071852637, -2.0319658112299095, + -4.9507215582761543e-9, 5.9626397294332597e-1, -4.4220765337238094e-1, + 1.6079998700166273e-1, -2.4733786203223402e-8, -4.0307574759979762e-2, + 2.7849050747097869e-2, -9.4751858992054221e-3, 6.419922235909132e-6, + 2.1250180774699461e-3}, + {2.1216837098382522e+2, 1.3107863022633868e+1, -4.9698285932871748e+2, + 7.3121595266969204e+2, -4.8213821720890847e+2, -2.8817248692894889e-2, + 3.2616720302947102e+2, -3.4389340280087117e+2, 1.7195193870816232e+2, + 1.4038077378096158e-4, -7.52594195897599e+1, 6.651969984520934e+1, + -2.8447519748152462e+1, -7.613702615875391e-7, 9.5402237105304373, + -7.5175301113311376, 2.8943997568871961, -4.6612194999538201e-7, + -8.0615149598794088e-1, 5.8483006570631029e-1, -2.0845408972964956e-1, + 1.4765818959305817e-4, 5.1000433863753019e-2, -3.3066252141883665e-2, + 1.5109265210467774e-2}, + {-9.8959643098322368e+2, 2.1925555360905233e+3, -1.9283586782723356e+3, + -1.5925738122215253e-1, 1.9569985945919857e+3, -2.4072514765081556e+3, + 1.3756149959336496e+3, 1.2920735237496668e-3, -7.525941715948055e+2, + 7.3171668742208716e+2, -3.4137023466220065e+2, -9.9857390260608043e-6, + 1.3356313181291573e+2, -1.1276295161252794e+2, 4.6310396098204458e+1, + -7.9237387133614756e-6, -1.4510726927018646e+1, 1.1111771248100563e+1, + -4.1690817945270892, 3.1008219800117808e-3, 1.1220095449981468, + -7.6052379926149916e-1, 3.6262236505085254e-1, 2.216867741940747e-1, + 4.8683443692930507e-1}}; + + int k, n, sgn; + int maxpow = 0; + const accscalar_t MACHEP = 5.9604644775390625E-8; + accscalar_t lambda = x / a; + accscalar_t sigma = (x - a) / a; + accscalar_t eta, res, ck, ckterm, term, absterm; + accscalar_t absoldterm = INFINITY; + accscalar_t etapow[25] = {1}; + accscalar_t sum = 0; + accscalar_t afac = 1; + + if (igam) { + sgn = -1; + } else { + sgn = 1; + } + + if (lambda > 1) { + eta = ::sqrt(-2 * (::log1p(sigma) - sigma)); + } else if (lambda < 1) { + eta = -::sqrt(-2 * (::log1p(sigma) - sigma)); + } else { + eta = 0; + } + res = 0.5 * ::erfc(sgn * eta * ::sqrt(a / 2)); + + for (k = 0; k < 25; k++) { + ck = d[k][0]; + for (n = 1; n < 25; n++) { + if (n > maxpow) { + etapow[n] = eta * etapow[n - 1]; + maxpow += 1; + } + ckterm = d[k][n] * etapow[n]; + ck += ckterm; + if (::fabs(ckterm) < MACHEP * ::fabs(ck)) { + break; + } + } + term = ck * afac; + absterm = ::fabs(term); + if (absterm > absoldterm) { + break; + } + sum += term; + if (absterm < MACHEP * ::fabs(sum)) { + break; + } + absoldterm = absterm; + afac /= a; + } + res += sgn * ::exp(-0.5 * a * eta * eta) * sum / ::sqrt(2 * 3.1415926535 * a); + + return res; +} + +template +scalar_t _igamc_helper_continued_fraction(scalar_t a, scalar_t x) { + // Compute igamc using DLMF 8.9.2. [igam1] + + using accscalar_t = opmath_t; + int i; + accscalar_t ans, ax, c, yc, r, t, y, z; + accscalar_t pk, pkm1, pkm2, qk, qkm1, qkm2; + const int MAXITER = 2000; + const accscalar_t MACHEP = 5.9604644775390625E-8; + const accscalar_t BIG = 16777216.; + const accscalar_t BIGINV = 5.9604644775390625E-8; + + ax = _igam_helper_fac(a, x); + if (ax == 0.0) { + return 0.0; + } + + /* continued fraction */ + y = 1.0 - a; + z = x + y + 1.0; + c = 0.0; + pkm2 = 1.0; + qkm2 = x; + pkm1 = x + 1.0; + qkm1 = z * x; + ans = pkm1 / qkm1; + + for (i = 0; i < MAXITER; i++) { + c += 1.0; + y += 1.0; + z += 2.0; + yc = y * c; + pk = pkm1 * z - pkm2 * yc; + qk = qkm1 * z - qkm2 * yc; + if (qk != 0) { + r = pk / qk; + t = ::fabs((ans - r) / r); + ans = r; + } else { + t = 1.0; + } + pkm2 = pkm1; + pkm1 = pk; + qkm2 = qkm1; + qkm1 = qk; + if (::fabs(pk) > BIG) { + pkm2 *= BIGINV; + pkm1 *= BIGINV; + qkm2 *= BIGINV; + qkm1 *= BIGINV; + } + if (t <= MACHEP) { + break; + } + } + return ans * ax; +} + +template +scalar_t calc_igammac(scalar_t a, scalar_t x) { + /* the calculation of the regularized upper incomplete gamma function + * is done differently based on the values of a and x: + * - if x and/or a is at the boundary of defined region, then assign the + * result at the boundary + * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for + * Large Parameter (see DLMF 8.12.4 [igam1]) + * - if x > 1.1 and x < a, using the subtraction from the regularized lower + * incomplete gamma + * - otherwise, calculate the series from [igam2] eq (5) + */ + + using accscalar_t = opmath_t; + accscalar_t absxma_a; + + const accscalar_t SMALL = 20.0; + const accscalar_t LARGE = 200.0; + const accscalar_t SMALLRATIO = 0.3; + const accscalar_t LARGERATIO = 4.5; + + if ((x < 0) || (a < 0)) { + // out of defined-region of the function + return NAN; + } else if (a == 0) { + if (x > 0) { + return 0.0; + } else { + return NAN; + } + } else if (x == 0) { + return 1.0; + } else if (isinf(a)) { + if (isinf(x)) { + return NAN; + } + return 1.0; + } else if (isinf(x)) { + return 0.0; + } + + absxma_a = ::fabs(x - a) / a; + if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) { + return _igam_helper_asymptotic_series(a, x, 0); + } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) { + return _igam_helper_asymptotic_series(a, x, 0); + } + + if (x > 1.1) { + if (x < a) { + return 1.0 - _igam_helper_series(a, x); + } else { + return _igamc_helper_continued_fraction(a, x); + } + } else if (x <= 0.5) { + if (-0.4 / ::log(x) < a) { + return 1.0 - _igam_helper_series(a, x); + } else { + return _igamc_helper_series(a, x); + } + } else { + if (x * 1.1 < a) { + return 1.0 - _igam_helper_series(a, x); + } else { + return _igamc_helper_series(a, x); + } + } +} + +template +scalar_t calc_igamma(scalar_t a, scalar_t x) { + /* the calculation of the regularized lower incomplete gamma function + * is done differently based on the values of a and x: + * - if x and/or a is at the boundary of defined region, then assign the + * result at the boundary + * - if a is large and a ~ x, then using Uniform Asymptotic Expansions for + * Large Parameter (see DLMF 8.12.3 [igam1]) + * - if x > 1 and x > a, using the subtraction from the regularized upper + * incomplete gamma + * - otherwise, calculate the series from [igam2] eq (4) + */ + + using accscalar_t = opmath_t; + accscalar_t absxma_a; + const accscalar_t SMALL = 20.0; + const accscalar_t LARGE = 200.0; + const accscalar_t SMALLRATIO = 0.3; + const accscalar_t LARGERATIO = 4.5; + + // boundary values following SciPy + if ((x < 0) || (a < 0)) { + // out of defined-region of the function + return NAN; + } else if (a == 0) { + if (x > 0) { + return 1.0; + } else { + return NAN; + } + } else if (x == 0) { + return 0.0; // zero integration limit + } else if (isinf(a)) { + if (isinf(x)) { + return NAN; + } + return 0.0; + } else if (isinf(x)) { + return 1.0; + } + + /* Asymptotic regime where a ~ x. */ + absxma_a = ::fabs(x - a) / a; + if ((a > SMALL) && (a < LARGE) && (absxma_a < SMALLRATIO)) { + return _igam_helper_asymptotic_series(a, x, 1); + } else if ((a > LARGE) && (absxma_a < LARGERATIO / ::sqrt(a))) { + return _igam_helper_asymptotic_series(a, x, 1); + } + + if ((x > 1.0) && (x > a)) { + return 1.0 - calc_igammac(a, x); + } + + return _igam_helper_series(a, x); +} + +} // namespace + +// end of regularized lower & upper incomplete gamma + +namespace c10 { +namespace metal { + +template +inline T igamma(T a, T b) { + return calc_igamma(a, b); +} + +template +inline T igammac(T a, T b) { + return calc_igammac(a, b); +} + +} // namespace metal +} // namespace c10 diff --git a/c10/metal/special_math.h b/c10/metal/special_math.h index 34f6ab6d1d09..29a45ff4c30b 100644 --- a/c10/metal/special_math.h +++ b/c10/metal/special_math.h @@ -1,6 +1,7 @@ // Implementation of specal math functions for Metal #pragma once #include +#include #include #include @@ -47,6 +48,11 @@ inline float erf(T x) { return r; } +template +float erfc(T x) { + return 1.0 - erf(x); +} + template inline float erfinv(T y) { /* coefficients in rational expansion */ diff --git a/c10/test/build.bzl b/c10/test/build.bzl index 2f54c8a2faa5..deb917dd8fcf 100644 --- a/c10/test/build.bzl +++ b/c10/test/build.bzl @@ -46,7 +46,7 @@ def define_targets(rules): "util/typeid_test.cpp", ], ), - copts = ["-Wno-deprecated-declarations"], + copts = ["-Wno-deprecated-declarations", "-Wno-ctad-maybe-unsupported"], deps = [ ":Macros", ":complex_math_test_common", diff --git a/c10/test/core/SymInt_test.cpp b/c10/test/core/SymInt_test.cpp index 7cefa1e4a771..e408543f5362 100644 --- a/c10/test/core/SymInt_test.cpp +++ b/c10/test/core/SymInt_test.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -35,4 +36,169 @@ TEST(SymIntTest, Overflows) { } #endif +namespace { + +// We need a SymNodeImpl that 1) has working arithmetic with +// predictable results and 2) causes SymInt::maybe_as_int to return +// nullopt so that we can hit all 4 cases (zero/one/both arguments +// have null maybe_as_int) in the operator implementations. +class ConstantIntPretendingToBeSymbolicSymNodeImpl + : public ConstantSymNodeImpl { + public: + using ConstantSymNodeImpl::ConstantSymNodeImpl; + std::optional constant_int() override { + return std::nullopt; + } + std::optional maybe_as_int() override { + return std::nullopt; + } + // Needs to be implemented for arithmetic to actually + // work. NestedIntSymNodeImpl does this, for example. + c10::SymNode wrap_int(int64_t num) override { + return SymNode( + c10::make_intrusive(num)); + } + + c10::SymNode wrap_bool(bool b) override { + return SymNode(c10::make_intrusive>(b)); + } + + SymNode add(const SymNode& other) override { + return wrap_int(int_() + other->int_()); + } + + SymNode sub(const SymNode& other) override { + return wrap_int(int_() - other->int_()); + } + + SymNode mul(const SymNode& other) override { + return wrap_int(int_() * other->int_()); + } + + SymNode floordiv(const SymNode& other) override { + return wrap_int(int_() / other->int_()); + } + + SymNode sym_min(const SymNode& other) override { + return wrap_int(std::min(int_(), other->int_())); + } + + SymNode sym_max(const SymNode& other) override { + return wrap_int(std::max(int_(), other->int_())); + } + + SymNode mod(const SymNode& other) override { + return wrap_int(int_() % other->int_()); + } + + SymNode eq(const SymNode& other) override { + return wrap_bool(int_() == other->int_()); + } + + SymNode ne(const SymNode& other) override { + return wrap_bool(int_() != other->int_()); + } + + SymNode lt(const SymNode& other) override { + return wrap_bool(int_() < other->int_()); + } + + SymNode le(const SymNode& other) override { + return wrap_bool(int_() <= other->int_()); + } + + SymNode gt(const SymNode& other) override { + return wrap_bool(int_() > other->int_()); + } + + SymNode ge(const SymNode& other) override { + return wrap_bool(int_() >= other->int_()); + } +}; + +SymInt create_symbolic_symint(int64_t value) { + return SymInt( + SymNode(c10::make_intrusive( + value))); +} + +auto unwrap(const SymInt& x) { + return x.guard_int(__FILE__, __LINE__); +} + +auto unwrap(bool b) { + return b; +} + +template