From c9ba49d261fb2e23ba9a01e61770a55369c8d552 Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <165712832+naromero77amd@users.noreply.github.com>
Date: Wed, 23 Jul 2025 11:32:26 -0500
Subject: [PATCH] =?UTF-8?q?[release/2.7][ROCm][tunableop]=20UT=20tolerance?=
 =?UTF-8?q?=20increase=20for=20matmul=5Fsmall=5Fbrute=5Fforce=5F=E2=80=A6?=
 =?UTF-8?q?=20(#2397)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TunableOp will sometimes find a less precise solution due to the small
input vectors used in this UT. Bumping up tolerance to eliminate
flakiness.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158788
Approved by: https://github.com/jeffdaily

(cherry picked from commit c917c63282c467ef942c99da3ce4fa57bceba603)
---
 test/test_linalg.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_linalg.py b/test/test_linalg.py
index b5ed3af02729f..1f5d4009cebba 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -4759,6 +4759,7 @@ def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
     @onlyCUDA
     @skipCUDAIfNotRocm  # Skipping due to SM89 OOM in CI, UT doesn't do much on NV anyways
     @dtypes(*floating_types_and(torch.half))
+    @precisionOverride({torch.float16: 1e-1})  # TunableOp may occasionally find less precise solution
     def test_matmul_small_brute_force_tunableop(self, device, dtype):
         # disable tunableop buffer rotation for all tests everywhere, it can be slow
         # We set the TunableOp numerical check environment variable here because it is