From 5dd58a9194c53bf74c9c19fcaa57f2fc5c4cdee1 Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Tue, 22 Jul 2025 19:45:35 +0000
Subject: [PATCH] [ROCm][tunableop] UT tolerance increase for
 matmul_small_brute_force_tunableop at FP16 (#158788)

TunableOp will sometimes find a less precise solution due to the small input vectors used in this UT. Bumping op tolerance to eliminate flakiness.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/158788
Approved by: https://github.com/jeffdaily

(cherry picked from commit c917c63282c467ef942c99da3ce4fa57bceba603)
---
 test/test_linalg.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_linalg.py b/test/test_linalg.py
index fe000c4ae9efc..6c4e6ccf04cd8 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -4704,6 +4704,7 @@ def test_matmul_small_brute_force_3d_Nd(self, device, dtype):
     @onlyCUDA
     @skipCUDAIfNotRocm  # Skipping due to SM89 OOM in CI, UT doesn't do much on NV anyways
     @dtypes(*floating_types_and(torch.half))
+    @precisionOverride({torch.float16: 1e-1})  # TunableOp may occasionally find less precise solution
     def test_matmul_small_brute_force_tunableop(self, device, dtype):
         # disable tunableop buffer rotation for all tests everywhere, it can be slow
         # We set the TunableOp numerical check environment variable here because it is