diff --git a/test/distributed/_composable/fsdp/test_fully_shard_memory.py b/test/distributed/_composable/fsdp/test_fully_shard_memory.py index 340fe913c1eba..de6df77479c92 100644 --- a/test/distributed/_composable/fsdp/test_fully_shard_memory.py +++ b/test/distributed/_composable/fsdp/test_fully_shard_memory.py @@ -117,6 +117,9 @@ def _test_fully_shard_training_memory( # number is kept much smaller than the actual memory usage, which is on # the order of 100-200+ MB) buffer_mb = 16 + # The default workspace for hipblaslt is larger than for cublas/cublaslt + # which requires a slight increase to this buffer value. + buffer_mb = 16 if torch.version.cuda else 18 if reshard_after_forward: # 3x max unsharded block parameters (current all-gather + copy-out # and next all-gather), non-block parameters, and other