diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 521b467480900..7d1c45e785b79 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -1062,7 +1062,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){ // In such case, values in each loaded vector always correspond to different outputs. if (fastest_moving_stride == sizeof(scalar_t)) { #ifdef USE_ROCM - if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1) { + if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) { #else if (reduction_on_fastest_striding_dimension && dim0 > 128 && iter.num_reduce_dims() == 1 && vt0 >= input_vec_size) { #endif