diff --git a/CHANGELOG.md b/CHANGELOG.md index 238f81944..9234702d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ## Unreleased - RCCL 2.27.7 for ROCm 7.1.1 +### Changed +* Enabling P2P batching with `RCCL_P2P_BATCH_ENABLE=1` is only applicable up to 32 nodes. + ### Resolved Issues * Fixed crash when using the librccl-profiler plugin with the all-to-all collective after the 2.27 update. @@ -16,7 +19,8 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: * Added `RCCL_FORCE_ENABLE_DMABUF` as a debugging feature if the user wants to explicitly enable DMABUF and forego system/kernel checks. * Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv. * Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages. -* Added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on `threadThreshold`. +* Added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold. + ### Changed diff --git a/src/enqueue.cc b/src/enqueue.cc index 575d84680..d41ebbe25 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -190,7 +190,7 @@ static void addWorkBatchToPlan( // batch further down. newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize; if (workType == ncclDevWorkTypeP2p) { - newBatch |= (comm->nNodes > 2 && batchP2P)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1); + newBatch |= (comm->nNodes > 2 && batchP2P && comm->nNodes <= 32)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1); for (int i=0; i < chan->wipBatch.nP2ps; i++) { newBatch |= p2pRound == chan->wipBatch.p2pRounds[i]; } @@ -952,7 +952,15 @@ static ncclResult_t addP2pToPlan( bool proxySameProcess[2] = {true, true}; void** handles[2] = {NULL, NULL}; auto batchP2PEnableEnv = rcclParamP2pBatchEnable(); - bool batchP2P = batchP2PEnableEnv && ((sendBytes == -1)? recvBytes <= rcclParamP2pBatchThreshold() : sendBytes <= rcclParamP2pBatchThreshold()); + auto p2pBatchThreshold = rcclParamP2pBatchThreshold(); + bool belowThreshold = (recvBytes <= p2pBatchThreshold) && (sendBytes <= p2pBatchThreshold); + bool batchP2P = batchP2PEnableEnv && (sendBytes == recvBytes) && belowThreshold; + + //ncclP2pChannelBaseForRound now computes channel-base based on batching enablement (env. variable RCCL_P2P_BATCH_ENABLE=1) + //but batching is only applicable if msg size is below threshold which is not checked below + //this causes perf. dips in some cases but also boosts in other cases even when no batching happens because msg size is above threshold + //replacing line below with ncclP2pChannelBaseForRound(comm, p2pRound, batchP2P) can cause issues due to ncclP2pChannelBaseForRound calling the same routine + //channel base computed in taskAppend and here must be the same, but in taskAppend the call happens once and is cached for later usage, which is why it wouldn't be consistent with the call below uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound, batchP2PEnableEnv); if (comm->p2pNet) { for (int dir = 0; dir <= 1; dir++) { diff --git a/src/include/device.h b/src/include/device.h index 3e0cb12f3..5e864dfe1 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -317,10 +317,24 @@ inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2 // ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code // uses ncclP2pChannelToPart to determine which part "this" channel is responsible for. inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part, int nParts, int nNodes) { + if (nNodes > 2) { + // Only works because nP2pChannels is pow2 + int nChannelsLog2 = countOneBits(nP2pChannels-1); + int delta = reverseBits(part, nChannelsLog2); + return (base + delta) & (nP2pChannels-1); + } else { return (base * nParts + part) & (nP2pChannels-1); + } } inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel, int nParts, int nNodes) { + if (nNodes > 2) { + // Only works because nP2pChannels is pow2 + int nChannelsLog2 = countOneBits(nP2pChannels-1); + int delta = (channel-base) & (nP2pChannels-1); + return reverseBits(delta, nChannelsLog2); + } else { return (channel - base * nParts) & (nP2pChannels-1); + } } struct alignas(16) ncclDevWorkColl {