Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:

## Unreleased - RCCL 2.27.7 for ROCm 7.1.1

### Changed
* Enabling P2P batching with `RCCL_P2P_BATCH_ENABLE=1` is only applicable up to 32 nodes.

### Resolved Issues

* Fixed crash when using the librccl-profiler plugin with the all-to-all collective after the 2.27 update.
Expand All @@ -16,7 +19,8 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
* Added `RCCL_FORCE_ENABLE_DMABUF` as a debugging feature if the user wants to explicitly enable DMABUF and forego system/kernel checks.
* Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
* Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
* Added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on `threadThreshold`.
* Added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold.


### Changed

Expand Down
12 changes: 10 additions & 2 deletions src/enqueue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ static void addWorkBatchToPlan(
// batch further down.
newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize;
if (workType == ncclDevWorkTypeP2p) {
newBatch |= (comm->nNodes > 2 && batchP2P)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
newBatch |= (comm->nNodes > 2 && batchP2P && comm->nNodes <= 32)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
for (int i=0; i < chan->wipBatch.nP2ps; i++) {
newBatch |= p2pRound == chan->wipBatch.p2pRounds[i];
}
Expand Down Expand Up @@ -952,7 +952,15 @@ static ncclResult_t addP2pToPlan(
bool proxySameProcess[2] = {true, true};
void** handles[2] = {NULL, NULL};
auto batchP2PEnableEnv = rcclParamP2pBatchEnable();
bool batchP2P = batchP2PEnableEnv && ((sendBytes == -1)? recvBytes <= rcclParamP2pBatchThreshold() : sendBytes <= rcclParamP2pBatchThreshold());
auto p2pBatchThreshold = rcclParamP2pBatchThreshold();
bool belowThreshold = (recvBytes <= p2pBatchThreshold) && (sendBytes <= p2pBatchThreshold);
bool batchP2P = batchP2PEnableEnv && (sendBytes == recvBytes) && belowThreshold;

//ncclP2pChannelBaseForRound now computes channel-base based on batching enablement (env. variable RCCL_P2P_BATCH_ENABLE=1)
//but batching is only applicable if msg size is below threshold which is not checked below
//this causes perf. dips in some cases but also boosts in other cases even when no batching happens because msg size is above threshold
//replacing line below with ncclP2pChannelBaseForRound(comm, p2pRound, batchP2P) can cause issues due to ncclP2pChannelBaseForRound calling the same routine
//channel base computed in taskAppend and here must be the same, but in taskAppend the call happens once and is cached for later usage, which is why it wouldn't be consistent with the call below
uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound, batchP2PEnableEnv);
if (comm->p2pNet) {
for (int dir = 0; dir <= 1; dir++) {
Expand Down
14 changes: 14 additions & 0 deletions src/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,10 +317,24 @@ inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2
// ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code
// uses ncclP2pChannelToPart to determine which part "this" channel is responsible for.
inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part, int nParts, int nNodes) {
if (nNodes > 2) {
// Only works because nP2pChannels is pow2
int nChannelsLog2 = countOneBits(nP2pChannels-1);
int delta = reverseBits(part, nChannelsLog2);
return (base + delta) & (nP2pChannels-1);
} else {
return (base * nParts + part) & (nP2pChannels-1);
}
}
inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel, int nParts, int nNodes) {
if (nNodes > 2) {
// Only works because nP2pChannels is pow2
int nChannelsLog2 = countOneBits(nP2pChannels-1);
int delta = (channel-base) & (nP2pChannels-1);
return reverseBits(delta, nChannelsLog2);
} else {
return (channel - base * nParts) & (nP2pChannels-1);
}
}

struct alignas(16) ncclDevWorkColl {
Expand Down
Loading