Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pycache/
__pycache__/
*.pyc
2 changes: 2 additions & 0 deletions input/config_file/rccl/rccl_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
"end_msg_size": "16g",
"step_function": "2",
"threads_per_gpu": "1",
"data_types": [ "int8", "int32"],
"no_of_cycles": "10",
"warmup_iterations": "10",
"no_of_iterations": "1",
"check_iteration_count": "1",
Expand Down
2 changes: 2 additions & 0 deletions input/config_file/rccl/single_node_mi355_rccl.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
"start_msg_size": "1024",
"end_msg_size": "16g",
"step_function": "2",
"data_types": [ "int8", "int32"],
"no_of_cycles": "10",
"warmup_iterations": "10",
"no_of_iterations": "1",
"check_iteration_count": "1",
Expand Down
367 changes: 286 additions & 81 deletions lib/rccl_lib.py

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""CVS Pydantic models for data validation."""

from models.rccl import (
RcclTests,
RcclTestsMultinodeRaw,
RcclTestsAggregated,
)

__all__ = [
'RcclTests',
'RcclTestsMultinodeRaw',
'RcclTestsAggregated',
]
106 changes: 106 additions & 0 deletions models/rccl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# std libs
from typing import Annotated, Literal, Optional
import math

#pypdantic libs
from pydantic import BaseModel, Field, model_validator, ConfigDict, field_validator

NonNegativeInt = Annotated[int, Field(ge=0)]
PositiveInt = Annotated[int, Field(gt=0)]
NonNegativeFloat = Annotated[float, Field(ge=0.0)]
Collective = Literal['AllReduce', 'AllGather', 'Scatter', 'Gather', 'ReduceScatter', 'SendRecv', 'AllToAll', 'AllToAllV', 'Broadcast']
Type = Literal[
'int8', 'int32', 'int64',
'uint8', 'uint32', 'uint64',
'float', 'double',
'half', 'bfloat16',
'fp8_e4m3', 'fp8_e5m2'
]
Redop = Literal['sum', 'prod', 'min', 'max', 'avg', 'all', 'none']
InPlace = Literal[0, 1]


class RcclTests(BaseModel):
model_config = ConfigDict(frozen=True)
numCycle: NonNegativeInt
name: Collective
size: PositiveInt
type: Type
redop: Redop
inPlace: InPlace
time: NonNegativeFloat
algBw: NonNegativeFloat
busBw: NonNegativeFloat
wrong: int

@field_validator('time', 'algBw', 'busBw')
@classmethod
def validate_not_nan_inf(cls, v: float, info) -> float:
"""Ensure no NaN/Inf values in measurements."""
if math.isnan(v) or math.isinf(v):
raise ValueError(f'{info.field_name} cannot be NaN/Inf, got {v}')
return v

class RcclTestsMultinodeRaw(RcclTests):
"""
This class represents the schema for multi node rccl-test results, while serializing rccl-test input
if we don't adhere to this schema, we fail immediately preventing weird behaviour later on
in the processing pipeline
"""
nodes: PositiveInt
ranks: PositiveInt
ranksPerNode: PositiveInt
gpusPerRank: PositiveInt

@model_validator(mode='after')
def validate_ranks_relationship(self):
"""Ensure ranks = nodes * ranksPerNode."""
expected_ranks = self.nodes * self.ranksPerNode
if self.ranks != expected_ranks:
raise ValueError(
f"ranks ({self.ranks}) must equal nodes ({self.nodes}) × "
f"ranksPerNode ({self.ranksPerNode}) = {expected_ranks}"
)
return self

class RcclTestsAggregated(BaseModel):
"""
This class represents the aggregated schema for rccl-test results
"""
# Grouping keys
model_config = ConfigDict(frozen=True, populate_by_name=True)
name: Collective = Field(alias='collective')
size: PositiveInt
type: Type
inPlace: InPlace

#Metadata
num_runs: PositiveInt = Field(description='Number of cycles aggregated')

# Aggregated metrics
busBw_mean: NonNegativeFloat
busBw_std: NonNegativeFloat
algBw_mean: NonNegativeFloat
algBw_std: NonNegativeFloat
time_mean: NonNegativeFloat
time_std: NonNegativeFloat

# Multinode metadata (optional, None for single-node tests)
nodes: Optional[PositiveInt] = None
ranks: Optional[PositiveInt] = None
ranksPerNode: Optional[PositiveInt] = None
gpusPerRank: Optional[PositiveInt] = None

@field_validator('busBw_std', 'algBw_std', 'time_std')
@classmethod
def handle_nan_std(cls, v: float, info) -> float:
"""
Convert NaN (from single-value std) to 0.0.
Pandas returns NaN for std of single value, which is correct mathematically,
but we interpret it as 0 variability.
"""
if math.isnan(v):
return 0.0
if math.isinf(v):
raise ValueError(f'{info.field_name} cannot be Inf')
return v
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ pytest-html
pytest-repeat
pytest-dependency
xlsxwriter
pydantic>=2.0
pandas
2 changes: 2 additions & 0 deletions tests/rccl/rccl_multinode_cvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,8 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective, rccl_
end_msg_size = config_dict['end_msg_size'], \
step_function = config_dict['step_function'], \
threads_per_gpu = config_dict['threads_per_gpu'], \
data_types = config_dict['data_types'], \
no_of_cycles = config_dict['no_of_cycles'], \
warmup_iterations = config_dict['warmup_iterations'], \
no_of_iterations = config_dict['no_of_iterations'], \
check_iteration_count = config_dict['check_iteration_count'], \
Expand Down
2 changes: 2 additions & 0 deletions tests/rccl/rccl_multinode_default_cvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective ):
end_msg_size = config_dict['end_msg_size'], \
step_function = config_dict['step_function'], \
threads_per_gpu = config_dict['threads_per_gpu'], \
data_types = config_dict['data_types'], \
no_of_cycles = config_dict['no_of_cycles'], \
warmup_iterations = config_dict['warmup_iterations'], \
no_of_iterations = config_dict['no_of_iterations'], \
check_iteration_count = config_dict['check_iteration_count'], \
Expand Down
2 changes: 2 additions & 0 deletions tests/rccl/rccl_singlenode_cvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,8 @@ def test_singlenode_perf(phdl, cluster_dict, config_dict, rccl_collective ):
debug_level = config_dict['debug_level'], \
rccl_result_file = config_dict['rccl_result_file'], \
no_of_local_ranks = config_dict['no_of_local_ranks'], \
data_types = config_dict['data_types'], \
no_of_cycles = config_dict['no_of_cycles'], \
verify_bus_bw = config_dict['verify_bus_bw'], \
verify_bw_dip = config_dict['verify_bw_dip'], \
verify_lat_dip = config_dict['verify_lat_dip'], \
Expand Down