Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cvs/lib/rccl_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ def rccl_regression(
for node in vpc_node_list:
host_file_params = f'{host_file_params}{node} slots={proc_per_node}\n'

cmd = 'sudo rm -f /tmp/rccl_hosts_file.txt'
cmd = 'rm -f /tmp/rccl_hosts_file.txt'
shdl.exec(cmd)

cmd = f'echo "{host_file_params}" > /tmp/rccl_hosts_file.txt'
Expand Down Expand Up @@ -722,7 +722,7 @@ def rccl_perf(
for node in vpc_node_list:
host_file_params = f'{host_file_params}' + f'{node} slots={proc_per_node}\n'

cmd = 'sudo rm -f /tmp/rccl_hosts_file.txt'
cmd = 'rm -f /tmp/rccl_hosts_file.txt'
shdl.exec(cmd)

cmd = f'echo "{host_file_params}" > /tmp/rccl_hosts_file.txt'
Expand Down
18 changes: 18 additions & 0 deletions cvs/lib/utils_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,24 @@ def convert_phdl_json_to_dict(dict_json):
return out_dict


def get_passwordless_sudo_status(phdl):
"""
Return whether passwordless sudo is available on each node.

Parameters:
phdl: parallel SSH handle exposing exec(cmd) -> dict[node, output]

Returns:
dict[str, bool]: node -> True when `sudo -n true` succeeds.
"""
out_dict = phdl.exec('sudo -n true >/dev/null 2>&1; echo $?')
sudo_status = {}
for node, output in out_dict.items():
last_line = output.strip().splitlines()[-1] if output.strip() else '1'
sudo_status[node] = last_line == '0'
return sudo_status


def get_model_from_rocm_smi_output(smi_output):
"""
Infer the GPU model identifier from a rocm-smi output snippet.
Expand Down
38 changes: 31 additions & 7 deletions cvs/tests/rccl/rccl_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,14 @@ def test_collect_networkinfo(phdl):

def test_disable_firewall(phdl):
globals.error_list = []
sudo_status = get_passwordless_sudo_status(phdl)
no_sudo_nodes = [node for node, ok in sudo_status.items() if not ok]
if no_sudo_nodes:
log.warning(
"Skipping firewall disable check because passwordless sudo is unavailable on nodes: %s", no_sudo_nodes
)
update_test_result()
return
phdl.exec('sudo service ufw stop')
time.sleep(2)
out_dict = phdl.exec('sudo service ufw status')
Expand Down Expand Up @@ -286,12 +294,22 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective):
- cluster_snapshot_debug controls whether before/after snapshots are taken.
"""

# Log a message to Dmesg to create a timestamp record
phdl.exec(f'sudo echo "Starting Test {rccl_collective}" | sudo tee /dev/kmsg')
globals.error_list = []
sudo_status = get_passwordless_sudo_status(phdl)
can_use_sudo = all(sudo_status.values())
if not can_use_sudo:
no_sudo_nodes = [node for node, ok in sudo_status.items() if not ok]
log.warning(
"Skipping dmesg markers/verification and sudo-only snapshots because passwordless sudo is unavailable "
"on nodes: %s",
no_sudo_nodes,
)

if can_use_sudo:
phdl.exec(f'sudo echo "Starting Test {rccl_collective}" | sudo tee /dev/kmsg')

# start_time = phdl.exec('date')
start_time = phdl.exec('date +"%a %b %e %H:%M"')
globals.error_list = []

# Build list of nodes and their VPC IPs (used by the RCCL test)
# make sure the VPC IPs are reachable from all nodes for passwordless ssh
Expand All @@ -302,7 +320,9 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective):
vpc_node_list.append(cluster_dict['node_dict'][node]['vpc_ip'])

# Get cluster snapshot ..
if re.search('True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I):
if can_use_sudo and re.search(
'True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I
):
cluster_dict_before = create_cluster_metrics_snapshot(phdl)

# Use the new grouped parameter function
Expand All @@ -325,13 +345,17 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective):

# Scan dmesg between start and end times cluster wide ..
# end_time = phdl.exec('date')
phdl.exec(f'sudo echo "End of Test {rccl_collective}" | sudo tee /dev/kmsg')
if can_use_sudo:
phdl.exec(f'sudo echo "End of Test {rccl_collective}" | sudo tee /dev/kmsg')

end_time = phdl.exec('date +"%a %b %e %H:%M"')
verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=True)
if can_use_sudo:
verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=True)

# Get new cluster snapshot and compare ..
if re.search('True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I):
if can_use_sudo and re.search(
'True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I
):
cluster_dict_after = create_cluster_metrics_snapshot(phdl)
compare_cluster_metrics_snapshots(cluster_dict_before, cluster_dict_after)

Expand Down
38 changes: 31 additions & 7 deletions cvs/tests/rccl/rccl_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,14 @@ def test_collect_networkinfo(phdl):

def test_disable_firewall(phdl):
globals.error_list = []
sudo_status = get_passwordless_sudo_status(phdl)
no_sudo_nodes = [node for node, ok in sudo_status.items() if not ok]
if no_sudo_nodes:
log.warning(
"Skipping firewall disable check because passwordless sudo is unavailable on nodes: %s", no_sudo_nodes
)
update_test_result()
return
phdl.exec('sudo service ufw stop')
time.sleep(2)
out_dict = phdl.exec('sudo service ufw status')
Expand Down Expand Up @@ -356,13 +364,23 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective, regre
- cluster_snapshot_debug controls whether before/after snapshots are taken.
"""

# Log a message to Dmesg to create a timestamp record
globals.error_list = []
sudo_status = get_passwordless_sudo_status(phdl)
can_use_sudo = all(sudo_status.values())
if not can_use_sudo:
no_sudo_nodes = [node for node, ok in sudo_status.items() if not ok]
log.warning(
"Skipping dmesg markers/verification and sudo-only snapshots because passwordless sudo is unavailable "
"on nodes: %s",
no_sudo_nodes,
)

params_str = ' '.join(f'{k}={v}' for k, v in regression_params.items())
phdl.exec(f'sudo echo "Starting Test {rccl_collective} {params_str}" | sudo tee /dev/kmsg')
if can_use_sudo:
phdl.exec(f'sudo echo "Starting Test {rccl_collective} {params_str}" | sudo tee /dev/kmsg')

# start_time = phdl.exec('date')
start_time = phdl.exec('date +"%a %b %e %H:%M"')
globals.error_list = []
node_list = list(cluster_dict['node_dict'].keys())

# Build list of nodes and their VPC IPs (used by the RCCL test)
Expand All @@ -373,7 +391,9 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective, regre
vpc_node_list.append(cluster_dict['node_dict'][node]['vpc_ip'])

# Get cluster snapshot ..
if re.search('True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I):
if can_use_sudo and re.search(
'True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I
):
cluster_dict_before = create_cluster_metrics_snapshot(phdl)

# Build env_overrides from all regression parameters (convert values to strings)
Expand All @@ -399,13 +419,17 @@ def test_rccl_perf(phdl, shdl, cluster_dict, config_dict, rccl_collective, regre

# Scan dmesg between start and end times cluster wide ..
# end_time = phdl.exec('date')
phdl.exec(f'sudo echo "End of Test {rccl_collective} {params_str}" | sudo tee /dev/kmsg')
if can_use_sudo:
phdl.exec(f'sudo echo "End of Test {rccl_collective} {params_str}" | sudo tee /dev/kmsg')

end_time = phdl.exec('date +"%a %b %e %H:%M"')
verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=True)
if can_use_sudo:
verify_dmesg_for_errors(phdl, start_time, end_time, till_end_flag=True)

# Get new cluster snapshot and compare ..
if re.search('True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I):
if can_use_sudo and re.search(
'True', config_dict.get('cvs_params', {}).get('cluster_snapshot_debug', 'False'), re.I
):
cluster_dict_after = create_cluster_metrics_snapshot(phdl)
compare_cluster_metrics_snapshots(cluster_dict_before, cluster_dict_after)

Expand Down
Loading