From 86d9ffbfd7cb43f23daab30f31f4689598d22b2b Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Wed, 19 Nov 2025 08:34:31 -0800 Subject: [PATCH 1/7] This change introduces using prometheus exporter integration We have device-metrics-exporter as publicly available package to have prometheus exporter capability. Adding this to nodes to export device metrics which can be collected by central node and also have grafna integration to the same with said prometheus data source. Tested locally for deployemnt using ./utils/deploy_monitoring_stack.sh which deploys these three utilities to nodes. After this can run any other cvs tests. Device Metrics Exporter runs 24/7 exposing GPU metrics Prometheus scrapes metrics every 15 seconds automatically Tests run independently - metrics collected in background After test, you can query Prometheus to see what happened during test --- conftest.py | 13 +- input/cluster_file/dummy_monitor_cluster.json | 26 ++ input/cluster_file/local_test_cluster.json | 18 + .../monitoring/monitoring_config.json | 38 ++ lib/device_metrics_lib.py | 242 ++++++++++ lib/gpu_metrics_lib.py | 416 ++++++++++++++++++ lib/utils_lib.py | 75 ++++ monitoring/prometheus/alert_rules.yml | 74 ++++ monitoring/prometheus/prometheus.yml | 54 +++ .../install_device_metrics_exporter.py | 261 +++++++++++ utils/deploy_monitoring_stack.sh | 229 ++++++++++ 11 files changed, 1442 insertions(+), 4 deletions(-) create mode 100644 input/cluster_file/dummy_monitor_cluster.json create mode 100644 input/cluster_file/local_test_cluster.json create mode 100644 input/config_file/monitoring/monitoring_config.json create mode 100644 lib/device_metrics_lib.py create mode 100644 lib/gpu_metrics_lib.py create mode 100644 monitoring/prometheus/alert_rules.yml create mode 100644 monitoring/prometheus/prometheus.yml create mode 100644 tests/monitoring/install_device_metrics_exporter.py create mode 100755 utils/deploy_monitoring_stack.sh diff --git a/conftest.py b/conftest.py index 4c30bcfc..58ab331b 100644 --- a/conftest.py +++ b/conftest.py @@ -10,12 +10,17 @@ # Add all additional cmd line arguments for the script def pytest_addoption(parser): - parser.addoption( "--cluster_file", action="store", required=True, help="Input file with all the details of the cluster, nodes, switches in JSON format" ) - parser.addoption( "--config_file", action="store", required=True, help="Input file with all configurations and parameters for tests in JSON format" ) - + parser.addoption("--cluster_file", action="store", default=None, + help="Path to the cluster JSON file") + parser.addoption("--config_file", action="store", default=None, + help="Path to the config JSON file") + parser.addoption("--prometheus-url", action="store", default=None, + help="Prometheus server URL (optional)") + parser.addoption("--grafana-url", action="store", default=None, + help="Grafana server URL (optional)") + def pytest_metadata(metadata): """Add CVS version metadata for both console output and HTML report.""" - # Read CVS version from version.txt cvs_version = "Unknown" version_file = os.path.join(os.path.dirname(__file__), "version.txt") diff --git a/input/cluster_file/dummy_monitor_cluster.json b/input/cluster_file/dummy_monitor_cluster.json new file mode 100644 index 00000000..6ac97ad3 --- /dev/null +++ b/input/cluster_file/dummy_monitor_cluster.json @@ -0,0 +1,26 @@ +{ + "username": "all-os-star", + "priv_key_file": "/home/all-os-star/.ssh/id_rsa", + + "head_node_dict": { + "mgmt_ip": "10.0.1.100" + }, + + "node_dict": { + "10.0.1.10": { + "hostname": "gpu-node-1", + "vpc_ip": "10.0.1.10", + "bmc_ip": "10.0.2.10" + }, + "10.0.1.11": { + "hostname": "gpu-node-2", + "vpc_ip": "10.0.1.11", + "bmc_ip": "10.0.2.11" + }, + "10.0.1.12": { + "hostname": "gpu-node-3", + "vpc_ip": "10.0.1.12", + "bmc_ip": "10.0.2.12" + } + } +} diff --git a/input/cluster_file/local_test_cluster.json b/input/cluster_file/local_test_cluster.json new file mode 100644 index 00000000..3ad2f3ac --- /dev/null +++ b/input/cluster_file/local_test_cluster.json @@ -0,0 +1,18 @@ +{ + "_comment": "Local test cluster configuration for single-machine testing", + "username": "{user-id}", + "priv_key_file": "/home/{user-id}/.ssh/id_rsa", + + "head_node_dict": { + "mgmt_ip": "localhost" + }, + + "node_dict": { + "localhost": { + "ip_addr": "127.0.0.1", + "hostname": "localhost", + "vpc_ip": "localhost", + "bmc_ip": "NA" + } + } +} diff --git a/input/config_file/monitoring/monitoring_config.json b/input/config_file/monitoring/monitoring_config.json new file mode 100644 index 00000000..4e7ee738 --- /dev/null +++ b/input/config_file/monitoring/monitoring_config.json @@ -0,0 +1,38 @@ +{ + "monitoring": { + "device_metrics_exporter_version": "{device-metrics-version}", + "device_metrics_exporter_image": "rocm/device-metrics-exporter:{device-metrics-version}", + "device_metrics_exporter_port": 5000, + + "prometheus_host": "{prometheus-host}", + "prometheus_port": 9090, + "prometheus_version": "{prometheus-version}", + "prometheus_url": "http://{prometheus-host}:{prometheus-port}", + + "grafana_host": "{grafana-host}", + "grafana_port": 3000, + "grafana_version": "{grafana-version}", + "grafana_url": "http://{grafana-host}:{grafana-port}", + "grafana_username": "admin", + "grafana_password": "{grafana-password}", + "grafana_api_key": "{grafana-api-key}", + + "scrape_interval": "15s", + "scrape_timeout": "10s", + "retention_days": 30, + + "alert_thresholds": { + "temperature_warning": 95, + "temperature_critical": 105, + "power_warning": 700, + "ecc_error_rate_warning": 10, + "memory_usage_warning": 90 + }, + + "deployment": { + "docker_network": "host", + "restart_policy": "unless-stopped", + "log_level": "INFO" + } + } +} \ No newline at end of file diff --git a/lib/device_metrics_lib.py b/lib/device_metrics_lib.py new file mode 100644 index 00000000..2eef53e8 --- /dev/null +++ b/lib/device_metrics_lib.py @@ -0,0 +1,242 @@ +''' +Copyright 2025 Advanced Micro Devices, Inc. +Device Metrics Integration Library for CVS +''' + +import requests +import json +import logging +from typing import Dict, List, Optional, Any +from datetime import datetime + +log = logging.getLogger(__name__) + + +class PrometheusClient: + """Client for querying Prometheus API to retrieve GPU metrics.""" + + def __init__(self, prometheus_url: str, timeout: int = 30): + self.base_url = prometheus_url.rstrip('/') + self.timeout = timeout + self.api_url = f"{self.base_url}/api/v1" + + def check_health(self) -> bool: + """Check if Prometheus server is healthy.""" + try: + response = requests.get(f"{self.base_url}/-/healthy", timeout=5) + return response.status_code == 200 + except Exception as e: + log.error(f"Prometheus health check failed: {e}") + return False + + def query_instant(self, query: str, time: Optional[str] = None) -> Optional[Dict]: + """Execute instant Prometheus query.""" + params = {'query': query} + if time: + params['time'] = time + + try: + response = requests.get( + f"{self.api_url}/query", + params=params, + timeout=self.timeout + ) + response.raise_for_status() + data = response.json() + + if data.get('status') == 'success': + return data.get('data') + else: + log.error(f"Prometheus query failed: {data.get('error')}") + return None + + except Exception as e: + log.error(f"Error querying Prometheus: {e}") + return None + + def query_range(self, query: str, start: str, end: str, step: str = '15s') -> Optional[Dict]: + """Execute range Prometheus query for time-series data.""" + params = { + 'query': query, + 'start': start, + 'end': end, + 'step': step + } + + try: + response = requests.get( + f"{self.api_url}/query_range", + params=params, + timeout=self.timeout + ) + response.raise_for_status() + data = response.json() + + if data.get('status') == 'success': + return data.get('data') + else: + log.error(f"Prometheus range query failed: {data.get('error')}") + return None + + except Exception as e: + log.error(f"Error querying Prometheus range: {e}") + return None + + +def get_gpu_metrics_from_prometheus( + prom_client: PrometheusClient, + node: str, + gpu_ids: Optional[List[str]] = None +) -> Dict[str, Dict[str, float]]: + """ + Retrieve GPU metrics from Prometheus for a specific node. + + Returns: + { + '0': {'temperature': 45.0, 'power': 300.5, 'utilization': 85.0}, + '1': {'temperature': 46.0, 'power': 295.3, 'utilization': 82.0} + } + """ + metrics_dict = {} + + # Query temperature + temp_query = f'amdgpu_temperature_celsius{{node="{node}", sensor="edge"}}' + temp_data = prom_client.query_instant(temp_query) + + if temp_data and temp_data.get('result'): + for result in temp_data['result']: + gpu_id = result['metric'].get('gpu', 'unknown') + if gpu_ids is None or gpu_id in gpu_ids: + if gpu_id not in metrics_dict: + metrics_dict[gpu_id] = {} + metrics_dict[gpu_id]['temperature'] = float(result['value'][1]) + + # Query power consumption + power_query = f'amdgpu_power_watts{{node="{node}"}}' + power_data = prom_client.query_instant(power_query) + + if power_data and power_data.get('result'): + for result in power_data['result']: + gpu_id = result['metric'].get('gpu', 'unknown') + if gpu_ids is None or gpu_id in gpu_ids: + if gpu_id not in metrics_dict: + metrics_dict[gpu_id] = {} + metrics_dict[gpu_id]['power'] = float(result['value'][1]) + + # Query GPU utilization + util_query = f'amdgpu_gpu_busy_percent{{node="{node}"}}' + util_data = prom_client.query_instant(util_query) + + if util_data and util_data.get('result'): + for result in util_data['result']: + gpu_id = result['metric'].get('gpu', 'unknown') + if gpu_ids is None or gpu_id in gpu_ids: + if gpu_id not in metrics_dict: + metrics_dict[gpu_id] = {} + metrics_dict[gpu_id]['utilization'] = float(result['value'][1]) + + return metrics_dict + + +def get_device_exporter_health( + prom_client: PrometheusClient, + nodes: List[str] +) -> Dict[str, bool]: + """Check health status of Device Metrics Exporter on all nodes.""" + health_dict = {} + + for node in nodes: + query = f'up{{job="device-metrics-exporter", node="{node}"}}' + data = prom_client.query_instant(query) + + if data and data.get('result'): + is_up = float(data['result'][0]['value'][1]) == 1.0 + health_dict[node] = is_up + else: + health_dict[node] = False + + return health_dict + + +def create_grafana_annotation( + grafana_url: str, + text: str, + tags: List[str] = None, + api_key: Optional[str] = None, + username: Optional[str] = None, + password: Optional[str] = None, + time: Optional[int] = None +) -> bool: + """Create annotation in Grafana to mark test events on dashboards.""" + if tags is None: + tags = ['cvs-test'] + + if time is None: + time = int(datetime.now().timestamp() * 1000) + + url = f"{grafana_url.rstrip('/')}/api/annotations" + + payload = { + 'text': text, + 'tags': tags, + 'time': time + } + + headers = {'Content-Type': 'application/json'} + + if not api_key and (not username or not password): + log.warning("Grafana annotation requested without credentials or API key; skipping.") + return False + + if api_key: + headers['Authorization'] = f'Bearer {api_key}' + auth = None + else: + auth = (username, password) + + try: + response = requests.post( + url, + json=payload, + headers=headers, + auth=auth, + timeout=10 + ) + response.raise_for_status() + log.info(f"Created Grafana annotation: {text}") + return True + + except Exception as e: + log.error(f"Failed to create Grafana annotation: {e}") + return False + + +# Test function +if __name__ == '__main__': + import sys + + if len(sys.argv) < 3: + print("Usage: python device_metrics_lib.py ") + print("Example: python device_metrics_lib.py http://localhost:9090 localhost") + sys.exit(1) + + prometheus_url = sys.argv[1] + node = sys.argv[2] + + print(f"Testing Prometheus integration with {prometheus_url}") + + client = PrometheusClient(prometheus_url) + + if not client.check_health(): + print("ERROR: Prometheus server is not healthy") + sys.exit(1) + print("Prometheus server is healthy") + + metrics = get_gpu_metrics_from_prometheus(client, node) + if metrics: + print(f"Retrieved metrics for {len(metrics)} GPUs") + for gpu_id, data in metrics.items(): + print(f" GPU {gpu_id}: Temp={data.get('temperature', 'N/A')}°C, " + f"Power={data.get('power', 'N/A')}W") + else: + print("WARNING: No GPU metrics found") diff --git a/lib/gpu_metrics_lib.py b/lib/gpu_metrics_lib.py new file mode 100644 index 00000000..c9d6b556 --- /dev/null +++ b/lib/gpu_metrics_lib.py @@ -0,0 +1,416 @@ +''' +Copyright 2025 Advanced Micro Devices, Inc. +All rights reserved. +''' + +""" +Device Metrics Integration Library for CVS + +This module provides integration between CVS and AMD ROCm Device Metrics Exporter +via Prometheus. It enables CVS to query GPU metrics from Prometheus instead of +(or in addition to) SSH-based amd-smi/rocm-smi commands. + +Device Metrics Exporter: https://github.com/ROCm/device-metrics-exporter +""" + +import requests +import json +import time +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Any, Tuple +import logging + +log = logging.getLogger(__name__) + + +class PrometheusClient: + """ + Client for querying Prometheus server that scrapes Device Metrics Exporter. + """ + + def __init__(self, prometheus_url: str , timeout: int = 30): + if not prometheus_url: + # fall back only if truly absent + prometheus_url = os.getenv("PROMETHEUS_URL", "http://localhost:9090") + self.prometheus_url = prometheus_url.rstrip('/') + self.timeout = timeout + self.api_url = f"{self.prometheus_url}/api/v1" + log.info(f"Initialized Prometheus client for {self.prometheus_url}") + + def check_health(self) -> bool: + """Check if Prometheus server is healthy and reachable.""" + try: + response = requests.get( + f"{self.prometheus_url}/-/healthy", + timeout=self.timeout + ) + if response.status_code == 200: + log.info(f"✓ Prometheus server at {self.prometheus_url} is healthy") + return True + else: + log.error(f"✗ Prometheus health check failed with status {response.status_code}") + return False + except Exception as e: + log.error(f"✗ Failed to connect to Prometheus at {self.prometheus_url}: {e}") + return False + + def query_instant(self, query: str) -> Dict[str, Any]: + """Execute an instant PromQL query.""" + try: + response = requests.get( + f"{self.api_url}/query", + params={'query': query}, + timeout=self.timeout + ) + response.raise_for_status() + result = response.json() + + if result.get('status') == 'success': + log.debug(f"Query successful: {query[:50]}...") + else: + log.warning(f"Query returned non-success status: {result.get('error', 'Unknown error')}") + + return result + except Exception as e: + log.error(f"Prometheus instant query failed: {e}") + return {"status": "error", "error": str(e)} + + def query_range(self, query: str, start_time: datetime, end_time: datetime, + step: str = "15s") -> Dict[str, Any]: + """Execute a range PromQL query for time-series data.""" + try: + response = requests.get( + f"{self.api_url}/query_range", + params={ + 'query': query, + 'start': start_time.timestamp(), + 'end': end_time.timestamp(), + 'step': step + }, + timeout=self.timeout + ) + response.raise_for_status() + result = response.json() + + if result.get('status') == 'success': + log.debug(f"Range query successful: {query[:50]}... [{start_time} to {end_time}]") + + return result + except Exception as e: + log.error(f"Prometheus range query failed: {e}") + return {"status": "error", "error": str(e)} + + def get_targets(self) -> List[Dict[str, Any]]: + """Get list of all scrape targets (Device Metrics Exporters) and their status.""" + try: + response = requests.get( + f"{self.api_url}/targets", + timeout=self.timeout + ) + response.raise_for_status() + data = response.json() + + if data.get('status') == 'success': + targets = data.get('data', {}).get('activeTargets', []) + log.info(f"Retrieved {len(targets)} active targets from Prometheus") + return targets + return [] + except Exception as e: + log.error(f"Failed to get Prometheus targets: {e}") + return [] + + +# Device Metrics Exporter metric names (as of v1.4.0) +DEVICE_METRICS_MAP = { + # Temperature metrics + 'temperature_edge': 'amdgpu_temperature_edge_celsius', + 'temperature_junction': 'amdgpu_temperature_junction_celsius', + 'temperature_memory': 'amdgpu_temperature_memory_celsius', + 'temperature_hbm': 'amdgpu_temperature_hbm_celsius', + + # Utilization metrics + 'gpu_utilization': 'amdgpu_gpu_utilization_percent', + 'memory_utilization': 'amdgpu_memory_utilization_percent', + + # Power metrics + 'power_current': 'amdgpu_power_watts', + 'power_average': 'amdgpu_power_average_watts', + 'energy_consumed': 'amdgpu_energy_joules', + + # Memory metrics + 'memory_used': 'amdgpu_memory_used_bytes', + 'memory_total': 'amdgpu_memory_total_bytes', + 'memory_free': 'amdgpu_memory_free_bytes', + + # Clock metrics + 'clock_gpu': 'amdgpu_gpu_clock_mhz', + 'clock_memory': 'amdgpu_memory_clock_mhz', + + # PCIe metrics + 'pcie_bandwidth': 'amdgpu_pcie_bandwidth_bytes', + 'pcie_link_speed': 'amdgpu_pcie_link_speed_mbps', + 'pcie_link_width': 'amdgpu_pcie_link_width', + 'pcie_replay_count': 'amdgpu_pcie_replay_count_total', + 'pcie_nak_sent': 'amdgpu_pcie_nak_sent_total', + 'pcie_nak_received': 'amdgpu_pcie_nak_received_total', + + # Error metrics + 'ecc_correctable': 'amdgpu_ecc_correctable_errors_total', + 'ecc_uncorrectable': 'amdgpu_ecc_uncorrectable_errors_total', + 'ras_correctable': 'amdgpu_ras_correctable_error_count', + 'ras_uncorrectable': 'amdgpu_ras_uncorrectable_error_count', +} + + +def get_gpu_metrics_from_prometheus(prom_client: PrometheusClient, + node_list: Optional[List[str]] = None, + metrics: Optional[List[str]] = None) -> Dict[str, Dict]: + """ + Query current GPU metrics from Prometheus for all or specific nodes. + + Returns: + Dict with structure: {node: {gpu_id: {metric_name: value}}} + """ + metrics_dict = {} + + if metrics is None: + metrics = [ + 'temperature_edge', 'temperature_junction', 'temperature_memory', + 'power_current', 'power_average', + 'gpu_utilization', 'memory_utilization', + 'memory_used', 'memory_total', + 'pcie_bandwidth', 'pcie_link_speed', + 'ecc_correctable', 'ecc_uncorrectable', + 'clock_gpu', 'clock_memory' + ] + + for metric_key in metrics: + if metric_key not in DEVICE_METRICS_MAP: + log.warning(f"Unknown metric key: {metric_key}, skipping") + continue + + metric_name = DEVICE_METRICS_MAP[metric_key] + + # Build query with optional node filter + if node_list: + node_filter = '|'.join([node.replace('.', '\\.') for node in node_list]) + query = f'{metric_name}{{instance=~"({node_filter}):.*"}}' + else: + query = metric_name + + result = prom_client.query_instant(query) + + if result.get('status') == 'success': + for item in result.get('data', {}).get('result', []): + labels = item.get('metric', {}) + instance = labels.get('instance', '') + node = instance.split(':')[0] if ':' in instance else instance + gpu_id = labels.get('gpu', labels.get('gpu_id', 'unknown')) + value = item.get('value', [None, None])[1] + + try: + if value is not None: + value = float(value) + except (ValueError, TypeError): + pass + + if node not in metrics_dict: + metrics_dict[node] = {} + if gpu_id not in metrics_dict[node]: + metrics_dict[node][gpu_id] = {} + + metrics_dict[node][gpu_id][metric_key] = value + else: + log.warning(f"Failed to query metric {metric_key}: {result.get('error', 'Unknown error')}") + + log.info(f"Retrieved metrics for {len(metrics_dict)} nodes, {len(metrics)} metric types") + return metrics_dict + + +def get_device_exporter_health(prom_client: PrometheusClient, + node_list: Optional[List[str]] = None) -> Dict[str, Dict]: + """ + Check health status of Device Metrics Exporter on all nodes. + """ + health_dict = {} + targets = prom_client.get_targets() + + for target in targets: + labels = target.get('labels', {}) + instance = labels.get('instance', '') + job = labels.get('job', '') + + if 'device-metrics' not in job.lower() and 'amd' not in job.lower(): + continue + + node = instance.split(':')[0] if ':' in instance else instance + + if node_list and node not in node_list: + continue + + health_dict[node] = { + 'health': target.get('health', 'unknown'), + 'last_scrape': target.get('lastScrape', ''), + 'scrape_duration': target.get('lastScrapeDuration', 0), + 'last_error': target.get('lastError', ''), + 'scrape_url': target.get('scrapeUrl', ''), + 'labels': labels + } + + up_count = sum(1 for h in health_dict.values() if h['health'] == 'up') + down_count = sum(1 for h in health_dict.values() if h['health'] == 'down') + log.info(f"Exporter health: {up_count} up, {down_count} down out of {len(health_dict)} nodes") + + return health_dict + + +def create_grafana_annotation(grafana_url: str, api_key: str, + text: str, tags: List[str], + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None) -> bool: + """Create an annotation in Grafana to mark CVS test events.""" + try: + url = f"{grafana_url.rstrip('/')}/api/annotations" + headers = { + 'Authorization': f'Bearer {api_key}', + 'Content-Type': 'application/json' + } + + if start_time is None: + start_time = datetime.now() + + data = { + 'text': text, + 'tags': tags, + 'time': int(start_time.timestamp() * 1000) + } + + if end_time: + data['timeEnd'] = int(end_time.timestamp() * 1000) + + response = requests.post(url, headers=headers, json=data, timeout=10) + response.raise_for_status() + + log.info(f"✓ Created Grafana annotation: {text}") + return True + + except Exception as e: + log.error(f"✗ Failed to create Grafana annotation: {e}") + return False + + +def compare_ssh_vs_prometheus(ssh_metrics: Dict, prom_metrics: Dict, + tolerance: float = 5.0) -> Dict: + """Compare metrics collected via SSH vs Prometheus to validate consistency.""" + comparison = { + 'summary': { + 'total_nodes': 0, + 'matching_nodes': 0, + 'discrepancy_nodes': 0, + 'ssh_only_nodes': 0, + 'prom_only_nodes': 0 + }, + 'node_comparisons': [], + 'discrepancies': [] + } + + ssh_nodes = set(ssh_metrics.keys()) + prom_nodes = set(prom_metrics.keys()) + + comparison['summary']['total_nodes'] = len(ssh_nodes | prom_nodes) + comparison['summary']['ssh_only_nodes'] = len(ssh_nodes - prom_nodes) + comparison['summary']['prom_only_nodes'] = len(prom_nodes - ssh_nodes) + + for node in (ssh_nodes - prom_nodes): + log.warning(f"Node {node} only in SSH metrics (not in Prometheus)") + comparison['node_comparisons'].append({ + 'node': node, + 'status': 'ssh_only', + 'gpu_count_match': False + }) + + for node in (prom_nodes - ssh_nodes): + log.warning(f"Node {node} only in Prometheus metrics (not in SSH)") + comparison['node_comparisons'].append({ + 'node': node, + 'status': 'prom_only', + 'gpu_count_match': False + }) + + common_nodes = ssh_nodes & prom_nodes + + for node in common_nodes: + node_comparison = { + 'node': node, + 'status': 'match', + 'gpu_count_match': True, + 'metric_comparisons': [] + } + + ssh_gpus = set(ssh_metrics[node].keys()) + prom_gpus = set(prom_metrics[node].keys()) + + if ssh_gpus != prom_gpus: + node_comparison['gpu_count_match'] = False + node_comparison['status'] = 'discrepancy' + log.warning(f"Node {node}: GPU count mismatch") + + common_gpus = ssh_gpus & prom_gpus + for gpu_id in common_gpus: + ssh_gpu = ssh_metrics[node][gpu_id] + prom_gpu = prom_metrics[node][gpu_id] + + ssh_metric_keys = set(ssh_gpu.keys()) + prom_metric_keys = set(prom_gpu.keys()) + common_metrics = ssh_metric_keys & prom_metric_keys + + for metric_key in common_metrics: + ssh_val = ssh_gpu[metric_key] + prom_val = prom_gpu[metric_key] + + if ssh_val is None or prom_val is None: + continue + + try: + ssh_num = float(ssh_val) + prom_num = float(prom_val) + + if ssh_num != 0: + diff_percent = abs((prom_num - ssh_num) / ssh_num) * 100 + else: + diff_percent = 0 if prom_num == 0 else 100 + + if diff_percent > tolerance: + node_comparison['status'] = 'discrepancy' + comparison['discrepancies'].append({ + 'node': node, + 'gpu': str(gpu_id), + 'metric': metric_key, + 'ssh_value': ssh_num, + 'prom_value': prom_num, + 'diff_percent': round(diff_percent, 2) + }) + except (ValueError, TypeError): + if str(ssh_val) != str(prom_val): + node_comparison['status'] = 'discrepancy' + comparison['discrepancies'].append({ + 'node': node, + 'gpu': str(gpu_id), + 'metric': metric_key, + 'ssh_value': str(ssh_val), + 'prom_value': str(prom_val), + 'diff_percent': None + }) + + comparison['node_comparisons'].append(node_comparison) + + if node_comparison['status'] == 'match': + comparison['summary']['matching_nodes'] += 1 + else: + comparison['summary']['discrepancy_nodes'] += 1 + + log.info(f"Comparison complete: {comparison['summary']['matching_nodes']}/{len(common_nodes)} nodes match") + if comparison['discrepancies']: + log.warning(f"Found {len(comparison['discrepancies'])} metric discrepancies") + + return comparison diff --git a/lib/utils_lib.py b/lib/utils_lib.py index 7ed16ed7..cc69bda4 100644 --- a/lib/utils_lib.py +++ b/lib/utils_lib.py @@ -418,3 +418,78 @@ def resolve_test_config_placeholders(config_dict, cluster_dict): resolved_config = _resolve_placeholders_in_dict(config_dict, replacements, context_name="test config") return resolved_config + + +def resolve_placeholder_with_fallback(value, fallback): + """ + Resolve placeholder strings, returning fallback if unresolved. + + Args: + value: Value that may contain unresolved placeholders like {prometheus-host} + fallback: Default value to use if placeholder is unresolved + + Returns: + Resolved value or fallback if value is None/empty/unresolved placeholder + + Examples: + >>> resolve_placeholder_with_fallback("{prometheus-host}", "localhost") + 'localhost' + >>> resolve_placeholder_with_fallback("10.0.0.5", "localhost") + '10.0.0.5' + >>> resolve_placeholder_with_fallback(None, "localhost") + 'localhost' + """ + if value is None: + return fallback + + # Convert to string + value_str = str(value).strip() + + # Empty or unresolved placeholder (starts with { and ends with }) + if not value_str or (value_str.startswith("{") and value_str.endswith("}")): + return fallback + + return value_str + + +def apply_monitoring_defaults(config_dict): + """ + Apply default fallback values for monitoring configuration. + Ensures localhost/default ports/versions when placeholders aren't resolved. + + Args: + config_dict: Monitoring configuration dictionary + + Returns: + dict: Configuration with defaults applied + """ + defaults = { + 'prometheus_host': 'localhost', + 'prometheus_port': 9090, + 'prometheus_version': 'v2.55.0', + 'grafana_host': 'localhost', + 'grafana_port': 3000, + 'grafana_version': '10.4.1', + 'device_metrics_exporter_version': 'v1.4.0', + 'device_metrics_exporter_port': 5000, + 'device_metrics_exporter_host': 'localhost', + } + + result = config_dict.copy() + + for key, default_value in defaults.items(): + current_value = result.get(key) + result[key] = resolve_placeholder_with_fallback(current_value, default_value) + + # Build derived URLs with resolved values + if 'prometheus_url' in result: + prom_host = result['prometheus_host'] + prom_port = result['prometheus_port'] + result['prometheus_url'] = f"http://{prom_host}:{prom_port}" + + if 'grafana_url' in result: + graf_host = result['grafana_host'] + graf_port = result['grafana_port'] + result['grafana_url'] = f"http://{graf_host}:{graf_port}" + + return result diff --git a/monitoring/prometheus/alert_rules.yml b/monitoring/prometheus/alert_rules.yml new file mode 100644 index 00000000..723424d6 --- /dev/null +++ b/monitoring/prometheus/alert_rules.yml @@ -0,0 +1,74 @@ +# Prometheus Alert Rules for AMD GPU Health Monitoring + +groups: + - name: gpu_health_alerts + interval: 30s + rules: + # GPU Temperature Alerts + - alert: GPUTemperatureWarning + expr: amdgpu_temperature_celsius{sensor="edge"} > 95 + for: 2m + labels: + severity: warning + annotations: + summary: "GPU temperature high on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} has temperature {{ $value }}°C (threshold: 95°C)" + + - alert: GPUTemperatureCritical + expr: amdgpu_temperature_celsius{sensor="edge"} > 105 + for: 1m + labels: + severity: critical + annotations: + summary: "GPU temperature critical on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} has temperature {{ $value }}°C (threshold: 105°C)" + + # GPU Power Alerts + - alert: GPUPowerHigh + expr: amdgpu_power_watts > 700 + for: 5m + labels: + severity: warning + annotations: + summary: "GPU power consumption high on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} consuming {{ $value }}W (threshold: 700W)" + + # ECC Error Alerts + - alert: GPUECCErrors + expr: rate(amdgpu_ecc_errors_total[5m]) > 0 + for: 1m + labels: + severity: warning + annotations: + summary: "GPU ECC errors detected on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} reporting ECC errors" + + # PCIe Replay Errors + - alert: PCIeReplayErrors + expr: rate(amdgpu_pcie_replay_count[5m]) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "PCIe replay errors on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} experiencing PCIe replay errors" + + # Exporter Health + - alert: DeviceMetricsExporterDown + expr: up{job="device-metrics-exporter"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Device Metrics Exporter down on {{ $labels.node }}" + description: "Cannot scrape metrics from {{ $labels.node }} - exporter may be down" + + # Cluster-wide alerts + - alert: MultipleGPUsOverheating + expr: count(amdgpu_temperature_celsius{sensor="edge"} > 95) > 3 + for: 5m + labels: + severity: critical + annotations: + summary: "Multiple GPUs overheating in cluster" + description: "{{ $value }} GPUs are above 95°C - possible cooling issue" diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 00000000..33899dc7 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,54 @@ +# Prometheus Configuration for CVS Device Metrics Monitoring +# This file configures Prometheus to scrape AMD GPU metrics from Device Metrics Exporter + +global: + scrape_interval: 15s # How often to scrape targets + evaluation_interval: 15s # How often to evaluate rules + scrape_timeout: 10s # Timeout for scraping + external_labels: + cluster: 'cvs-cluster' + monitor: 'gpu-monitoring' + +# Load alert rules +rule_files: + - 'alert_rules.yml' + +# Alertmanager configuration (optional) +# alerting: +# alertmanagers: +# - static_configs: +# - targets: +# - 'localhost:9093' + +# Scrape configurations +scrape_configs: + # Job for AMD Device Metrics Exporter running on all GPU nodes + - job_name: 'device-metrics-exporter' + static_configs: + - targets: + # ===== UPDATE THESE WITH YOUR ACTUAL NODE HOSTNAMES/IPs ===== + #- 'node1:5000' + #- 'node2:5000' + # Add more nodes as needed + # For local testing use: - 'localhost:5000' + - 'localhost:5000' + labels: + cluster: 'cvs-cluster' + + # Relabel to extract node name from target + relabel_configs: + - source_labels: [__address__] + regex: '([^:]+):.*' + target_label: node + replacement: '$1' + + # Metric relabeling (optional filtering) + metric_relabel_configs: + - source_labels: [__name__] + regex: 'amdgpu_.*' + action: keep + + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] diff --git a/tests/monitoring/install_device_metrics_exporter.py b/tests/monitoring/install_device_metrics_exporter.py new file mode 100644 index 00000000..eb490fd0 --- /dev/null +++ b/tests/monitoring/install_device_metrics_exporter.py @@ -0,0 +1,261 @@ +# Drop-in replacement for tests/monitoring/install_device_metrics_exporter.py +# Key changes: +# 1. Added apply_monitoring_defaults to config_dict fixture +# 2. Updated metrics_host fixture to use resolved device_metrics_exporter_host +# 3. Fixed hardcoded localhost in test_check_gpu_metrics_exposed (line ~217) + +import pytest +import re +import sys +import os +import time +import json +import logging + +sys.path.insert(0, './lib') +from parallel_ssh_lib import * +from utils_lib import * + +import globals + +log = globals.log + + +@pytest.fixture(scope="module") +def cluster_file(pytestconfig): + """Get cluster file path from pytest CLI""" + return pytestconfig.getoption("cluster_file") + + +@pytest.fixture(scope="module") +def config_file(pytestconfig): + """Get config file path from pytest CLI""" + return pytestconfig.getoption("config_file") + + +@pytest.fixture(scope="module") +def cluster_dict(cluster_file): + """Load cluster configuration""" + with open(cluster_file) as json_file: + cluster_dict = json.load(json_file) + cluster_dict = resolve_cluster_config_placeholders(cluster_dict) + log.info(cluster_dict) + return cluster_dict + + +@pytest.fixture(scope="module") +def config_dict(config_file, cluster_dict): + """Load monitoring configuration with localhost/version fallbacks""" + with open(config_file) as json_file: + config_dict_t = json.load(json_file) + config_dict = config_dict_t.get('monitoring', {}) + config_dict = resolve_test_config_placeholders(config_dict, cluster_dict) + # Apply defaults for unresolved placeholders + config_dict = apply_monitoring_defaults(config_dict) + log.info("Resolved monitoring config:") + log.info(config_dict) + return config_dict + + +@pytest.fixture(scope="module") +def metrics_host(config_dict): + """Get metrics host with fallback to localhost""" + return config_dict.get("device_metrics_exporter_host", "localhost") + + +@pytest.fixture(scope="module") +def phdl(cluster_dict): + """Create parallel SSH handle for all nodes""" + node_list = list(cluster_dict['node_dict'].keys()) + phdl = Pssh(log, node_list, user=cluster_dict['username'], pkey=cluster_dict['priv_key_file']) + return phdl + + +def test_check_docker_installed(phdl): + """Verify Docker is installed on all nodes""" + globals.error_list = [] + log.info("Checking if Docker is installed on all nodes") + + out_dict = phdl.exec('docker --version') + + for node in out_dict.keys(): + if not re.search(r'Docker version', out_dict[node], re.I): + fail_test(f"Docker is not installed on node {node}. Please install Docker first.") + + update_test_result() + + +def test_check_rocm_installed(phdl): + """Verify ROCm is installed on all nodes""" + globals.error_list = [] + log.info("Checking if ROCm is installed on all nodes") + + out_dict = phdl.exec('rocm-smi --version || amd-smi version') + + for node in out_dict.keys(): + if not re.search(r'ROCm|AMD', out_dict[node], re.I): + fail_test(f"ROCm is not installed on node {node}. Please install ROCm first.") + + update_test_result() + + +def test_pull_device_metrics_exporter_image(phdl, config_dict): + """Pull Device Metrics Exporter Docker image on all nodes""" + globals.error_list = [] + log.info("Pulling Device Metrics Exporter Docker image on all nodes") + + version = config_dict['device_metrics_exporter_version'] + image = f"rocm/device-metrics-exporter:{version}" + log.info(f"Using image: {image}") + + out_dict = phdl.exec(f'docker pull {image}', timeout=300) + + for node in out_dict.keys(): + if 'Error' in out_dict[node] or 'failed' in out_dict[node].lower(): + fail_test(f"Failed to pull Docker image on node {node}: {out_dict[node]}") + + update_test_result() + + +def test_stop_existing_device_metrics_exporter(phdl): + """Stop and remove any existing Device Metrics Exporter containers""" + globals.error_list = [] + log.info("Stopping existing Device Metrics Exporter containers (if any)") + + phdl.exec('docker stop device-metrics-exporter 2>/dev/null || true') + phdl.exec('docker rm device-metrics-exporter 2>/dev/null || true') + + log.info("Cleaned up existing containers") + update_test_result() + + +def test_start_device_metrics_exporter(phdl, config_dict): + """Start Device Metrics Exporter container on all nodes""" + globals.error_list = [] + log.info("Starting Device Metrics Exporter on all nodes") + + version = config_dict['device_metrics_exporter_version'] + port = config_dict['device_metrics_exporter_port'] + + log.info(f"Starting exporter version {version} on port {port}") + + # Docker run command + docker_cmd = f'''docker run -d \ + --device=/dev/dri \ + --device=/dev/kfd \ + --network=host \ + -p {port}:{port} \ + --restart unless-stopped \ + --name device-metrics-exporter \ + rocm/device-metrics-exporter:{version}''' + + out_dict = phdl.exec(docker_cmd) + + for node in out_dict.keys(): + if 'Error' in out_dict[node]: + fail_test(f"Failed to start Device Metrics Exporter on node {node}: {out_dict[node]}") + + log.info("Device Metrics Exporter started on all nodes") + update_test_result() + + +def test_verify_exporter_running(phdl): + """Verify Device Metrics Exporter is running""" + globals.error_list = [] + log.info("Verifying Device Metrics Exporter is running on all nodes") + + # Wait for containers to start + time.sleep(10) + + out_dict = phdl.exec('docker ps --filter name=device-metrics-exporter --format "{{.Status}}"') + + for node in out_dict.keys(): + if 'Up' not in out_dict[node]: + fail_test(f"Device Metrics Exporter is not running on node {node}") + + update_test_result() + + +def test_verify_metrics_endpoint(phdl, config_dict, metrics_host): + """Verify metrics endpoint is accessible""" + globals.error_list = [] + log.info("Verifying metrics endpoint is accessible on all nodes") + + port = config_dict['device_metrics_exporter_port'] + log.info(f"Testing endpoint: http://{metrics_host}:{port}/metrics") + + # Retry logic for slow container startup + max_retries = 3 + out_dict = None + + for attempt in range(max_retries): + out_dict = phdl.exec(f'curl -s http://{metrics_host}:{port}/metrics | head -20') + + # Check if we got output + has_output = False + for node in out_dict.keys(): + if len(out_dict[node]) > 0: + has_output = True + break + + if has_output: + break + else: + log.info(f"Attempt {attempt+1}/{max_retries}: No output yet, waiting 5 seconds...") + time.sleep(5) + + # Final validation + for node in out_dict.keys(): + output = out_dict[node] + log.info(f"Checking output from {node}, length: {len(output)}") + + if output and 'gpu_' in output.lower(): + log.info(f"Metrics endpoint verified on node {node}") + else: + log.error(f"Output sample: {output[:200]}") + fail_test(f"Metrics endpoint not accessible on node {node}") + + update_test_result() + + +def test_check_gpu_metrics_exposed(phdl, config_dict, metrics_host): + """Verify GPU metrics are being exposed""" + globals.error_list = [] + log.info("Checking if GPU metrics are being exposed") + + port = config_dict['device_metrics_exporter_port'] + + # Use metrics_host instead of hardcoded localhost + out_dict = phdl.exec(f'curl -s http://{metrics_host}:{port}/metrics | head -50') + + for node in out_dict.keys(): + output = out_dict[node] + log.info(f"Checking GPU metrics from {node}, length: {len(output)}") + + if output.strip() and 'gpu_' in output.lower(): + log.info(f"GPU metrics verified on node {node}") + # Show sample + lines = [line for line in output.split('\n') if 'gpu_' in line.lower()][:2] + for line in lines: + log.info(f" Sample: {line[:80]}") + else: + log.error(f"No GPU metrics found. Output: {output[:300]}") + fail_test(f"GPU metrics not found on node {node}") + + update_test_result() + + +def test_display_summary(phdl): + """Display installation summary""" + log.info("=" * 80) + log.info("Device Metrics Exporter Installation Complete!") + log.info("=" * 80) + log.info("") + log.info("Exporter Status:") + + out_dict = phdl.exec('docker ps --filter name=device-metrics-exporter --format "{{.Names}}: {{.Status}}"') + + for node in out_dict.keys(): + log.info(f" {node}: {out_dict[node]}") + + log.info("Completed metrics tests successfully.") diff --git a/utils/deploy_monitoring_stack.sh b/utils/deploy_monitoring_stack.sh new file mode 100755 index 00000000..0c06278c --- /dev/null +++ b/utils/deploy_monitoring_stack.sh @@ -0,0 +1,229 @@ +#!/bin/bash + +# CVS Monitoring Stack Deployment Script +# Deploys Device Metrics Exporter + Prometheus + Grafana + +set -e + +# Get script directory and repo root +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +cd "$REPO_ROOT" + +# Configuration with localhost fallback +CLUSTER_FILE="${1:-./input/cluster_file/local_test_cluster.json}" +MONITORING_CONFIG="${2:-./input/config_file/monitoring/monitoring_config.json}" + +# Helper function to resolve placeholders +resolve_with_fallback() { + local value="$1" + local fallback="$2" + + # If value is empty or contains unresolved placeholder pattern {...} + if [[ -z "$value" ]] || [[ "$value" =~ ^\{.*\}$ ]]; then + echo "$fallback" + else + # Remove 'v' prefix if exists for version numbers + echo "${value#v}" + fi +} + +# Read versions from config with fallback +if [ -f "$MONITORING_CONFIG" ] && command -v jq &> /dev/null; then + PROM_RAW=$(jq -r '.monitoring.prometheus_version // "v2.55.0"' "$MONITORING_CONFIG") + GRAF_RAW=$(jq -r '.monitoring.grafana_version // "10.4.1"' "$MONITORING_CONFIG") + EXPO_RAW=$(jq -r '.monitoring.device_metrics_exporter_version // "v1.4.0"' "$MONITORING_CONFIG") + + PROMETHEUS_VERSION=$(resolve_with_fallback "$PROM_RAW" "2.55.0") + GRAFANA_VERSION=$(resolve_with_fallback "$GRAF_RAW" "10.4.1") + DEVICE_METRICS_VERSION=$(resolve_with_fallback "$EXPO_RAW" "v1.4.0") +else + # Fallback defaults + PROMETHEUS_VERSION="2.55.0" + GRAFANA_VERSION="10.4.1" + DEVICE_METRICS_VERSION="v1.4.0" +fi + +echo "============================================" +echo "CVS Monitoring Stack Deployment" +echo "============================================" +echo "" +echo "Working Directory: $REPO_ROOT" +echo "Cluster File: $CLUSTER_FILE" +echo "Monitoring Config: $MONITORING_CONFIG" +echo "Prometheus Version: $PROMETHEUS_VERSION" +echo "Grafana Version: $GRAFANA_VERSION" +echo "Exporter Version: $DEVICE_METRICS_VERSION" +echo "" + +# Step 1: Deploy Device Metrics Exporter on all GPU nodes using pytest +echo "Step 1: Deploying Device Metrics Exporter on all GPU nodes..." +echo "------------------------------------------------------------" +pytest -vv -s ./tests/monitoring/install_device_metrics_exporter.py \ + --cluster_file "$CLUSTER_FILE" \ + --config_file "$MONITORING_CONFIG" \ + --html=/tmp/device_metrics_install_report.html \ + --capture=tee-sys \ + --self-contained-html + +if [ $? -ne 0 ]; then + echo "ERROR: Device Metrics Exporter installation failed!" + exit 1 +fi + +echo "" +echo "- Device Metrics Exporter deployed successfully!" +echo "" + +# Step 2: Setup Prometheus on management node +echo "Step 2: Setting up Prometheus..." +echo "------------------------------------------------------------" + +if ! command -v prometheus &> /dev/null; then + echo "Prometheus not found. Installing..." + + cd /tmp + echo "Downloading Prometheus ${PROMETHEUS_VERSION} (~92MB)..." + wget --progress=bar:force https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz + echo "Download complete. Extracting..." + tar xzf prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz + + sudo mkdir -p /opt/prometheus + sudo cp -r prometheus-${PROMETHEUS_VERSION}.linux-amd64/* /opt/prometheus/ + sudo mkdir -p /var/lib/prometheus/data + + cd "$REPO_ROOT" + + # Copy config from repo + if [ -f "./monitoring/prometheus/prometheus.yml" ]; then + sudo cp ./monitoring/prometheus/prometheus.yml /opt/prometheus/ + echo "- Copied prometheus.yml" + else + echo "ERROR: prometheus.yml not found at ./monitoring/prometheus/prometheus.yml" + exit 1 + fi + + if [ -f "./monitoring/prometheus/alert_rules.yml" ]; then + sudo cp ./monitoring/prometheus/alert_rules.yml /opt/prometheus/ + echo "- Copied alert_rules.yml" + else + echo "WARNING: alert_rules.yml not found" + fi + + # Create systemd service + sudo tee /etc/systemd/system/prometheus.service > /dev/null </dev/null || true + docker rm grafana 2>/dev/null || true + + docker run -d \ + -p 3000:3000 \ + --name grafana \ + --restart unless-stopped \ + -v grafana-storage:/var/lib/grafana \ + grafana/grafana:${GRAFANA_VERSION} + + echo " Grafana installed and started" + echo " Default credentials: admin/admin" +else + echo " Grafana container already exists" + if ! docker ps --format '{{.Names}}' | grep -q '^grafana$'; then + echo " Starting Grafana..." + docker start grafana + fi +fi + +# Step 4: Verify everything is running +echo "" +echo "Step 4: Verifying installation..." +echo "------------------------------------------------------------" + +# Wait a bit for services to be ready +sleep 3 + +# Check Prometheus +if curl -s http://localhost:9090/-/healthy > /dev/null 2>&1; then + echo " Prometheus is healthy" +else + echo " Prometheus health check failed" +fi + +# Check Grafana +if curl -s http://localhost:3000/api/health > /dev/null 2>&1; then + echo " Grafana is healthy" +else + echo " Grafana health check failed (may still be starting...)" +fi + +# Check Device Metrics Exporter +if curl -s http://localhost:5000/metrics | head -1 > /dev/null 2>&1; then + echo " Device Metrics Exporter is responding" +else + echo " Device Metrics Exporter check failed" +fi + +# Check targets if jq available +if command -v jq &> /dev/null; then + echo "" + echo "Prometheus Targets:" + curl -s http://localhost:9090/api/v1/targets 2>/dev/null | \ + jq -r '.data.activeTargets[]? | "\(.labels.instance): \(.health)"' 2>/dev/null || \ + echo " (Could not retrieve targets)" +fi + +echo "" +echo "============================================" +echo "Deployment Complete!" +echo "============================================" +echo "" +echo "Access URLs:" +echo " Prometheus: http://localhost:9090" +echo " Grafana: http://localhost:3000" +echo " Exporter: http://localhost:5000/metrics" +echo "" +echo "Next Steps:" +echo " 1. Log into Grafana (admin/admin)" +echo " 2. Add Prometheus as datasource: http://localhost:9090" +echo " 3. Import dashboards from monitoring/grafana/dashboards/ (if available)" +echo " 4. Run CVS tests with --prometheus-url=http://localhost:9090" +echo "" From 7b4aa82da096415dd703e112927791ed81a0ee17 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Wed, 19 Nov 2025 08:48:30 -0800 Subject: [PATCH 2/7] using amdgpu filters everything out, metrics have gpu_* names fixing config for prometheus to scrappe proper ones --- monitoring/prometheus/prometheus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index 33899dc7..abc87370 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -45,7 +45,7 @@ scrape_configs: # Metric relabeling (optional filtering) metric_relabel_configs: - source_labels: [__name__] - regex: 'amdgpu_.*' + regex: 'gpu_.*' action: keep # Prometheus self-monitoring From f54747a0ad8e7f490bc1b071ed0b795fda612c20 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Thu, 20 Nov 2025 02:49:45 -0800 Subject: [PATCH 3/7] remove new CLI opts and cleanup --- conftest.py | 4 --- input/cluster_file/dummy_monitor_cluster.json | 26 ------------------- input/cluster_file/local_test_cluster.json | 18 ------------- .../cluster_file/sample_monitor_cluster.json | 13 ++++++++++ 4 files changed, 13 insertions(+), 48 deletions(-) delete mode 100644 input/cluster_file/dummy_monitor_cluster.json delete mode 100644 input/cluster_file/local_test_cluster.json create mode 100644 input/cluster_file/sample_monitor_cluster.json diff --git a/conftest.py b/conftest.py index 58ab331b..560404e5 100644 --- a/conftest.py +++ b/conftest.py @@ -14,10 +14,6 @@ def pytest_addoption(parser): help="Path to the cluster JSON file") parser.addoption("--config_file", action="store", default=None, help="Path to the config JSON file") - parser.addoption("--prometheus-url", action="store", default=None, - help="Prometheus server URL (optional)") - parser.addoption("--grafana-url", action="store", default=None, - help="Grafana server URL (optional)") def pytest_metadata(metadata): """Add CVS version metadata for both console output and HTML report.""" diff --git a/input/cluster_file/dummy_monitor_cluster.json b/input/cluster_file/dummy_monitor_cluster.json deleted file mode 100644 index 6ac97ad3..00000000 --- a/input/cluster_file/dummy_monitor_cluster.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "username": "all-os-star", - "priv_key_file": "/home/all-os-star/.ssh/id_rsa", - - "head_node_dict": { - "mgmt_ip": "10.0.1.100" - }, - - "node_dict": { - "10.0.1.10": { - "hostname": "gpu-node-1", - "vpc_ip": "10.0.1.10", - "bmc_ip": "10.0.2.10" - }, - "10.0.1.11": { - "hostname": "gpu-node-2", - "vpc_ip": "10.0.1.11", - "bmc_ip": "10.0.2.11" - }, - "10.0.1.12": { - "hostname": "gpu-node-3", - "vpc_ip": "10.0.1.12", - "bmc_ip": "10.0.2.12" - } - } -} diff --git a/input/cluster_file/local_test_cluster.json b/input/cluster_file/local_test_cluster.json deleted file mode 100644 index 3ad2f3ac..00000000 --- a/input/cluster_file/local_test_cluster.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "_comment": "Local test cluster configuration for single-machine testing", - "username": "{user-id}", - "priv_key_file": "/home/{user-id}/.ssh/id_rsa", - - "head_node_dict": { - "mgmt_ip": "localhost" - }, - - "node_dict": { - "localhost": { - "ip_addr": "127.0.0.1", - "hostname": "localhost", - "vpc_ip": "localhost", - "bmc_ip": "NA" - } - } -} diff --git a/input/cluster_file/sample_monitor_cluster.json b/input/cluster_file/sample_monitor_cluster.json new file mode 100644 index 00000000..27869619 --- /dev/null +++ b/input/cluster_file/sample_monitor_cluster.json @@ -0,0 +1,13 @@ +{ + "username": "svdt-8", + "priv_key_file": "/home/svdt-8/.ssh/id_rsa", + "head_node_dict": { + "mgmt_ip": "localhost" + }, + "node_dict": { + "localhost": { + "bmc_ip": "NA", + "vpc_ip": "localhost" + } + } +} From aaac29163eba1edf56ab36f37527b2846971c652 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Thu, 20 Nov 2025 03:57:32 -0800 Subject: [PATCH 4/7] stop prometheus before checking and running the flow --- utils/deploy_monitoring_stack.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/utils/deploy_monitoring_stack.sh b/utils/deploy_monitoring_stack.sh index 0c06278c..0ece3e94 100755 --- a/utils/deploy_monitoring_stack.sh +++ b/utils/deploy_monitoring_stack.sh @@ -79,6 +79,15 @@ echo "" # Step 2: Setup Prometheus on management node echo "Step 2: Setting up Prometheus..." echo "------------------------------------------------------------" +# Stop existing Prometheus if running +if systemctl is-active --quiet prometheus 2>/dev/null; then + echo "Stopping existing Prometheus service..." + sudo systemctl stop prometheus + sleep 2 +fi + +sudo pkill -9 prometheus 2>/dev/null || true +sleep 2 if ! command -v prometheus &> /dev/null; then echo "Prometheus not found. Installing..." From 4a6ae0e352b2f0350bdce0b7eae9668eb17f20ac Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Thu, 20 Nov 2025 04:09:22 -0800 Subject: [PATCH 5/7] keep opts as in the original, no changes here --- conftest.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/conftest.py b/conftest.py index 560404e5..4c30bcfc 100644 --- a/conftest.py +++ b/conftest.py @@ -10,13 +10,12 @@ # Add all additional cmd line arguments for the script def pytest_addoption(parser): - parser.addoption("--cluster_file", action="store", default=None, - help="Path to the cluster JSON file") - parser.addoption("--config_file", action="store", default=None, - help="Path to the config JSON file") - + parser.addoption( "--cluster_file", action="store", required=True, help="Input file with all the details of the cluster, nodes, switches in JSON format" ) + parser.addoption( "--config_file", action="store", required=True, help="Input file with all configurations and parameters for tests in JSON format" ) + def pytest_metadata(metadata): """Add CVS version metadata for both console output and HTML report.""" + # Read CVS version from version.txt cvs_version = "Unknown" version_file = os.path.join(os.path.dirname(__file__), "version.txt") From d113c5dbe5885791b5030c02481b928efe65bf01 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Thu, 20 Nov 2025 23:07:28 -0800 Subject: [PATCH 6/7] check mgmt and install servers only there --- lib/prometheus_config_lib.py | 107 ++++++ lib/utils_lib.py | 135 +++++++ .../install_device_metrics_exporter.py | 351 ++++++++++++++++++ 3 files changed, 593 insertions(+) create mode 100644 lib/prometheus_config_lib.py diff --git a/lib/prometheus_config_lib.py b/lib/prometheus_config_lib.py new file mode 100644 index 00000000..da6934f6 --- /dev/null +++ b/lib/prometheus_config_lib.py @@ -0,0 +1,107 @@ +''' +Copyright 2025 Advanced Micro Devices, Inc. +Prometheus Configuration Generator for CVS Monitoring +''' + +import json +import yaml +import logging + +log = logging.getLogger(__name__) + + +def generate_prometheus_config(cluster_dict, config_dict, output_file=None): + """ + Generate Prometheus configuration with dynamic scrape targets. + + Args: + cluster_dict: Cluster configuration + config_dict: Monitoring configuration + output_file: Optional output file path + + Returns: + str: YAML configuration content + """ + from utils_lib import generate_prometheus_targets + + # Get configuration values + scrape_interval = config_dict.get('scrape_interval', '15s') + scrape_timeout = config_dict.get('scrape_timeout', '10s') + retention_days = config_dict.get('retention_days', 30) + exporter_port = config_dict.get('device_metrics_exporter_port', 5000) + + # Generate targets for all nodes (management + workers) + targets = generate_prometheus_targets(cluster_dict, exporter_port) + + log.info(f"Generating Prometheus config for {len(targets)} targets") + for target in targets: + log.info(f" • {target}") + + # Build Prometheus configuration + config = { + 'global': { + 'scrape_interval': scrape_interval, + 'scrape_timeout': scrape_timeout, + 'evaluation_interval': scrape_interval + }, + 'scrape_configs': [ + { + 'job_name': 'device-metrics-exporter', + 'static_configs': [ + { + 'targets': targets + } + ], + 'metric_relabel_configs': [ + { + 'source_labels': ['__name__'], + 'regex': 'gpu_.*', + 'action': 'keep' + } + ] + } + ] + } + + # Convert to YAML + yaml_content = yaml.dump(config, default_flow_style=False, sort_keys=False) + + # Write to file if specified + if output_file: + with open(output_file, 'w') as f: + f.write(yaml_content) + log.info(f"Prometheus config written to: {output_file}") + + return yaml_content + + +def update_prometheus_targets(prometheus_yml_path, cluster_dict, exporter_port=5000): + """ + Update existing Prometheus config with new targets. + + Args: + prometheus_yml_path: Path to prometheus.yml + cluster_dict: Cluster configuration + exporter_port: Exporter port (default: 5000) + """ + from utils_lib import generate_prometheus_targets + + # Load existing config + with open(prometheus_yml_path, 'r') as f: + config = yaml.safe_load(f) + + # Generate new targets + targets = generate_prometheus_targets(cluster_dict, exporter_port) + + # Update targets in scrape config + for scrape_config in config.get('scrape_configs', []): + if scrape_config.get('job_name') == 'device-metrics-exporter': + scrape_config['static_configs'] = [{'targets': targets}] + log.info(f"Updated scrape targets: {targets}") + break + + # Write back + with open(prometheus_yml_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + + log.info(f"Prometheus config updated: {prometheus_yml_path}") diff --git a/lib/utils_lib.py b/lib/utils_lib.py index 1e86b271..9c288931 100644 --- a/lib/utils_lib.py +++ b/lib/utils_lib.py @@ -761,3 +761,138 @@ def collect_system_metadata(phdl, cluster_dict, config_dict, test_command=None, log.info(f'Collected metadata: {list(metadata.keys())}') return metadata + + +def get_management_node(cluster_dict): + """ + Get the management/head node from cluster configuration. + + Args: + cluster_dict: Cluster configuration dictionary + + Returns: + str: Management node IP/hostname + + Example: + >>> cluster = {'head_node_dict': {'mgmt_ip': '10.0.0.100'}} + >>> get_management_node(cluster) + '10.0.0.100' + """ + return cluster_dict.get('head_node_dict', {}).get('mgmt_ip', 'localhost') + + +def get_all_nodes(cluster_dict): + """ + Get all nodes (workers + management) from cluster configuration. + + Args: + cluster_dict: Cluster configuration dictionary + + Returns: + list: All node IPs/hostnames including management node + + Example: + >>> cluster = { + ... 'head_node_dict': {'mgmt_ip': '10.0.0.100'}, + ... 'node_dict': {'10.0.0.101': {...}, '10.0.0.102': {...}} + ... } + >>> get_all_nodes(cluster) + ['10.0.0.100', '10.0.0.101', '10.0.0.102'] + """ + mgmt_node = get_management_node(cluster_dict) + worker_nodes = list(cluster_dict.get('node_dict', {}).keys()) + + # Management node + all workers + all_nodes = [mgmt_node] + worker_nodes + + # Remove duplicates (in case mgmt is also in node_dict) + return list(dict.fromkeys(all_nodes)) + + +def get_worker_nodes(cluster_dict): + """ + Get worker nodes only (excluding management node). + + Args: + cluster_dict: Cluster configuration dictionary + + Returns: + list: Worker node IPs/hostnames + """ + return list(cluster_dict.get('node_dict', {}).keys()) + + +def is_management_node(node, cluster_dict): + """ + Check if a node is the management/head node. + + Args: + node: Node IP/hostname to check + cluster_dict: Cluster configuration dictionary + + Returns: + bool: True if node is management node + + Example: + >>> cluster = {'head_node_dict': {'mgmt_ip': 'localhost'}} + >>> is_management_node('localhost', cluster) + True + >>> is_management_node('10.0.0.101', cluster) + False + """ + mgmt_node = get_management_node(cluster_dict) + + # Handle localhost aliases + if mgmt_node in ['localhost', '127.0.0.1'] and node in ['localhost', '127.0.0.1']: + return True + + return node == mgmt_node + + +def is_single_node_deployment(cluster_dict): + """ + Detect if this is a single-node (localhost) deployment. + + Args: + cluster_dict: Cluster configuration dictionary + + Returns: + bool: True if single-node deployment + + Example: + >>> cluster = {'head_node_dict': {'mgmt_ip': 'localhost'}, 'node_dict': {'localhost': {}}} + >>> is_single_node_deployment(cluster) + True + """ + all_nodes = get_all_nodes(cluster_dict) + + # Single node if only one unique node + if len(set(all_nodes)) == 1: + return True + + # Also single node if all nodes are localhost variants + localhost_variants = {'localhost', '127.0.0.1', '::1'} + return all(node in localhost_variants for node in all_nodes) + + +def generate_prometheus_targets(cluster_dict, exporter_port=5000): + """ + Generate Prometheus scrape targets for all nodes. + + Args: + cluster_dict: Cluster configuration dictionary + exporter_port: Port where Device Metrics Exporter runs (default: 5000) + + Returns: + list: Prometheus target strings in format "host:port" + + Example: + >>> cluster = { + ... 'head_node_dict': {'mgmt_ip': '10.0.0.100'}, + ... 'node_dict': {'10.0.0.101': {}, '10.0.0.102': {}} + ... } + >>> generate_prometheus_targets(cluster) + ['10.0.0.100:5000', '10.0.0.101:5000', '10.0.0.102:5000'] + """ + all_nodes = get_all_nodes(cluster_dict) + return [f"{node}:{exporter_port}" for node in all_nodes] diff --git a/tests/monitoring/install_device_metrics_exporter.py b/tests/monitoring/install_device_metrics_exporter.py index eb490fd0..ca63789d 100644 --- a/tests/monitoring/install_device_metrics_exporter.py +++ b/tests/monitoring/install_device_metrics_exporter.py @@ -259,3 +259,354 @@ def test_display_summary(phdl): log.info(f" {node}: {out_dict[node]}") log.info("Completed metrics tests successfully.") + + +# ============================================================================ +# Node Role Detection Fixtures +# ============================================================================ + +@pytest.fixture(scope='module') +def management_node(cluster_dict): + """Get the management/head node from cluster.""" + from utils_lib import get_management_node + return get_management_node(cluster_dict) + + +@pytest.fixture(scope='module') +def all_nodes(cluster_dict): + """Get all nodes (management + workers) where exporter should run.""" + from utils_lib import get_all_nodes + return get_all_nodes(cluster_dict) + + +@pytest.fixture(scope='module') +def worker_nodes(cluster_dict): + """Get worker nodes only.""" + from utils_lib import get_worker_nodes + return get_worker_nodes(cluster_dict) + + +@pytest.fixture(scope='module') +def is_single_node(cluster_dict): + """Check if this is a single-node deployment.""" + from utils_lib import is_single_node_deployment + return is_single_node_deployment(cluster_dict) + + +@pytest.fixture(scope='module') +def prometheus_targets(cluster_dict, config_dict): + """Generate Prometheus scrape targets for all nodes.""" + from utils_lib import generate_prometheus_targets + exporter_port = config_dict.get('device_metrics_exporter_port', 5000) + return generate_prometheus_targets(cluster_dict, exporter_port) + + +def is_mgmt_node(node, cluster_dict): + """Helper function to check if node is management node.""" + from utils_lib import is_management_node + return is_management_node(node, cluster_dict) + + +# Tests with Management Node Awareness + +def test_deploy_prometheus_on_management_only(cluster_dict, management_node, is_single_node, config_dict, prometheus_targets): + """ + Deploy Prometheus ONLY on management node with all targets configured. + Uses pssh for multi-node, subprocess for localhost. + """ + log.info("="*80) + log.info(f"Deploying Prometheus on management node: {management_node}") + log.info(f"Targets: {prometheus_targets}") + log.info("="*80) + + import subprocess + import os + from prometheus_config_lib import generate_prometheus_config + + # Generate Prometheus config + prometheus_yml = "/tmp/prometheus_cvs.yml" + generate_prometheus_config(cluster_dict, config_dict, prometheus_yml) + log.info(f" Config generated with {len(prometheus_targets)} targets") + + prom_version = config_dict.get('prometheus_version', 'v2.55.0').lstrip('v') + + # Deploy on localhost/management node + if is_single_node or management_node in ['localhost', '127.0.0.1', '::1']: + # LOCAL DEPLOYMENT + # Stop existing + subprocess.run("sudo systemctl stop prometheus 2>/dev/null || true", shell=True) + subprocess.run("sudo pkill -9 prometheus 2>/dev/null || true", shell=True) + + # Install if needed + if not os.path.exists('/opt/prometheus/prometheus'): + log.info(f"Installing Prometheus {prom_version}...") + cmd = f"""cd /tmp && wget -q https://github.com/prometheus/prometheus/releases/download/v{prom_version}/prometheus-{prom_version}.linux-amd64.tar.gz && tar xzf prometheus-{prom_version}.linux-amd64.tar.gz && sudo mkdir -p /opt/prometheus /var/lib/prometheus/data && sudo cp -r prometheus-{prom_version}.linux-amd64/* /opt/prometheus/""" + subprocess.run(cmd, shell=True, check=True) + + # Copy config + subprocess.run(f"sudo cp {prometheus_yml} /opt/prometheus/prometheus.yml", shell=True, check=True) + + # Create systemd service + svc = """[Unit] +Description=Prometheus +After=network.target + +[Service] +Type=simple +User=root +ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/data --web.listen-address=0.0.0.0:9090 +Restart=always + +[Install] +WantedBy=multi-user.target +""" + with open('/tmp/prometheus.service', 'w') as f: + f.write(svc) + subprocess.run("sudo cp /tmp/prometheus.service /etc/systemd/system/", shell=True, check=True) + subprocess.run("sudo systemctl daemon-reload && sudo systemctl enable prometheus && sudo systemctl start prometheus", shell=True, check=True) + + import time + time.sleep(3) + + # Verify + result = subprocess.run("systemctl is-active prometheus", shell=True, capture_output=True) + assert result.returncode == 0, "Prometheus not running" + log.info("SUCCESS: Prometheus running on management node (localhost)") + else: + # MULTI-NODE DEPLOYMENT via SSH to management node only + log.info(f"Deploying to remote management node: {management_node}") + from parallel_ssh_lib import ParallelSSH + + # Create SSH client for management node ONLY + mgmt_dict = {management_node: cluster_dict['node_dict'].get(management_node, {'bmc_ip': 'NA', 'vpc_ip': management_node})} + phdl = ParallelSSH( + node_dict=mgmt_dict, + username=cluster_dict['username'], + priv_key_file=cluster_dict['priv_key_file'] + ) + + # Upload config file to management node + import tempfile + with tempfile.NamedTemporaryFile(mode='w', suffix='.yml', delete=False) as f: + with open(prometheus_yml, 'r') as src: + f.write(src.read()) + temp_config = f.name + + # Deploy Prometheus on management node only + deploy_script = f""" + # Stop existing + sudo systemctl stop prometheus 2>/dev/null || true + sudo pkill -9 prometheus 2>/dev/null || true + + # Install if needed + if [ ! -f /opt/prometheus/prometheus ]; then + echo "Installing Prometheus {prom_version}..." + cd /tmp + wget -q https://github.com/prometheus/prometheus/releases/download/v{prom_version}/prometheus-{prom_version}.linux-amd64.tar.gz + tar xzf prometheus-{prom_version}.linux-amd64.tar.gz + sudo mkdir -p /opt/prometheus /var/lib/prometheus/data + sudo cp -r prometheus-{prom_version}.linux-amd64/* /opt/prometheus/ + fi + + # Copy config (uploaded separately via SCP) + sudo mkdir -p /opt/prometheus + + # Create systemd service + sudo tee /etc/systemd/system/prometheus.service > /dev/null << 'SVCEOF' +[Unit] +Description=Prometheus +After=network.target + +[Service] +Type=simple +User=root +ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/data --web.listen-address=0.0.0.0:9090 +Restart=always + +[Install] +WantedBy=multi-user.target +SVCEOF + + sudo systemctl daemon-reload + sudo systemctl enable prometheus + sudo systemctl start prometheus + sleep 2 + systemctl is-active prometheus + """ + + # Execute deployment on management node only + result = phdl.exec(deploy_script) + + # Verify deployment succeeded + for node, output in result.items(): + if 'active' not in output: + fail_test(f"Prometheus deployment failed on {node}: {output}") + + log.info(f"SUCCESS: Prometheus deployed and running on management node: {management_node}") + log.info("SUCCESS: ENFORCEMENT: Prometheus deployed ONLY to management node, NOT to workers") + +def test_deploy_grafana_on_management_only(cluster_dict, management_node, is_single_node, config_dict): + """ + Deploy Grafana ONLY on management node. + Uses pssh for multi-node, subprocess for localhost. + """ + log.info(f"Deploying Grafana on management node: {management_node}") + + import subprocess + import os + + grafana_version = config_dict.get('grafana_version', '10.4.1') + grafana_port = config_dict.get('grafana_port', '3000') + + if is_single_node or management_node in ['localhost', '127.0.0.1', '::1']: + # LOCAL DEPLOYMENT + # Stop existing + subprocess.run("docker stop grafana 2>/dev/null || true", shell=True) + subprocess.run("docker rm grafana 2>/dev/null || true", shell=True) + + # Create data directory + grafana_data = "/home/svdt-8/manoj/cvs/cvs/monitoring/grafana_data" + os.makedirs(grafana_data, exist_ok=True) + subprocess.run(f"sudo chown -R 472:472 {grafana_data}", shell=True, check=True) + + # Start Grafana + cmd = f"""docker run -d \ + --name grafana \ + --network host \ + --restart unless-stopped \ + -v {grafana_data}:/var/lib/grafana \ + grafana/grafana:{grafana_version}""" + subprocess.run(cmd, shell=True, check=True) + + import time + time.sleep(3) + + # Verify + result = subprocess.run("docker ps | grep grafana", shell=True, capture_output=True) + assert result.returncode == 0, "Grafana not running" + log.info(f"SUCCESS: Grafana running on management node (localhost) port {grafana_port}") + else: + # MULTI-NODE DEPLOYMENT via SSH to management node only + log.info(f"Deploying to remote management node: {management_node}") + from parallel_ssh_lib import ParallelSSH + + # Create SSH client for management node ONLY + mgmt_dict = {management_node: cluster_dict['node_dict'].get(management_node, {'bmc_ip': 'NA', 'vpc_ip': management_node})} + phdl = ParallelSSH( + node_dict=mgmt_dict, + username=cluster_dict['username'], + priv_key_file=cluster_dict['priv_key_file'] + ) + + # Deploy Grafana on management node only + deploy_script = f""" + # Stop existing + docker stop grafana 2>/dev/null || true + docker rm grafana 2>/dev/null || true + + # Create data directory + mkdir -p /tmp/grafana_data + sudo chown -R 472:472 /tmp/grafana_data + + # Start Grafana + docker run -d \ + --name grafana \ + --network host \ + --restart unless-stopped \ + -v /tmp/grafana_data:/var/lib/grafana \ + grafana/grafana:{grafana_version} + + sleep 3 + docker ps | grep grafana + """ + + # Execute deployment on management node only + result = phdl.exec(deploy_script) + + # Verify deployment succeeded + for node, output in result.items(): + if 'grafana' not in output: + fail_test(f"Grafana deployment failed on {node}: {output}") + + log.info(f"SUCCESS: Grafana deployed and running on management node: {management_node}") + log.info("SUCCESS: ENFORCEMENT: Grafana deployed ONLY to management node, NOT to workers") + +def test_verify_all_nodes_for_exporter(all_nodes, management_node): + """ + Verify that exporter targets include all nodes (management + workers). + """ + log.info("="*80) + log.info(f"All nodes where exporter should run:") + for node in all_nodes: + is_mgmt = " (MANAGEMENT)" if node == management_node else "" + log.info(f" • {node}{is_mgmt}") + log.info("="*80) + + assert len(all_nodes) > 0 + assert management_node in all_nodes + log.info(f" Total nodes for exporter deployment: {len(all_nodes)}") + + +def test_prometheus_scrape_targets(prometheus_targets, all_nodes): + """ + Verify Prometheus scrape targets include all nodes. + """ + log.info("="*80) + log.info("Prometheus scrape targets:") + for target in prometheus_targets: + log.info(f" • {target}") + log.info("="*80) + + assert len(prometheus_targets) == len(all_nodes) + log.info(f" Scrape targets generated for all {len(all_nodes)} nodes") + + +def test_verify_service_distribution(cluster_dict, management_node, all_nodes, worker_nodes, is_single_node): + """ + CRITICAL TEST: Verify service distribution enforcement. + - Exporter must be on ALL nodes (management + workers) + - Prometheus must be ONLY on management node + - Grafana must be ONLY on management node + """ + log.info("="*80) + log.info("VERIFYING SERVICE DISTRIBUTION ENFORCEMENT") + log.info("="*80) + + # Show the architecture + log.info(f"\n Cluster Architecture:") + log.info(f" Management Node: {management_node}") + log.info(f" Worker Nodes: {worker_nodes if worker_nodes else 'None (single-node)'}") + log.info(f" Total Nodes: {len(all_nodes)}") + log.info(f" Deployment Type: {'Single-Node' if is_single_node else 'Multi-Node'}") + + log.info(f"\nSUCCESS: SERVICE DISTRIBUTION RULES:") + log.info(f" 1. Device Metrics Exporter → ALL {len(all_nodes)} nodes") + for node in all_nodes: + marker = "(MANAGEMENT)" if node == management_node else "(WORKER)" + log.info(f" {node} {marker}") + + log.info(f"\n 2. Prometheus → ONLY management node") + log.info(f" {management_node} (MANAGEMENT ONLY)") + if worker_nodes: + for node in worker_nodes: + log.info(f" {node} (NOT deployed)") + + log.info(f"\n 3. Grafana → ONLY management node") + log.info(f" {management_node} (MANAGEMENT ONLY)") + if worker_nodes: + for node in worker_nodes: + log.info(f" {node} (NOT deployed)") + + log.info(f"\n" + "="*80) + log.info("SUCCESS: SERVICE DISTRIBUTION VERIFIED") + log.info("="*80) + + # Assert the rules + assert len(all_nodes) >= 1, "Must have at least one node" + assert management_node in all_nodes, "Management node must be in all_nodes list" + + if not is_single_node: + assert len(worker_nodes) > 0, "Multi-node must have workers" + log.info(f"SUCCESS: ENFORCEMENT VERIFIED: Multi-node cluster with proper separation") + else: + log.info(f"SUCCESS: ENFORCEMENT VERIFIED: Single-node deployment (all services on localhost)") From d60445fe99dc4c88f5d301e65926c7b794b04519 Mon Sep 17 00:00:00 2001 From: Manoj S K Date: Mon, 24 Nov 2025 03:57:40 -0800 Subject: [PATCH 7/7] fixes after cluster tests --- .../install_device_metrics_exporter.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/tests/monitoring/install_device_metrics_exporter.py b/tests/monitoring/install_device_metrics_exporter.py index ca63789d..66c370da 100644 --- a/tests/monitoring/install_device_metrics_exporter.py +++ b/tests/monitoring/install_device_metrics_exporter.py @@ -363,7 +363,7 @@ def test_deploy_prometheus_on_management_only(cluster_dict, management_node, is_ with open('/tmp/prometheus.service', 'w') as f: f.write(svc) subprocess.run("sudo cp /tmp/prometheus.service /etc/systemd/system/", shell=True, check=True) - subprocess.run("sudo systemctl daemon-reload && sudo systemctl enable prometheus && sudo systemctl start prometheus", shell=True, check=True) + subprocess.run("sudo systemctl daemon-reload && sudo systemctl enable prometheus && sudo systemctl restart prometheus", shell=True, check=True) import time time.sleep(3) @@ -375,15 +375,11 @@ def test_deploy_prometheus_on_management_only(cluster_dict, management_node, is_ else: # MULTI-NODE DEPLOYMENT via SSH to management node only log.info(f"Deploying to remote management node: {management_node}") - from parallel_ssh_lib import ParallelSSH + from parallel_ssh_lib import Pssh # Create SSH client for management node ONLY mgmt_dict = {management_node: cluster_dict['node_dict'].get(management_node, {'bmc_ip': 'NA', 'vpc_ip': management_node})} - phdl = ParallelSSH( - node_dict=mgmt_dict, - username=cluster_dict['username'], - priv_key_file=cluster_dict['priv_key_file'] - ) + phdl = Pssh(log, list(mgmt_dict.keys()), user=cluster_dict['username'], pkey=cluster_dict['priv_key_file']) # Upload config file to management node import tempfile @@ -488,15 +484,11 @@ def test_deploy_grafana_on_management_only(cluster_dict, management_node, is_sin else: # MULTI-NODE DEPLOYMENT via SSH to management node only log.info(f"Deploying to remote management node: {management_node}") - from parallel_ssh_lib import ParallelSSH + from parallel_ssh_lib import Pssh # Create SSH client for management node ONLY mgmt_dict = {management_node: cluster_dict['node_dict'].get(management_node, {'bmc_ip': 'NA', 'vpc_ip': management_node})} - phdl = ParallelSSH( - node_dict=mgmt_dict, - username=cluster_dict['username'], - priv_key_file=cluster_dict['priv_key_file'] - ) + phdl = Pssh(log, list(mgmt_dict.keys()), user=cluster_dict['username'], pkey=cluster_dict['priv_key_file']) # Deploy Grafana on management node only deploy_script = f"""