diff --git a/input/cluster_file/sample_monitor_cluster.json b/input/cluster_file/sample_monitor_cluster.json new file mode 100644 index 00000000..27869619 --- /dev/null +++ b/input/cluster_file/sample_monitor_cluster.json @@ -0,0 +1,13 @@ +{ + "username": "svdt-8", + "priv_key_file": "/home/svdt-8/.ssh/id_rsa", + "head_node_dict": { + "mgmt_ip": "localhost" + }, + "node_dict": { + "localhost": { + "bmc_ip": "NA", + "vpc_ip": "localhost" + } + } +} diff --git a/input/config_file/monitoring/monitoring_config.json b/input/config_file/monitoring/monitoring_config.json new file mode 100644 index 00000000..4e7ee738 --- /dev/null +++ b/input/config_file/monitoring/monitoring_config.json @@ -0,0 +1,38 @@ +{ + "monitoring": { + "device_metrics_exporter_version": "{device-metrics-version}", + "device_metrics_exporter_image": "rocm/device-metrics-exporter:{device-metrics-version}", + "device_metrics_exporter_port": 5000, + + "prometheus_host": "{prometheus-host}", + "prometheus_port": 9090, + "prometheus_version": "{prometheus-version}", + "prometheus_url": "http://{prometheus-host}:{prometheus-port}", + + "grafana_host": "{grafana-host}", + "grafana_port": 3000, + "grafana_version": "{grafana-version}", + "grafana_url": "http://{grafana-host}:{grafana-port}", + "grafana_username": "admin", + "grafana_password": "{grafana-password}", + "grafana_api_key": "{grafana-api-key}", + + "scrape_interval": "15s", + "scrape_timeout": "10s", + "retention_days": 30, + + "alert_thresholds": { + "temperature_warning": 95, + "temperature_critical": 105, + "power_warning": 700, + "ecc_error_rate_warning": 10, + "memory_usage_warning": 90 + }, + + "deployment": { + "docker_network": "host", + "restart_policy": "unless-stopped", + "log_level": "INFO" + } + } +} \ No newline at end of file diff --git a/lib/device_metrics_lib.py b/lib/device_metrics_lib.py new file mode 100644 index 00000000..2eef53e8 --- /dev/null +++ b/lib/device_metrics_lib.py @@ -0,0 +1,242 @@ +''' +Copyright 2025 Advanced Micro Devices, Inc. +Device Metrics Integration Library for CVS +''' + +import requests +import json +import logging +from typing import Dict, List, Optional, Any +from datetime import datetime + +log = logging.getLogger(__name__) + + +class PrometheusClient: + """Client for querying Prometheus API to retrieve GPU metrics.""" + + def __init__(self, prometheus_url: str, timeout: int = 30): + self.base_url = prometheus_url.rstrip('/') + self.timeout = timeout + self.api_url = f"{self.base_url}/api/v1" + + def check_health(self) -> bool: + """Check if Prometheus server is healthy.""" + try: + response = requests.get(f"{self.base_url}/-/healthy", timeout=5) + return response.status_code == 200 + except Exception as e: + log.error(f"Prometheus health check failed: {e}") + return False + + def query_instant(self, query: str, time: Optional[str] = None) -> Optional[Dict]: + """Execute instant Prometheus query.""" + params = {'query': query} + if time: + params['time'] = time + + try: + response = requests.get( + f"{self.api_url}/query", + params=params, + timeout=self.timeout + ) + response.raise_for_status() + data = response.json() + + if data.get('status') == 'success': + return data.get('data') + else: + log.error(f"Prometheus query failed: {data.get('error')}") + return None + + except Exception as e: + log.error(f"Error querying Prometheus: {e}") + return None + + def query_range(self, query: str, start: str, end: str, step: str = '15s') -> Optional[Dict]: + """Execute range Prometheus query for time-series data.""" + params = { + 'query': query, + 'start': start, + 'end': end, + 'step': step + } + + try: + response = requests.get( + f"{self.api_url}/query_range", + params=params, + timeout=self.timeout + ) + response.raise_for_status() + data = response.json() + + if data.get('status') == 'success': + return data.get('data') + else: + log.error(f"Prometheus range query failed: {data.get('error')}") + return None + + except Exception as e: + log.error(f"Error querying Prometheus range: {e}") + return None + + +def get_gpu_metrics_from_prometheus( + prom_client: PrometheusClient, + node: str, + gpu_ids: Optional[List[str]] = None +) -> Dict[str, Dict[str, float]]: + """ + Retrieve GPU metrics from Prometheus for a specific node. + + Returns: + { + '0': {'temperature': 45.0, 'power': 300.5, 'utilization': 85.0}, + '1': {'temperature': 46.0, 'power': 295.3, 'utilization': 82.0} + } + """ + metrics_dict = {} + + # Query temperature + temp_query = f'amdgpu_temperature_celsius{{node="{node}", sensor="edge"}}' + temp_data = prom_client.query_instant(temp_query) + + if temp_data and temp_data.get('result'): + for result in temp_data['result']: + gpu_id = result['metric'].get('gpu', 'unknown') + if gpu_ids is None or gpu_id in gpu_ids: + if gpu_id not in metrics_dict: + metrics_dict[gpu_id] = {} + metrics_dict[gpu_id]['temperature'] = float(result['value'][1]) + + # Query power consumption + power_query = f'amdgpu_power_watts{{node="{node}"}}' + power_data = prom_client.query_instant(power_query) + + if power_data and power_data.get('result'): + for result in power_data['result']: + gpu_id = result['metric'].get('gpu', 'unknown') + if gpu_ids is None or gpu_id in gpu_ids: + if gpu_id not in metrics_dict: + metrics_dict[gpu_id] = {} + metrics_dict[gpu_id]['power'] = float(result['value'][1]) + + # Query GPU utilization + util_query = f'amdgpu_gpu_busy_percent{{node="{node}"}}' + util_data = prom_client.query_instant(util_query) + + if util_data and util_data.get('result'): + for result in util_data['result']: + gpu_id = result['metric'].get('gpu', 'unknown') + if gpu_ids is None or gpu_id in gpu_ids: + if gpu_id not in metrics_dict: + metrics_dict[gpu_id] = {} + metrics_dict[gpu_id]['utilization'] = float(result['value'][1]) + + return metrics_dict + + +def get_device_exporter_health( + prom_client: PrometheusClient, + nodes: List[str] +) -> Dict[str, bool]: + """Check health status of Device Metrics Exporter on all nodes.""" + health_dict = {} + + for node in nodes: + query = f'up{{job="device-metrics-exporter", node="{node}"}}' + data = prom_client.query_instant(query) + + if data and data.get('result'): + is_up = float(data['result'][0]['value'][1]) == 1.0 + health_dict[node] = is_up + else: + health_dict[node] = False + + return health_dict + + +def create_grafana_annotation( + grafana_url: str, + text: str, + tags: List[str] = None, + api_key: Optional[str] = None, + username: Optional[str] = None, + password: Optional[str] = None, + time: Optional[int] = None +) -> bool: + """Create annotation in Grafana to mark test events on dashboards.""" + if tags is None: + tags = ['cvs-test'] + + if time is None: + time = int(datetime.now().timestamp() * 1000) + + url = f"{grafana_url.rstrip('/')}/api/annotations" + + payload = { + 'text': text, + 'tags': tags, + 'time': time + } + + headers = {'Content-Type': 'application/json'} + + if not api_key and (not username or not password): + log.warning("Grafana annotation requested without credentials or API key; skipping.") + return False + + if api_key: + headers['Authorization'] = f'Bearer {api_key}' + auth = None + else: + auth = (username, password) + + try: + response = requests.post( + url, + json=payload, + headers=headers, + auth=auth, + timeout=10 + ) + response.raise_for_status() + log.info(f"Created Grafana annotation: {text}") + return True + + except Exception as e: + log.error(f"Failed to create Grafana annotation: {e}") + return False + + +# Test function +if __name__ == '__main__': + import sys + + if len(sys.argv) < 3: + print("Usage: python device_metrics_lib.py ") + print("Example: python device_metrics_lib.py http://localhost:9090 localhost") + sys.exit(1) + + prometheus_url = sys.argv[1] + node = sys.argv[2] + + print(f"Testing Prometheus integration with {prometheus_url}") + + client = PrometheusClient(prometheus_url) + + if not client.check_health(): + print("ERROR: Prometheus server is not healthy") + sys.exit(1) + print("Prometheus server is healthy") + + metrics = get_gpu_metrics_from_prometheus(client, node) + if metrics: + print(f"Retrieved metrics for {len(metrics)} GPUs") + for gpu_id, data in metrics.items(): + print(f" GPU {gpu_id}: Temp={data.get('temperature', 'N/A')}°C, " + f"Power={data.get('power', 'N/A')}W") + else: + print("WARNING: No GPU metrics found") diff --git a/lib/gpu_metrics_lib.py b/lib/gpu_metrics_lib.py new file mode 100644 index 00000000..c9d6b556 --- /dev/null +++ b/lib/gpu_metrics_lib.py @@ -0,0 +1,416 @@ +''' +Copyright 2025 Advanced Micro Devices, Inc. +All rights reserved. +''' + +""" +Device Metrics Integration Library for CVS + +This module provides integration between CVS and AMD ROCm Device Metrics Exporter +via Prometheus. It enables CVS to query GPU metrics from Prometheus instead of +(or in addition to) SSH-based amd-smi/rocm-smi commands. + +Device Metrics Exporter: https://github.com/ROCm/device-metrics-exporter +""" + +import requests +import json +import time +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Any, Tuple +import logging + +log = logging.getLogger(__name__) + + +class PrometheusClient: + """ + Client for querying Prometheus server that scrapes Device Metrics Exporter. + """ + + def __init__(self, prometheus_url: str , timeout: int = 30): + if not prometheus_url: + # fall back only if truly absent + prometheus_url = os.getenv("PROMETHEUS_URL", "http://localhost:9090") + self.prometheus_url = prometheus_url.rstrip('/') + self.timeout = timeout + self.api_url = f"{self.prometheus_url}/api/v1" + log.info(f"Initialized Prometheus client for {self.prometheus_url}") + + def check_health(self) -> bool: + """Check if Prometheus server is healthy and reachable.""" + try: + response = requests.get( + f"{self.prometheus_url}/-/healthy", + timeout=self.timeout + ) + if response.status_code == 200: + log.info(f"✓ Prometheus server at {self.prometheus_url} is healthy") + return True + else: + log.error(f"✗ Prometheus health check failed with status {response.status_code}") + return False + except Exception as e: + log.error(f"✗ Failed to connect to Prometheus at {self.prometheus_url}: {e}") + return False + + def query_instant(self, query: str) -> Dict[str, Any]: + """Execute an instant PromQL query.""" + try: + response = requests.get( + f"{self.api_url}/query", + params={'query': query}, + timeout=self.timeout + ) + response.raise_for_status() + result = response.json() + + if result.get('status') == 'success': + log.debug(f"Query successful: {query[:50]}...") + else: + log.warning(f"Query returned non-success status: {result.get('error', 'Unknown error')}") + + return result + except Exception as e: + log.error(f"Prometheus instant query failed: {e}") + return {"status": "error", "error": str(e)} + + def query_range(self, query: str, start_time: datetime, end_time: datetime, + step: str = "15s") -> Dict[str, Any]: + """Execute a range PromQL query for time-series data.""" + try: + response = requests.get( + f"{self.api_url}/query_range", + params={ + 'query': query, + 'start': start_time.timestamp(), + 'end': end_time.timestamp(), + 'step': step + }, + timeout=self.timeout + ) + response.raise_for_status() + result = response.json() + + if result.get('status') == 'success': + log.debug(f"Range query successful: {query[:50]}... [{start_time} to {end_time}]") + + return result + except Exception as e: + log.error(f"Prometheus range query failed: {e}") + return {"status": "error", "error": str(e)} + + def get_targets(self) -> List[Dict[str, Any]]: + """Get list of all scrape targets (Device Metrics Exporters) and their status.""" + try: + response = requests.get( + f"{self.api_url}/targets", + timeout=self.timeout + ) + response.raise_for_status() + data = response.json() + + if data.get('status') == 'success': + targets = data.get('data', {}).get('activeTargets', []) + log.info(f"Retrieved {len(targets)} active targets from Prometheus") + return targets + return [] + except Exception as e: + log.error(f"Failed to get Prometheus targets: {e}") + return [] + + +# Device Metrics Exporter metric names (as of v1.4.0) +DEVICE_METRICS_MAP = { + # Temperature metrics + 'temperature_edge': 'amdgpu_temperature_edge_celsius', + 'temperature_junction': 'amdgpu_temperature_junction_celsius', + 'temperature_memory': 'amdgpu_temperature_memory_celsius', + 'temperature_hbm': 'amdgpu_temperature_hbm_celsius', + + # Utilization metrics + 'gpu_utilization': 'amdgpu_gpu_utilization_percent', + 'memory_utilization': 'amdgpu_memory_utilization_percent', + + # Power metrics + 'power_current': 'amdgpu_power_watts', + 'power_average': 'amdgpu_power_average_watts', + 'energy_consumed': 'amdgpu_energy_joules', + + # Memory metrics + 'memory_used': 'amdgpu_memory_used_bytes', + 'memory_total': 'amdgpu_memory_total_bytes', + 'memory_free': 'amdgpu_memory_free_bytes', + + # Clock metrics + 'clock_gpu': 'amdgpu_gpu_clock_mhz', + 'clock_memory': 'amdgpu_memory_clock_mhz', + + # PCIe metrics + 'pcie_bandwidth': 'amdgpu_pcie_bandwidth_bytes', + 'pcie_link_speed': 'amdgpu_pcie_link_speed_mbps', + 'pcie_link_width': 'amdgpu_pcie_link_width', + 'pcie_replay_count': 'amdgpu_pcie_replay_count_total', + 'pcie_nak_sent': 'amdgpu_pcie_nak_sent_total', + 'pcie_nak_received': 'amdgpu_pcie_nak_received_total', + + # Error metrics + 'ecc_correctable': 'amdgpu_ecc_correctable_errors_total', + 'ecc_uncorrectable': 'amdgpu_ecc_uncorrectable_errors_total', + 'ras_correctable': 'amdgpu_ras_correctable_error_count', + 'ras_uncorrectable': 'amdgpu_ras_uncorrectable_error_count', +} + + +def get_gpu_metrics_from_prometheus(prom_client: PrometheusClient, + node_list: Optional[List[str]] = None, + metrics: Optional[List[str]] = None) -> Dict[str, Dict]: + """ + Query current GPU metrics from Prometheus for all or specific nodes. + + Returns: + Dict with structure: {node: {gpu_id: {metric_name: value}}} + """ + metrics_dict = {} + + if metrics is None: + metrics = [ + 'temperature_edge', 'temperature_junction', 'temperature_memory', + 'power_current', 'power_average', + 'gpu_utilization', 'memory_utilization', + 'memory_used', 'memory_total', + 'pcie_bandwidth', 'pcie_link_speed', + 'ecc_correctable', 'ecc_uncorrectable', + 'clock_gpu', 'clock_memory' + ] + + for metric_key in metrics: + if metric_key not in DEVICE_METRICS_MAP: + log.warning(f"Unknown metric key: {metric_key}, skipping") + continue + + metric_name = DEVICE_METRICS_MAP[metric_key] + + # Build query with optional node filter + if node_list: + node_filter = '|'.join([node.replace('.', '\\.') for node in node_list]) + query = f'{metric_name}{{instance=~"({node_filter}):.*"}}' + else: + query = metric_name + + result = prom_client.query_instant(query) + + if result.get('status') == 'success': + for item in result.get('data', {}).get('result', []): + labels = item.get('metric', {}) + instance = labels.get('instance', '') + node = instance.split(':')[0] if ':' in instance else instance + gpu_id = labels.get('gpu', labels.get('gpu_id', 'unknown')) + value = item.get('value', [None, None])[1] + + try: + if value is not None: + value = float(value) + except (ValueError, TypeError): + pass + + if node not in metrics_dict: + metrics_dict[node] = {} + if gpu_id not in metrics_dict[node]: + metrics_dict[node][gpu_id] = {} + + metrics_dict[node][gpu_id][metric_key] = value + else: + log.warning(f"Failed to query metric {metric_key}: {result.get('error', 'Unknown error')}") + + log.info(f"Retrieved metrics for {len(metrics_dict)} nodes, {len(metrics)} metric types") + return metrics_dict + + +def get_device_exporter_health(prom_client: PrometheusClient, + node_list: Optional[List[str]] = None) -> Dict[str, Dict]: + """ + Check health status of Device Metrics Exporter on all nodes. + """ + health_dict = {} + targets = prom_client.get_targets() + + for target in targets: + labels = target.get('labels', {}) + instance = labels.get('instance', '') + job = labels.get('job', '') + + if 'device-metrics' not in job.lower() and 'amd' not in job.lower(): + continue + + node = instance.split(':')[0] if ':' in instance else instance + + if node_list and node not in node_list: + continue + + health_dict[node] = { + 'health': target.get('health', 'unknown'), + 'last_scrape': target.get('lastScrape', ''), + 'scrape_duration': target.get('lastScrapeDuration', 0), + 'last_error': target.get('lastError', ''), + 'scrape_url': target.get('scrapeUrl', ''), + 'labels': labels + } + + up_count = sum(1 for h in health_dict.values() if h['health'] == 'up') + down_count = sum(1 for h in health_dict.values() if h['health'] == 'down') + log.info(f"Exporter health: {up_count} up, {down_count} down out of {len(health_dict)} nodes") + + return health_dict + + +def create_grafana_annotation(grafana_url: str, api_key: str, + text: str, tags: List[str], + start_time: Optional[datetime] = None, + end_time: Optional[datetime] = None) -> bool: + """Create an annotation in Grafana to mark CVS test events.""" + try: + url = f"{grafana_url.rstrip('/')}/api/annotations" + headers = { + 'Authorization': f'Bearer {api_key}', + 'Content-Type': 'application/json' + } + + if start_time is None: + start_time = datetime.now() + + data = { + 'text': text, + 'tags': tags, + 'time': int(start_time.timestamp() * 1000) + } + + if end_time: + data['timeEnd'] = int(end_time.timestamp() * 1000) + + response = requests.post(url, headers=headers, json=data, timeout=10) + response.raise_for_status() + + log.info(f"✓ Created Grafana annotation: {text}") + return True + + except Exception as e: + log.error(f"✗ Failed to create Grafana annotation: {e}") + return False + + +def compare_ssh_vs_prometheus(ssh_metrics: Dict, prom_metrics: Dict, + tolerance: float = 5.0) -> Dict: + """Compare metrics collected via SSH vs Prometheus to validate consistency.""" + comparison = { + 'summary': { + 'total_nodes': 0, + 'matching_nodes': 0, + 'discrepancy_nodes': 0, + 'ssh_only_nodes': 0, + 'prom_only_nodes': 0 + }, + 'node_comparisons': [], + 'discrepancies': [] + } + + ssh_nodes = set(ssh_metrics.keys()) + prom_nodes = set(prom_metrics.keys()) + + comparison['summary']['total_nodes'] = len(ssh_nodes | prom_nodes) + comparison['summary']['ssh_only_nodes'] = len(ssh_nodes - prom_nodes) + comparison['summary']['prom_only_nodes'] = len(prom_nodes - ssh_nodes) + + for node in (ssh_nodes - prom_nodes): + log.warning(f"Node {node} only in SSH metrics (not in Prometheus)") + comparison['node_comparisons'].append({ + 'node': node, + 'status': 'ssh_only', + 'gpu_count_match': False + }) + + for node in (prom_nodes - ssh_nodes): + log.warning(f"Node {node} only in Prometheus metrics (not in SSH)") + comparison['node_comparisons'].append({ + 'node': node, + 'status': 'prom_only', + 'gpu_count_match': False + }) + + common_nodes = ssh_nodes & prom_nodes + + for node in common_nodes: + node_comparison = { + 'node': node, + 'status': 'match', + 'gpu_count_match': True, + 'metric_comparisons': [] + } + + ssh_gpus = set(ssh_metrics[node].keys()) + prom_gpus = set(prom_metrics[node].keys()) + + if ssh_gpus != prom_gpus: + node_comparison['gpu_count_match'] = False + node_comparison['status'] = 'discrepancy' + log.warning(f"Node {node}: GPU count mismatch") + + common_gpus = ssh_gpus & prom_gpus + for gpu_id in common_gpus: + ssh_gpu = ssh_metrics[node][gpu_id] + prom_gpu = prom_metrics[node][gpu_id] + + ssh_metric_keys = set(ssh_gpu.keys()) + prom_metric_keys = set(prom_gpu.keys()) + common_metrics = ssh_metric_keys & prom_metric_keys + + for metric_key in common_metrics: + ssh_val = ssh_gpu[metric_key] + prom_val = prom_gpu[metric_key] + + if ssh_val is None or prom_val is None: + continue + + try: + ssh_num = float(ssh_val) + prom_num = float(prom_val) + + if ssh_num != 0: + diff_percent = abs((prom_num - ssh_num) / ssh_num) * 100 + else: + diff_percent = 0 if prom_num == 0 else 100 + + if diff_percent > tolerance: + node_comparison['status'] = 'discrepancy' + comparison['discrepancies'].append({ + 'node': node, + 'gpu': str(gpu_id), + 'metric': metric_key, + 'ssh_value': ssh_num, + 'prom_value': prom_num, + 'diff_percent': round(diff_percent, 2) + }) + except (ValueError, TypeError): + if str(ssh_val) != str(prom_val): + node_comparison['status'] = 'discrepancy' + comparison['discrepancies'].append({ + 'node': node, + 'gpu': str(gpu_id), + 'metric': metric_key, + 'ssh_value': str(ssh_val), + 'prom_value': str(prom_val), + 'diff_percent': None + }) + + comparison['node_comparisons'].append(node_comparison) + + if node_comparison['status'] == 'match': + comparison['summary']['matching_nodes'] += 1 + else: + comparison['summary']['discrepancy_nodes'] += 1 + + log.info(f"Comparison complete: {comparison['summary']['matching_nodes']}/{len(common_nodes)} nodes match") + if comparison['discrepancies']: + log.warning(f"Found {len(comparison['discrepancies'])} metric discrepancies") + + return comparison diff --git a/lib/grafana_config_lib.py b/lib/grafana_config_lib.py new file mode 100644 index 00000000..5849da2a --- /dev/null +++ b/lib/grafana_config_lib.py @@ -0,0 +1,250 @@ +""" +Grafana configuration and provisioning library +""" +import os +import json +import logging + +log = logging.getLogger(__name__) + + +def setup_grafana_provisioning(monitoring_dir="/tmp/grafana_provisioning"): + """ + Setup Grafana provisioning configs for datasources and dashboards + """ + os.makedirs(f"{monitoring_dir}/datasources", exist_ok=True) + os.makedirs(f"{monitoring_dir}/dashboards", exist_ok=True) + os.makedirs(f"{monitoring_dir}/dashboard_files", exist_ok=True) + + # Datasource config + datasource_config = """apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://localhost:9090 + isDefault: true + editable: false + jsonData: + timeInterval: "5s" +""" + + with open(f"{monitoring_dir}/datasources/prometheus.yml", 'w') as f: + f.write(datasource_config) + + # Dashboard provisioning config + dashboard_config = """apiVersion: 1 + +providers: + - name: 'Default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards +""" + + with open(f"{monitoring_dir}/dashboards/default.yml", 'w') as f: + f.write(dashboard_config) + + log.info(f"Grafana provisioning configs created in {monitoring_dir}") + return monitoring_dir + + +def create_gpu_dashboard(output_file="/tmp/grafana_provisioning/dashboard_files/gpu-metrics.json"): + """ + Create GPU metrics dashboard JSON + """ + dashboard = { + "annotations": {"list": []}, + "editable": True, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": None, + "links": [], + "liveNow": False, + "panels": [ + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": False, + "axisColorMode": "text", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85} + ] + }, + "unit": "celsius" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "id": 1, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True}, + "tooltip": {"mode": "multi"} + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "expr": "gpu_temp_degrees", + "legendFormat": "{{instance}} - GPU {{gpu_index}}", + "refId": "A" + } + ], + "title": "GPU Temperature", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": False, + "axisColorMode": "text", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5 + }, + "mappings": [], + "max": 100, + "min": 0, + "unit": "percent" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "id": 2, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True}, + "tooltip": {"mode": "multi"} + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "expr": "gpu_utilization_percent", + "legendFormat": "{{instance}} - GPU {{gpu_index}}", + "refId": "A" + } + ], + "title": "GPU Utilization %", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": False, + "axisColorMode": "text", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1 + }, + "mappings": [], + "unit": "bytes" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "id": 3, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True}, + "tooltip": {"mode": "multi"} + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "expr": "gpu_memory_used_bytes", + "legendFormat": "{{instance}} - GPU {{gpu_index}} Used", + "refId": "A" + }, + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "expr": "gpu_memory_total_bytes", + "legendFormat": "{{instance}} - GPU {{gpu_index}} Total", + "refId": "B" + } + ], + "title": "GPU Memory Usage", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": False, + "axisColorMode": "text", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1 + }, + "mappings": [], + "unit": "watt" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "id": 4, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True}, + "tooltip": {"mode": "multi"} + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "prometheus"}, + "expr": "gpu_power_watts", + "legendFormat": "{{instance}} - GPU {{gpu_index}}", + "refId": "A" + } + ], + "title": "GPU Power Consumption", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": ["gpu", "amd", "rocm"], + "templating": {"list": []}, + "time": {"from": "now-15m", "to": "now"}, + "timepicker": {}, + "timezone": "", + "title": "AMD GPU Metrics Dashboard", + "uid": "amd-gpu-metrics", + "version": 1 + } + + with open(output_file, 'w') as f: + json.dump(dashboard, f, indent=2) + + log.info(f"GPU dashboard created: {output_file}") + return output_file diff --git a/lib/prometheus_config_lib.py b/lib/prometheus_config_lib.py new file mode 100644 index 00000000..da6934f6 --- /dev/null +++ b/lib/prometheus_config_lib.py @@ -0,0 +1,107 @@ +''' +Copyright 2025 Advanced Micro Devices, Inc. +Prometheus Configuration Generator for CVS Monitoring +''' + +import json +import yaml +import logging + +log = logging.getLogger(__name__) + + +def generate_prometheus_config(cluster_dict, config_dict, output_file=None): + """ + Generate Prometheus configuration with dynamic scrape targets. + + Args: + cluster_dict: Cluster configuration + config_dict: Monitoring configuration + output_file: Optional output file path + + Returns: + str: YAML configuration content + """ + from utils_lib import generate_prometheus_targets + + # Get configuration values + scrape_interval = config_dict.get('scrape_interval', '15s') + scrape_timeout = config_dict.get('scrape_timeout', '10s') + retention_days = config_dict.get('retention_days', 30) + exporter_port = config_dict.get('device_metrics_exporter_port', 5000) + + # Generate targets for all nodes (management + workers) + targets = generate_prometheus_targets(cluster_dict, exporter_port) + + log.info(f"Generating Prometheus config for {len(targets)} targets") + for target in targets: + log.info(f" • {target}") + + # Build Prometheus configuration + config = { + 'global': { + 'scrape_interval': scrape_interval, + 'scrape_timeout': scrape_timeout, + 'evaluation_interval': scrape_interval + }, + 'scrape_configs': [ + { + 'job_name': 'device-metrics-exporter', + 'static_configs': [ + { + 'targets': targets + } + ], + 'metric_relabel_configs': [ + { + 'source_labels': ['__name__'], + 'regex': 'gpu_.*', + 'action': 'keep' + } + ] + } + ] + } + + # Convert to YAML + yaml_content = yaml.dump(config, default_flow_style=False, sort_keys=False) + + # Write to file if specified + if output_file: + with open(output_file, 'w') as f: + f.write(yaml_content) + log.info(f"Prometheus config written to: {output_file}") + + return yaml_content + + +def update_prometheus_targets(prometheus_yml_path, cluster_dict, exporter_port=5000): + """ + Update existing Prometheus config with new targets. + + Args: + prometheus_yml_path: Path to prometheus.yml + cluster_dict: Cluster configuration + exporter_port: Exporter port (default: 5000) + """ + from utils_lib import generate_prometheus_targets + + # Load existing config + with open(prometheus_yml_path, 'r') as f: + config = yaml.safe_load(f) + + # Generate new targets + targets = generate_prometheus_targets(cluster_dict, exporter_port) + + # Update targets in scrape config + for scrape_config in config.get('scrape_configs', []): + if scrape_config.get('job_name') == 'device-metrics-exporter': + scrape_config['static_configs'] = [{'targets': targets}] + log.info(f"Updated scrape targets: {targets}") + break + + # Write back + with open(prometheus_yml_path, 'w') as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + + log.info(f"Prometheus config updated: {prometheus_yml_path}") diff --git a/lib/utils_lib.py b/lib/utils_lib.py index bc8f0387..9c288931 100644 --- a/lib/utils_lib.py +++ b/lib/utils_lib.py @@ -420,6 +420,80 @@ def resolve_test_config_placeholders(config_dict, cluster_dict): return resolved_config +def resolve_placeholder_with_fallback(value, fallback): + """ + Resolve placeholder strings, returning fallback if unresolved. + + Args: + value: Value that may contain unresolved placeholders like {prometheus-host} + fallback: Default value to use if placeholder is unresolved + + Returns: + Resolved value or fallback if value is None/empty/unresolved placeholder + + Examples: + >>> resolve_placeholder_with_fallback("{prometheus-host}", "localhost") + 'localhost' + >>> resolve_placeholder_with_fallback("10.0.0.5", "localhost") + '10.0.0.5' + >>> resolve_placeholder_with_fallback(None, "localhost") + 'localhost' + """ + if value is None: + return fallback + + # Convert to string + value_str = str(value).strip() + + # Empty or unresolved placeholder (starts with { and ends with }) + if not value_str or (value_str.startswith("{") and value_str.endswith("}")): + return fallback + + return value_str + + +def apply_monitoring_defaults(config_dict): + """ + Apply default fallback values for monitoring configuration. + Ensures localhost/default ports/versions when placeholders aren't resolved. + + Args: + config_dict: Monitoring configuration dictionary + + Returns: + dict: Configuration with defaults applied + """ + defaults = { + 'prometheus_host': 'localhost', + 'prometheus_port': 9090, + 'prometheus_version': 'v2.55.0', + 'grafana_host': 'localhost', + 'grafana_port': 3000, + 'grafana_version': '10.4.1', + 'device_metrics_exporter_version': 'v1.4.0', + 'device_metrics_exporter_port': 5000, + 'device_metrics_exporter_host': 'localhost', + } + + result = config_dict.copy() + + for key, default_value in defaults.items(): + current_value = result.get(key) + result[key] = resolve_placeholder_with_fallback(current_value, default_value) + + # Build derived URLs with resolved values + if 'prometheus_url' in result: + prom_host = result['prometheus_host'] + prom_port = result['prometheus_port'] + result['prometheus_url'] = f"http://{prom_host}:{prom_port}" + + if 'grafana_url' in result: + graf_host = result['grafana_host'] + graf_port = result['grafana_port'] + result['grafana_url'] = f"http://{graf_host}:{graf_port}" + + return result + def collect_system_metadata(phdl, cluster_dict, config_dict, test_command=None, env_vars=None): """ Collect comprehensive system metadata from compute nodes for test reporting. @@ -687,3 +761,138 @@ def collect_system_metadata(phdl, cluster_dict, config_dict, test_command=None, log.info(f'Collected metadata: {list(metadata.keys())}') return metadata + + +def get_management_node(cluster_dict): + """ + Get the management/head node from cluster configuration. + + Args: + cluster_dict: Cluster configuration dictionary + + Returns: + str: Management node IP/hostname + + Example: + >>> cluster = {'head_node_dict': {'mgmt_ip': '10.0.0.100'}} + >>> get_management_node(cluster) + '10.0.0.100' + """ + return cluster_dict.get('head_node_dict', {}).get('mgmt_ip', 'localhost') + + +def get_all_nodes(cluster_dict): + """ + Get all nodes (workers + management) from cluster configuration. + + Args: + cluster_dict: Cluster configuration dictionary + + Returns: + list: All node IPs/hostnames including management node + + Example: + >>> cluster = { + ... 'head_node_dict': {'mgmt_ip': '10.0.0.100'}, + ... 'node_dict': {'10.0.0.101': {...}, '10.0.0.102': {...}} + ... } + >>> get_all_nodes(cluster) + ['10.0.0.100', '10.0.0.101', '10.0.0.102'] + """ + mgmt_node = get_management_node(cluster_dict) + worker_nodes = list(cluster_dict.get('node_dict', {}).keys()) + + # Management node + all workers + all_nodes = [mgmt_node] + worker_nodes + + # Remove duplicates (in case mgmt is also in node_dict) + return list(dict.fromkeys(all_nodes)) + + +def get_worker_nodes(cluster_dict): + """ + Get worker nodes only (excluding management node). + + Args: + cluster_dict: Cluster configuration dictionary + + Returns: + list: Worker node IPs/hostnames + """ + return list(cluster_dict.get('node_dict', {}).keys()) + + +def is_management_node(node, cluster_dict): + """ + Check if a node is the management/head node. + + Args: + node: Node IP/hostname to check + cluster_dict: Cluster configuration dictionary + + Returns: + bool: True if node is management node + + Example: + >>> cluster = {'head_node_dict': {'mgmt_ip': 'localhost'}} + >>> is_management_node('localhost', cluster) + True + >>> is_management_node('10.0.0.101', cluster) + False + """ + mgmt_node = get_management_node(cluster_dict) + + # Handle localhost aliases + if mgmt_node in ['localhost', '127.0.0.1'] and node in ['localhost', '127.0.0.1']: + return True + + return node == mgmt_node + + +def is_single_node_deployment(cluster_dict): + """ + Detect if this is a single-node (localhost) deployment. + + Args: + cluster_dict: Cluster configuration dictionary + + Returns: + bool: True if single-node deployment + + Example: + >>> cluster = {'head_node_dict': {'mgmt_ip': 'localhost'}, 'node_dict': {'localhost': {}}} + >>> is_single_node_deployment(cluster) + True + """ + all_nodes = get_all_nodes(cluster_dict) + + # Single node if only one unique node + if len(set(all_nodes)) == 1: + return True + + # Also single node if all nodes are localhost variants + localhost_variants = {'localhost', '127.0.0.1', '::1'} + return all(node in localhost_variants for node in all_nodes) + + +def generate_prometheus_targets(cluster_dict, exporter_port=5000): + """ + Generate Prometheus scrape targets for all nodes. + + Args: + cluster_dict: Cluster configuration dictionary + exporter_port: Port where Device Metrics Exporter runs (default: 5000) + + Returns: + list: Prometheus target strings in format "host:port" + + Example: + >>> cluster = { + ... 'head_node_dict': {'mgmt_ip': '10.0.0.100'}, + ... 'node_dict': {'10.0.0.101': {}, '10.0.0.102': {}} + ... } + >>> generate_prometheus_targets(cluster) + ['10.0.0.100:5000', '10.0.0.101:5000', '10.0.0.102:5000'] + """ + all_nodes = get_all_nodes(cluster_dict) + return [f"{node}:{exporter_port}" for node in all_nodes] diff --git a/monitoring/dashboards/gpu-metrics-dashboard.json b/monitoring/dashboards/gpu-metrics-dashboard.json new file mode 100644 index 00000000..ee6048df --- /dev/null +++ b/monitoring/dashboards/gpu-metrics-dashboard.json @@ -0,0 +1,1006 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "type": "row", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "title": "\ud83d\udd34 Critical Health Metrics" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "GPU Edge Temperature", + "type": "stat", + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 1, + "fieldConfig": { + "defaults": { + "unit": "celsius", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "orange", + "value": 85 + }, + { + "color": "red", + "value": 95 + } + ] + } + } + }, + "options": { + "colorMode": "background", + "graphMode": "area", + "orientation": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_edge_temperature", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "GPU Utilization", + "type": "stat", + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 2, + "fieldConfig": { + "defaults": { + "unit": "percent", + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "green", + "value": 50 + } + ] + } + } + }, + "options": { + "colorMode": "background", + "graphMode": "area", + "orientation": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_gfx_activity", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "Power Usage", + "type": "stat", + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 3, + "fieldConfig": { + "defaults": { + "unit": "watt", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 500 + }, + { + "color": "red", + "value": 700 + } + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "area" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_power_usage", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "Total GPUs Online", + "type": "stat", + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 4, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "textMode": "value_and_name" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "count(gpu_edge_temperature)", + "legendFormat": "Total GPUs", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "Memory Usage", + "type": "stat", + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 5, + "fieldConfig": { + "defaults": { + "unit": "percent", + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + } + } + }, + "options": { + "colorMode": "background", + "graphMode": "area" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "(gpu_used_vram / gpu_total_vram) * 100", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "ECC Errors (5m rate)", + "type": "stat", + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 6, + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 100 + } + ] + } + } + }, + "options": { + "colorMode": "background", + "graphMode": "area" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "rate(gpu_ecc_correct_total[5m]) * 300", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ] + }, + { + "type": "row", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 101, + "title": "\ud83c\udf21\ufe0f Temperature & Utilization" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "GPU Temperatures Over Time", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 7, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "lineWidth": 2, + "pointSize": 5, + "showPoints": "never" + }, + "unit": "celsius", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + } + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "calcs": [ + "mean", + "max", + "last" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_edge_temperature", + "legendFormat": "{{hostname}} GPU{{gpu_id}} Edge", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_junction_temperature", + "legendFormat": "{{hostname}} GPU{{gpu_id}} Junction", + "refId": "B" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "GPU Utilization Over Time", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 8, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 20, + "lineWidth": 2 + }, + "unit": "percent", + "max": 100, + "min": 0 + } + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "calcs": [ + "mean", + "max", + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_gfx_activity", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ] + }, + { + "type": "row", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 102, + "title": "\ud83d\udcbe Memory & Power" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "GPU Memory Usage (GB)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 9, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 30, + "lineWidth": 2 + }, + "unit": "decbytes" + } + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "calcs": [ + "mean", + "max", + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_used_vram", + "legendFormat": "{{hostname}} GPU{{gpu_id}} Used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_total_vram", + "legendFormat": "{{hostname}} GPU{{gpu_id}} Total", + "refId": "B" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "Power Consumption by GPU", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 10, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 20, + "lineWidth": 2 + }, + "unit": "watt" + } + }, + "options": { + "legend": { + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "calcs": [ + "mean", + "max", + "last" + ] + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_power_usage", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ] + }, + { + "type": "row", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 103, + "title": "\ud83d\udd27 Advanced Metrics" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "GPU Clock Speed (MHz)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 24 + }, + "id": 11, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "lineWidth": 2 + }, + "unit": "hertz" + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_clock{clock_type=\"GPU_CLOCK_TYPE_SYSTEM\"}", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "Memory Activity (%)", + "type": "timeseries", + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 24 + }, + "id": 12, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 15, + "lineWidth": 2 + }, + "unit": "percent", + "max": 100, + "min": 0 + } + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "gpu_umc_activity", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "ECC Errors by Component (5m rate)", + "type": "bargauge", + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 24 + }, + "id": 13, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + } + }, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "showUnfilled": true + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(gpu_ecc_correct_umc[5m]) * 300)", + "legendFormat": "UMC", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(gpu_ecc_correct_gfx[5m]) * 300)", + "legendFormat": "GFX", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(gpu_ecc_correct_sdma[5m]) * 300)", + "legendFormat": "SDMA", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum(rate(gpu_ecc_correct_mmhub[5m]) * 300)", + "legendFormat": "MMHUB", + "refId": "D" + } + ] + }, + { + "type": "row", + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 104, + "title": "\ud83d\udda5\ufe0f Node Comparison" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "Average GPU Temp by Node", + "type": "bargauge", + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 33 + }, + "id": 14, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "celsius" + } + }, + "options": { + "displayMode": "lcd", + "orientation": "horizontal" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "avg by (hostname) (gpu_edge_temperature)", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "Total Power by Node", + "type": "piechart", + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 33 + }, + "id": 15, + "options": { + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "value", + "percent" + ] + }, + "pieType": "pie", + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "sum by (hostname) (gpu_power_usage)", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ] + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "title": "GPU Count by Node", + "type": "stat", + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 33 + }, + "id": 16, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "unit": "short" + } + }, + "options": { + "colorMode": "value", + "graphMode": "none", + "textMode": "value_and_name", + "orientation": "horizontal" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "expr": "count by (hostname) (gpu_edge_temperature)", + "legendFormat": "{{hostname}}", + "refId": "A" + } + ] + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": [ + "gpu", + "amd", + "rocm", + "cluster" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h" + ] + }, + "timezone": "browser", + "title": "AMD GPU Cluster Monitoring", + "uid": "amd-gpu-metrics", + "version": 1 +} \ No newline at end of file diff --git a/monitoring/prometheus/alert_rules.yml b/monitoring/prometheus/alert_rules.yml new file mode 100644 index 00000000..723424d6 --- /dev/null +++ b/monitoring/prometheus/alert_rules.yml @@ -0,0 +1,74 @@ +# Prometheus Alert Rules for AMD GPU Health Monitoring + +groups: + - name: gpu_health_alerts + interval: 30s + rules: + # GPU Temperature Alerts + - alert: GPUTemperatureWarning + expr: amdgpu_temperature_celsius{sensor="edge"} > 95 + for: 2m + labels: + severity: warning + annotations: + summary: "GPU temperature high on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} has temperature {{ $value }}°C (threshold: 95°C)" + + - alert: GPUTemperatureCritical + expr: amdgpu_temperature_celsius{sensor="edge"} > 105 + for: 1m + labels: + severity: critical + annotations: + summary: "GPU temperature critical on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} has temperature {{ $value }}°C (threshold: 105°C)" + + # GPU Power Alerts + - alert: GPUPowerHigh + expr: amdgpu_power_watts > 700 + for: 5m + labels: + severity: warning + annotations: + summary: "GPU power consumption high on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} consuming {{ $value }}W (threshold: 700W)" + + # ECC Error Alerts + - alert: GPUECCErrors + expr: rate(amdgpu_ecc_errors_total[5m]) > 0 + for: 1m + labels: + severity: warning + annotations: + summary: "GPU ECC errors detected on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} reporting ECC errors" + + # PCIe Replay Errors + - alert: PCIeReplayErrors + expr: rate(amdgpu_pcie_replay_count[5m]) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "PCIe replay errors on {{ $labels.node }}" + description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} experiencing PCIe replay errors" + + # Exporter Health + - alert: DeviceMetricsExporterDown + expr: up{job="device-metrics-exporter"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Device Metrics Exporter down on {{ $labels.node }}" + description: "Cannot scrape metrics from {{ $labels.node }} - exporter may be down" + + # Cluster-wide alerts + - alert: MultipleGPUsOverheating + expr: count(amdgpu_temperature_celsius{sensor="edge"} > 95) > 3 + for: 5m + labels: + severity: critical + annotations: + summary: "Multiple GPUs overheating in cluster" + description: "{{ $value }} GPUs are above 95°C - possible cooling issue" diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100644 index 00000000..abc87370 --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,54 @@ +# Prometheus Configuration for CVS Device Metrics Monitoring +# This file configures Prometheus to scrape AMD GPU metrics from Device Metrics Exporter + +global: + scrape_interval: 15s # How often to scrape targets + evaluation_interval: 15s # How often to evaluate rules + scrape_timeout: 10s # Timeout for scraping + external_labels: + cluster: 'cvs-cluster' + monitor: 'gpu-monitoring' + +# Load alert rules +rule_files: + - 'alert_rules.yml' + +# Alertmanager configuration (optional) +# alerting: +# alertmanagers: +# - static_configs: +# - targets: +# - 'localhost:9093' + +# Scrape configurations +scrape_configs: + # Job for AMD Device Metrics Exporter running on all GPU nodes + - job_name: 'device-metrics-exporter' + static_configs: + - targets: + # ===== UPDATE THESE WITH YOUR ACTUAL NODE HOSTNAMES/IPs ===== + #- 'node1:5000' + #- 'node2:5000' + # Add more nodes as needed + # For local testing use: - 'localhost:5000' + - 'localhost:5000' + labels: + cluster: 'cvs-cluster' + + # Relabel to extract node name from target + relabel_configs: + - source_labels: [__address__] + regex: '([^:]+):.*' + target_label: node + replacement: '$1' + + # Metric relabeling (optional filtering) + metric_relabel_configs: + - source_labels: [__name__] + regex: 'gpu_.*' + action: keep + + # Prometheus self-monitoring + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] diff --git a/monitoring/provisioning/dashboards/default.yml b/monitoring/provisioning/dashboards/default.yml new file mode 100644 index 00000000..0fac35e9 --- /dev/null +++ b/monitoring/provisioning/dashboards/default.yml @@ -0,0 +1,13 @@ +apiVersion: 1 + +providers: + - name: 'Default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/monitoring/provisioning/datasources/prometheus.yml b/monitoring/provisioning/datasources/prometheus.yml new file mode 100644 index 00000000..0534726e --- /dev/null +++ b/monitoring/provisioning/datasources/prometheus.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +datasources: + - name: prometheus + type: prometheus + access: proxy + url: http://localhost:9090 + isDefault: true + editable: false + jsonData: + timeInterval: "5s" diff --git a/tests/monitoring/cleanup_monitoring_stack.py b/tests/monitoring/cleanup_monitoring_stack.py new file mode 100644 index 00000000..0f2b13a5 --- /dev/null +++ b/tests/monitoring/cleanup_monitoring_stack.py @@ -0,0 +1,91 @@ +"""Cleanup test for GPU monitoring stack - removes all components from all nodes.""" + +import pytest +import logging +import subprocess +from lib.parallel_ssh_lib import Pssh + +logger = logging.getLogger(__name__) + +def is_localhost(ip_address): + """Check if IP address is localhost.""" + import socket + local_addresses = {'localhost', '127.0.0.1', '::1', '127.0.1.1'} + if ip_address in local_addresses: + return True + try: + result = subprocess.run(['hostname', '-I'], capture_output=True, text=True, timeout=5) + if result.returncode == 0: + local_addresses.update(result.stdout.strip().split()) + except: pass + return ip_address in local_addresses + +@pytest.mark.cleanup +def test_stop_exporters_on_all_nodes(cluster_dict, all_nodes): + """Stop and remove device-metrics-exporter containers from all nodes.""" + logger.info(f"Stopping device-metrics-exporters on all {len(all_nodes)} nodes") + username = cluster_dict['username'] + priv_key_file = cluster_dict.get('priv_key_file', f"/home/{username}/.ssh/id_rsa") + commands = ["docker stop device-metrics-exporter || true", "docker rm device-metrics-exporter || true"] + + for node_ip in all_nodes: + logger.info(f"Cleaning up exporter on node: {node_ip}") + if is_localhost(node_ip): + for cmd in commands: + subprocess.run(cmd, shell=True, capture_output=True, text=True) + else: + phdl = Pssh([node_ip], user=username, priv_key=priv_key_file) + for cmd in commands: + phdl.run(cmd) + logger.info("✓ Exporters cleaned up on all nodes") + +@pytest.mark.cleanup +def test_stop_prometheus_on_management(cluster_dict, management_node): + """Stop Prometheus systemd service.""" + logger.info(f"Stopping Prometheus on management node: {management_node}") + username = cluster_dict['username'] + commands = ["sudo systemctl stop prometheus || true", "sudo systemctl disable prometheus || true"] + + if is_localhost(management_node): + for cmd in commands: + subprocess.run(cmd, shell=True, capture_output=True, text=True) + else: + phdl = Pssh([management_node], user=username, priv_key=cluster_dict.get('priv_key_file')) + for cmd in commands: + phdl.run(cmd) + logger.info("✓ Prometheus stopped") + +@pytest.mark.cleanup +def test_stop_grafana_on_management(cluster_dict, management_node): + """Stop and remove Grafana container.""" + logger.info(f"Stopping Grafana on management node: {management_node}") + commands = ["docker stop grafana || true", "docker rm grafana || true"] + + if is_localhost(management_node): + for cmd in commands: + subprocess.run(cmd, shell=True, capture_output=True, text=True) + logger.info("✓ Grafana stopped") + +@pytest.mark.cleanup +def test_remove_prometheus_config(cluster_dict, management_node): + """Remove Prometheus configuration and data.""" + logger.info(f"Removing Prometheus config from management node") + commands = [ + "sudo rm -f /etc/systemd/system/prometheus.service", + "sudo systemctl daemon-reload", + "sudo rm -rf /etc/prometheus", + "sudo rm -rf /var/lib/prometheus" + ] + + if is_localhost(management_node): + for cmd in commands: + subprocess.run(cmd, shell=True, capture_output=True, text=True) + logger.info("✓ Prometheus config removed") + +@pytest.mark.cleanup +def test_cleanup_summary(all_nodes, management_node): + """Display cleanup summary.""" + logger.info("=" * 60) + logger.info("MONITORING STACK CLEANUP COMPLETE") + logger.info(f"Cleaned {len(all_nodes)} nodes") + logger.info("=" * 60) diff --git a/tests/monitoring/install_device_metrics_exporter.py b/tests/monitoring/install_device_metrics_exporter.py new file mode 100644 index 00000000..1da39acd --- /dev/null +++ b/tests/monitoring/install_device_metrics_exporter.py @@ -0,0 +1,938 @@ +# Drop-in replacement for tests/monitoring/install_device_metrics_exporter.py +# Key changes: +# 1. Added apply_monitoring_defaults to config_dict fixture +# 2. Updated metrics_host fixture to use resolved device_metrics_exporter_host +# 3. Fixed hardcoded localhost in test_check_gpu_metrics_exposed (line ~217) + +import pytest +import re +import sys +import os +import time +import json +import logging + +sys.path.insert(0, './lib') +from parallel_ssh_lib import * +from utils_lib import * + +import globals + +log = globals.log + + +@pytest.fixture(scope="module") +def cluster_file(pytestconfig): + """Get cluster file path from pytest CLI""" + return pytestconfig.getoption("cluster_file") + + +@pytest.fixture(scope="module") +def config_file(pytestconfig): + """Get config file path from pytest CLI""" + return pytestconfig.getoption("config_file") + + +@pytest.fixture(scope="module") +def cluster_dict(cluster_file): + """Load cluster configuration""" + with open(cluster_file) as json_file: + cluster_dict = json.load(json_file) + cluster_dict = resolve_cluster_config_placeholders(cluster_dict) + log.info(cluster_dict) + return cluster_dict + + +@pytest.fixture(scope="module") +def config_dict(config_file, cluster_dict): + """Load monitoring configuration with localhost/version fallbacks""" + with open(config_file) as json_file: + config_dict_t = json.load(json_file) + config_dict = config_dict_t.get('monitoring', {}) + config_dict = resolve_test_config_placeholders(config_dict, cluster_dict) + # Apply defaults for unresolved placeholders + config_dict = apply_monitoring_defaults(config_dict) + log.info("Resolved monitoring config:") + log.info(config_dict) + return config_dict + + +@pytest.fixture(scope="module") +def metrics_host(config_dict): + """Get metrics host with fallback to localhost""" + return config_dict.get("device_metrics_exporter_host", "localhost") + + +@pytest.fixture(scope="module") +def phdl(cluster_dict): + """Create parallel SSH handle for all nodes""" + node_list = list(cluster_dict['node_dict'].keys()) + phdl = Pssh(log, node_list, user=cluster_dict['username'], pkey=cluster_dict['priv_key_file']) + return phdl + + +def test_check_docker_installed(phdl): + """Verify Docker is installed on all nodes""" + globals.error_list = [] + log.info("Checking if Docker is installed on all nodes") + + out_dict = phdl.exec('docker --version') + + for node in out_dict.keys(): + if not re.search(r'Docker version', out_dict[node], re.I): + fail_test(f"Docker is not installed on node {node}. Please install Docker first.") + + update_test_result() + + +def test_check_rocm_installed(phdl): + """Verify ROCm is installed on all nodes""" + globals.error_list = [] + log.info("Checking if ROCm is installed on all nodes") + + out_dict = phdl.exec('rocm-smi --version || amd-smi version') + + for node in out_dict.keys(): + if not re.search(r'ROCm|AMD', out_dict[node], re.I): + fail_test(f"ROCm is not installed on node {node}. Please install ROCm first.") + + update_test_result() + + +def test_pull_device_metrics_exporter_image(phdl, config_dict): + """Pull Device Metrics Exporter Docker image on all nodes""" + globals.error_list = [] + log.info("Pulling Device Metrics Exporter Docker image on all nodes") + + version = config_dict['device_metrics_exporter_version'] + image = f"rocm/device-metrics-exporter:{version}" + log.info(f"Using image: {image}") + + out_dict = phdl.exec(f'docker pull {image}', timeout=300) + + for node in out_dict.keys(): + if 'Error' in out_dict[node] or 'failed' in out_dict[node].lower(): + fail_test(f"Failed to pull Docker image on node {node}: {out_dict[node]}") + + update_test_result() + + +def test_stop_existing_device_metrics_exporter(phdl): + """Stop and remove any existing Device Metrics Exporter containers""" + globals.error_list = [] + log.info("Stopping existing Device Metrics Exporter containers (if any)") + + phdl.exec('docker stop device-metrics-exporter 2>/dev/null || true') + phdl.exec('docker rm device-metrics-exporter 2>/dev/null || true') + + log.info("Cleaned up existing containers") + update_test_result() + + +def test_start_device_metrics_exporter(phdl, config_dict): + """Start Device Metrics Exporter container on all nodes""" + globals.error_list = [] + log.info("Starting Device Metrics Exporter on all nodes") + + version = config_dict['device_metrics_exporter_version'] + port = config_dict['device_metrics_exporter_port'] + + log.info(f"Starting exporter version {version} on port {port}") + + # Docker run command + docker_cmd = f'''docker run -d \ + --device=/dev/dri \ + --device=/dev/kfd \ + --network=host \ + -p {port}:{port} \ + --restart unless-stopped \ + --name device-metrics-exporter \ + rocm/device-metrics-exporter:{version}''' + + out_dict = phdl.exec(docker_cmd) + + for node in out_dict.keys(): + if 'Error' in out_dict[node]: + fail_test(f"Failed to start Device Metrics Exporter on node {node}: {out_dict[node]}") + + log.info("Device Metrics Exporter started on all nodes") + update_test_result() + + +def test_verify_exporter_running(phdl): + """Verify Device Metrics Exporter is running""" + globals.error_list = [] + log.info("Verifying Device Metrics Exporter is running on all nodes") + + # Wait for containers to start + time.sleep(10) + + out_dict = phdl.exec('docker ps --filter name=device-metrics-exporter --format "{{.Status}}"') + + for node in out_dict.keys(): + if 'Up' not in out_dict[node]: + fail_test(f"Device Metrics Exporter is not running on node {node}") + + update_test_result() + + +def test_verify_metrics_endpoint(phdl, config_dict, metrics_host): + """Verify metrics endpoint is accessible""" + globals.error_list = [] + log.info("Verifying metrics endpoint is accessible on all nodes") + + port = config_dict['device_metrics_exporter_port'] + log.info(f"Testing endpoint: http://{metrics_host}:{port}/metrics") + + # Retry logic for slow container startup + max_retries = 3 + out_dict = None + + for attempt in range(max_retries): + out_dict = phdl.exec(f'curl -s http://{metrics_host}:{port}/metrics | head -20') + + # Check if we got output + has_output = False + for node in out_dict.keys(): + if len(out_dict[node]) > 0: + has_output = True + break + + if has_output: + break + else: + log.info(f"Attempt {attempt+1}/{max_retries}: No output yet, waiting 5 seconds...") + time.sleep(5) + + # Final validation + for node in out_dict.keys(): + output = out_dict[node] + log.info(f"Checking output from {node}, length: {len(output)}") + + if output and 'gpu_' in output.lower(): + log.info(f"Metrics endpoint verified on node {node}") + else: + log.error(f"Output sample: {output[:200]}") + fail_test(f"Metrics endpoint not accessible on node {node}") + + update_test_result() + + +def test_check_gpu_metrics_exposed(phdl, config_dict, metrics_host): + """Verify GPU metrics are being exposed""" + globals.error_list = [] + log.info("Checking if GPU metrics are being exposed") + + port = config_dict['device_metrics_exporter_port'] + + # Use metrics_host instead of hardcoded localhost + out_dict = phdl.exec(f'curl -s http://{metrics_host}:{port}/metrics | head -50') + + for node in out_dict.keys(): + output = out_dict[node] + log.info(f"Checking GPU metrics from {node}, length: {len(output)}") + + if output.strip() and 'gpu_' in output.lower(): + log.info(f"GPU metrics verified on node {node}") + # Show sample + lines = [line for line in output.split('\n') if 'gpu_' in line.lower()][:2] + for line in lines: + log.info(f" Sample: {line[:80]}") + else: + log.error(f"No GPU metrics found. Output: {output[:300]}") + fail_test(f"GPU metrics not found on node {node}") + + update_test_result() + + +def test_display_summary(phdl): + """Display installation summary""" + log.info("=" * 80) + log.info("Device Metrics Exporter Installation Complete!") + log.info("=" * 80) + log.info("") + log.info("Exporter Status:") + + out_dict = phdl.exec('docker ps --filter name=device-metrics-exporter --format "{{.Names}}: {{.Status}}"') + + for node in out_dict.keys(): + log.info(f" {node}: {out_dict[node]}") + + log.info("Completed metrics tests successfully.") + + +# ============================================================================ +# Node Role Detection Fixtures +# ============================================================================ + +@pytest.fixture(scope='module') +def management_node(cluster_dict): + """Get the management/head node from cluster.""" + from utils_lib import get_management_node + return get_management_node(cluster_dict) + + +@pytest.fixture(scope='module') +def all_nodes(cluster_dict): + """Get all nodes (management + workers) where exporter should run.""" + from utils_lib import get_all_nodes + return get_all_nodes(cluster_dict) + + +@pytest.fixture(scope='module') +def worker_nodes(cluster_dict): + """Get worker nodes only.""" + from utils_lib import get_worker_nodes + return get_worker_nodes(cluster_dict) + + +@pytest.fixture(scope='module') +def is_single_node(cluster_dict): + """Check if this is a single-node deployment.""" + from utils_lib import is_single_node_deployment + return is_single_node_deployment(cluster_dict) + + +@pytest.fixture(scope='module') +def prometheus_targets(cluster_dict, config_dict): + """Generate Prometheus scrape targets for all nodes.""" + from utils_lib import generate_prometheus_targets + exporter_port = config_dict.get('device_metrics_exporter_port', 5000) + return generate_prometheus_targets(cluster_dict, exporter_port) + + +def is_mgmt_node(node, cluster_dict): + """Helper function to check if node is management node.""" + from utils_lib import is_management_node + return is_management_node(node, cluster_dict) + + +# Tests with Management Node Awareness + +def test_deploy_prometheus_on_management_only(cluster_dict, management_node, is_single_node, config_dict, prometheus_targets): + """ + Deploy Prometheus ONLY on management node with all targets configured. + Uses pssh for multi-node, subprocess for localhost. + """ + log.info("="*80) + log.info(f"Deploying Prometheus on management node: {management_node}") + log.info(f"Targets: {prometheus_targets}") + log.info("="*80) + + import subprocess + import os + from prometheus_config_lib import generate_prometheus_config + + # Generate Prometheus config + prometheus_yml = "/tmp/prometheus_cvs.yml" + generate_prometheus_config(cluster_dict, config_dict, prometheus_yml) + log.info(f" Config generated with {len(prometheus_targets)} targets") + + prom_version = config_dict.get('prometheus_version', 'v2.55.0').lstrip('v') + + # Deploy on localhost/management node + if is_single_node or is_localhost(management_node): + # LOCAL DEPLOYMENT + # Stop existing + subprocess.run("sudo systemctl stop prometheus 2>/dev/null || true", shell=True) + subprocess.run("sudo pkill -9 prometheus 2>/dev/null || true", shell=True) + + # Install if needed + if not os.path.exists('/opt/prometheus/prometheus'): + log.info(f"Installing Prometheus {prom_version}...") + cmd = f"""cd /tmp && wget -q https://github.com/prometheus/prometheus/releases/download/v{prom_version}/prometheus-{prom_version}.linux-amd64.tar.gz && tar xzf prometheus-{prom_version}.linux-amd64.tar.gz && sudo mkdir -p /opt/prometheus /var/lib/prometheus/data && sudo cp -r prometheus-{prom_version}.linux-amd64/* /opt/prometheus/""" + subprocess.run(cmd, shell=True, check=True) + + # Copy config + subprocess.run(f"sudo cp {prometheus_yml} /opt/prometheus/prometheus.yml", shell=True, check=True) + + # Create systemd service + svc = """[Unit] +Description=Prometheus +After=network.target + +[Service] +Type=simple +User=root +ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/data --web.listen-address=0.0.0.0:9090 +Restart=always + +[Install] +WantedBy=multi-user.target +""" + with open('/tmp/prometheus.service', 'w') as f: + f.write(svc) + subprocess.run("sudo cp /tmp/prometheus.service /etc/systemd/system/", shell=True, check=True) + subprocess.run("sudo systemctl daemon-reload && sudo systemctl enable prometheus && sudo systemctl restart prometheus", shell=True, check=True) + + import time + time.sleep(3) + + # Verify + result = subprocess.run("systemctl is-active prometheus", shell=True, capture_output=True) + assert result.returncode == 0, "Prometheus not running" + log.info("SUCCESS: Prometheus running on management node (localhost)") + else: + # MULTI-NODE DEPLOYMENT via SSH to management node only + log.info(f"Deploying to remote management node: {management_node}") + from parallel_ssh_lib import Pssh + + # Create SSH client for management node ONLY + mgmt_dict = {management_node: cluster_dict['node_dict'].get(management_node, {'bmc_ip': 'NA', 'vpc_ip': management_node})} + phdl = Pssh(log, list(mgmt_dict.keys()), user=cluster_dict['username'], pkey=cluster_dict['priv_key_file']) + + # Upload config file to management node + import tempfile + with tempfile.NamedTemporaryFile(mode='w', suffix='.yml', delete=False) as f: + with open(prometheus_yml, 'r') as src: + f.write(src.read()) + temp_config = f.name + + # Deploy Prometheus on management node only + deploy_script = f""" + # Stop existing + sudo systemctl stop prometheus 2>/dev/null || true + sudo pkill -9 prometheus 2>/dev/null || true + + # Install if needed + if [ ! -f /opt/prometheus/prometheus ]; then + echo "Installing Prometheus {prom_version}..." + cd /tmp + wget -q https://github.com/prometheus/prometheus/releases/download/v{prom_version}/prometheus-{prom_version}.linux-amd64.tar.gz + tar xzf prometheus-{prom_version}.linux-amd64.tar.gz + sudo mkdir -p /opt/prometheus /var/lib/prometheus/data + sudo cp -r prometheus-{prom_version}.linux-amd64/* /opt/prometheus/ + fi + + # Copy config (uploaded separately via SCP) + sudo mkdir -p /opt/prometheus + + # Create systemd service + sudo tee /etc/systemd/system/prometheus.service > /dev/null << 'SVCEOF' +[Unit] +Description=Prometheus +After=network.target + +[Service] +Type=simple +User=root +ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/data --web.listen-address=0.0.0.0:9090 +Restart=always + +[Install] +WantedBy=multi-user.target +SVCEOF + + sudo systemctl daemon-reload + sudo systemctl enable prometheus + sudo systemctl start prometheus + sleep 2 + systemctl is-active prometheus + """ + + # Execute deployment on management node only + result = phdl.exec(deploy_script) + + # Verify deployment succeeded + for node, output in result.items(): + if 'active' not in output: + fail_test(f"Prometheus deployment failed on {node}: {output}") + + log.info(f"SUCCESS: Prometheus deployed and running on management node: {management_node}") + log.info("SUCCESS: ENFORCEMENT: Prometheus deployed ONLY to management node, NOT to workers") + +def test_deploy_grafana_on_management_only(cluster_dict, management_node, is_single_node, config_dict): + """ + Deploy Grafana ONLY on management node. + Uses pssh for multi-node, subprocess for localhost. + """ + log.info(f"Deploying Grafana on management node: {management_node}") + + # Create provisioning configs and dashboard BEFORE starting Grafana + create_grafana_provisioning_configs() + create_grafana_dashboard_file() + + import subprocess + import os + + grafana_version = config_dict.get('grafana_version', '10.4.1') + grafana_port = config_dict.get('grafana_port', '3000') + + if is_single_node or is_localhost(management_node): + # LOCAL DEPLOYMENT + # Stop existing + subprocess.run("docker stop grafana 2>/dev/null || true", shell=True) + subprocess.run("docker rm grafana 2>/dev/null || true", shell=True) + + # Create data directory + grafana_data = "/home/svdt-8/manoj/cvs/cvs/monitoring/grafana_data" + os.makedirs(grafana_data, exist_ok=True) + subprocess.run(f"sudo chown -R 472:472 {grafana_data}", shell=True, check=True) + + # Start Grafana + cmd = f"""docker run -d \ + --name grafana \ + --network host \ + --restart unless-stopped \ + -v {grafana_data}:/var/lib/grafana \ + -v $(pwd)/monitoring/provisioning:/etc/grafana/provisioning \ + -v $(pwd)/monitoring/dashboards:/var/lib/grafana/dashboards \ + grafana/grafana:{grafana_version}""" + subprocess.run(cmd, shell=True, check=True) + + import time + time.sleep(3) + + # Verify + result = subprocess.run("docker ps | grep grafana", shell=True, capture_output=True) + assert result.returncode == 0, "Grafana not running" + log.info(f"SUCCESS: Grafana running on management node (localhost) port {grafana_port}") + else: + # MULTI-NODE DEPLOYMENT via SSH to management node only + log.info(f"Deploying to remote management node: {management_node}") + from parallel_ssh_lib import Pssh + + # Create SSH client for management node ONLY + mgmt_dict = {management_node: cluster_dict['node_dict'].get(management_node, {'bmc_ip': 'NA', 'vpc_ip': management_node})} + phdl = Pssh(log, list(mgmt_dict.keys()), user=cluster_dict['username'], pkey=cluster_dict['priv_key_file']) + + # Deploy Grafana on management node only + deploy_script = f""" + # Stop existing + docker stop grafana 2>/dev/null || true + docker rm grafana 2>/dev/null || true + + # Create data directory + mkdir -p /tmp/grafana_data + sudo chown -R 472:472 /tmp/grafana_data + + # Start Grafana + docker run -d \ + --name grafana \ + --network host \ + --restart unless-stopped \ + -v /tmp/grafana_data:/var/lib/grafana \ + grafana/grafana:{grafana_version} + + sleep 3 + docker ps | grep grafana + """ + + # Execute deployment on management node only + result = phdl.exec(deploy_script) + + # Verify deployment succeeded + for node, output in result.items(): + if 'grafana' not in output: + fail_test(f"Grafana deployment failed on {node}: {output}") + + log.info(f"SUCCESS: Grafana deployed and running on management node: {management_node}") + log.info("SUCCESS: ENFORCEMENT: Grafana deployed ONLY to management node, NOT to workers") + +def test_verify_all_nodes_for_exporter(all_nodes, management_node): + """ + Verify that exporter targets include all nodes (management + workers). + """ + log.info("="*80) + log.info(f"All nodes where exporter should run:") + for node in all_nodes: + is_mgmt = " (MANAGEMENT)" if node == management_node else "" + log.info(f" • {node}{is_mgmt}") + log.info("="*80) + + assert len(all_nodes) > 0 + assert management_node in all_nodes + log.info(f" Total nodes for exporter deployment: {len(all_nodes)}") + + +def test_prometheus_scrape_targets(prometheus_targets, all_nodes): + """ + Verify Prometheus scrape targets include all nodes. + """ + log.info("="*80) + log.info("Prometheus scrape targets:") + for target in prometheus_targets: + log.info(f" • {target}") + log.info("="*80) + + assert len(prometheus_targets) == len(all_nodes) + log.info(f" Scrape targets generated for all {len(all_nodes)} nodes") + + +def test_verify_service_distribution(cluster_dict, management_node, all_nodes, worker_nodes, is_single_node): + """ + CRITICAL TEST: Verify service distribution enforcement. + - Exporter must be on ALL nodes (management + workers) + - Prometheus must be ONLY on management node + - Grafana must be ONLY on management node + """ + log.info("="*80) + log.info("VERIFYING SERVICE DISTRIBUTION ENFORCEMENT") + log.info("="*80) + + # Show the architecture + log.info(f"\n Cluster Architecture:") + log.info(f" Management Node: {management_node}") + log.info(f" Worker Nodes: {worker_nodes if worker_nodes else 'None (single-node)'}") + log.info(f" Total Nodes: {len(all_nodes)}") + log.info(f" Deployment Type: {'Single-Node' if is_single_node else 'Multi-Node'}") + + log.info(f"\nSUCCESS: SERVICE DISTRIBUTION RULES:") + log.info(f" 1. Device Metrics Exporter → ALL {len(all_nodes)} nodes") + for node in all_nodes: + marker = "(MANAGEMENT)" if node == management_node else "(WORKER)" + log.info(f" {node} {marker}") + + log.info(f"\n 2. Prometheus → ONLY management node") + log.info(f" {management_node} (MANAGEMENT ONLY)") + if worker_nodes: + for node in worker_nodes: + log.info(f" {node} (NOT deployed)") + + log.info(f"\n 3. Grafana → ONLY management node") + log.info(f" {management_node} (MANAGEMENT ONLY)") + if worker_nodes: + for node in worker_nodes: + log.info(f" {node} (NOT deployed)") + + log.info(f"\n" + "="*80) + log.info("SUCCESS: SERVICE DISTRIBUTION VERIFIED") + log.info("="*80) + + # Assert the rules + assert len(all_nodes) >= 1, "Must have at least one node" + assert management_node in all_nodes, "Management node must be in all_nodes list" + + if not is_single_node: + assert len(worker_nodes) > 0, "Multi-node must have workers" + log.info(f"SUCCESS: ENFORCEMENT VERIFIED: Multi-node cluster with proper separation") + else: + log.info(f"SUCCESS: ENFORCEMENT VERIFIED: Single-node deployment (all services on localhost)") + + +def is_localhost(node): + """Check if a node IP/hostname refers to localhost.""" + import socket + import subprocess + + # Obvious localhost values + if node in ['localhost', '127.0.0.1', '::1', 'localhost.localdomain']: + return True + + # Get all local IPs + local_ips = set(['127.0.0.1', '::1', 'localhost']) + + try: + # Get hostname and its IP + hostname = socket.gethostname() + local_ips.add(hostname) + + # Get primary IP + try: + local_ip = socket.gethostbyname(hostname) + local_ips.add(local_ip) + except: + pass + + # Get all IPs from hostname -I + try: + result = subprocess.run(['hostname', '-I'], capture_output=True, text=True, timeout=2) + if result.returncode == 0: + for ip in result.stdout.strip().split(): + local_ips.add(ip.strip()) + except: + pass + + # Get all IPs from ip addr + try: + result = subprocess.run(['ip', 'addr'], capture_output=True, text=True, timeout=2) + if result.returncode == 0: + import re + for match in re.finditer(r'inet\s+(\d+\.\d+\.\d+\.\d+)', result.stdout): + local_ips.add(match.group(1)) + except: + pass + + except Exception as e: + log.warning(f"Error detecting local IPs: {e}") + + log.info(f"Local IPs detected: {local_ips}") + log.info(f"Checking if {node} is localhost: {node in local_ips}") + + return node in local_ips + + +def create_grafana_dashboard_file(): + """Create GPU dashboard with correct metric names.""" + import os + import json + + dashboard_dir = "monitoring/dashboards" + os.makedirs(dashboard_dir, exist_ok=True) + + dashboard = { + "annotations": {"list": []}, + "editable": True, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": None, + "links": [], + "panels": [ + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": False, + "axisColorMode": "text", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": None}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85} + ] + }, + "unit": "celsius" + } + }, + "gridPos": {"h": 8, "w": 8, "x": 0, "y": 0}, + "id": 1, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True}, + "tooltip": {"mode": "multi"} + }, + "targets": [ + { + "datasource": "prometheus", + "expr": "gpu_edge_temperature", + "legendFormat": "{{hostname}} GPU{{gpu_id}} Edge", + "refId": "A" + }, + { + "datasource": "prometheus", + "expr": "gpu_junction_temperature", + "legendFormat": "{{hostname}} GPU{{gpu_id}} Junction", + "refId": "B" + } + ], + "title": "GPU Temperature", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": False, + "axisColorMode": "text", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1 + }, + "mappings": [], + "unit": "watt" + } + }, + "gridPos": {"h": 8, "w": 8, "x": 8, "y": 0}, + "id": 2, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True}, + "tooltip": {"mode": "multi"} + }, + "targets": [ + { + "datasource": "prometheus", + "expr": "gpu_power_usage", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ], + "title": "GPU Power Usage", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": False, + "axisColorMode": "text", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1 + }, + "mappings": [], + "unit": "watt" + } + }, + "gridPos": {"h": 8, "w": 8, "x": 16, "y": 0}, + "id": 3, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True}, + "tooltip": {"mode": "multi"} + }, + "targets": [ + { + "datasource": "prometheus", + "expr": "gpu_average_package_power", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ], + "title": "GPU Average Package Power", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": False, + "axisColorMode": "text", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1 + }, + "mappings": [], + "unit": "hertz" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "id": 4, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True}, + "tooltip": {"mode": "multi"} + }, + "targets": [ + { + "datasource": "prometheus", + "expr": "gpu_clock{clock_type=\"GPU_CLOCK_TYPE_SYSTEM\"}", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ], + "title": "GPU Clock Speed", + "type": "timeseries" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": { + "axisCenteredZero": False, + "axisColorMode": "text", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1 + }, + "mappings": [], + "unit": "celsius" + } + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "id": 5, + "options": { + "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True}, + "tooltip": {"mode": "multi"} + }, + "targets": [ + { + "datasource": "prometheus", + "expr": "gpu_memory_temperature", + "legendFormat": "{{hostname}} GPU{{gpu_id}}", + "refId": "A" + } + ], + "title": "GPU Memory Temperature", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": ["gpu", "amd", "rocm"], + "templating": {"list": []}, + "time": {"from": "now-15m", "to": "now"}, + "timepicker": {}, + "timezone": "browser", + "title": "AMD GPU Metrics Dashboard", + "uid": "amd-gpu-metrics", + "version": 1 + } + + dashboard_file = f"{dashboard_dir}/gpu-metrics-dashboard.json" + with open(dashboard_file, 'w') as f: + json.dump(dashboard, f, indent=2) + + log.info(f"✓ Created dashboard: {dashboard_file}") + return dashboard_file + + +def create_grafana_provisioning_configs(): + """Create Grafana provisioning configs for datasources and dashboards.""" + import os + + # Create directories + os.makedirs("monitoring/provisioning/datasources", exist_ok=True) + os.makedirs("monitoring/provisioning/dashboards", exist_ok=True) + + # Datasource config + datasource_config = """apiVersion: 1 + +datasources: + - name: prometheus + type: prometheus + access: proxy + url: http://localhost:9090 + isDefault: true + editable: false + jsonData: + timeInterval: "5s" +""" + + with open("monitoring/provisioning/datasources/prometheus.yml", 'w') as f: + f.write(datasource_config) + + # Dashboard provisioning config + dashboard_config = """apiVersion: 1 + +providers: + - name: 'Default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards +""" + + with open("monitoring/provisioning/dashboards/default.yml", 'w') as f: + f.write(dashboard_config) + + log.info("✓ Created Grafana provisioning configs") diff --git a/utils/deploy_monitoring_stack.sh b/utils/deploy_monitoring_stack.sh new file mode 100755 index 00000000..0ece3e94 --- /dev/null +++ b/utils/deploy_monitoring_stack.sh @@ -0,0 +1,238 @@ +#!/bin/bash + +# CVS Monitoring Stack Deployment Script +# Deploys Device Metrics Exporter + Prometheus + Grafana + +set -e + +# Get script directory and repo root +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +cd "$REPO_ROOT" + +# Configuration with localhost fallback +CLUSTER_FILE="${1:-./input/cluster_file/local_test_cluster.json}" +MONITORING_CONFIG="${2:-./input/config_file/monitoring/monitoring_config.json}" + +# Helper function to resolve placeholders +resolve_with_fallback() { + local value="$1" + local fallback="$2" + + # If value is empty or contains unresolved placeholder pattern {...} + if [[ -z "$value" ]] || [[ "$value" =~ ^\{.*\}$ ]]; then + echo "$fallback" + else + # Remove 'v' prefix if exists for version numbers + echo "${value#v}" + fi +} + +# Read versions from config with fallback +if [ -f "$MONITORING_CONFIG" ] && command -v jq &> /dev/null; then + PROM_RAW=$(jq -r '.monitoring.prometheus_version // "v2.55.0"' "$MONITORING_CONFIG") + GRAF_RAW=$(jq -r '.monitoring.grafana_version // "10.4.1"' "$MONITORING_CONFIG") + EXPO_RAW=$(jq -r '.monitoring.device_metrics_exporter_version // "v1.4.0"' "$MONITORING_CONFIG") + + PROMETHEUS_VERSION=$(resolve_with_fallback "$PROM_RAW" "2.55.0") + GRAFANA_VERSION=$(resolve_with_fallback "$GRAF_RAW" "10.4.1") + DEVICE_METRICS_VERSION=$(resolve_with_fallback "$EXPO_RAW" "v1.4.0") +else + # Fallback defaults + PROMETHEUS_VERSION="2.55.0" + GRAFANA_VERSION="10.4.1" + DEVICE_METRICS_VERSION="v1.4.0" +fi + +echo "============================================" +echo "CVS Monitoring Stack Deployment" +echo "============================================" +echo "" +echo "Working Directory: $REPO_ROOT" +echo "Cluster File: $CLUSTER_FILE" +echo "Monitoring Config: $MONITORING_CONFIG" +echo "Prometheus Version: $PROMETHEUS_VERSION" +echo "Grafana Version: $GRAFANA_VERSION" +echo "Exporter Version: $DEVICE_METRICS_VERSION" +echo "" + +# Step 1: Deploy Device Metrics Exporter on all GPU nodes using pytest +echo "Step 1: Deploying Device Metrics Exporter on all GPU nodes..." +echo "------------------------------------------------------------" +pytest -vv -s ./tests/monitoring/install_device_metrics_exporter.py \ + --cluster_file "$CLUSTER_FILE" \ + --config_file "$MONITORING_CONFIG" \ + --html=/tmp/device_metrics_install_report.html \ + --capture=tee-sys \ + --self-contained-html + +if [ $? -ne 0 ]; then + echo "ERROR: Device Metrics Exporter installation failed!" + exit 1 +fi + +echo "" +echo "- Device Metrics Exporter deployed successfully!" +echo "" + +# Step 2: Setup Prometheus on management node +echo "Step 2: Setting up Prometheus..." +echo "------------------------------------------------------------" +# Stop existing Prometheus if running +if systemctl is-active --quiet prometheus 2>/dev/null; then + echo "Stopping existing Prometheus service..." + sudo systemctl stop prometheus + sleep 2 +fi + +sudo pkill -9 prometheus 2>/dev/null || true +sleep 2 + +if ! command -v prometheus &> /dev/null; then + echo "Prometheus not found. Installing..." + + cd /tmp + echo "Downloading Prometheus ${PROMETHEUS_VERSION} (~92MB)..." + wget --progress=bar:force https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz + echo "Download complete. Extracting..." + tar xzf prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz + + sudo mkdir -p /opt/prometheus + sudo cp -r prometheus-${PROMETHEUS_VERSION}.linux-amd64/* /opt/prometheus/ + sudo mkdir -p /var/lib/prometheus/data + + cd "$REPO_ROOT" + + # Copy config from repo + if [ -f "./monitoring/prometheus/prometheus.yml" ]; then + sudo cp ./monitoring/prometheus/prometheus.yml /opt/prometheus/ + echo "- Copied prometheus.yml" + else + echo "ERROR: prometheus.yml not found at ./monitoring/prometheus/prometheus.yml" + exit 1 + fi + + if [ -f "./monitoring/prometheus/alert_rules.yml" ]; then + sudo cp ./monitoring/prometheus/alert_rules.yml /opt/prometheus/ + echo "- Copied alert_rules.yml" + else + echo "WARNING: alert_rules.yml not found" + fi + + # Create systemd service + sudo tee /etc/systemd/system/prometheus.service > /dev/null </dev/null || true + docker rm grafana 2>/dev/null || true + + docker run -d \ + -p 3000:3000 \ + --name grafana \ + --restart unless-stopped \ + -v grafana-storage:/var/lib/grafana \ + grafana/grafana:${GRAFANA_VERSION} + + echo " Grafana installed and started" + echo " Default credentials: admin/admin" +else + echo " Grafana container already exists" + if ! docker ps --format '{{.Names}}' | grep -q '^grafana$'; then + echo " Starting Grafana..." + docker start grafana + fi +fi + +# Step 4: Verify everything is running +echo "" +echo "Step 4: Verifying installation..." +echo "------------------------------------------------------------" + +# Wait a bit for services to be ready +sleep 3 + +# Check Prometheus +if curl -s http://localhost:9090/-/healthy > /dev/null 2>&1; then + echo " Prometheus is healthy" +else + echo " Prometheus health check failed" +fi + +# Check Grafana +if curl -s http://localhost:3000/api/health > /dev/null 2>&1; then + echo " Grafana is healthy" +else + echo " Grafana health check failed (may still be starting...)" +fi + +# Check Device Metrics Exporter +if curl -s http://localhost:5000/metrics | head -1 > /dev/null 2>&1; then + echo " Device Metrics Exporter is responding" +else + echo " Device Metrics Exporter check failed" +fi + +# Check targets if jq available +if command -v jq &> /dev/null; then + echo "" + echo "Prometheus Targets:" + curl -s http://localhost:9090/api/v1/targets 2>/dev/null | \ + jq -r '.data.activeTargets[]? | "\(.labels.instance): \(.health)"' 2>/dev/null || \ + echo " (Could not retrieve targets)" +fi + +echo "" +echo "============================================" +echo "Deployment Complete!" +echo "============================================" +echo "" +echo "Access URLs:" +echo " Prometheus: http://localhost:9090" +echo " Grafana: http://localhost:3000" +echo " Exporter: http://localhost:5000/metrics" +echo "" +echo "Next Steps:" +echo " 1. Log into Grafana (admin/admin)" +echo " 2. Add Prometheus as datasource: http://localhost:9090" +echo " 3. Import dashboards from monitoring/grafana/dashboards/ (if available)" +echo " 4. Run CVS tests with --prometheus-url=http://localhost:9090" +echo ""