diff --git a/input/cluster_file/sample_monitor_cluster.json b/input/cluster_file/sample_monitor_cluster.json
new file mode 100644
index 00000000..27869619
--- /dev/null
+++ b/input/cluster_file/sample_monitor_cluster.json
@@ -0,0 +1,13 @@
+{
+    "username": "svdt-8",
+    "priv_key_file": "/home/svdt-8/.ssh/id_rsa",
+    "head_node_dict": {
+        "mgmt_ip": "localhost"
+    },
+    "node_dict": {
+        "localhost": {
+            "bmc_ip": "NA",
+            "vpc_ip": "localhost"
+        }
+    }
+}
diff --git a/input/config_file/monitoring/monitoring_config.json b/input/config_file/monitoring/monitoring_config.json
new file mode 100644
index 00000000..4e7ee738
--- /dev/null
+++ b/input/config_file/monitoring/monitoring_config.json
@@ -0,0 +1,38 @@
+{
+  "monitoring": {
+    "device_metrics_exporter_version": "{device-metrics-version}",
+    "device_metrics_exporter_image": "rocm/device-metrics-exporter:{device-metrics-version}",
+    "device_metrics_exporter_port": 5000,
+
+    "prometheus_host": "{prometheus-host}",
+    "prometheus_port": 9090,
+    "prometheus_version": "{prometheus-version}",
+    "prometheus_url": "http://{prometheus-host}:{prometheus-port}",
+
+    "grafana_host": "{grafana-host}",
+    "grafana_port": 3000,
+    "grafana_version": "{grafana-version}",
+    "grafana_url": "http://{grafana-host}:{grafana-port}",
+    "grafana_username": "admin",
+    "grafana_password": "{grafana-password}",
+    "grafana_api_key": "{grafana-api-key}",
+
+    "scrape_interval": "15s",
+    "scrape_timeout": "10s",
+    "retention_days": 30,
+
+    "alert_thresholds": {
+      "temperature_warning": 95,
+      "temperature_critical": 105,
+      "power_warning": 700,
+      "ecc_error_rate_warning": 10,
+      "memory_usage_warning": 90
+    },
+
+    "deployment": {
+      "docker_network": "host",
+      "restart_policy": "unless-stopped",
+      "log_level": "INFO"
+    }
+  }
+}
\ No newline at end of file
diff --git a/lib/device_metrics_lib.py b/lib/device_metrics_lib.py
new file mode 100644
index 00000000..2eef53e8
--- /dev/null
+++ b/lib/device_metrics_lib.py
@@ -0,0 +1,242 @@
+'''
+Copyright 2025 Advanced Micro Devices, Inc.
+Device Metrics Integration Library for CVS
+'''
+
+import requests
+import json
+import logging
+from typing import Dict, List, Optional, Any
+from datetime import datetime
+
+log = logging.getLogger(__name__)
+
+
+class PrometheusClient:
+    """Client for querying Prometheus API to retrieve GPU metrics."""
+    
+    def __init__(self, prometheus_url: str, timeout: int = 30):
+        self.base_url = prometheus_url.rstrip('/')
+        self.timeout = timeout
+        self.api_url = f"{self.base_url}/api/v1"
+        
+    def check_health(self) -> bool:
+        """Check if Prometheus server is healthy."""
+        try:
+            response = requests.get(f"{self.base_url}/-/healthy", timeout=5)
+            return response.status_code == 200
+        except Exception as e:
+            log.error(f"Prometheus health check failed: {e}")
+            return False
+    
+    def query_instant(self, query: str, time: Optional[str] = None) -> Optional[Dict]:
+        """Execute instant Prometheus query."""
+        params = {'query': query}
+        if time:
+            params['time'] = time
+            
+        try:
+            response = requests.get(
+                f"{self.api_url}/query",
+                params=params,
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            data = response.json()
+            
+            if data.get('status') == 'success':
+                return data.get('data')
+            else:
+                log.error(f"Prometheus query failed: {data.get('error')}")
+                return None
+                
+        except Exception as e:
+            log.error(f"Error querying Prometheus: {e}")
+            return None
+    
+    def query_range(self, query: str, start: str, end: str, step: str = '15s') -> Optional[Dict]:
+        """Execute range Prometheus query for time-series data."""
+        params = {
+            'query': query,
+            'start': start,
+            'end': end,
+            'step': step
+        }
+        
+        try:
+            response = requests.get(
+                f"{self.api_url}/query_range",
+                params=params,
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            data = response.json()
+            
+            if data.get('status') == 'success':
+                return data.get('data')
+            else:
+                log.error(f"Prometheus range query failed: {data.get('error')}")
+                return None
+                
+        except Exception as e:
+            log.error(f"Error querying Prometheus range: {e}")
+            return None
+
+
+def get_gpu_metrics_from_prometheus(
+    prom_client: PrometheusClient,
+    node: str,
+    gpu_ids: Optional[List[str]] = None
+) -> Dict[str, Dict[str, float]]:
+    """
+    Retrieve GPU metrics from Prometheus for a specific node.
+    
+    Returns:
+        {
+            '0': {'temperature': 45.0, 'power': 300.5, 'utilization': 85.0},
+            '1': {'temperature': 46.0, 'power': 295.3, 'utilization': 82.0}
+        }
+    """
+    metrics_dict = {}
+    
+    # Query temperature
+    temp_query = f'amdgpu_temperature_celsius{{node="{node}", sensor="edge"}}'
+    temp_data = prom_client.query_instant(temp_query)
+    
+    if temp_data and temp_data.get('result'):
+        for result in temp_data['result']:
+            gpu_id = result['metric'].get('gpu', 'unknown')
+            if gpu_ids is None or gpu_id in gpu_ids:
+                if gpu_id not in metrics_dict:
+                    metrics_dict[gpu_id] = {}
+                metrics_dict[gpu_id]['temperature'] = float(result['value'][1])
+    
+    # Query power consumption
+    power_query = f'amdgpu_power_watts{{node="{node}"}}'
+    power_data = prom_client.query_instant(power_query)
+    
+    if power_data and power_data.get('result'):
+        for result in power_data['result']:
+            gpu_id = result['metric'].get('gpu', 'unknown')
+            if gpu_ids is None or gpu_id in gpu_ids:
+                if gpu_id not in metrics_dict:
+                    metrics_dict[gpu_id] = {}
+                metrics_dict[gpu_id]['power'] = float(result['value'][1])
+    
+    # Query GPU utilization
+    util_query = f'amdgpu_gpu_busy_percent{{node="{node}"}}'
+    util_data = prom_client.query_instant(util_query)
+    
+    if util_data and util_data.get('result'):
+        for result in util_data['result']:
+            gpu_id = result['metric'].get('gpu', 'unknown')
+            if gpu_ids is None or gpu_id in gpu_ids:
+                if gpu_id not in metrics_dict:
+                    metrics_dict[gpu_id] = {}
+                metrics_dict[gpu_id]['utilization'] = float(result['value'][1])
+    
+    return metrics_dict
+
+
+def get_device_exporter_health(
+    prom_client: PrometheusClient,
+    nodes: List[str]
+) -> Dict[str, bool]:
+    """Check health status of Device Metrics Exporter on all nodes."""
+    health_dict = {}
+    
+    for node in nodes:
+        query = f'up{{job="device-metrics-exporter", node="{node}"}}'
+        data = prom_client.query_instant(query)
+        
+        if data and data.get('result'):
+            is_up = float(data['result'][0]['value'][1]) == 1.0
+            health_dict[node] = is_up
+        else:
+            health_dict[node] = False
+    
+    return health_dict
+
+
+def create_grafana_annotation(
+    grafana_url: str,
+    text: str,
+    tags: List[str] = None,
+    api_key: Optional[str] = None,
+    username: Optional[str] = None,
+    password: Optional[str] = None,
+    time: Optional[int] = None
+) -> bool:
+    """Create annotation in Grafana to mark test events on dashboards."""
+    if tags is None:
+        tags = ['cvs-test']
+    
+    if time is None:
+        time = int(datetime.now().timestamp() * 1000)
+    
+    url = f"{grafana_url.rstrip('/')}/api/annotations"
+    
+    payload = {
+        'text': text,
+        'tags': tags,
+        'time': time
+    }
+    
+    headers = {'Content-Type': 'application/json'}
+    
+    if not api_key and (not username or not password):
+        log.warning("Grafana annotation requested without credentials or API key; skipping.")
+        return False
+    
+    if api_key:
+        headers['Authorization'] = f'Bearer {api_key}'
+        auth = None
+    else:
+        auth = (username, password)
+    
+    try:
+        response = requests.post(
+            url,
+            json=payload,
+            headers=headers,
+            auth=auth,
+            timeout=10
+        )
+        response.raise_for_status()
+        log.info(f"Created Grafana annotation: {text}")
+        return True
+        
+    except Exception as e:
+        log.error(f"Failed to create Grafana annotation: {e}")
+        return False
+
+
+# Test function
+if __name__ == '__main__':
+    import sys
+    
+    if len(sys.argv) < 3:
+        print("Usage: python device_metrics_lib.py <prometheus_url> <node>")
+        print("Example: python device_metrics_lib.py http://localhost:9090 localhost")
+        sys.exit(1)
+    
+    prometheus_url = sys.argv[1]
+    node = sys.argv[2]
+    
+    print(f"Testing Prometheus integration with {prometheus_url}")
+    
+    client = PrometheusClient(prometheus_url)
+    
+    if not client.check_health():
+        print("ERROR: Prometheus server is not healthy")
+        sys.exit(1)
+    print("Prometheus server is healthy")
+    
+    metrics = get_gpu_metrics_from_prometheus(client, node)
+    if metrics:
+        print(f"Retrieved metrics for {len(metrics)} GPUs")
+        for gpu_id, data in metrics.items():
+            print(f"  GPU {gpu_id}: Temp={data.get('temperature', 'N/A')}°C, "
+                  f"Power={data.get('power', 'N/A')}W")
+    else:
+        print("WARNING: No GPU metrics found")
diff --git a/lib/gpu_metrics_lib.py b/lib/gpu_metrics_lib.py
new file mode 100644
index 00000000..c9d6b556
--- /dev/null
+++ b/lib/gpu_metrics_lib.py
@@ -0,0 +1,416 @@
+'''
+Copyright 2025 Advanced Micro Devices, Inc.
+All rights reserved.
+'''
+
+"""
+Device Metrics Integration Library for CVS
+
+This module provides integration between CVS and AMD ROCm Device Metrics Exporter
+via Prometheus. It enables CVS to query GPU metrics from Prometheus instead of 
+(or in addition to) SSH-based amd-smi/rocm-smi commands.
+
+Device Metrics Exporter: https://github.com/ROCm/device-metrics-exporter
+"""
+
+import requests
+import json
+import time
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Any, Tuple
+import logging
+
+log = logging.getLogger(__name__)
+
+
+class PrometheusClient:
+    """
+    Client for querying Prometheus server that scrapes Device Metrics Exporter.
+    """
+    
+    def __init__(self, prometheus_url: str , timeout: int = 30):
+        if not prometheus_url:
+            # fall back only if truly absent
+            prometheus_url = os.getenv("PROMETHEUS_URL", "http://localhost:9090")
+        self.prometheus_url = prometheus_url.rstrip('/')
+        self.timeout = timeout
+        self.api_url = f"{self.prometheus_url}/api/v1"
+        log.info(f"Initialized Prometheus client for {self.prometheus_url}")
+        
+    def check_health(self) -> bool:
+        """Check if Prometheus server is healthy and reachable."""
+        try:
+            response = requests.get(
+                f"{self.prometheus_url}/-/healthy",
+                timeout=self.timeout
+            )
+            if response.status_code == 200:
+                log.info(f"✓ Prometheus server at {self.prometheus_url} is healthy")
+                return True
+            else:
+                log.error(f"✗ Prometheus health check failed with status {response.status_code}")
+                return False
+        except Exception as e:
+            log.error(f"✗ Failed to connect to Prometheus at {self.prometheus_url}: {e}")
+            return False
+    
+    def query_instant(self, query: str) -> Dict[str, Any]:
+        """Execute an instant PromQL query."""
+        try:
+            response = requests.get(
+                f"{self.api_url}/query",
+                params={'query': query},
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            result = response.json()
+            
+            if result.get('status') == 'success':
+                log.debug(f"Query successful: {query[:50]}...")
+            else:
+                log.warning(f"Query returned non-success status: {result.get('error', 'Unknown error')}")
+            
+            return result
+        except Exception as e:
+            log.error(f"Prometheus instant query failed: {e}")
+            return {"status": "error", "error": str(e)}
+    
+    def query_range(self, query: str, start_time: datetime, end_time: datetime, 
+                   step: str = "15s") -> Dict[str, Any]:
+        """Execute a range PromQL query for time-series data."""
+        try:
+            response = requests.get(
+                f"{self.api_url}/query_range",
+                params={
+                    'query': query,
+                    'start': start_time.timestamp(),
+                    'end': end_time.timestamp(),
+                    'step': step
+                },
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            result = response.json()
+            
+            if result.get('status') == 'success':
+                log.debug(f"Range query successful: {query[:50]}... [{start_time} to {end_time}]")
+            
+            return result
+        except Exception as e:
+            log.error(f"Prometheus range query failed: {e}")
+            return {"status": "error", "error": str(e)}
+    
+    def get_targets(self) -> List[Dict[str, Any]]:
+        """Get list of all scrape targets (Device Metrics Exporters) and their status."""
+        try:
+            response = requests.get(
+                f"{self.api_url}/targets",
+                timeout=self.timeout
+            )
+            response.raise_for_status()
+            data = response.json()
+            
+            if data.get('status') == 'success':
+                targets = data.get('data', {}).get('activeTargets', [])
+                log.info(f"Retrieved {len(targets)} active targets from Prometheus")
+                return targets
+            return []
+        except Exception as e:
+            log.error(f"Failed to get Prometheus targets: {e}")
+            return []
+
+
+# Device Metrics Exporter metric names (as of v1.4.0)
+DEVICE_METRICS_MAP = {
+    # Temperature metrics
+    'temperature_edge': 'amdgpu_temperature_edge_celsius',
+    'temperature_junction': 'amdgpu_temperature_junction_celsius',
+    'temperature_memory': 'amdgpu_temperature_memory_celsius',
+    'temperature_hbm': 'amdgpu_temperature_hbm_celsius',
+    
+    # Utilization metrics
+    'gpu_utilization': 'amdgpu_gpu_utilization_percent',
+    'memory_utilization': 'amdgpu_memory_utilization_percent',
+    
+    # Power metrics
+    'power_current': 'amdgpu_power_watts',
+    'power_average': 'amdgpu_power_average_watts',
+    'energy_consumed': 'amdgpu_energy_joules',
+    
+    # Memory metrics
+    'memory_used': 'amdgpu_memory_used_bytes',
+    'memory_total': 'amdgpu_memory_total_bytes',
+    'memory_free': 'amdgpu_memory_free_bytes',
+    
+    # Clock metrics
+    'clock_gpu': 'amdgpu_gpu_clock_mhz',
+    'clock_memory': 'amdgpu_memory_clock_mhz',
+    
+    # PCIe metrics
+    'pcie_bandwidth': 'amdgpu_pcie_bandwidth_bytes',
+    'pcie_link_speed': 'amdgpu_pcie_link_speed_mbps',
+    'pcie_link_width': 'amdgpu_pcie_link_width',
+    'pcie_replay_count': 'amdgpu_pcie_replay_count_total',
+    'pcie_nak_sent': 'amdgpu_pcie_nak_sent_total',
+    'pcie_nak_received': 'amdgpu_pcie_nak_received_total',
+    
+    # Error metrics
+    'ecc_correctable': 'amdgpu_ecc_correctable_errors_total',
+    'ecc_uncorrectable': 'amdgpu_ecc_uncorrectable_errors_total',
+    'ras_correctable': 'amdgpu_ras_correctable_error_count',
+    'ras_uncorrectable': 'amdgpu_ras_uncorrectable_error_count',
+}
+
+
+def get_gpu_metrics_from_prometheus(prom_client: PrometheusClient, 
+                                    node_list: Optional[List[str]] = None,
+                                    metrics: Optional[List[str]] = None) -> Dict[str, Dict]:
+    """
+    Query current GPU metrics from Prometheus for all or specific nodes.
+    
+    Returns:
+        Dict with structure: {node: {gpu_id: {metric_name: value}}}
+    """
+    metrics_dict = {}
+    
+    if metrics is None:
+        metrics = [
+            'temperature_edge', 'temperature_junction', 'temperature_memory',
+            'power_current', 'power_average',
+            'gpu_utilization', 'memory_utilization',
+            'memory_used', 'memory_total',
+            'pcie_bandwidth', 'pcie_link_speed',
+            'ecc_correctable', 'ecc_uncorrectable',
+            'clock_gpu', 'clock_memory'
+        ]
+    
+    for metric_key in metrics:
+        if metric_key not in DEVICE_METRICS_MAP:
+            log.warning(f"Unknown metric key: {metric_key}, skipping")
+            continue
+        
+        metric_name = DEVICE_METRICS_MAP[metric_key]
+        
+        # Build query with optional node filter
+        if node_list:
+            node_filter = '|'.join([node.replace('.', '\\.') for node in node_list])
+            query = f'{metric_name}{{instance=~"({node_filter}):.*"}}'
+        else:
+            query = metric_name
+        
+        result = prom_client.query_instant(query)
+        
+        if result.get('status') == 'success':
+            for item in result.get('data', {}).get('result', []):
+                labels = item.get('metric', {})
+                instance = labels.get('instance', '')
+                node = instance.split(':')[0] if ':' in instance else instance
+                gpu_id = labels.get('gpu', labels.get('gpu_id', 'unknown'))
+                value = item.get('value', [None, None])[1]
+                
+                try:
+                    if value is not None:
+                        value = float(value)
+                except (ValueError, TypeError):
+                    pass
+                
+                if node not in metrics_dict:
+                    metrics_dict[node] = {}
+                if gpu_id not in metrics_dict[node]:
+                    metrics_dict[node][gpu_id] = {}
+                
+                metrics_dict[node][gpu_id][metric_key] = value
+        else:
+            log.warning(f"Failed to query metric {metric_key}: {result.get('error', 'Unknown error')}")
+    
+    log.info(f"Retrieved metrics for {len(metrics_dict)} nodes, {len(metrics)} metric types")
+    return metrics_dict
+
+
+def get_device_exporter_health(prom_client: PrometheusClient,
+                               node_list: Optional[List[str]] = None) -> Dict[str, Dict]:
+    """
+    Check health status of Device Metrics Exporter on all nodes.
+    """
+    health_dict = {}
+    targets = prom_client.get_targets()
+    
+    for target in targets:
+        labels = target.get('labels', {})
+        instance = labels.get('instance', '')
+        job = labels.get('job', '')
+        
+        if 'device-metrics' not in job.lower() and 'amd' not in job.lower():
+            continue
+        
+        node = instance.split(':')[0] if ':' in instance else instance
+        
+        if node_list and node not in node_list:
+            continue
+        
+        health_dict[node] = {
+            'health': target.get('health', 'unknown'),
+            'last_scrape': target.get('lastScrape', ''),
+            'scrape_duration': target.get('lastScrapeDuration', 0),
+            'last_error': target.get('lastError', ''),
+            'scrape_url': target.get('scrapeUrl', ''),
+            'labels': labels
+        }
+    
+    up_count = sum(1 for h in health_dict.values() if h['health'] == 'up')
+    down_count = sum(1 for h in health_dict.values() if h['health'] == 'down')
+    log.info(f"Exporter health: {up_count} up, {down_count} down out of {len(health_dict)} nodes")
+    
+    return health_dict
+
+
+def create_grafana_annotation(grafana_url: str, api_key: str, 
+                              text: str, tags: List[str],
+                              start_time: Optional[datetime] = None,
+                              end_time: Optional[datetime] = None) -> bool:
+    """Create an annotation in Grafana to mark CVS test events."""
+    try:
+        url = f"{grafana_url.rstrip('/')}/api/annotations"
+        headers = {
+            'Authorization': f'Bearer {api_key}',
+            'Content-Type': 'application/json'
+        }
+        
+        if start_time is None:
+            start_time = datetime.now()
+        
+        data = {
+            'text': text,
+            'tags': tags,
+            'time': int(start_time.timestamp() * 1000)
+        }
+        
+        if end_time:
+            data['timeEnd'] = int(end_time.timestamp() * 1000)
+        
+        response = requests.post(url, headers=headers, json=data, timeout=10)
+        response.raise_for_status()
+        
+        log.info(f"✓ Created Grafana annotation: {text}")
+        return True
+        
+    except Exception as e:
+        log.error(f"✗ Failed to create Grafana annotation: {e}")
+        return False
+
+
+def compare_ssh_vs_prometheus(ssh_metrics: Dict, prom_metrics: Dict,
+                              tolerance: float = 5.0) -> Dict:
+    """Compare metrics collected via SSH vs Prometheus to validate consistency."""
+    comparison = {
+        'summary': {
+            'total_nodes': 0,
+            'matching_nodes': 0,
+            'discrepancy_nodes': 0,
+            'ssh_only_nodes': 0,
+            'prom_only_nodes': 0
+        },
+        'node_comparisons': [],
+        'discrepancies': []
+    }
+    
+    ssh_nodes = set(ssh_metrics.keys())
+    prom_nodes = set(prom_metrics.keys())
+    
+    comparison['summary']['total_nodes'] = len(ssh_nodes | prom_nodes)
+    comparison['summary']['ssh_only_nodes'] = len(ssh_nodes - prom_nodes)
+    comparison['summary']['prom_only_nodes'] = len(prom_nodes - ssh_nodes)
+    
+    for node in (ssh_nodes - prom_nodes):
+        log.warning(f"Node {node} only in SSH metrics (not in Prometheus)")
+        comparison['node_comparisons'].append({
+            'node': node,
+            'status': 'ssh_only',
+            'gpu_count_match': False
+        })
+    
+    for node in (prom_nodes - ssh_nodes):
+        log.warning(f"Node {node} only in Prometheus metrics (not in SSH)")
+        comparison['node_comparisons'].append({
+            'node': node,
+            'status': 'prom_only',
+            'gpu_count_match': False
+        })
+    
+    common_nodes = ssh_nodes & prom_nodes
+    
+    for node in common_nodes:
+        node_comparison = {
+            'node': node,
+            'status': 'match',
+            'gpu_count_match': True,
+            'metric_comparisons': []
+        }
+        
+        ssh_gpus = set(ssh_metrics[node].keys())
+        prom_gpus = set(prom_metrics[node].keys())
+        
+        if ssh_gpus != prom_gpus:
+            node_comparison['gpu_count_match'] = False
+            node_comparison['status'] = 'discrepancy'
+            log.warning(f"Node {node}: GPU count mismatch")
+        
+        common_gpus = ssh_gpus & prom_gpus
+        for gpu_id in common_gpus:
+            ssh_gpu = ssh_metrics[node][gpu_id]
+            prom_gpu = prom_metrics[node][gpu_id]
+            
+            ssh_metric_keys = set(ssh_gpu.keys())
+            prom_metric_keys = set(prom_gpu.keys())
+            common_metrics = ssh_metric_keys & prom_metric_keys
+            
+            for metric_key in common_metrics:
+                ssh_val = ssh_gpu[metric_key]
+                prom_val = prom_gpu[metric_key]
+                
+                if ssh_val is None or prom_val is None:
+                    continue
+                
+                try:
+                    ssh_num = float(ssh_val)
+                    prom_num = float(prom_val)
+                    
+                    if ssh_num != 0:
+                        diff_percent = abs((prom_num - ssh_num) / ssh_num) * 100
+                    else:
+                        diff_percent = 0 if prom_num == 0 else 100
+                    
+                    if diff_percent > tolerance:
+                        node_comparison['status'] = 'discrepancy'
+                        comparison['discrepancies'].append({
+                            'node': node,
+                            'gpu': str(gpu_id),
+                            'metric': metric_key,
+                            'ssh_value': ssh_num,
+                            'prom_value': prom_num,
+                            'diff_percent': round(diff_percent, 2)
+                        })
+                except (ValueError, TypeError):
+                    if str(ssh_val) != str(prom_val):
+                        node_comparison['status'] = 'discrepancy'
+                        comparison['discrepancies'].append({
+                            'node': node,
+                            'gpu': str(gpu_id),
+                            'metric': metric_key,
+                            'ssh_value': str(ssh_val),
+                            'prom_value': str(prom_val),
+                            'diff_percent': None
+                        })
+        
+        comparison['node_comparisons'].append(node_comparison)
+        
+        if node_comparison['status'] == 'match':
+            comparison['summary']['matching_nodes'] += 1
+        else:
+            comparison['summary']['discrepancy_nodes'] += 1
+    
+    log.info(f"Comparison complete: {comparison['summary']['matching_nodes']}/{len(common_nodes)} nodes match")
+    if comparison['discrepancies']:
+        log.warning(f"Found {len(comparison['discrepancies'])} metric discrepancies")
+    
+    return comparison
diff --git a/lib/grafana_config_lib.py b/lib/grafana_config_lib.py
new file mode 100644
index 00000000..5849da2a
--- /dev/null
+++ b/lib/grafana_config_lib.py
@@ -0,0 +1,250 @@
+"""
+Grafana configuration and provisioning library
+"""
+import os
+import json
+import logging
+
+log = logging.getLogger(__name__)
+
+
+def setup_grafana_provisioning(monitoring_dir="/tmp/grafana_provisioning"):
+    """
+    Setup Grafana provisioning configs for datasources and dashboards
+    """
+    os.makedirs(f"{monitoring_dir}/datasources", exist_ok=True)
+    os.makedirs(f"{monitoring_dir}/dashboards", exist_ok=True)
+    os.makedirs(f"{monitoring_dir}/dashboard_files", exist_ok=True)
+    
+    # Datasource config
+    datasource_config = """apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://localhost:9090
+    isDefault: true
+    editable: false
+    jsonData:
+      timeInterval: "5s"
+"""
+    
+    with open(f"{monitoring_dir}/datasources/prometheus.yml", 'w') as f:
+        f.write(datasource_config)
+    
+    # Dashboard provisioning config
+    dashboard_config = """apiVersion: 1
+
+providers:
+  - name: 'Default'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /var/lib/grafana/dashboards
+"""
+    
+    with open(f"{monitoring_dir}/dashboards/default.yml", 'w') as f:
+        f.write(dashboard_config)
+    
+    log.info(f"Grafana provisioning configs created in {monitoring_dir}")
+    return monitoring_dir
+
+
+def create_gpu_dashboard(output_file="/tmp/grafana_provisioning/dashboard_files/gpu-metrics.json"):
+    """
+    Create GPU metrics dashboard JSON
+    """
+    dashboard = {
+        "annotations": {"list": []},
+        "editable": True,
+        "fiscalYearStartMonth": 0,
+        "graphTooltip": 0,
+        "id": None,
+        "links": [],
+        "liveNow": False,
+        "panels": [
+            {
+                "datasource": {"type": "prometheus", "uid": "prometheus"},
+                "fieldConfig": {
+                    "defaults": {
+                        "color": {"mode": "palette-classic"},
+                        "custom": {
+                            "axisCenteredZero": False,
+                            "axisColorMode": "text",
+                            "axisPlacement": "auto",
+                            "barAlignment": 0,
+                            "drawStyle": "line",
+                            "fillOpacity": 10,
+                            "gradientMode": "none",
+                            "lineInterpolation": "linear",
+                            "lineWidth": 1,
+                            "pointSize": 5,
+                            "showPoints": "never"
+                        },
+                        "mappings": [],
+                        "thresholds": {
+                            "mode": "absolute",
+                            "steps": [
+                                {"color": "green", "value": None},
+                                {"color": "yellow", "value": 70},
+                                {"color": "red", "value": 85}
+                            ]
+                        },
+                        "unit": "celsius"
+                    }
+                },
+                "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+                "id": 1,
+                "options": {
+                    "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True},
+                    "tooltip": {"mode": "multi"}
+                },
+                "targets": [
+                    {
+                        "datasource": {"type": "prometheus", "uid": "prometheus"},
+                        "expr": "gpu_temp_degrees",
+                        "legendFormat": "{{instance}} - GPU {{gpu_index}}",
+                        "refId": "A"
+                    }
+                ],
+                "title": "GPU Temperature",
+                "type": "timeseries"
+            },
+            {
+                "datasource": {"type": "prometheus", "uid": "prometheus"},
+                "fieldConfig": {
+                    "defaults": {
+                        "color": {"mode": "palette-classic"},
+                        "custom": {
+                            "axisCenteredZero": False,
+                            "axisColorMode": "text",
+                            "axisPlacement": "auto",
+                            "drawStyle": "line",
+                            "fillOpacity": 10,
+                            "lineInterpolation": "linear",
+                            "lineWidth": 1,
+                            "pointSize": 5
+                        },
+                        "mappings": [],
+                        "max": 100,
+                        "min": 0,
+                        "unit": "percent"
+                    }
+                },
+                "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+                "id": 2,
+                "options": {
+                    "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True},
+                    "tooltip": {"mode": "multi"}
+                },
+                "targets": [
+                    {
+                        "datasource": {"type": "prometheus", "uid": "prometheus"},
+                        "expr": "gpu_utilization_percent",
+                        "legendFormat": "{{instance}} - GPU {{gpu_index}}",
+                        "refId": "A"
+                    }
+                ],
+                "title": "GPU Utilization %",
+                "type": "timeseries"
+            },
+            {
+                "datasource": {"type": "prometheus", "uid": "prometheus"},
+                "fieldConfig": {
+                    "defaults": {
+                        "color": {"mode": "palette-classic"},
+                        "custom": {
+                            "axisCenteredZero": False,
+                            "axisColorMode": "text",
+                            "axisPlacement": "auto",
+                            "drawStyle": "line",
+                            "fillOpacity": 10,
+                            "lineInterpolation": "linear",
+                            "lineWidth": 1
+                        },
+                        "mappings": [],
+                        "unit": "bytes"
+                    }
+                },
+                "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+                "id": 3,
+                "options": {
+                    "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True},
+                    "tooltip": {"mode": "multi"}
+                },
+                "targets": [
+                    {
+                        "datasource": {"type": "prometheus", "uid": "prometheus"},
+                        "expr": "gpu_memory_used_bytes",
+                        "legendFormat": "{{instance}} - GPU {{gpu_index}} Used",
+                        "refId": "A"
+                    },
+                    {
+                        "datasource": {"type": "prometheus", "uid": "prometheus"},
+                        "expr": "gpu_memory_total_bytes",
+                        "legendFormat": "{{instance}} - GPU {{gpu_index}} Total",
+                        "refId": "B"
+                    }
+                ],
+                "title": "GPU Memory Usage",
+                "type": "timeseries"
+            },
+            {
+                "datasource": {"type": "prometheus", "uid": "prometheus"},
+                "fieldConfig": {
+                    "defaults": {
+                        "color": {"mode": "palette-classic"},
+                        "custom": {
+                            "axisCenteredZero": False,
+                            "axisColorMode": "text",
+                            "axisPlacement": "auto",
+                            "drawStyle": "line",
+                            "fillOpacity": 10,
+                            "lineInterpolation": "linear",
+                            "lineWidth": 1
+                        },
+                        "mappings": [],
+                        "unit": "watt"
+                    }
+                },
+                "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+                "id": 4,
+                "options": {
+                    "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True},
+                    "tooltip": {"mode": "multi"}
+                },
+                "targets": [
+                    {
+                        "datasource": {"type": "prometheus", "uid": "prometheus"},
+                        "expr": "gpu_power_watts",
+                        "legendFormat": "{{instance}} - GPU {{gpu_index}}",
+                        "refId": "A"
+                    }
+                ],
+                "title": "GPU Power Consumption",
+                "type": "timeseries"
+            }
+        ],
+        "refresh": "5s",
+        "schemaVersion": 38,
+        "style": "dark",
+        "tags": ["gpu", "amd", "rocm"],
+        "templating": {"list": []},
+        "time": {"from": "now-15m", "to": "now"},
+        "timepicker": {},
+        "timezone": "",
+        "title": "AMD GPU Metrics Dashboard",
+        "uid": "amd-gpu-metrics",
+        "version": 1
+    }
+    
+    with open(output_file, 'w') as f:
+        json.dump(dashboard, f, indent=2)
+    
+    log.info(f"GPU dashboard created: {output_file}")
+    return output_file
diff --git a/lib/prometheus_config_lib.py b/lib/prometheus_config_lib.py
new file mode 100644
index 00000000..da6934f6
--- /dev/null
+++ b/lib/prometheus_config_lib.py
@@ -0,0 +1,107 @@
+'''
+Copyright 2025 Advanced Micro Devices, Inc.
+Prometheus Configuration Generator for CVS Monitoring
+'''
+
+import json
+import yaml
+import logging
+
+log = logging.getLogger(__name__)
+
+
+def generate_prometheus_config(cluster_dict, config_dict, output_file=None):
+    """
+    Generate Prometheus configuration with dynamic scrape targets.
+    
+    Args:
+        cluster_dict: Cluster configuration
+        config_dict: Monitoring configuration
+        output_file: Optional output file path
+    
+    Returns:
+        str: YAML configuration content
+    """
+    from utils_lib import generate_prometheus_targets
+    
+    # Get configuration values
+    scrape_interval = config_dict.get('scrape_interval', '15s')
+    scrape_timeout = config_dict.get('scrape_timeout', '10s')
+    retention_days = config_dict.get('retention_days', 30)
+    exporter_port = config_dict.get('device_metrics_exporter_port', 5000)
+    
+    # Generate targets for all nodes (management + workers)
+    targets = generate_prometheus_targets(cluster_dict, exporter_port)
+    
+    log.info(f"Generating Prometheus config for {len(targets)} targets")
+    for target in targets:
+        log.info(f"  • {target}")
+    
+    # Build Prometheus configuration
+    config = {
+        'global': {
+            'scrape_interval': scrape_interval,
+            'scrape_timeout': scrape_timeout,
+            'evaluation_interval': scrape_interval
+        },
+        'scrape_configs': [
+            {
+                'job_name': 'device-metrics-exporter',
+                'static_configs': [
+                    {
+                        'targets': targets
+                    }
+                ],
+                'metric_relabel_configs': [
+                    {
+                        'source_labels': ['__name__'],
+                        'regex': 'gpu_.*',
+                        'action': 'keep'
+                    }
+                ]
+            }
+        ]
+    }
+    
+    # Convert to YAML
+    yaml_content = yaml.dump(config, default_flow_style=False, sort_keys=False)
+    
+    # Write to file if specified
+    if output_file:
+        with open(output_file, 'w') as f:
+            f.write(yaml_content)
+        log.info(f"Prometheus config written to: {output_file}")
+    
+    return yaml_content
+
+
+def update_prometheus_targets(prometheus_yml_path, cluster_dict, exporter_port=5000):
+    """
+    Update existing Prometheus config with new targets.
+    
+    Args:
+        prometheus_yml_path: Path to prometheus.yml
+        cluster_dict: Cluster configuration
+        exporter_port: Exporter port (default: 5000)
+    """
+    from utils_lib import generate_prometheus_targets
+    
+    # Load existing config
+    with open(prometheus_yml_path, 'r') as f:
+        config = yaml.safe_load(f)
+    
+    # Generate new targets
+    targets = generate_prometheus_targets(cluster_dict, exporter_port)
+    
+    # Update targets in scrape config
+    for scrape_config in config.get('scrape_configs', []):
+        if scrape_config.get('job_name') == 'device-metrics-exporter':
+            scrape_config['static_configs'] = [{'targets': targets}]
+            log.info(f"Updated scrape targets: {targets}")
+            break
+    
+    # Write back
+    with open(prometheus_yml_path, 'w') as f:
+        yaml.dump(config, f, default_flow_style=False, sort_keys=False)
+    
+    log.info(f"Prometheus config updated: {prometheus_yml_path}")
diff --git a/lib/utils_lib.py b/lib/utils_lib.py
index bc8f0387..9c288931 100644
--- a/lib/utils_lib.py
+++ b/lib/utils_lib.py
@@ -420,6 +420,80 @@ def resolve_test_config_placeholders(config_dict, cluster_dict):
     return resolved_config
 
 
+def resolve_placeholder_with_fallback(value, fallback):
+    """
+    Resolve placeholder strings, returning fallback if unresolved.
+    
+    Args:
+        value: Value that may contain unresolved placeholders like {prometheus-host}
+        fallback: Default value to use if placeholder is unresolved
+    
+    Returns:
+        Resolved value or fallback if value is None/empty/unresolved placeholder
+    
+    Examples:
+        >>> resolve_placeholder_with_fallback("{prometheus-host}", "localhost")
+        'localhost'
+        >>> resolve_placeholder_with_fallback("10.0.0.5", "localhost")
+        '10.0.0.5'
+        >>> resolve_placeholder_with_fallback(None, "localhost")
+        'localhost'
+    """
+    if value is None:
+        return fallback
+    
+    # Convert to string
+    value_str = str(value).strip()
+    
+    # Empty or unresolved placeholder (starts with { and ends with })
+    if not value_str or (value_str.startswith("{") and value_str.endswith("}")):
+        return fallback
+    
+    return value_str
+
+
+def apply_monitoring_defaults(config_dict):
+    """
+    Apply default fallback values for monitoring configuration.
+    Ensures localhost/default ports/versions when placeholders aren't resolved.
+    
+    Args:
+        config_dict: Monitoring configuration dictionary
+    
+    Returns:
+        dict: Configuration with defaults applied
+    """
+    defaults = {
+        'prometheus_host': 'localhost',
+        'prometheus_port': 9090,
+        'prometheus_version': 'v2.55.0',
+        'grafana_host': 'localhost',
+        'grafana_port': 3000,
+        'grafana_version': '10.4.1',
+        'device_metrics_exporter_version': 'v1.4.0',
+        'device_metrics_exporter_port': 5000,
+        'device_metrics_exporter_host': 'localhost',
+    }
+    
+    result = config_dict.copy()
+    
+    for key, default_value in defaults.items():
+        current_value = result.get(key)
+        result[key] = resolve_placeholder_with_fallback(current_value, default_value)
+    
+    # Build derived URLs with resolved values
+    if 'prometheus_url' in result:
+        prom_host = result['prometheus_host']
+        prom_port = result['prometheus_port']
+        result['prometheus_url'] = f"http://{prom_host}:{prom_port}"
+    
+    if 'grafana_url' in result:
+        graf_host = result['grafana_host']
+        graf_port = result['grafana_port']
+        result['grafana_url'] = f"http://{graf_host}:{graf_port}"
+    
+    return result
+  
 def collect_system_metadata(phdl, cluster_dict, config_dict, test_command=None, env_vars=None):
     """
     Collect comprehensive system metadata from compute nodes for test reporting.
@@ -687,3 +761,138 @@ def collect_system_metadata(phdl, cluster_dict, config_dict, test_command=None,
     
     log.info(f'Collected metadata: {list(metadata.keys())}')
     return metadata
+
+
+def get_management_node(cluster_dict):
+    """
+    Get the management/head node from cluster configuration.
+    
+    Args:
+        cluster_dict: Cluster configuration dictionary
+    
+    Returns:
+        str: Management node IP/hostname
+    
+    Example:
+        >>> cluster = {'head_node_dict': {'mgmt_ip': '10.0.0.100'}}
+        >>> get_management_node(cluster)
+        '10.0.0.100'
+    """
+    return cluster_dict.get('head_node_dict', {}).get('mgmt_ip', 'localhost')
+
+
+def get_all_nodes(cluster_dict):
+    """
+    Get all nodes (workers + management) from cluster configuration.
+    
+    Args:
+        cluster_dict: Cluster configuration dictionary
+    
+    Returns:
+        list: All node IPs/hostnames including management node
+    
+    Example:
+        >>> cluster = {
+        ...     'head_node_dict': {'mgmt_ip': '10.0.0.100'},
+        ...     'node_dict': {'10.0.0.101': {...}, '10.0.0.102': {...}}
+        ... }
+        >>> get_all_nodes(cluster)
+        ['10.0.0.100', '10.0.0.101', '10.0.0.102']
+    """
+    mgmt_node = get_management_node(cluster_dict)
+    worker_nodes = list(cluster_dict.get('node_dict', {}).keys())
+    
+    # Management node + all workers
+    all_nodes = [mgmt_node] + worker_nodes
+    
+    # Remove duplicates (in case mgmt is also in node_dict)
+    return list(dict.fromkeys(all_nodes))
+
+
+def get_worker_nodes(cluster_dict):
+    """
+    Get worker nodes only (excluding management node).
+    
+    Args:
+        cluster_dict: Cluster configuration dictionary
+    
+    Returns:
+        list: Worker node IPs/hostnames
+    """
+    return list(cluster_dict.get('node_dict', {}).keys())
+
+
+def is_management_node(node, cluster_dict):
+    """
+    Check if a node is the management/head node.
+    
+    Args:
+        node: Node IP/hostname to check
+        cluster_dict: Cluster configuration dictionary
+    
+    Returns:
+        bool: True if node is management node
+    
+    Example:
+        >>> cluster = {'head_node_dict': {'mgmt_ip': 'localhost'}}
+        >>> is_management_node('localhost', cluster)
+        True
+        >>> is_management_node('10.0.0.101', cluster)
+        False
+    """
+    mgmt_node = get_management_node(cluster_dict)
+    
+    # Handle localhost aliases
+    if mgmt_node in ['localhost', '127.0.0.1'] and node in ['localhost', '127.0.0.1']:
+        return True
+    
+    return node == mgmt_node
+
+
+def is_single_node_deployment(cluster_dict):
+    """
+    Detect if this is a single-node (localhost) deployment.
+    
+    Args:
+        cluster_dict: Cluster configuration dictionary
+    
+    Returns:
+        bool: True if single-node deployment
+    
+    Example:
+        >>> cluster = {'head_node_dict': {'mgmt_ip': 'localhost'}, 'node_dict': {'localhost': {}}}
+        >>> is_single_node_deployment(cluster)
+        True
+    """
+    all_nodes = get_all_nodes(cluster_dict)
+    
+    # Single node if only one unique node
+    if len(set(all_nodes)) == 1:
+        return True
+    
+    # Also single node if all nodes are localhost variants
+    localhost_variants = {'localhost', '127.0.0.1', '::1'}
+    return all(node in localhost_variants for node in all_nodes)
+
+
+def generate_prometheus_targets(cluster_dict, exporter_port=5000):
+    """
+    Generate Prometheus scrape targets for all nodes.
+    
+    Args:
+        cluster_dict: Cluster configuration dictionary
+        exporter_port: Port where Device Metrics Exporter runs (default: 5000)
+    
+    Returns:
+        list: Prometheus target strings in format "host:port"
+    
+    Example:
+        >>> cluster = {
+        ...     'head_node_dict': {'mgmt_ip': '10.0.0.100'},
+        ...     'node_dict': {'10.0.0.101': {}, '10.0.0.102': {}}
+        ... }
+        >>> generate_prometheus_targets(cluster)
+        ['10.0.0.100:5000', '10.0.0.101:5000', '10.0.0.102:5000']
+    """
+    all_nodes = get_all_nodes(cluster_dict)
+    return [f"{node}:{exporter_port}" for node in all_nodes]
diff --git a/monitoring/dashboards/gpu-metrics-dashboard.json b/monitoring/dashboards/gpu-metrics-dashboard.json
new file mode 100644
index 00000000..ee6048df
--- /dev/null
+++ b/monitoring/dashboards/gpu-metrics-dashboard.json
@@ -0,0 +1,1006 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "panels": [
+    {
+      "type": "row",
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 100,
+      "title": "\ud83d\udd34 Critical Health Metrics"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "GPU Edge Temperature",
+      "type": "stat",
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 0,
+        "y": 1
+      },
+      "id": 1,
+      "fieldConfig": {
+        "defaults": {
+          "unit": "celsius",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "orange",
+                "value": 85
+              },
+              {
+                "color": "red",
+                "value": 95
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "colorMode": "background",
+        "graphMode": "area",
+        "orientation": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_edge_temperature",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "GPU Utilization",
+      "type": "stat",
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 4,
+        "y": 1
+      },
+      "id": 2,
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "max": 100,
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10
+              },
+              {
+                "color": "green",
+                "value": 50
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "colorMode": "background",
+        "graphMode": "area",
+        "orientation": "auto"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_gfx_activity",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "Power Usage",
+      "type": "stat",
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 8,
+        "y": 1
+      },
+      "id": 3,
+      "fieldConfig": {
+        "defaults": {
+          "unit": "watt",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 500
+              },
+              {
+                "color": "red",
+                "value": 700
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_power_usage",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "Total GPUs Online",
+      "type": "stat",
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 12,
+        "y": 1
+      },
+      "id": 4,
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "red",
+                "value": null
+              },
+              {
+                "color": "green",
+                "value": 1
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "textMode": "value_and_name"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "count(gpu_edge_temperature)",
+          "legendFormat": "Total GPUs",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "Memory Usage",
+      "type": "stat",
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 16,
+        "y": 1
+      },
+      "id": 5,
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 90
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "colorMode": "background",
+        "graphMode": "area"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "(gpu_used_vram / gpu_total_vram) * 100",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "ECC Errors (5m rate)",
+      "type": "stat",
+      "gridPos": {
+        "h": 4,
+        "w": 4,
+        "x": 20,
+        "y": 1
+      },
+      "id": 6,
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 10
+              },
+              {
+                "color": "red",
+                "value": 100
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "colorMode": "background",
+        "graphMode": "area"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "rate(gpu_ecc_correct_total[5m]) * 300",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 5
+      },
+      "id": 101,
+      "title": "\ud83c\udf21\ufe0f Temperature & Utilization"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "GPU Temperatures Over Time",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 6
+      },
+      "id": 7,
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisPlacement": "auto",
+            "drawStyle": "line",
+            "fillOpacity": 10,
+            "lineWidth": 2,
+            "pointSize": 5,
+            "showPoints": "never"
+          },
+          "unit": "celsius",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          }
+        }
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true,
+          "calcs": [
+            "mean",
+            "max",
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_edge_temperature",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}} Edge",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_junction_temperature",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}} Junction",
+          "refId": "B"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "GPU Utilization Over Time",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 6
+      },
+      "id": 8,
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisPlacement": "auto",
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "lineWidth": 2
+          },
+          "unit": "percent",
+          "max": 100,
+          "min": 0
+        }
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true,
+          "calcs": [
+            "mean",
+            "max",
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_gfx_activity",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 14
+      },
+      "id": 102,
+      "title": "\ud83d\udcbe Memory & Power"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "GPU Memory Usage (GB)",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 15
+      },
+      "id": 9,
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisPlacement": "auto",
+            "drawStyle": "line",
+            "fillOpacity": 30,
+            "lineWidth": 2
+          },
+          "unit": "decbytes"
+        }
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true,
+          "calcs": [
+            "mean",
+            "max",
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_used_vram",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}} Used",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_total_vram",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}} Total",
+          "refId": "B"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "Power Consumption by GPU",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 15
+      },
+      "id": 10,
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisPlacement": "auto",
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "lineWidth": 2
+          },
+          "unit": "watt"
+        }
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "showLegend": true,
+          "calcs": [
+            "mean",
+            "max",
+            "last"
+          ]
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_power_usage",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 23
+      },
+      "id": 103,
+      "title": "\ud83d\udd27 Advanced Metrics"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "GPU Clock Speed (MHz)",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 0,
+        "y": 24
+      },
+      "id": 11,
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisPlacement": "auto",
+            "drawStyle": "line",
+            "lineWidth": 2
+          },
+          "unit": "hertz"
+        }
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_clock{clock_type=\"GPU_CLOCK_TYPE_SYSTEM\"}",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "Memory Activity (%)",
+      "type": "timeseries",
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 8,
+        "y": 24
+      },
+      "id": 12,
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisPlacement": "auto",
+            "drawStyle": "line",
+            "fillOpacity": 15,
+            "lineWidth": 2
+          },
+          "unit": "percent",
+          "max": 100,
+          "min": 0
+        }
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "gpu_umc_activity",
+          "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "ECC Errors by Component (5m rate)",
+      "type": "bargauge",
+      "gridPos": {
+        "h": 8,
+        "w": 8,
+        "x": 16,
+        "y": 24
+      },
+      "id": 13,
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 5
+              },
+              {
+                "color": "red",
+                "value": 50
+              }
+            ]
+          },
+          "unit": "short"
+        }
+      },
+      "options": {
+        "displayMode": "gradient",
+        "orientation": "horizontal",
+        "showUnfilled": true
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum(rate(gpu_ecc_correct_umc[5m]) * 300)",
+          "legendFormat": "UMC",
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum(rate(gpu_ecc_correct_gfx[5m]) * 300)",
+          "legendFormat": "GFX",
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum(rate(gpu_ecc_correct_sdma[5m]) * 300)",
+          "legendFormat": "SDMA",
+          "refId": "C"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum(rate(gpu_ecc_correct_mmhub[5m]) * 300)",
+          "legendFormat": "MMHUB",
+          "refId": "D"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "collapsed": false,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 32
+      },
+      "id": 104,
+      "title": "\ud83d\udda5\ufe0f Node Comparison"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "Average GPU Temp by Node",
+      "type": "bargauge",
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 0,
+        "y": 33
+      },
+      "id": 14,
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 70
+              },
+              {
+                "color": "red",
+                "value": 85
+              }
+            ]
+          },
+          "unit": "celsius"
+        }
+      },
+      "options": {
+        "displayMode": "lcd",
+        "orientation": "horizontal"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "avg by (hostname) (gpu_edge_temperature)",
+          "legendFormat": "{{hostname}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "Total Power by Node",
+      "type": "piechart",
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 8,
+        "y": 33
+      },
+      "id": 15,
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true,
+          "values": [
+            "value",
+            "percent"
+          ]
+        },
+        "pieType": "pie",
+        "tooltip": {
+          "mode": "single"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "sum by (hostname) (gpu_power_usage)",
+          "legendFormat": "{{hostname}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "title": "GPU Count by Node",
+      "type": "stat",
+      "gridPos": {
+        "h": 6,
+        "w": 8,
+        "x": 16,
+        "y": 33
+      },
+      "id": 16,
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "unit": "short"
+        }
+      },
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "textMode": "value_and_name",
+        "orientation": "horizontal"
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "expr": "count by (hostname) (gpu_edge_temperature)",
+          "legendFormat": "{{hostname}}",
+          "refId": "A"
+        }
+      ]
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 39,
+  "tags": [
+    "gpu",
+    "amd",
+    "rocm",
+    "cluster"
+  ],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {
+    "refresh_intervals": [
+      "5s",
+      "10s",
+      "30s",
+      "1m",
+      "5m"
+    ],
+    "time_options": [
+      "5m",
+      "15m",
+      "1h",
+      "6h",
+      "12h",
+      "24h"
+    ]
+  },
+  "timezone": "browser",
+  "title": "AMD GPU Cluster Monitoring",
+  "uid": "amd-gpu-metrics",
+  "version": 1
+}
\ No newline at end of file
diff --git a/monitoring/prometheus/alert_rules.yml b/monitoring/prometheus/alert_rules.yml
new file mode 100644
index 00000000..723424d6
--- /dev/null
+++ b/monitoring/prometheus/alert_rules.yml
@@ -0,0 +1,74 @@
+# Prometheus Alert Rules for AMD GPU Health Monitoring
+
+groups:
+  - name: gpu_health_alerts
+    interval: 30s
+    rules:
+      # GPU Temperature Alerts
+      - alert: GPUTemperatureWarning
+        expr: amdgpu_temperature_celsius{sensor="edge"} > 95
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "GPU temperature high on {{ $labels.node }}"
+          description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} has temperature {{ $value }}°C (threshold: 95°C)"
+
+      - alert: GPUTemperatureCritical
+        expr: amdgpu_temperature_celsius{sensor="edge"} > 105
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "GPU temperature critical on {{ $labels.node }}"
+          description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} has temperature {{ $value }}°C (threshold: 105°C)"
+
+      # GPU Power Alerts
+      - alert: GPUPowerHigh
+        expr: amdgpu_power_watts > 700
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "GPU power consumption high on {{ $labels.node }}"
+          description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} consuming {{ $value }}W (threshold: 700W)"
+
+      # ECC Error Alerts
+      - alert: GPUECCErrors
+        expr: rate(amdgpu_ecc_errors_total[5m]) > 0
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "GPU ECC errors detected on {{ $labels.node }}"
+          description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} reporting ECC errors"
+
+      # PCIe Replay Errors
+      - alert: PCIeReplayErrors
+        expr: rate(amdgpu_pcie_replay_count[5m]) > 10
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "PCIe replay errors on {{ $labels.node }}"
+          description: "GPU {{ $labels.gpu }} on node {{ $labels.node }} experiencing PCIe replay errors"
+
+      # Exporter Health
+      - alert: DeviceMetricsExporterDown
+        expr: up{job="device-metrics-exporter"} == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Device Metrics Exporter down on {{ $labels.node }}"
+          description: "Cannot scrape metrics from {{ $labels.node }} - exporter may be down"
+
+      # Cluster-wide alerts
+      - alert: MultipleGPUsOverheating
+        expr: count(amdgpu_temperature_celsius{sensor="edge"} > 95) > 3
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Multiple GPUs overheating in cluster"
+          description: "{{ $value }} GPUs are above 95°C - possible cooling issue"
diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml
new file mode 100644
index 00000000..abc87370
--- /dev/null
+++ b/monitoring/prometheus/prometheus.yml
@@ -0,0 +1,54 @@
+# Prometheus Configuration for CVS Device Metrics Monitoring
+# This file configures Prometheus to scrape AMD GPU metrics from Device Metrics Exporter
+
+global:
+  scrape_interval: 15s           # How often to scrape targets
+  evaluation_interval: 15s       # How often to evaluate rules
+  scrape_timeout: 10s            # Timeout for scraping
+  external_labels:
+    cluster: 'cvs-cluster'
+    monitor: 'gpu-monitoring'
+
+# Load alert rules
+rule_files:
+  - 'alert_rules.yml'
+
+# Alertmanager configuration (optional)
+# alerting:
+#   alertmanagers:
+#     - static_configs:
+#         - targets:
+#           - 'localhost:9093'
+
+# Scrape configurations
+scrape_configs:
+  # Job for AMD Device Metrics Exporter running on all GPU nodes
+  - job_name: 'device-metrics-exporter'
+    static_configs:
+      - targets:
+          # ===== UPDATE THESE WITH YOUR ACTUAL NODE HOSTNAMES/IPs =====
+          #- 'node1:5000'
+          #- 'node2:5000'
+          # Add more nodes as needed
+          # For local testing use: - 'localhost:5000'
+          - 'localhost:5000'
+        labels:
+          cluster: 'cvs-cluster'
+    
+    # Relabel to extract node name from target
+    relabel_configs:
+      - source_labels: [__address__]
+        regex: '([^:]+):.*'
+        target_label: node
+        replacement: '$1'
+    
+    # Metric relabeling (optional filtering)
+    metric_relabel_configs:
+      - source_labels: [__name__]
+        regex: 'gpu_.*'
+        action: keep
+
+  # Prometheus self-monitoring
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
diff --git a/monitoring/provisioning/dashboards/default.yml b/monitoring/provisioning/dashboards/default.yml
new file mode 100644
index 00000000..0fac35e9
--- /dev/null
+++ b/monitoring/provisioning/dashboards/default.yml
@@ -0,0 +1,13 @@
+apiVersion: 1
+
+providers:
+  - name: 'Default'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /var/lib/grafana/dashboards
+      foldersFromFilesStructure: false
diff --git a/monitoring/provisioning/datasources/prometheus.yml b/monitoring/provisioning/datasources/prometheus.yml
new file mode 100644
index 00000000..0534726e
--- /dev/null
+++ b/monitoring/provisioning/datasources/prometheus.yml
@@ -0,0 +1,11 @@
+apiVersion: 1
+
+datasources:
+  - name: prometheus
+    type: prometheus
+    access: proxy
+    url: http://localhost:9090
+    isDefault: true
+    editable: false
+    jsonData:
+      timeInterval: "5s"
diff --git a/tests/monitoring/cleanup_monitoring_stack.py b/tests/monitoring/cleanup_monitoring_stack.py
new file mode 100644
index 00000000..0f2b13a5
--- /dev/null
+++ b/tests/monitoring/cleanup_monitoring_stack.py
@@ -0,0 +1,91 @@
+"""Cleanup test for GPU monitoring stack - removes all components from all nodes."""
+
+import pytest
+import logging
+import subprocess
+from lib.parallel_ssh_lib import Pssh
+
+logger = logging.getLogger(__name__)
+
+def is_localhost(ip_address):
+    """Check if IP address is localhost."""
+    import socket
+    local_addresses = {'localhost', '127.0.0.1', '::1', '127.0.1.1'}
+    if ip_address in local_addresses:
+        return True
+    try:
+        result = subprocess.run(['hostname', '-I'], capture_output=True, text=True, timeout=5)
+        if result.returncode == 0:
+            local_addresses.update(result.stdout.strip().split())
+    except: pass
+    return ip_address in local_addresses
+
+@pytest.mark.cleanup
+def test_stop_exporters_on_all_nodes(cluster_dict, all_nodes):
+    """Stop and remove device-metrics-exporter containers from all nodes."""
+    logger.info(f"Stopping device-metrics-exporters on all {len(all_nodes)} nodes")
+    username = cluster_dict['username']
+    priv_key_file = cluster_dict.get('priv_key_file', f"/home/{username}/.ssh/id_rsa")
+    commands = ["docker stop device-metrics-exporter || true", "docker rm device-metrics-exporter || true"]
+    
+    for node_ip in all_nodes:
+        logger.info(f"Cleaning up exporter on node: {node_ip}")
+        if is_localhost(node_ip):
+            for cmd in commands:
+                subprocess.run(cmd, shell=True, capture_output=True, text=True)
+        else:
+            phdl = Pssh([node_ip], user=username, priv_key=priv_key_file)
+            for cmd in commands:
+                phdl.run(cmd)
+    logger.info("✓ Exporters cleaned up on all nodes")
+
+@pytest.mark.cleanup
+def test_stop_prometheus_on_management(cluster_dict, management_node):
+    """Stop Prometheus systemd service."""
+    logger.info(f"Stopping Prometheus on management node: {management_node}")
+    username = cluster_dict['username']
+    commands = ["sudo systemctl stop prometheus || true", "sudo systemctl disable prometheus || true"]
+    
+    if is_localhost(management_node):
+        for cmd in commands:
+            subprocess.run(cmd, shell=True, capture_output=True, text=True)
+    else:
+        phdl = Pssh([management_node], user=username, priv_key=cluster_dict.get('priv_key_file'))
+        for cmd in commands:
+            phdl.run(cmd)
+    logger.info("✓ Prometheus stopped")
+
+@pytest.mark.cleanup
+def test_stop_grafana_on_management(cluster_dict, management_node):
+    """Stop and remove Grafana container."""
+    logger.info(f"Stopping Grafana on management node: {management_node}")
+    commands = ["docker stop grafana || true", "docker rm grafana || true"]
+    
+    if is_localhost(management_node):
+        for cmd in commands:
+            subprocess.run(cmd, shell=True, capture_output=True, text=True)
+    logger.info("✓ Grafana stopped")
+
+@pytest.mark.cleanup
+def test_remove_prometheus_config(cluster_dict, management_node):
+    """Remove Prometheus configuration and data."""
+    logger.info(f"Removing Prometheus config from management node")
+    commands = [
+        "sudo rm -f /etc/systemd/system/prometheus.service",
+        "sudo systemctl daemon-reload",
+        "sudo rm -rf /etc/prometheus",
+        "sudo rm -rf /var/lib/prometheus"
+    ]
+    
+    if is_localhost(management_node):
+        for cmd in commands:
+            subprocess.run(cmd, shell=True, capture_output=True, text=True)
+    logger.info("✓ Prometheus config removed")
+
+@pytest.mark.cleanup
+def test_cleanup_summary(all_nodes, management_node):
+    """Display cleanup summary."""
+    logger.info("=" * 60)
+    logger.info("MONITORING STACK CLEANUP COMPLETE")
+    logger.info(f"Cleaned {len(all_nodes)} nodes")
+    logger.info("=" * 60)
diff --git a/tests/monitoring/install_device_metrics_exporter.py b/tests/monitoring/install_device_metrics_exporter.py
new file mode 100644
index 00000000..1da39acd
--- /dev/null
+++ b/tests/monitoring/install_device_metrics_exporter.py
@@ -0,0 +1,938 @@
+# Drop-in replacement for tests/monitoring/install_device_metrics_exporter.py
+# Key changes:
+# 1. Added apply_monitoring_defaults to config_dict fixture
+# 2. Updated metrics_host fixture to use resolved device_metrics_exporter_host
+# 3. Fixed hardcoded localhost in test_check_gpu_metrics_exposed (line ~217)
+
+import pytest
+import re
+import sys
+import os
+import time
+import json
+import logging
+
+sys.path.insert(0, './lib')
+from parallel_ssh_lib import *
+from utils_lib import *
+
+import globals
+
+log = globals.log
+
+
+@pytest.fixture(scope="module")
+def cluster_file(pytestconfig):
+    """Get cluster file path from pytest CLI"""
+    return pytestconfig.getoption("cluster_file")
+
+
+@pytest.fixture(scope="module")
+def config_file(pytestconfig):
+    """Get config file path from pytest CLI"""
+    return pytestconfig.getoption("config_file")
+
+
+@pytest.fixture(scope="module")
+def cluster_dict(cluster_file):
+    """Load cluster configuration"""
+    with open(cluster_file) as json_file:
+        cluster_dict = json.load(json_file)
+    cluster_dict = resolve_cluster_config_placeholders(cluster_dict)
+    log.info(cluster_dict)
+    return cluster_dict
+
+
+@pytest.fixture(scope="module")
+def config_dict(config_file, cluster_dict):
+    """Load monitoring configuration with localhost/version fallbacks"""
+    with open(config_file) as json_file:
+        config_dict_t = json.load(json_file)
+    config_dict = config_dict_t.get('monitoring', {})
+    config_dict = resolve_test_config_placeholders(config_dict, cluster_dict)
+    # Apply defaults for unresolved placeholders
+    config_dict = apply_monitoring_defaults(config_dict)
+    log.info("Resolved monitoring config:")
+    log.info(config_dict)
+    return config_dict
+
+
+@pytest.fixture(scope="module")
+def metrics_host(config_dict):
+    """Get metrics host with fallback to localhost"""
+    return config_dict.get("device_metrics_exporter_host", "localhost")
+
+
+@pytest.fixture(scope="module")
+def phdl(cluster_dict):
+    """Create parallel SSH handle for all nodes"""
+    node_list = list(cluster_dict['node_dict'].keys())
+    phdl = Pssh(log, node_list, user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'])
+    return phdl
+
+
+def test_check_docker_installed(phdl):
+    """Verify Docker is installed on all nodes"""
+    globals.error_list = []
+    log.info("Checking if Docker is installed on all nodes")
+    
+    out_dict = phdl.exec('docker --version')
+    
+    for node in out_dict.keys():
+        if not re.search(r'Docker version', out_dict[node], re.I):
+            fail_test(f"Docker is not installed on node {node}. Please install Docker first.")
+    
+    update_test_result()
+
+
+def test_check_rocm_installed(phdl):
+    """Verify ROCm is installed on all nodes"""
+    globals.error_list = []
+    log.info("Checking if ROCm is installed on all nodes")
+    
+    out_dict = phdl.exec('rocm-smi --version || amd-smi version')
+    
+    for node in out_dict.keys():
+        if not re.search(r'ROCm|AMD', out_dict[node], re.I):
+            fail_test(f"ROCm is not installed on node {node}. Please install ROCm first.")
+    
+    update_test_result()
+
+
+def test_pull_device_metrics_exporter_image(phdl, config_dict):
+    """Pull Device Metrics Exporter Docker image on all nodes"""
+    globals.error_list = []
+    log.info("Pulling Device Metrics Exporter Docker image on all nodes")
+    
+    version = config_dict['device_metrics_exporter_version']
+    image = f"rocm/device-metrics-exporter:{version}"
+    log.info(f"Using image: {image}")
+    
+    out_dict = phdl.exec(f'docker pull {image}', timeout=300)
+    
+    for node in out_dict.keys():
+        if 'Error' in out_dict[node] or 'failed' in out_dict[node].lower():
+            fail_test(f"Failed to pull Docker image on node {node}: {out_dict[node]}")
+    
+    update_test_result()
+
+
+def test_stop_existing_device_metrics_exporter(phdl):
+    """Stop and remove any existing Device Metrics Exporter containers"""
+    globals.error_list = []
+    log.info("Stopping existing Device Metrics Exporter containers (if any)")
+    
+    phdl.exec('docker stop device-metrics-exporter 2>/dev/null || true')
+    phdl.exec('docker rm device-metrics-exporter 2>/dev/null || true')
+    
+    log.info("Cleaned up existing containers")
+    update_test_result()
+
+
+def test_start_device_metrics_exporter(phdl, config_dict):
+    """Start Device Metrics Exporter container on all nodes"""
+    globals.error_list = []
+    log.info("Starting Device Metrics Exporter on all nodes")
+    
+    version = config_dict['device_metrics_exporter_version']
+    port = config_dict['device_metrics_exporter_port']
+    
+    log.info(f"Starting exporter version {version} on port {port}")
+    
+    # Docker run command
+    docker_cmd = f'''docker run -d \
+        --device=/dev/dri \
+        --device=/dev/kfd \
+        --network=host \
+        -p {port}:{port} \
+        --restart unless-stopped \
+        --name device-metrics-exporter \
+        rocm/device-metrics-exporter:{version}'''
+    
+    out_dict = phdl.exec(docker_cmd)
+    
+    for node in out_dict.keys():
+        if 'Error' in out_dict[node]:
+            fail_test(f"Failed to start Device Metrics Exporter on node {node}: {out_dict[node]}")
+    
+    log.info("Device Metrics Exporter started on all nodes")
+    update_test_result()
+
+
+def test_verify_exporter_running(phdl):
+    """Verify Device Metrics Exporter is running"""
+    globals.error_list = []
+    log.info("Verifying Device Metrics Exporter is running on all nodes")
+    
+    # Wait for containers to start
+    time.sleep(10)
+    
+    out_dict = phdl.exec('docker ps --filter name=device-metrics-exporter --format "{{.Status}}"')
+    
+    for node in out_dict.keys():
+        if 'Up' not in out_dict[node]:
+            fail_test(f"Device Metrics Exporter is not running on node {node}")
+    
+    update_test_result()
+
+
+def test_verify_metrics_endpoint(phdl, config_dict, metrics_host):
+    """Verify metrics endpoint is accessible"""
+    globals.error_list = []
+    log.info("Verifying metrics endpoint is accessible on all nodes")
+    
+    port = config_dict['device_metrics_exporter_port']
+    log.info(f"Testing endpoint: http://{metrics_host}:{port}/metrics")
+    
+    # Retry logic for slow container startup
+    max_retries = 3
+    out_dict = None
+    
+    for attempt in range(max_retries):
+        out_dict = phdl.exec(f'curl -s http://{metrics_host}:{port}/metrics | head -20')
+        
+        # Check if we got output
+        has_output = False
+        for node in out_dict.keys():
+            if len(out_dict[node]) > 0:
+                has_output = True
+                break
+        
+        if has_output:
+            break
+        else:
+            log.info(f"Attempt {attempt+1}/{max_retries}: No output yet, waiting 5 seconds...")
+            time.sleep(5)
+    
+    # Final validation
+    for node in out_dict.keys():
+        output = out_dict[node]
+        log.info(f"Checking output from {node}, length: {len(output)}")
+        
+        if output and 'gpu_' in output.lower():
+            log.info(f"Metrics endpoint verified on node {node}")
+        else:
+            log.error(f"Output sample: {output[:200]}")
+            fail_test(f"Metrics endpoint not accessible on node {node}")
+    
+    update_test_result()
+
+
+def test_check_gpu_metrics_exposed(phdl, config_dict, metrics_host):
+    """Verify GPU metrics are being exposed"""
+    globals.error_list = []
+    log.info("Checking if GPU metrics are being exposed")
+    
+    port = config_dict['device_metrics_exporter_port']
+    
+    # Use metrics_host instead of hardcoded localhost
+    out_dict = phdl.exec(f'curl -s http://{metrics_host}:{port}/metrics | head -50')
+    
+    for node in out_dict.keys():
+        output = out_dict[node]
+        log.info(f"Checking GPU metrics from {node}, length: {len(output)}")
+        
+        if output.strip() and 'gpu_' in output.lower():
+            log.info(f"GPU metrics verified on node {node}")
+            # Show sample
+            lines = [line for line in output.split('\n') if 'gpu_' in line.lower()][:2]
+            for line in lines:
+                log.info(f"  Sample: {line[:80]}")
+        else:
+            log.error(f"No GPU metrics found. Output: {output[:300]}")
+            fail_test(f"GPU metrics not found on node {node}")
+    
+    update_test_result()
+
+
+def test_display_summary(phdl):
+    """Display installation summary"""
+    log.info("=" * 80)
+    log.info("Device Metrics Exporter Installation Complete!")
+    log.info("=" * 80)
+    log.info("")
+    log.info("Exporter Status:")
+    
+    out_dict = phdl.exec('docker ps --filter name=device-metrics-exporter --format "{{.Names}}: {{.Status}}"')
+    
+    for node in out_dict.keys():
+        log.info(f"  {node}: {out_dict[node]}")
+    
+    log.info("Completed metrics tests successfully.")
+
+
+# ============================================================================
+# Node Role Detection Fixtures
+# ============================================================================
+
+@pytest.fixture(scope='module')
+def management_node(cluster_dict):
+    """Get the management/head node from cluster."""
+    from utils_lib import get_management_node
+    return get_management_node(cluster_dict)
+
+
+@pytest.fixture(scope='module')
+def all_nodes(cluster_dict):
+    """Get all nodes (management + workers) where exporter should run."""
+    from utils_lib import get_all_nodes
+    return get_all_nodes(cluster_dict)
+
+
+@pytest.fixture(scope='module')
+def worker_nodes(cluster_dict):
+    """Get worker nodes only."""
+    from utils_lib import get_worker_nodes
+    return get_worker_nodes(cluster_dict)
+
+
+@pytest.fixture(scope='module')
+def is_single_node(cluster_dict):
+    """Check if this is a single-node deployment."""
+    from utils_lib import is_single_node_deployment
+    return is_single_node_deployment(cluster_dict)
+
+
+@pytest.fixture(scope='module')
+def prometheus_targets(cluster_dict, config_dict):
+    """Generate Prometheus scrape targets for all nodes."""
+    from utils_lib import generate_prometheus_targets
+    exporter_port = config_dict.get('device_metrics_exporter_port', 5000)
+    return generate_prometheus_targets(cluster_dict, exporter_port)
+
+
+def is_mgmt_node(node, cluster_dict):
+    """Helper function to check if node is management node."""
+    from utils_lib import is_management_node
+    return is_management_node(node, cluster_dict)
+
+
+# Tests with Management Node Awareness
+
+def test_deploy_prometheus_on_management_only(cluster_dict, management_node, is_single_node, config_dict, prometheus_targets):
+    """
+    Deploy Prometheus ONLY on management node with all targets configured.
+    Uses pssh for multi-node, subprocess for localhost.
+    """
+    log.info("="*80)
+    log.info(f"Deploying Prometheus on management node: {management_node}")
+    log.info(f"Targets: {prometheus_targets}")
+    log.info("="*80)
+    
+    import subprocess
+    import os
+    from prometheus_config_lib import generate_prometheus_config
+    
+    # Generate Prometheus config
+    prometheus_yml = "/tmp/prometheus_cvs.yml"
+    generate_prometheus_config(cluster_dict, config_dict, prometheus_yml)
+    log.info(f" Config generated with {len(prometheus_targets)} targets")
+    
+    prom_version = config_dict.get('prometheus_version', 'v2.55.0').lstrip('v')
+    
+    # Deploy on localhost/management node
+    if is_single_node or is_localhost(management_node):
+        # LOCAL DEPLOYMENT
+        # Stop existing
+        subprocess.run("sudo systemctl stop prometheus 2>/dev/null || true", shell=True)
+        subprocess.run("sudo pkill -9 prometheus 2>/dev/null || true", shell=True)
+        
+        # Install if needed
+        if not os.path.exists('/opt/prometheus/prometheus'):
+            log.info(f"Installing Prometheus {prom_version}...")
+            cmd = f"""cd /tmp && wget -q https://github.com/prometheus/prometheus/releases/download/v{prom_version}/prometheus-{prom_version}.linux-amd64.tar.gz && tar xzf prometheus-{prom_version}.linux-amd64.tar.gz && sudo mkdir -p /opt/prometheus /var/lib/prometheus/data && sudo cp -r prometheus-{prom_version}.linux-amd64/* /opt/prometheus/"""
+            subprocess.run(cmd, shell=True, check=True)
+        
+        # Copy config
+        subprocess.run(f"sudo cp {prometheus_yml} /opt/prometheus/prometheus.yml", shell=True, check=True)
+        
+        # Create systemd service
+        svc = """[Unit]
+Description=Prometheus
+After=network.target
+
+[Service]
+Type=simple
+User=root
+ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/data --web.listen-address=0.0.0.0:9090
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
+"""
+        with open('/tmp/prometheus.service', 'w') as f:
+            f.write(svc)
+        subprocess.run("sudo cp /tmp/prometheus.service /etc/systemd/system/", shell=True, check=True)
+        subprocess.run("sudo systemctl daemon-reload && sudo systemctl enable prometheus && sudo systemctl restart prometheus", shell=True, check=True)
+        
+        import time
+        time.sleep(3)
+        
+        # Verify
+        result = subprocess.run("systemctl is-active prometheus", shell=True, capture_output=True)
+        assert result.returncode == 0, "Prometheus not running"
+        log.info("SUCCESS: Prometheus running on management node (localhost)")
+    else:
+        # MULTI-NODE DEPLOYMENT via SSH to management node only
+        log.info(f"Deploying to remote management node: {management_node}")
+        from parallel_ssh_lib import Pssh
+        
+        # Create SSH client for management node ONLY
+        mgmt_dict = {management_node: cluster_dict['node_dict'].get(management_node, {'bmc_ip': 'NA', 'vpc_ip': management_node})}
+        phdl = Pssh(log, list(mgmt_dict.keys()), user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'])
+        
+        # Upload config file to management node
+        import tempfile
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.yml', delete=False) as f:
+            with open(prometheus_yml, 'r') as src:
+                f.write(src.read())
+            temp_config = f.name
+        
+        # Deploy Prometheus on management node only
+        deploy_script = f"""
+        # Stop existing
+        sudo systemctl stop prometheus 2>/dev/null || true
+        sudo pkill -9 prometheus 2>/dev/null || true
+        
+        # Install if needed
+        if [ ! -f /opt/prometheus/prometheus ]; then
+            echo "Installing Prometheus {prom_version}..."
+            cd /tmp
+            wget -q https://github.com/prometheus/prometheus/releases/download/v{prom_version}/prometheus-{prom_version}.linux-amd64.tar.gz
+            tar xzf prometheus-{prom_version}.linux-amd64.tar.gz
+            sudo mkdir -p /opt/prometheus /var/lib/prometheus/data
+            sudo cp -r prometheus-{prom_version}.linux-amd64/* /opt/prometheus/
+        fi
+        
+        # Copy config (uploaded separately via SCP)
+        sudo mkdir -p /opt/prometheus
+        
+        # Create systemd service
+        sudo tee /etc/systemd/system/prometheus.service > /dev/null << 'SVCEOF'
+[Unit]
+Description=Prometheus
+After=network.target
+
+[Service]
+Type=simple
+User=root
+ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/data --web.listen-address=0.0.0.0:9090
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
+SVCEOF
+        
+        sudo systemctl daemon-reload
+        sudo systemctl enable prometheus
+        sudo systemctl start prometheus
+        sleep 2
+        systemctl is-active prometheus
+        """
+        
+        # Execute deployment on management node only
+        result = phdl.exec(deploy_script)
+        
+        # Verify deployment succeeded
+        for node, output in result.items():
+            if 'active' not in output:
+                fail_test(f"Prometheus deployment failed on {node}: {output}")
+        
+        log.info(f"SUCCESS: Prometheus deployed and running on management node: {management_node}")
+        log.info("SUCCESS: ENFORCEMENT: Prometheus deployed ONLY to management node, NOT to workers")
+
+def test_deploy_grafana_on_management_only(cluster_dict, management_node, is_single_node, config_dict):
+    """
+    Deploy Grafana ONLY on management node.
+    Uses pssh for multi-node, subprocess for localhost.
+    """
+    log.info(f"Deploying Grafana on management node: {management_node}")
+    
+    # Create provisioning configs and dashboard BEFORE starting Grafana
+    create_grafana_provisioning_configs()
+    create_grafana_dashboard_file()
+    
+    import subprocess
+    import os
+    
+    grafana_version = config_dict.get('grafana_version', '10.4.1')
+    grafana_port = config_dict.get('grafana_port', '3000')
+    
+    if is_single_node or is_localhost(management_node):
+        # LOCAL DEPLOYMENT
+        # Stop existing
+        subprocess.run("docker stop grafana 2>/dev/null || true", shell=True)
+        subprocess.run("docker rm grafana 2>/dev/null || true", shell=True)
+        
+        # Create data directory
+        grafana_data = "/home/svdt-8/manoj/cvs/cvs/monitoring/grafana_data"
+        os.makedirs(grafana_data, exist_ok=True)
+        subprocess.run(f"sudo chown -R 472:472 {grafana_data}", shell=True, check=True)
+        
+        # Start Grafana
+        cmd = f"""docker run -d \
+            --name grafana \
+            --network host \
+            --restart unless-stopped \
+            -v {grafana_data}:/var/lib/grafana \
+            -v $(pwd)/monitoring/provisioning:/etc/grafana/provisioning \
+            -v $(pwd)/monitoring/dashboards:/var/lib/grafana/dashboards \
+            grafana/grafana:{grafana_version}"""
+        subprocess.run(cmd, shell=True, check=True)
+        
+        import time
+        time.sleep(3)
+        
+        # Verify
+        result = subprocess.run("docker ps | grep grafana", shell=True, capture_output=True)
+        assert result.returncode == 0, "Grafana not running"
+        log.info(f"SUCCESS: Grafana running on management node (localhost) port {grafana_port}")
+    else:
+        # MULTI-NODE DEPLOYMENT via SSH to management node only
+        log.info(f"Deploying to remote management node: {management_node}")
+        from parallel_ssh_lib import Pssh
+        
+        # Create SSH client for management node ONLY
+        mgmt_dict = {management_node: cluster_dict['node_dict'].get(management_node, {'bmc_ip': 'NA', 'vpc_ip': management_node})}
+        phdl = Pssh(log, list(mgmt_dict.keys()), user=cluster_dict['username'], pkey=cluster_dict['priv_key_file'])
+        
+        # Deploy Grafana on management node only
+        deploy_script = f"""
+        # Stop existing
+        docker stop grafana 2>/dev/null || true
+        docker rm grafana 2>/dev/null || true
+        
+        # Create data directory
+        mkdir -p /tmp/grafana_data
+        sudo chown -R 472:472 /tmp/grafana_data
+        
+        # Start Grafana
+        docker run -d \
+            --name grafana \
+            --network host \
+            --restart unless-stopped \
+            -v /tmp/grafana_data:/var/lib/grafana \
+            grafana/grafana:{grafana_version}
+        
+        sleep 3
+        docker ps | grep grafana
+        """
+        
+        # Execute deployment on management node only
+        result = phdl.exec(deploy_script)
+        
+        # Verify deployment succeeded
+        for node, output in result.items():
+            if 'grafana' not in output:
+                fail_test(f"Grafana deployment failed on {node}: {output}")
+        
+        log.info(f"SUCCESS: Grafana deployed and running on management node: {management_node}")
+        log.info("SUCCESS: ENFORCEMENT: Grafana deployed ONLY to management node, NOT to workers")
+
+def test_verify_all_nodes_for_exporter(all_nodes, management_node):
+    """
+    Verify that exporter targets include all nodes (management + workers).
+    """
+    log.info("="*80)
+    log.info(f"All nodes where exporter should run:")
+    for node in all_nodes:
+        is_mgmt = " (MANAGEMENT)" if node == management_node else ""
+        log.info(f"  • {node}{is_mgmt}")
+    log.info("="*80)
+    
+    assert len(all_nodes) > 0
+    assert management_node in all_nodes
+    log.info(f" Total nodes for exporter deployment: {len(all_nodes)}")
+
+
+def test_prometheus_scrape_targets(prometheus_targets, all_nodes):
+    """
+    Verify Prometheus scrape targets include all nodes.
+    """
+    log.info("="*80)
+    log.info("Prometheus scrape targets:")
+    for target in prometheus_targets:
+        log.info(f"  • {target}")
+    log.info("="*80)
+    
+    assert len(prometheus_targets) == len(all_nodes)
+    log.info(f" Scrape targets generated for all {len(all_nodes)} nodes")
+
+
+def test_verify_service_distribution(cluster_dict, management_node, all_nodes, worker_nodes, is_single_node):
+    """
+    CRITICAL TEST: Verify service distribution enforcement.
+    - Exporter must be on ALL nodes (management + workers)
+    - Prometheus must be ONLY on management node
+    - Grafana must be ONLY on management node
+    """
+    log.info("="*80)
+    log.info("VERIFYING SERVICE DISTRIBUTION ENFORCEMENT")
+    log.info("="*80)
+    
+    # Show the architecture
+    log.info(f"\n Cluster Architecture:")
+    log.info(f"  Management Node: {management_node}")
+    log.info(f"  Worker Nodes: {worker_nodes if worker_nodes else 'None (single-node)'}")
+    log.info(f"  Total Nodes: {len(all_nodes)}")
+    log.info(f"  Deployment Type: {'Single-Node' if is_single_node else 'Multi-Node'}")
+    
+    log.info(f"\nSUCCESS: SERVICE DISTRIBUTION RULES:")
+    log.info(f"  1. Device Metrics Exporter → ALL {len(all_nodes)} nodes")
+    for node in all_nodes:
+        marker = "(MANAGEMENT)" if node == management_node else "(WORKER)"
+        log.info(f"      {node} {marker}")
+    
+    log.info(f"\n  2. Prometheus → ONLY management node")
+    log.info(f"      {management_node} (MANAGEMENT ONLY)")
+    if worker_nodes:
+        for node in worker_nodes:
+            log.info(f"      {node} (NOT deployed)")
+    
+    log.info(f"\n  3. Grafana → ONLY management node")
+    log.info(f"      {management_node} (MANAGEMENT ONLY)")
+    if worker_nodes:
+        for node in worker_nodes:
+            log.info(f"      {node} (NOT deployed)")
+    
+    log.info(f"\n" + "="*80)
+    log.info("SUCCESS: SERVICE DISTRIBUTION VERIFIED")
+    log.info("="*80)
+    
+    # Assert the rules
+    assert len(all_nodes) >= 1, "Must have at least one node"
+    assert management_node in all_nodes, "Management node must be in all_nodes list"
+    
+    if not is_single_node:
+        assert len(worker_nodes) > 0, "Multi-node must have workers"
+        log.info(f"SUCCESS: ENFORCEMENT VERIFIED: Multi-node cluster with proper separation")
+    else:
+        log.info(f"SUCCESS: ENFORCEMENT VERIFIED: Single-node deployment (all services on localhost)")
+
+
+def is_localhost(node):
+    """Check if a node IP/hostname refers to localhost."""
+    import socket
+    import subprocess
+    
+    # Obvious localhost values
+    if node in ['localhost', '127.0.0.1', '::1', 'localhost.localdomain']:
+        return True
+    
+    # Get all local IPs
+    local_ips = set(['127.0.0.1', '::1', 'localhost'])
+    
+    try:
+        # Get hostname and its IP
+        hostname = socket.gethostname()
+        local_ips.add(hostname)
+        
+        # Get primary IP
+        try:
+            local_ip = socket.gethostbyname(hostname)
+            local_ips.add(local_ip)
+        except:
+            pass
+        
+        # Get all IPs from hostname -I
+        try:
+            result = subprocess.run(['hostname', '-I'], capture_output=True, text=True, timeout=2)
+            if result.returncode == 0:
+                for ip in result.stdout.strip().split():
+                    local_ips.add(ip.strip())
+        except:
+            pass
+        
+        # Get all IPs from ip addr
+        try:
+            result = subprocess.run(['ip', 'addr'], capture_output=True, text=True, timeout=2)
+            if result.returncode == 0:
+                import re
+                for match in re.finditer(r'inet\s+(\d+\.\d+\.\d+\.\d+)', result.stdout):
+                    local_ips.add(match.group(1))
+        except:
+            pass
+            
+    except Exception as e:
+        log.warning(f"Error detecting local IPs: {e}")
+    
+    log.info(f"Local IPs detected: {local_ips}")
+    log.info(f"Checking if {node} is localhost: {node in local_ips}")
+    
+    return node in local_ips
+
+
+def create_grafana_dashboard_file():
+    """Create GPU dashboard with correct metric names."""
+    import os
+    import json
+    
+    dashboard_dir = "monitoring/dashboards"
+    os.makedirs(dashboard_dir, exist_ok=True)
+    
+    dashboard = {
+        "annotations": {"list": []},
+        "editable": True,
+        "fiscalYearStartMonth": 0,
+        "graphTooltip": 0,
+        "id": None,
+        "links": [],
+        "panels": [
+            {
+                "datasource": "prometheus",
+                "fieldConfig": {
+                    "defaults": {
+                        "color": {"mode": "palette-classic"},
+                        "custom": {
+                            "axisCenteredZero": False,
+                            "axisColorMode": "text",
+                            "axisPlacement": "auto",
+                            "barAlignment": 0,
+                            "drawStyle": "line",
+                            "fillOpacity": 10,
+                            "gradientMode": "none",
+                            "lineInterpolation": "linear",
+                            "lineWidth": 1,
+                            "pointSize": 5,
+                            "showPoints": "never"
+                        },
+                        "mappings": [],
+                        "thresholds": {
+                            "mode": "absolute",
+                            "steps": [
+                                {"color": "green", "value": None},
+                                {"color": "yellow", "value": 70},
+                                {"color": "red", "value": 85}
+                            ]
+                        },
+                        "unit": "celsius"
+                    }
+                },
+                "gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
+                "id": 1,
+                "options": {
+                    "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True},
+                    "tooltip": {"mode": "multi"}
+                },
+                "targets": [
+                    {
+                        "datasource": "prometheus",
+                        "expr": "gpu_edge_temperature",
+                        "legendFormat": "{{hostname}} GPU{{gpu_id}} Edge",
+                        "refId": "A"
+                    },
+                    {
+                        "datasource": "prometheus",
+                        "expr": "gpu_junction_temperature",
+                        "legendFormat": "{{hostname}} GPU{{gpu_id}} Junction",
+                        "refId": "B"
+                    }
+                ],
+                "title": "GPU Temperature",
+                "type": "timeseries"
+            },
+            {
+                "datasource": "prometheus",
+                "fieldConfig": {
+                    "defaults": {
+                        "color": {"mode": "palette-classic"},
+                        "custom": {
+                            "axisCenteredZero": False,
+                            "axisColorMode": "text",
+                            "axisPlacement": "auto",
+                            "drawStyle": "line",
+                            "fillOpacity": 10,
+                            "lineInterpolation": "linear",
+                            "lineWidth": 1
+                        },
+                        "mappings": [],
+                        "unit": "watt"
+                    }
+                },
+                "gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
+                "id": 2,
+                "options": {
+                    "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True},
+                    "tooltip": {"mode": "multi"}
+                },
+                "targets": [
+                    {
+                        "datasource": "prometheus",
+                        "expr": "gpu_power_usage",
+                        "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+                        "refId": "A"
+                    }
+                ],
+                "title": "GPU Power Usage",
+                "type": "timeseries"
+            },
+            {
+                "datasource": "prometheus",
+                "fieldConfig": {
+                    "defaults": {
+                        "color": {"mode": "palette-classic"},
+                        "custom": {
+                            "axisCenteredZero": False,
+                            "axisColorMode": "text",
+                            "axisPlacement": "auto",
+                            "drawStyle": "line",
+                            "fillOpacity": 10,
+                            "lineInterpolation": "linear",
+                            "lineWidth": 1
+                        },
+                        "mappings": [],
+                        "unit": "watt"
+                    }
+                },
+                "gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
+                "id": 3,
+                "options": {
+                    "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True},
+                    "tooltip": {"mode": "multi"}
+                },
+                "targets": [
+                    {
+                        "datasource": "prometheus",
+                        "expr": "gpu_average_package_power",
+                        "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+                        "refId": "A"
+                    }
+                ],
+                "title": "GPU Average Package Power",
+                "type": "timeseries"
+            },
+            {
+                "datasource": "prometheus",
+                "fieldConfig": {
+                    "defaults": {
+                        "color": {"mode": "palette-classic"},
+                        "custom": {
+                            "axisCenteredZero": False,
+                            "axisColorMode": "text",
+                            "axisPlacement": "auto",
+                            "drawStyle": "line",
+                            "fillOpacity": 10,
+                            "lineInterpolation": "linear",
+                            "lineWidth": 1
+                        },
+                        "mappings": [],
+                        "unit": "hertz"
+                    }
+                },
+                "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+                "id": 4,
+                "options": {
+                    "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True},
+                    "tooltip": {"mode": "multi"}
+                },
+                "targets": [
+                    {
+                        "datasource": "prometheus",
+                        "expr": "gpu_clock{clock_type=\"GPU_CLOCK_TYPE_SYSTEM\"}",
+                        "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+                        "refId": "A"
+                    }
+                ],
+                "title": "GPU Clock Speed",
+                "type": "timeseries"
+            },
+            {
+                "datasource": "prometheus",
+                "fieldConfig": {
+                    "defaults": {
+                        "color": {"mode": "palette-classic"},
+                        "custom": {
+                            "axisCenteredZero": False,
+                            "axisColorMode": "text",
+                            "axisPlacement": "auto",
+                            "drawStyle": "line",
+                            "fillOpacity": 10,
+                            "lineInterpolation": "linear",
+                            "lineWidth": 1
+                        },
+                        "mappings": [],
+                        "unit": "celsius"
+                    }
+                },
+                "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+                "id": 5,
+                "options": {
+                    "legend": {"displayMode": "list", "placement": "bottom", "showLegend": True},
+                    "tooltip": {"mode": "multi"}
+                },
+                "targets": [
+                    {
+                        "datasource": "prometheus",
+                        "expr": "gpu_memory_temperature",
+                        "legendFormat": "{{hostname}} GPU{{gpu_id}}",
+                        "refId": "A"
+                    }
+                ],
+                "title": "GPU Memory Temperature",
+                "type": "timeseries"
+            }
+        ],
+        "refresh": "5s",
+        "schemaVersion": 39,
+        "tags": ["gpu", "amd", "rocm"],
+        "templating": {"list": []},
+        "time": {"from": "now-15m", "to": "now"},
+        "timepicker": {},
+        "timezone": "browser",
+        "title": "AMD GPU Metrics Dashboard",
+        "uid": "amd-gpu-metrics",
+        "version": 1
+    }
+    
+    dashboard_file = f"{dashboard_dir}/gpu-metrics-dashboard.json"
+    with open(dashboard_file, 'w') as f:
+        json.dump(dashboard, f, indent=2)
+    
+    log.info(f"✓ Created dashboard: {dashboard_file}")
+    return dashboard_file
+
+
+def create_grafana_provisioning_configs():
+    """Create Grafana provisioning configs for datasources and dashboards."""
+    import os
+    
+    # Create directories
+    os.makedirs("monitoring/provisioning/datasources", exist_ok=True)
+    os.makedirs("monitoring/provisioning/dashboards", exist_ok=True)
+    
+    # Datasource config
+    datasource_config = """apiVersion: 1
+
+datasources:
+  - name: prometheus
+    type: prometheus
+    access: proxy
+    url: http://localhost:9090
+    isDefault: true
+    editable: false
+    jsonData:
+      timeInterval: "5s"
+"""
+    
+    with open("monitoring/provisioning/datasources/prometheus.yml", 'w') as f:
+        f.write(datasource_config)
+    
+    # Dashboard provisioning config
+    dashboard_config = """apiVersion: 1
+
+providers:
+  - name: 'Default'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /var/lib/grafana/dashboards
+"""
+    
+    with open("monitoring/provisioning/dashboards/default.yml", 'w') as f:
+        f.write(dashboard_config)
+    
+    log.info("✓ Created Grafana provisioning configs")
diff --git a/utils/deploy_monitoring_stack.sh b/utils/deploy_monitoring_stack.sh
new file mode 100755
index 00000000..0ece3e94
--- /dev/null
+++ b/utils/deploy_monitoring_stack.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+
+# CVS Monitoring Stack Deployment Script
+# Deploys Device Metrics Exporter + Prometheus + Grafana
+
+set -e
+
+# Get script directory and repo root
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+cd "$REPO_ROOT"
+
+# Configuration with localhost fallback
+CLUSTER_FILE="${1:-./input/cluster_file/local_test_cluster.json}"
+MONITORING_CONFIG="${2:-./input/config_file/monitoring/monitoring_config.json}"
+
+# Helper function to resolve placeholders
+resolve_with_fallback() {
+    local value="$1"
+    local fallback="$2"
+    
+    # If value is empty or contains unresolved placeholder pattern {...}
+    if [[ -z "$value" ]] || [[ "$value" =~ ^\{.*\}$ ]]; then
+        echo "$fallback"
+    else
+        # Remove 'v' prefix if exists for version numbers
+        echo "${value#v}"
+    fi
+}
+
+# Read versions from config with fallback
+if [ -f "$MONITORING_CONFIG" ] && command -v jq &> /dev/null; then
+    PROM_RAW=$(jq -r '.monitoring.prometheus_version // "v2.55.0"' "$MONITORING_CONFIG")
+    GRAF_RAW=$(jq -r '.monitoring.grafana_version // "10.4.1"' "$MONITORING_CONFIG")
+    EXPO_RAW=$(jq -r '.monitoring.device_metrics_exporter_version // "v1.4.0"' "$MONITORING_CONFIG")
+    
+    PROMETHEUS_VERSION=$(resolve_with_fallback "$PROM_RAW" "2.55.0")
+    GRAFANA_VERSION=$(resolve_with_fallback "$GRAF_RAW" "10.4.1")
+    DEVICE_METRICS_VERSION=$(resolve_with_fallback "$EXPO_RAW" "v1.4.0")
+else
+    # Fallback defaults
+    PROMETHEUS_VERSION="2.55.0"
+    GRAFANA_VERSION="10.4.1"
+    DEVICE_METRICS_VERSION="v1.4.0"
+fi
+
+echo "============================================"
+echo "CVS Monitoring Stack Deployment"
+echo "============================================"
+echo ""
+echo "Working Directory: $REPO_ROOT"
+echo "Cluster File: $CLUSTER_FILE"
+echo "Monitoring Config: $MONITORING_CONFIG"
+echo "Prometheus Version: $PROMETHEUS_VERSION"
+echo "Grafana Version: $GRAFANA_VERSION"
+echo "Exporter Version: $DEVICE_METRICS_VERSION"
+echo ""
+
+# Step 1: Deploy Device Metrics Exporter on all GPU nodes using pytest
+echo "Step 1: Deploying Device Metrics Exporter on all GPU nodes..."
+echo "------------------------------------------------------------"
+pytest -vv -s ./tests/monitoring/install_device_metrics_exporter.py \
+    --cluster_file "$CLUSTER_FILE" \
+    --config_file "$MONITORING_CONFIG" \
+    --html=/tmp/device_metrics_install_report.html \
+    --capture=tee-sys \
+    --self-contained-html
+
+if [ $? -ne 0 ]; then
+    echo "ERROR: Device Metrics Exporter installation failed!"
+    exit 1
+fi
+
+echo ""
+echo "- Device Metrics Exporter deployed successfully!"
+echo ""
+
+# Step 2: Setup Prometheus on management node
+echo "Step 2: Setting up Prometheus..."
+echo "------------------------------------------------------------"
+# Stop existing Prometheus if running
+if systemctl is-active --quiet prometheus 2>/dev/null; then
+    echo "Stopping existing Prometheus service..."
+    sudo systemctl stop prometheus
+    sleep 2
+fi
+
+sudo pkill -9 prometheus 2>/dev/null || true
+sleep 2
+
+if ! command -v prometheus &> /dev/null; then
+    echo "Prometheus not found. Installing..."
+    
+    cd /tmp
+    echo "Downloading Prometheus ${PROMETHEUS_VERSION} (~92MB)..."
+    wget --progress=bar:force https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz
+    echo "Download complete. Extracting..."
+    tar xzf prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz
+    
+    sudo mkdir -p /opt/prometheus
+    sudo cp -r prometheus-${PROMETHEUS_VERSION}.linux-amd64/* /opt/prometheus/
+    sudo mkdir -p /var/lib/prometheus/data
+    
+    cd "$REPO_ROOT"
+    
+    # Copy config from repo
+    if [ -f "./monitoring/prometheus/prometheus.yml" ]; then
+        sudo cp ./monitoring/prometheus/prometheus.yml /opt/prometheus/
+        echo "- Copied prometheus.yml"
+    else
+        echo "ERROR: prometheus.yml not found at ./monitoring/prometheus/prometheus.yml"
+        exit 1
+    fi
+    
+    if [ -f "./monitoring/prometheus/alert_rules.yml" ]; then
+        sudo cp ./monitoring/prometheus/alert_rules.yml /opt/prometheus/
+        echo "- Copied alert_rules.yml"
+    else
+        echo "WARNING: alert_rules.yml not found"
+    fi
+    
+    # Create systemd service
+    sudo tee /etc/systemd/system/prometheus.service > /dev/null <<EOF
+[Unit]
+Description=Prometheus
+After=network.target
+
+[Service]
+Type=simple
+User=root
+ExecStart=/opt/prometheus/prometheus \
+    --config.file=/opt/prometheus/prometheus.yml \
+    --storage.tsdb.path=/var/lib/prometheus/data \
+    --web.listen-address=0.0.0.0:9090
+Restart=always
+
+[Install]
+WantedBy=multi-user.target
+EOF
+    
+    sudo systemctl daemon-reload
+    sudo systemctl enable prometheus
+    sudo systemctl start prometheus
+    
+    echo " Prometheus installed and started"
+else
+    echo " Prometheus already installed"
+    # Optionally restart with updated config
+    if sudo systemctl is-active --quiet prometheus; then
+        echo "  Reloading Prometheus config..."
+        sudo systemctl reload prometheus || sudo systemctl restart prometheus
+    fi
+fi
+
+# Step 3: Setup Grafana
+echo ""
+echo "Step 3: Setting up Grafana..."
+echo "------------------------------------------------------------"
+
+if ! docker ps -a --format '{{.Names}}' | grep -q '^grafana$'; then
+    echo "Grafana not found. Installing via Docker..."
+    
+    # Stop any existing container
+    docker stop grafana 2>/dev/null || true
+    docker rm grafana 2>/dev/null || true
+    
+    docker run -d \
+        -p 3000:3000 \
+        --name grafana \
+        --restart unless-stopped \
+        -v grafana-storage:/var/lib/grafana \
+        grafana/grafana:${GRAFANA_VERSION}
+    
+    echo " Grafana installed and started"
+    echo "  Default credentials: admin/admin"
+else
+    echo " Grafana container already exists"
+    if ! docker ps --format '{{.Names}}' | grep -q '^grafana$'; then
+        echo "  Starting Grafana..."
+        docker start grafana
+    fi
+fi
+
+# Step 4: Verify everything is running
+echo ""
+echo "Step 4: Verifying installation..."
+echo "------------------------------------------------------------"
+
+# Wait a bit for services to be ready
+sleep 3
+
+# Check Prometheus
+if curl -s http://localhost:9090/-/healthy > /dev/null 2>&1; then
+    echo " Prometheus is healthy"
+else
+    echo " Prometheus health check failed"
+fi
+
+# Check Grafana
+if curl -s http://localhost:3000/api/health > /dev/null 2>&1; then
+    echo " Grafana is healthy"
+else
+    echo " Grafana health check failed (may still be starting...)"
+fi
+
+# Check Device Metrics Exporter
+if curl -s http://localhost:5000/metrics | head -1 > /dev/null 2>&1; then
+    echo " Device Metrics Exporter is responding"
+else
+    echo " Device Metrics Exporter check failed"
+fi
+
+# Check targets if jq available
+if command -v jq &> /dev/null; then
+    echo ""
+    echo "Prometheus Targets:"
+    curl -s http://localhost:9090/api/v1/targets 2>/dev/null | \
+        jq -r '.data.activeTargets[]? | "\(.labels.instance): \(.health)"' 2>/dev/null || \
+        echo "  (Could not retrieve targets)"
+fi
+
+echo ""
+echo "============================================"
+echo "Deployment Complete!"
+echo "============================================"
+echo ""
+echo "Access URLs:"
+echo "  Prometheus: http://localhost:9090"
+echo "  Grafana:    http://localhost:3000"
+echo "  Exporter:   http://localhost:5000/metrics"
+echo ""
+echo "Next Steps:"
+echo "  1. Log into Grafana (admin/admin)"
+echo "  2. Add Prometheus as datasource: http://localhost:9090"
+echo "  3. Import dashboards from monitoring/grafana/dashboards/ (if available)"
+echo "  4. Run CVS tests with --prometheus-url=http://localhost:9090"
+echo ""