In [6]:
# This class represent function to calculate energy consumption of running pipelines
#curl -s 'http://localhost:9090/api/v1/label/__name__/values' | jq '.data[] | select(. | contains("kepler"))'

#https://www.run.ai/guides/kubernetes-architecture/kubeflow-pipelines-the-basics-and-a-quick-tutorial.     KUBEFLOW_ARCH
#https://medium.com/@ketangangal98/introduction-d07e3aca35ac.         KUBEFLOW_ARCH
#http://localhost:9102/metrics
#http://localhost:9090/graph?g0.expr=%7Bnamespace%3D%22kubeflow%22%7D%20%3E%200&g0.tab=1&g0.display_mode=lines&g0.show_exemplars=0&g0.range_input=1h


In [7]:
import json
import yaml
import pandas as pd
from kfp import Client

def get_pipelines_cpu_memory_usage(pipeline_config_path: str):
    # Load the pipeline configuration from the specified YAML file
    with open(pipeline_config_path, 'r') as file:
        config = yaml.safe_load(file)

    # Extract necessary values from the configuration
    namespace = config['Deployment']['namespace']
    prometheus_url = config['Deployment']['prometheusURL']
    label_key = 'pipeline/runid'  # Specify the label key for querying metrics

    # Initialize the Kubeflow Pipelines client
    kfp_client = Client()
    pipelines = kfp_client.list_pipelines()
    print(f"pipeline size: {pipelines.total_size}")
    total_usage_data = []

    # Iterate over each pipeline
    for pipeline in pipelines.pipelines:
        print(f"Processing pipeline: {pipeline.display_name}")
        # filter=json.dumps({
        #                 "predicates": [{
        #                     "operation": "EQUALS",
        #                     "key": "runs.runs.pipeline_spec.pipeline_id",
        #                     "stringValue": pipeline.pipeline_id,
        #                 }]
        #             })
        # Get runs for the current pipeline
        runs = kfp_client.list_runs()


        # Iterate over each run of the pipeline
        for run in runs.runs:
            # Retrieve CPU and memory usage metrics
            total_cpu_usage_value = get_total_cpu_usage_metrics(label_key, run.run_id, namespace, prometheus_url)
            total_memory_usage_value = get_total_mem_usage_metrics(label_key, run.run_id, namespace, prometheus_url)
            total_joules = get_total_container_joules(label_key, run.run_id, namespace, prometheus_url)

            # Append the results to the usage data list
            total_usage_data.append({
                'pipeline_name': pipeline.display_name,
                'pipeline_id': pipeline.pipeline_id,
                'run_id': run.run_id,
                'total_memory_usage': total_memory_usage_value,
                'total_cpu_usage': total_cpu_usage_value,
                'total_joules' : total_joules
            })
    print(total_usage_data)
    # Return the collected data as a DataFrame
    return pd.DataFrame(total_usage_data)



In [8]:
import pandas as pd
import requests

def get_cpu_usage_metrics_per_pod(label_key: str, label_value: str, namespace: str, prometheus_url: str):
    # Construct the Prometheus query for CPU usage per pod with the specified label
    query = f'rate(kepler_container_core_joules_total{{namespace="{namespace}", "{label_key}"="{label_value}"}}[5m])'

    # Query Prometheus for the CPU consumption metrics
    response = requests.get(f"{prometheus_url}/api/v1/query", params={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        metrics_data = response.json()
        results = metrics_data['data']['result']
        if results:
            pod_cpu_usage = []
            for result in results:
                pod_name = result['metric'].get('pod')  # Extract pod name from the metric
                cpu_usage = float(result['value'][1])  # Get the CPU usage value for the pod
                pod_cpu_usage.append({'pod_name': pod_name, 'cpu_usage': cpu_usage})
            
            # Convert to a pandas DataFrame
            df = pd.DataFrame(pod_cpu_usage)
            return df
        else:
            # Return an empty DataFrame if no results
            return pd.DataFrame(columns=['pod_name', 'cpu_usage'])
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")


In [9]:
import requests
import pandas as pd
import yaml

def get_total_cpu_usage_metrics(label_key:str,label_value:str, namespace:str, prometheus_url:str):

   # Construct the Prometheus query for total CPU usage across all pods
    query = f'sum(rate(kepler_container_core_joules_total{{namespace="{namespace}", "{label_key}"="{label_value}"}}[5m]))'
    
    # Query Prometheus for the CPU consumption metrics
    response = requests.get(f"{prometheus_url}/api/v1/query", params={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        metrics_data = response.json()
        results = metrics_data['data']['result']
        if results:
            total_cpu_usage = float(results[0]['value'][1])  # Get the total CPU usage value
            return total_cpu_usage
        else:
            return 0.0 
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")



In [10]:
import pandas as pd
import requests

def get_mem_usage_metrics_per_pod(label_key: str, label_value: str, namespace: str, prometheus_url: str):
    # Construct the Prometheus query for memory usage per pod with the specified label
    query = f'rate(kepler_container_bpf_cpu_time_ms_total{{namespace="{namespace}", "{label_key}"="{label_value}"}}[5m])'

    # Query Prometheus for the memory consumption metrics
    response = requests.get(f"{prometheus_url}/api/v1/query", params={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        metrics_data = response.json()
        results = metrics_data['data']['result']
        if results:
            pod_mem_usage = []
            for result in results:
                pod_name = result['metric'].get('pod')  # Extract pod name from the metric
                mem_usage = float(result['value'][1])  # Get the memory usage value for the pod
                pod_mem_usage.append({'pod_name': pod_name, 'mem_usage': mem_usage})
            
            # Convert to a pandas DataFrame
            df = pd.DataFrame(pod_mem_usage)
            return df
        else:
            # Return an empty DataFrame if no results
            return pd.DataFrame(columns=['pod_name', 'mem_usage'])
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")


In [11]:
import requests
import pandas as pd
import yaml

def get_total_mem_usage_metrics(label_key:str,label_value:str, namespace:str, prometheus_url:str):

   # query = f'sum(rate(container_memory_usage_bytes{{namespace="{namespace}", {label_key}="{label_value}"}}[5m])) by (pod)'
    query = f'sum(rate(kepler_container_bpf_cpu_time_ms_total{{namespace="{namespace}", "{label_key}"="{label_value}"}}[5m]))'

    response = requests.get(f"{prometheus_url}/api/v1/query", params={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        metrics_data = response.json()
        results = metrics_data['data']['result']
        if results:
            total_memory_usage = float(results[0]['value'][1])  # Get the total memory usage value
            return total_memory_usage
        else:
            return 0.0 
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}") 
        
    #     metrics = [{'pod': result['metric']['pod'], 'memory_usage': float(result['value'][1])} for result in results]
    #     return pd.DataFrame(metrics)
    # else:
    #     raise Exception(f"Error: {response.status_code} - {response.text}")
    

In [12]:
import pandas as pd
import requests

def get_container_joules_metrics_per_pod(label_key: str, label_value: str, namespace: str, prometheus_url: str):
    # Construct the Prometheus query for energy usage per pod with the specified label
    query = f'rate(kepler_container_joules_total{{namespace="{namespace}", "{label_key}"="{label_value}"}}[5m])'

    # Query Prometheus for the energy consumption metrics
    response = requests.get(f"{prometheus_url}/api/v1/query", params={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        metrics_data = response.json()
        results = metrics_data['data']['result']
        if results:
            pod_joules_usage = []
            for result in results:
                pod_name = result['metric'].get('pod')  # Extract pod name from the metric
                joules_usage = float(result['value'][1])  # Get the energy usage value (in joules) for the pod
                pod_joules_usage.append({'pod_name': pod_name, 'joules_usage': joules_usage})
            
            # Convert to a pandas DataFrame
            df = pd.DataFrame(pod_joules_usage)
            return df
        else:
            # Return an empty DataFrame if no results
            return pd.DataFrame(columns=['pod_name', 'joules_usage'])
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")


In [13]:
def get_total_container_joules(label_key:str,label_value:str, namespace:str, prometheus_url:str):

  # query = f'sum(rate(container_memory_usage_bytes{{namespace="{namespace}", {label_key}="{label_value}"}}[5m])) by (pod)'
    query = f'sum(rate(kepler_container_joules_total{{namespace="{namespace}", "{label_key}"="{label_value}"}}[5m]))'

    response = requests.get(f"{prometheus_url}/api/v1/query", params={'query': query})

    # Check if the request was successful
    if response.status_code == 200:
        metrics_data = response.json()
        results = metrics_data['data']['result']
        if results:
            total_memory_usage = float(results[0]['value'][1])  # Get the total memory usage value
            return total_memory_usage
        else:
            return 0.0 
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}") 
        
    #     metrics = [{'pod': result['metric']['pod'], 'memory_usage': float(result['value'][1])} for result in results]
    #     return pd.DataFrame(metrics)
    # else:
    #     raise Exception(f"Error: {response.status_code} - {response.text}")
        

In [14]:
def get_pipelines_cpu_memory_usage_per_pod(pipeline_config_path: str):
    # Load the pipeline configuration from the specified YAML file
    with open(pipeline_config_path, 'r') as file:
        config = yaml.safe_load(file)

    # Extract necessary values from the configuration
    namespace = config['Deployment']['namespace']
    prometheus_url = config['Deployment']['prometheusURL']
    label_key = 'pipeline/runid'  # Specify the label key for querying metrics

    # Initialize the Kubeflow Pipelines client
    kfp_client = Client()
    pipelines = kfp_client.list_pipelines()
    print(f"Pipeline size: {pipelines.total_size}")
    total_usage_data = []

    # Iterate over each pipeline
    for pipeline in pipelines.pipelines:
        print(f"Processing pipeline: {pipeline.display_name}")
        # Get runs for the current pipeline
        runs = kfp_client.list_runs()

        # Iterate over each run of the pipeline
        for run in runs.runs:
            run_id = run.run_id

            # Retrieve CPU, memory, and energy usage metrics for all pods in the run
            cpu_metrics_per_pod = get_cpu_usage_metrics_per_pod(label_key, run_id, namespace, prometheus_url)
            mem_metrics_per_pod = get_mem_usage_metrics_per_pod(label_key, run_id, namespace, prometheus_url)
            joules_metrics_per_pod = get_container_joules_metrics_per_pod(label_key, run_id, namespace, prometheus_url)

            # Merge the per-pod metrics into a single DataFrame
            pod_metrics = cpu_metrics_per_pod.merge(mem_metrics_per_pod, on='pod_name', how='outer')
            pod_metrics = pod_metrics.merge(joules_metrics_per_pod, on='pod_name', how='outer')

            # Add metadata columns to the DataFrame
            pod_metrics['pipeline_name'] = pipeline.display_name
            pod_metrics['pipeline_id'] = pipeline.pipeline_id
            pod_metrics['run_id'] = run_id

            # Retrieve aggregated metrics for the entire run
            total_cpu_usage_value = get_total_cpu_usage_metrics(label_key, run_id, namespace, prometheus_url)
            total_memory_usage_value = get_total_mem_usage_metrics(label_key, run_id, namespace, prometheus_url)
            total_joules = get_total_container_joules(label_key, run_id, namespace, prometheus_url)

            # Add aggregated data as a new row (for summary purposes)
            total_usage_data.append({
                'pipeline_name': pipeline.display_name,
                'pipeline_id': pipeline.pipeline_id,
                'run_id': run_id,
                'total_memory_usage': total_memory_usage_value,
                'total_cpu_usage': total_cpu_usage_value,
                'total_joules': total_joules,
                'pod_metrics': pod_metrics  # Attach the per-pod DataFrame for further analysis
            })

    # Convert the aggregated usage data to a DataFrame
    total_usage_df = pd.DataFrame(total_usage_data)

    # Return the DataFrame containing both aggregated and per-pod data
    return total_usage_df


In [15]:
# draw pandas area plots
import pandas as pd
import matplotlib.pyplot as plt

# Function to plot an area chart for a metric
def plot_area_per_pod(output_df: pd.DataFrame, metric: str, title: str, xlabel: str, ylabel: str):
    """
    Plot an area chart for the given metric in the pipeline's per-pod usage data.

    :param output_df: DataFrame containing pipeline per-pod usage metrics.
    :param metric: The metric to visualize (e.g., 'cpu_usage', 'mem_usage', 'joules_usage').
    :param title: Title of the plot.
    :param xlabel: Label for the x-axis.
    :param ylabel: Label for the y-axis.
    """
    if metric not in output_df.columns:
        raise ValueError(f"Metric '{metric}' not found in the DataFrame.")
    
    # Aggregate data by pod
    pod_metric_data = output_df.groupby('pod_name')[metric].sum().reset_index()

    # Sort the data for a cleaner area plot
    pod_metric_data = pod_metric_data.sort_values(by=metric, ascending=False)

    # Plot the area chart
    plt.figure(figsize=(12, 6))
    plt.stackplot(pod_metric_data['pod_name'], pod_metric_data[metric], alpha=0.8, labels=[metric])
    plt.xticks(rotation=45, ha='right')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

In [16]:
# Main

import yaml
import pandas as pd
from kfp import Client

# Call the function with a sample path
output_df = get_pipelines_cpu_memory_usage_per_pod("../data/pipeline_config_sample.yaml")

#display(output_df)  # Jupyter-friendly display

# Access aggregated metrics
print(output_df[['pipeline_name', 'total_cpu_usage', 'total_memory_usage', 'total_joules']])

# Access per-pod metrics for a specific pipeline run
print(output_df.iloc[0]['pod_metrics'])  # Per-pod metrics for the first run

# Plot CPU usage per pod
plot_area_per_pod(output_df, metric='cpu_usage', 
                  title='CPU Usage Distribution Among Pods', 
                  xlabel='Pod Names', ylabel='CPU Usage (cores)')

# Plot Memory usage per pod
plot_area_per_pod(output_df, metric='mem_usage', 
                  title='Memory Usage Distribution Among Pods', 
                  xlabel='Pod Names', ylabel='Memory Usage (MB)')

# Plot Energy usage per pod
plot_area_per_pod(output_df, metric='joules_usage', 
                  title='Energy Consumption Distribution Among Pods', 
                  xlabel='Pod Names', ylabel='Energy Consumption (Joules)')



Pipeline size: 2
Processing pipeline: chain_employee_data_pipeline_618e4a19-530b-405b-9ce6-a1f7788b9778
Processing pipeline: chain_sales_data_pipeline_764bd130-0123-4a60-9f4e-31ff2b14b47f
                                       pipeline_name  total_cpu_usage  \
0  chain_employee_data_pipeline_618e4a19-530b-405...              0.0   
1  chain_employee_data_pipeline_618e4a19-530b-405...              0.0   
2  chain_sales_data_pipeline_764bd130-0123-4a60-9...              0.0   
3  chain_sales_data_pipeline_764bd130-0123-4a60-9...              0.0   

   total_memory_usage  total_joules  
0                 0.0           0.0  
1                 0.0           0.0  
2                 0.0           0.0  
3                 0.0           0.0  
Empty DataFrame
Columns: [pod_name, cpu_usage, mem_usage, joules_usage, pipeline_name, pipeline_id, run_id]
Index: []


ValueError: Metric 'cpu_usage' not found in the DataFrame.

In [None]:
## test

from prometheus_api_client import PrometheusConnect

# Connect to your Prometheus instance
prom = PrometheusConnect(url="http://localhost:9090", disable_ssl=True)

# Define your namespace
namespace = "kubeflow"

# Query to get the total energy consumption over the last 5 days
#query = f'sum(sum_over_time(kepler_container_bpf_block_irq_total{{namespace="{namespace}"}}[5d]))'
query = f'kepler_container_bpf_block_irq_total{{namespace="{namespace}"}}'

# Execute the query
result = prom.custom_query(query)

# Extract and print the CPU consumption for each pod
cpu_consumption = {item['metric']['pod']: float(item['value'][1]) for item in result if 'pod' in item['metric']}

print("CPU consumption (in joules) of all pods in namespace", namespace, ":", cpu_consumption)



In [45]:
import requests
import pandas as pd

# Define Prometheus query
prometheus_url = "http://localhost:9090/api/v1/query"
namespace = "kubeflow"  # Replace with your namespace
query = '{namespace="' + namespace + '", pod="chain-employee-data-pipeline-618e4a19-530b-405b-9ce6-a1f77gbfj9-system-container-impl-361845380"}'
#__name__="kepler.*", 
# Send the query to Prometheus
response = requests.get(prometheus_url, params={"query": query})

if response.status_code == 200:
    results = response.json()['data']['result']
    
    # Process and display results as a DataFrame
    if results:
        data = []
        for item in results:
            metric_name = item['metric'].get('__name__', 'unknown')
            pod_name = item['metric'].get('pod', 'N/A')  # Get pod name
            namespace_name = item['metric'].get('namespace', 'N/A')  # Get namespace name
            value = item['value'][1]
            data.append({
                "metric": metric_name,
                "namespace": namespace_name,
                "pod": pod_name,
                "value": value
            })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        print("Metrics with values > 0 from namespace '{}'".format(namespace))
        display(df)
    else:
        print("No metrics found matching the criteria.")
else:
    print(f"Error querying Prometheus: {response.status_code}")


Metrics with values > 0 from namespace 'kubeflow'


Unnamed: 0,metric,namespace,pod,value
0,kube_pod_completion_time,kubeflow,chain-employee-data-pipeline-618e4a19-530b-405...,1733752305
1,kube_pod_container_info,kubeflow,chain-employee-data-pipeline-618e4a19-530b-405...,1
2,kube_pod_container_info,kubeflow,chain-employee-data-pipeline-618e4a19-530b-405...,1
3,kube_pod_container_state_started,kubeflow,chain-employee-data-pipeline-618e4a19-530b-405...,1733752294
4,kube_pod_container_state_started,kubeflow,chain-employee-data-pipeline-618e4a19-530b-405...,1733752294
...,...,...,...,...
62,kube_pod_tolerations,kubeflow,chain-employee-data-pipeline-618e4a19-530b-405...,1
63,kube_pod_tolerations,kubeflow,chain-employee-data-pipeline-618e4a19-530b-405...,1
64,kube_pod_service_account,kubeflow,chain-employee-data-pipeline-618e4a19-530b-405...,1
65,kube_pod_scheduler,kubeflow,chain-employee-data-pipeline-618e4a19-530b-405...,1


In [46]:
import requests
import pandas as pd

# Define Prometheus query
prometheus_url = "http://localhost:9090/api/v1/query"
namespace = "kubeflow"  # Replace with your namespace
query = '{namespace="' + namespace + '", pod="chain-employee-data-pipeline-618e4a19-530b-405b-9ce6-a1f77gbfj9-system-container-impl-361845380"}'

# Send the query to Prometheus
response = requests.get(prometheus_url, params={"query": query})

if response.status_code == 200:
    results = response.json()['data']['result']
    
    # Process and display results as a DataFrame
    if results:
        data = []
        for item in results:
            metric_name = item['metric'].get('__name__', 'unknown')
            pod_name = item['metric'].get('pod', 'N/A')  # Get pod name
            namespace_name = item['metric'].get('namespace', 'N/A')  # Get namespace name
            value = item['value'][1]
            data.append({
                "metric": metric_name,
                "namespace": namespace_name,
                "pod": pod_name,
                "value": value
            })
        
        # Create DataFrame
        df = pd.DataFrame(data)
        print("Metrics with values > 0 from namespace '{}'".format(namespace))
        print(df)

        # Save DataFrame to a CSV file
        output_file = "prometheus_metrics.csv"
        df.to_csv(output_file, index=False)
        print(f"Data successfully saved to '{output_file}'")
    else:
        print("No metrics found matching the criteria.")
else:
    print(f"Error querying Prometheus: {response.status_code}")


Metrics with values > 0 from namespace 'kubeflow'
                               metric namespace  \
0            kube_pod_completion_time  kubeflow   
1             kube_pod_container_info  kubeflow   
2             kube_pod_container_info  kubeflow   
3    kube_pod_container_state_started  kubeflow   
4    kube_pod_container_state_started  kubeflow   
..                                ...       ...   
62               kube_pod_tolerations  kubeflow   
63               kube_pod_tolerations  kubeflow   
64           kube_pod_service_account  kubeflow   
65                 kube_pod_scheduler  kubeflow   
66  node_namespace_pod:kube_pod_info:  kubeflow   

                                                  pod       value  
0   chain-employee-data-pipeline-618e4a19-530b-405...  1733752305  
1   chain-employee-data-pipeline-618e4a19-530b-405...           1  
2   chain-employee-data-pipeline-618e4a19-530b-405...           1  
3   chain-employee-data-pipeline-618e4a19-530b-405...  1733752294