# Graphing Prometheus Metrics

This notebook demonstrates how to query and graph long-term metrics from a Prometheus server. It assumes that the long-term metrics optional component, as well as the THREDDS log-parser have been enabled.

Note: This notebook also requires packages that are not installed by default in the JupyterLab environment. You can install them by running the following command in a terminal:
```bash
pip install requests_magpie prometheus_pandas prometheus_api_client
```



In [1]:
import datetime as dt
import os
import re

import pandas as pd
import requests
import xarray as xr
from matplotlib import pyplot as plt
from prometheus_api_client.utils import parse_datetime
from prometheus_pandas import query
from requests_magpie import MagpieAuth

# The main production node doesn't yet have the long-term metrics
# node = "https://pavics.ouranos.ca"
node = "https://lvupavicsmaster.ouranos.ca"

# PAVICS credentials
USERNAME = os.getenv("PAVICS_USERNAME", "your_username")
PASSWORD = os.getenv("PAVICS_PASSWORD", "your_password")

# Login to Magpie
with requests.session() as session:
    session.auth = MagpieAuth(f"{node}/magpie", USERNAME, PASSWORD)

# Connect to Prometheus server
# TODO: Change this to the long-term metrics server when available @ /prometheus/federate ?
prom = query.Prometheus(api_url=f"{node}/prometheus", http=session)

In [2]:
# Long-term metrics attributes
meta = {
    # FIXME: Duplicate key
    # "instance:node_cpu_seconds:avg_rate1h_iowait": {
    #     "name": "CPU Load Fraction",
    #     "units": "",
    #     "description": "Fraction of the time, over the last hour, that CPUs were working, ie. not idle. 1 means all CPUs were working all the time, 0 means they were all idle all the time",
    # },
    "instance:node_cpu_seconds:avg_rate1h_iowait": {
        "name": "CPU IO Wait Fraction",
        "units": "",
        "description": "Fraction of the time, over the last hour, that CPUs were waiting for IO operations to complete.",
    },
    "instance:go_threads:avg1h": {
        "name": "Threads Count",
        "units": "",
        "description": "Number of threads running on the node, averaged over the last hour.",
    },
    "instance:node_network_transmit_bytes:sum_rate1h": {
        "name": "Outgoing Network Transfer",
        "units": "bytes",
        "description": "Total number of bytes transmitted over the network by the node, averaged over the last hour.",
    },
    "instance:node_network_receive_bytes:sum_rate1h": {
        "name": "Incoming Network Transfer",
        "units": "bytes",
        "description": "Total number of bytes received over the network by the node, averaged over the last hour.",
    },
    "instance:thredds_transfer_size_bytes:increase1h": {
        "name": "THREDDS Download Volume",
        "units": "bytes",
        "description": "Total size of data transferred from the THREDDS server, averaged over the last hour.",
    },
    "instance:node_memory_MemAvailable_bytes:avg1h": {
        "name": "Memory Available",
        "units": "bytes",
        "description": "Total memory available for use by applications, averaged over the last hour.",
    },
    "instance:node_memory_SwapFree_bytes:avg1h": {
        "name": "Swap Memory Usage",
        "units": "bytes",
        "description": "Swap memory used, averaged over the last hour.",
    },
    "instance:node_disk_read_bytes:sum_rate1h": {
        "name": "Disk Read",
        "units": "bytes",
        "description": "Total number of bytes read from disk by the node, averaged over the last hour.",
    },
    "instance:node_disk_written_bytes:sum_rate1h": {
        "name": "Disk Write",
        "units": "bytes",
        "description": "Total number of bytes written to disk by the node, averaged over the last hour.",
    },
    "jupyter:container_fs_writes_bytes:sum_increase1h": {
        "name": "Jupyter Container Disk Write",
        "units": "bytes",
        "description": "Total number of bytes written to disk by each Jupyter container, averaged over the last hour.",
    },
    "jupyter:container_cpu_user_seconds:sum_increase1h": {
        "name": "Jupyter Container CPU User Time",
        "units": "seconds",
        "description": "Total user CPU time used by each Jupyter container, averaged over the last hour.",
    },
    "instance:node_boot_time_seconds:max_over_time1d": {
        "name": "Uptime",
        "units": "seconds",
        "description": "Time since the node was last booted.",
    },
    "instance:node_filesystem_free_bytes:avg_min_over_time1d": {
        "name": "Free Filesystem Space",
        "units": "bytes",
        "description": "Total amount of free space on the filesystem, averaged over the last day.",
    },
    "instance:node_filesystem_size_bytes:avg_max_over_time1d": {
        "name": "Total Filesystem Space",
        "units": "bytes",
        "description": "Total amount of space on the filesystem, averaged over the last day.",
    },
    "instance:node_memory_MemTotal_bytes:avg_max_over_time1d": {
        "name": "Total Memory",
        "units": "bytes",
        "description": "Total amount of memory available on the node, averaged over the last day.",
    },
    "instance:node_memory_SwapTotal_bytes:avg_min_over_time1d": {
        "name": "Total Swap Memory",
        "units": "bytes",
        "description": "Total amount of swap memory available on the node, averaged over the last day.",
    },
    "jupyter:container_last_seen:sum_rate1d": {
        "name": "Jupyter Container Open",
        "units": "",
        "description": "Fraction of the time Jupyter containers were open during the last day.",
    },
}

In [3]:
def get(metric, start="1y", end="now") -> xr.DataArray:
    """
    Query a metric over a specified time range.

    Parameters
    ----------
    metric : str
      The Prometheus metric to graph. See `meta` for available long-term metrics.
    start : str, datetime
        The start time of the time range, can be a relative time from now (e.g. `5d`) or a specific date (YYYY-MM-DD).
    end : str, datetime
        The end time of the time range, can be `now` or a specific date (YYYY-MM-DD).

    Returns
    -------
    xr.DataArray
        Time series of the metric over the specified time range.
    """
    # We need to use `last_over_time` to get the last value of the metric over the specified time range,
    # otherwise prometheus won't return all the existing records. By default, a Prometheus time stamp is
    # valid during 5 minutes.
    if "1h" in metric:
        step = dt.timedelta(hours=1)
        metric_ = f"last_over_time({metric}[1h])"
    elif "1d" in metric:
        step = dt.timedelta(days=1)
        metric_ = f"last_over_time({metric}[1d])"
    else:
        step = "5m"
        metric_ = metric

    if type(start) is str:
        start = parse_datetime(start)

    if type(end) is str:
        end = parse_datetime(end)

    df = prom.query_range(metric_, start=start, end=end, step=step)
    return to_dataarray(df, metric)


def to_dataarray(df: pd.DataFrame, metric) -> xr.DataArray:
    """
    Convert a DataFrame from a Prometheus query to a DataArray.

    The function includes metadata attributes defined above, and creates coordinates from the records labels.
    """
    ds = df.to_xarray().rename(index="time")

    # Match record labels
    pat = re.compile(r'(\w+)="([^"]+)"')

    das = []
    for name, da in ds.data_vars.items():
        # Set the name and description of the data array
        da.name = metric
        da.attrs["long_name"] = meta[metric]["name"]
        da.attrs["units"] = meta[metric]["units"]
        da.attrs["description"] = meta[metric]["description"]

        # Convert labels into coordinates
        labels = re.findall(pat, name)
        coords = {key: [value] for key, value in labels}

        das.append(da.expand_dims(coords))

    return xr.combine_by_coords(das).squeeze()[metric]


def graph_metric_ts(da: xr.DataArray) -> plt.Figure:
    """
    Graph a simple one-dimensional metric over a specified time range.

    Parameters
    ----------
    da : xr.DataArray
        A time series of the metric to graph.
    """
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))

    da.plot(add_legend=False, ax=ax)

    ax.set_xlabel("Time")
    label = da.attrs["long_name"]
    if u := da.attrs.get("units"):
        label += f" ({u})"

    ax.set_ylabel(label)

    return fig

In [4]:
fig = graph_metric_ts(get("instance:node_cpu_seconds:avg_rate1h_iowait"))

MagpieAuthenticationError: 

In [5]:
# Bar graph of the most downloaded datasets in the last 120 days


def graph_most_downloaded(start="1y", end="now", n=5) -> plt.Figure:
    """
    Graph the most downloaded datasets.

    Parameters
    ----------
    start : str, datetime
        The start time of the time range, can be a relative time from now (e.g. `5d`) or a specific date (YYYY-MM-DD).
    end : str, datetime
        The end time of the time range, can be `now` or a specific date (YYYY-MM-DD).
    n : int
        The number of datasets to display in the graph.
    """

    da = get("instance:thredds_transfer_size_bytes:increase1h", start=start, end=end)
    df = da.sum(["remote_addr", "tds_service", "variable", "time"]).to_series()

    # Then we sum the time series for each dataset to get the total size of data transferred.
    tds = df.sort_values(ascending=False)[n:0:-1] / 1024**2

    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    ax = tds.plot.barh(title=f"Top {n} downloaded datasets", ax=ax)
    ax.set_xlabel("Data transferred (MB)")

    return fig

In [6]:
fig = graph_most_downloaded()

MagpieAuthenticationError: 