In [None]:
import duckdb
import sys
import os
from pathlib import Path

Jupyter server should be run at the notebook directory, so the output of the following cell would be the project root:

In [None]:
project_root = str(Path.cwd().resolve().parent)
print(f"Project root: {project_root}")

In [None]:
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
%load_ext autoreload
# Reload all modules imported with %aimport every time before executing the Python code typed.
%autoreload 1
%aimport src.analysis.visualize_columns

In [None]:
import src.analysis.visualize_columns as visualize_columns

In [None]:
ram_map = {
    "a100": 80,
    "v100": 16,
    "a40": 48,
    "gh200": 95,
    "rtx_8000": 48,
    "2080_ti": 11,
    "1080_ti": 11,
    "2080": 8,
    "h100": 80,
    "l4": 23,
    "m40": 23,
    "l40s": 48,
    "titan_x": 12,
    "a16": 16,
}

vram_cutoffs = [-1, 1e-6, 8, 11, 12, 16, 23, 32, 40, 48, 80]
vram_labels = [0] + vram_cutoffs[2:]


def get_requested_vram(constraints):
    """Get the minimum requested VRAM from job constraints.

    Args:
        constraints (list[str]): List of constraint strings from the job.

    Returns:
        int: Minimum requested VRAM in GB, or 0 if not specified.
    """
    try:
        len(constraints)
    except TypeError:
        return 0
    requested_vrams = []
    for constr in constraints:
        constr = constr.strip("'")
        if constr.startswith("vram"):
            requested_vrams.append(int(constr.replace("vram", "")))
        elif constr.startswith("gpu"):
            gpu_type = constr.split(":")[1]
            requested_vrams.append(ram_map[gpu_type])
    if not (requested_vrams):
        return 0
    return min(requested_vrams)

class GPUMetrics:
    """A class for computing and plotting metrics about GPU jobs."""

    def __init__(self, metricsfile="./modules/admin-resources/reporting/slurm_data.db", min_elapsed=600) -> None:
        """Initialize GPUMetrics with job data from a DuckDB database.

        Args:
            metricsfile (str, optional): Path to the DuckDB database file containing job data.
            min_elapsed (int, optional): Minimum elapsed time (in seconds) for jobs to be included.
        """
        self.con = duckdb.connect(metricsfile)
        # TODO - handle array jobs properly
        df = self.con.query(
            "select GPUs, GPUMemUsage, GPUComputeUsage, GPUType, Elapsed, "
            "StartTime,"
            "StartTime-SubmitTime as Queued, TimeLimit, Interactive, "
            "IsArray, JobID, ArrayID, Status, Constraints, Partition, User, Account from Jobs "
            f"where GPUs > 0 and Elapsed>{int(min_elapsed)} and GPUType is not null "
            " and Status != 'CANCELLED' and Status != 'FAILED'"
        ).to_df()
        df["requested_vram"] = df["Constraints"].apply(lambda c: get_requested_vram(c))
        df["allocated_vram"] = df["GPUType"].apply(lambda x: min(ram_map[t] for t in x))
        df["user_jobs"] = df.groupby("User")["User"].transform("size")
        df["account_jobs"] = df.groupby("Account")["Account"].transform("size")
        self.df = df

In [None]:
metrics = GPUMetrics("../data/slurm_data_small.db", 600)

In [None]:
metrics.df

In [None]:
visualizer = visualize_columns.DataVisualizer(metrics.df)


In [None]:
visualizer.visualize_columns(random_seed=42)