# <a id='toc1_'></a>[No VRAM Use Analysis](#toc0_)
This notebook generates the analysis for jobs run on partitions that their type is GPU but do not use any VRAM. It looks at these jobs, corresponding users, and PI groups.

**Table of contents**<a id='toc0_'></a>    
- [No VRAM Use Analysis](#toc1_)    
  - [Load the data](#toc1_1_)    
  - [Setup](#toc1_2_)    
    - [Generate all metrics used for analysis:](#toc1_2_1_)    
    - [Job Metrics](#toc1_2_2_)    
      - [Find most inefficient jobs with no VRAM constraints based on `vram_hours`](#toc1_2_2_1_)    
    - [User Metrics](#toc1_2_3_)    
      - [Find Inefficient Users based on `vram_hours`](#toc1_2_3_1_)    
    - [PI Group Metrics](#toc1_2_4_)    
      - [Find Inefficient PIs based on `vram_hours`](#toc1_2_4_1_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

## <a id='toc1_1_'></a>[Load the data](#toc0_)

In [None]:
# Import required modules
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Jupyter server should be run at the notebook directory, so the output of the following cell would be the project root:

In [None]:
project_root = str(Path.cwd().resolve().parent.parent)
print(f"Project root: {project_root}")

In [None]:
# Automatically reload modules before executing code (set this up BEFORE imports)
%load_ext autoreload
%autoreload 2

# Add project root to sys.path for module imports
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.analysis import efficiency_analysis as ea
from src.visualization import JobsWithMetricsVisualizer, UsersWithMetricsVisualizer
from src.config.enum_constants import MetricsDataFrameNameEnum

In [None]:
# Load the jobs DataFrame from DuckDB
preprocessed_jobs_df = ea.load_preprocessed_jobs_dataframe_from_duckdb(
    db_path=Path(project_root) / "data/slurm_data.db",
    table_name="Jobs",
)
display(preprocessed_jobs_df.head(10))
print(preprocessed_jobs_df.shape)

## <a id='toc1_2_'></a>[Setup](#toc0_)


In [None]:
efficiency_analysis = ea.EfficiencyAnalysis(
    jobs_df=preprocessed_jobs_df, metrics_df_name_enum=MetricsDataFrameNameEnum
)

In [None]:
filtered_jobs = efficiency_analysis.filter_jobs_for_analysis(
    gpu_mem_usage_filter=0,  # Used 0 GB of VRAM
)
filtered_jobs

### <a id='toc1_2_1_'></a>[Generate all metrics used for analysis:](#toc0_)

In [None]:
metrics_dict = efficiency_analysis.calculate_all_efficiency_metrics(filtered_jobs)

jobs_with_metrics = metrics_dict["jobs_with_efficiency_metrics"]
users_with_metrics = metrics_dict["users_with_efficiency_metrics"]
pi_accounts_with_metrics = metrics_dict["pi_accounts_with_efficiency_metrics"]

### <a id='toc1_2_2_'></a>[Job Metrics](#toc0_)

In [None]:
# Set option to display all columns
pd.set_option("display.max_columns", None)
# Display the DataFrame
display(jobs_with_metrics.head(10))
# To revert to default settings (optional)
pd.reset_option("display.max_columns")

print(f"Jobs found: {len(jobs_with_metrics)}")

#### <a id='toc1_2_2_1_'></a>[Find most inefficient jobs with no VRAM constraints based on `vram_hours`](#toc0_)

In [None]:
inefficient_jobs_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=MetricsDataFrameNameEnum.JOBS,
    sorting_key="vram_hours",
    ascending=False,  # Sort by vram_hours in descending order
    filter_criteria={
        "vram_hours": {"min": 80 * 24, "inclusive": True},  # VRAM-hours threshold for identifying inefficient jobs
    },
)
# Display top inefficient users by VRAM-hours
print("\nTop inefficient Jobs by VRAM-hours:")
display(inefficient_jobs_vram_hours.head(10))

# Plot top inefficient jobs by VRAM-hours, with VRAM-hours as labels
jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_vram_hours.head(10))
jobs_with_metrics_visualizer.visualize(
    column="vram_hours", bar_label_columns=["vram_hours", "allocated_vram"], figsize=(10, 6)
)

### <a id='toc1_2_3_'></a>[User Metrics](#toc0_)

In [None]:
users_with_metrics

#### <a id='toc1_2_3_1_'></a>[Find Inefficient Users based on `vram_hours`](#toc0_)

In [None]:
inefficient_users_vram_hours = efficiency_analysis.find_inefficient_users_by_vram_hours(
    vram_hours_filter={"min": 200, "inclusive": True},  # VRAM-hours threshold for identifying inefficient users
    min_jobs=5,  # Minimum number of jobs to consider a user
)
# Display top inefficient users by VRAM-hours
print("\nTop inefficient users by VRAM-hours:")
display(inefficient_users_vram_hours.head(10))


# Plot top inefficient users by VRAM-hours, with VRAM-hours as labels
users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_vram_hours.head(10))
users_with_metrics_visualizer.visualize(
    column="vram_hours", bar_label_columns=["vram_hours", "user_job_hours"], figsize=(10, 6)
)

In [None]:
x_focus = inefficient_users_vram_hours["vram_hours"]

# Log-spaced bins inside the focus window to handle several orders of magnitude
n_bins = 50
# bins = np.logspace(np.log10(x_focus.min()), np.log10(x_focus.max()), n_bins)

# Histogram with logarithmic x-axis (focused range only)
ax = sns.histplot(x_focus, bins=10**4, color="#1f77b4")
ax.set_xscale("log")
ax.set_xlabel("VRAM-Hours (log scale)")
ax.set_ylabel("Count")
ax.set_title("VRAM-Hours Distribution")

total_count = x_focus.sum()
ax.text(
    0.98,
    0.95,
    f"Total VRAM_Hours\n{total_count:,.2f}",
    transform=ax.transAxes,
    ha="right",
    va="top",
    fontsize=10,
    bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.9),
)

plt.tight_layout()
plt.show()

### <a id='toc1_2_4_'></a>[PI Group Metrics](#toc0_)

In [None]:
pi_accounts_with_metrics

#### <a id='toc1_2_4_1_'></a>[Find Inefficient PIs based on `vram_hours`](#toc0_)

In [None]:
from matplotlib.transforms import blended_transform_factory

inefficient_pis_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=MetricsDataFrameNameEnum.PI_GROUPS,
    sorting_key="pi_acc_vram_hours",
    ascending=False,
    filter_criteria={
        "pi_acc_vram_hours": {"min": 200, "inclusive": True},  # VRAM-hours threshold for identifying inefficient users
        "job_count": {"min": 5, "inclusive": True},  # Minimum number of jobs to consider a PI account
    },
)
# Display top inefficient users by VRAM-hours
print("\nTop inefficient PI Groups by VRAM-Hours:")
display(inefficient_pis_vram_hours.head(10))

top_pi_accounts = inefficient_pis_vram_hours.head(10)

pi_accounts = top_pi_accounts["pi_account"].tolist()
user_counts = top_pi_accounts.get("user_count", ["-"] * len(top_pi_accounts)).tolist()

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    y=top_pi_accounts["pi_account"].tolist(),
    x=top_pi_accounts["pi_acc_vram_hours"],
    order=pi_accounts,
    orient="h",
    palette="Blues_r",
    hue=top_pi_accounts["pi_account"].tolist(),
)

# We'll replace the default tick labels with custom two-line labels placed OUTSIDE the left spine.
ax.set_yticks(range(len(pi_accounts)))
ax.set_yticklabels([])  # clear built-in labels
ax.set_ylabel("PI Account")

transform = blended_transform_factory(ax.transAxes, ax.transData)  # x in axes fraction, y in data coords
x_outside = -0.02  # negative x fraction places text just left of spine; adjust if needed
line_gap = 0.4  # vertical separation between the two lines
for y_pos, (pi, uc) in enumerate(zip(pi_accounts, user_counts, strict=True)):
    # First line (PI account) slightly above center
    ax.text(
        x_outside,
        y_pos - line_gap / 2,
        pi,
        ha="right",
        va="center",
        transform=transform,
        fontsize=10,
        clip_on=False,
    )
    # Second line (Users) slightly below center
    ax.text(
        x_outside,
        y_pos + line_gap / 2,
        f"# of Users: {uc}",
        ha="right",
        va="center",
        transform=transform,
        fontsize=9,
        color="dimgray",
        clip_on=False,
    )

# Y-axis label: place further left than custom tick labels.
ax.set_ylabel("PI Account / Users", rotation=90, labelpad=20)
# Position the label using axes fraction (x< x_outside)
ax.yaxis.set_label_coords(x_outside - 0.30, 0.5)

# Hide y-axis tick labels (already blank) but keep small outward ticks if desired
ax.tick_params(axis="y", which="both", direction="out", length=4, pad=2)
# plt.subplots_adjust(left=0.7)

plt.xlabel("VRAM-Hours")
plt.title("Top Inefficient PI Accounts by VRAM-Hours")

# Annotate bars with VRAM-Hours and Job Hours to the right
xmax = top_pi_accounts["pi_acc_vram_hours"].max()
xlim = xmax * 1.6 if xmax > 0 else 1
ax.set_xlim(0, xlim)
for i, (vram_hours, pi_acc_job_hours) in enumerate(
    zip(top_pi_accounts["pi_acc_vram_hours"], top_pi_accounts["pi_acc_job_hours"], strict=True)
):
    xpos = min(vram_hours + xlim * 0.02, xlim * 0.92)
    ax.text(
        xpos,
        i,
        f"VRAM-Hours: {vram_hours:.2f}\nJob Hours: {pi_acc_job_hours:.2f}",
        va="center",
        ha="left",
        fontsize=9,
        color="black",
        clip_on=True,
    )

plt.tight_layout()
plt.show()