# <a id='toc1_'></a>[Efficiency Analysis](#toc0_)
This notebook demonstrates the use of `EfficiencyAnalysis` class in `src/analysis/efficiency_analysis.py` for analyzing the efficiency of jobs, users, and PI groups.

**Table of contents**<a id='toc0_'></a>    
- [Efficiency Analysis](#toc1_)    
  - [Setup](#toc1_1_)    
  - [Example: Analyze workload efficiency of GPU users who set no VRAM constraints and used 0 GB of VRAM](#toc1_2_)    
    - [Job Efficiency Metrics](#toc1_2_1_)    
      - [Find most inefficient jobs with no VRAM constraints based on `vram_hours`](#toc1_2_1_1_)    
    - [User Efficiency Metrics](#toc1_2_2_)    
      - [Find Inefficient Users based on `expected_value_alloc_vram_efficiency`](#toc1_2_2_1_)    
      - [Find Inefficient Users based on `vram_hours`](#toc1_2_2_2_)    
    - [PI Group Efficiency Metrics](#toc1_2_3_)    
      - [Find Inefficient PIs based on `vram_hours`](#toc1_2_3_1_)    
  - [Example: Analyze all jobs with no VRAM constraints](#toc1_3_)    
    - [Job Efficiency Metrics](#toc1_3_1_)    
      - [Problem with duplicate JobIDs](#toc1_3_1_1_)    
      - [Top users with most number of jobs that have no VRAM constraints](#toc1_3_1_2_)    
      - [Find inefficient jobs with no VRAM Constraints based on `alloc_vram_efficiency_score`](#toc1_3_1_3_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

## <a id='toc1_1_'></a>[Setup](#toc0_)

In [None]:
# Import required modules
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Jupyter server should be run at the notebook directory, so the output of the following cell would be the project root:

In [None]:
project_root = str(Path.cwd().resolve().parent)
print(f"Project root: {project_root}")

In [None]:
# Add project root to sys.path for module imports
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.analysis import vram_usage as ea

# Automatically reload modules before executing code
# This is useful for development to see changes without restarting the kernel.
%load_ext autoreload
# Reload all modules imported with %aimport every time before executing the Python code typed.
%autoreload 1
%aimport src.analysis.efficiency_analysis, src.preprocess.preprocess, src.config.enum_constants

In [None]:
# Load the jobs DataFrame from DuckDB
preprocessed_jobs_df = ea.load_preprocessed_jobs_dataframe_from_duckdb(
    db_path='../data/slurm_data.db',
    table_name='Jobs',
    )
display(preprocessed_jobs_df.head(10))
print(preprocessed_jobs_df.shape)

## <a id='toc1_2_'></a>[Example: Analyze workload efficiency of GPU users who set no VRAM constraints and used 0 GB of VRAM](#toc0_)


In [None]:
efficiency_analysis = ea.EfficiencyAnalysis(
	jobs_df=preprocessed_jobs_df
)

In [None]:
filtered_jobs = efficiency_analysis.filter_jobs_for_analysis(
    vram_constraint_filter=pd.NA,  # No VRAM constraints
    gpu_mem_usage_filter=0,  # Used 0 GB of VRAM
)
filtered_jobs

Generate all metrics:

In [None]:
metrics_dict = efficiency_analysis.calculate_all_efficiency_metrics(filtered_jobs)

jobs_with_metrics = metrics_dict['jobs_with_efficiency_metrics']
users_with_metrics = metrics_dict['users_with_efficiency_metrics']
pi_accounts_with_metrics = metrics_dict['pi_accounts_with_efficiency_metrics']

### <a id='toc1_2_1_'></a>[Job Efficiency Metrics](#toc0_)

In [None]:
# Set option to display all columns
pd.set_option('display.max_columns', None)
# Display the DataFrame
display(jobs_with_metrics.head(10))
# To revert to default settings (optional)
pd.reset_option('display.max_columns')

print(f"Jobs found: {len(jobs_with_metrics)}")

#### <a id='toc1_2_1_1_'></a>[Find most inefficient jobs with no VRAM constraints based on `vram_hours`](#toc0_)

In [None]:
inefficient_jobs_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=ea.MetricsDataFrameNameEnum.JOBS,
    sorting_key="vram_hours",
    ascending=False,  # Sort by vram_hours in descending order
    filter_criteria={
        "vram_hours": {"min": 80 * 24, "inclusive": True},  # VRAM-hours threshold for identifying inefficient jobs
    }
)
# Display top inefficient users by VRAM-hours
print("\nTop inefficient Jobs by VRAM-hours:")
display(inefficient_jobs_vram_hours.head(10))

top_jobs = inefficient_jobs_vram_hours.head(20)

# Plot top inefficient jobs by VRAM-hours, with VRAM-hours as labels
plt.figure(figsize=(10, 8))

# Create y-tick labels with JobID and User
yticklabels = [f"{jid}\n{user}" for jid, user in zip(top_jobs["JobID"], top_jobs["User"], strict=True)]

barplot = sns.barplot(
    y=yticklabels,
    x=top_jobs["vram_hours"],
    orient="h"
)
plt.xlabel("VRAM-Hours")
plt.ylabel("Job ID / User")
plt.title("Top Inefficient Jobs by VRAM-Hours")

ax = barplot
xmax = top_jobs["vram_hours"].max()
xlim = xmax * 1.6 if xmax > 0 else 1
ax.set_xlim(0, xlim)

for i, (vram_hours, job_hours) in enumerate(
    zip(top_jobs["vram_hours"], top_jobs["job_hours"], strict=True)
):
    xpos = min(vram_hours + xlim * 0.02, xlim * 0.98)
    ax.text(
        xpos,
        i,
        f"VRAM-Hours: {vram_hours:.2f}\nJob Hours: {job_hours:.2f}",
        va="center",
        ha="left",
        fontsize=10,
        color="black",
        clip_on=True
    )

plt.tight_layout()
plt.show()


### <a id='toc1_2_2_'></a>[User Efficiency Metrics](#toc0_)

In [None]:
users_with_metrics

#### <a id='toc1_2_2_1_'></a>[Find Inefficient Users based on `expected_value_alloc_vram_efficiency`](#toc0_)

In [None]:
inefficient_users = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=efficiency_analysis.MetricsDataFrameNameEnum.USERS,
    sorting_key="expected_value_alloc_vram_efficiency",
    ascending=True,  # we want to find users with low efficiency
    filter_criteria={
        "expected_value_alloc_vram_efficiency": {"max": 0.3, "inclusive": True},
        "job_count": {"min": 5, "inclusive": True},  # Minimum number of jobs to consider a user
    }
)

# Display top inefficient users by job count
print("\nTop inefficient users by allocated vram efficiency:")
display(inefficient_users.head(10))


# Plot top inefficient users by GPU hours, with efficiency as labels
top_users = inefficient_users.head(10)

plt.figure(figsize=(8, 5))
barplot = sns.barplot(
    y=top_users["User"],
    x=top_users["user_job_hours"],
    orient="h"
)
plt.xlabel("Job Hours")
plt.ylabel("User")
plt.title("Top 10 Inefficient Users by Allocated VRAM Efficiency Contribution")

# Annotate bars with expected_value_alloc_vram_efficiency, keeping text fully inside the plot's right spine
ax = barplot
xmax = top_users["user_job_hours"].max()
# Add headroom for annotation space (20% extra)
xlim = xmax * 1.20 if xmax > 0 else 1
ax.set_xlim(0, xlim)

# Calculate annotation x-position: place at 98% of xlim or just left of the right spine, whichever is smaller
for i, (job_hours, efficiency) in enumerate(
    zip(
        top_users["user_job_hours"],
        top_users["expected_value_alloc_vram_efficiency"],
        strict=True,
    )
):
    # Place annotation at min(job_hours + 2% of xlim, 98% of xlim)
    xpos = min(job_hours + xlim * 0.02, xlim * 0.98)
    # If bar is very close to right spine, nudge annotation left to avoid overlap
    if xpos > xlim * 0.96:
        xpos = xlim * 0.96
    ax.text(
        xpos,
        i,
        f"Eff: {efficiency:.2f}",
        va="center",
        ha="left",
        fontsize=10,
        color="black",
        clip_on=True
    )

plt.tight_layout()
plt.show()

#### <a id='toc1_2_2_2_'></a>[Find Inefficient Users based on `vram_hours`](#toc0_)

In [None]:
inefficient_users_vram_hours = efficiency_analysis.find_inefficient_users_by_vram_hours(
    vram_hours_filter={"min": 200, "inclusive": True},  # VRAM-hours threshold for identifying inefficient users
    min_jobs=5,  # Minimum number of jobs to consider a user
)
# Display top inefficient users by VRAM-hours
print("\nTop inefficient users by VRAM-hours:")
display(inefficient_users_vram_hours.head(20))

top_users = inefficient_users_vram_hours.head(20)

# Plot top inefficient users by VRAM-hours, with VRAM-hours as labels
plt.figure(figsize=(8, 8))
barplot = sns.barplot(
    y=top_users["User"],
    x=top_users["vram_hours"],
    orient="h"
)
plt.xlabel("VRAM-Hours")
plt.ylabel("User")
plt.title("Top 10 Inefficient Users by VRAM-Hours")
# Annotate bars with gpu_hours, keeping text fully inside the plot's right spine
ax = barplot
xmax = top_users["vram_hours"].max()
# Add headroom for annotation space (20% extra)
xlim = xmax * 1.6 if xmax > 0 else 1
ax.set_xlim(0, xlim)
# Calculate annotation x-position: place at 98% of xlim or just left of the right spine, whichever is smaller
for i, (vram_hours, user_job_hours) in enumerate(
    zip(
        top_users["vram_hours"],
        top_users["user_job_hours"],
        strict=True,
    )
):
    # Place annotation at min(vram_hours + 2% of xlim, 98% of xlim)
    xpos = min(vram_hours + xlim * 0.02, xlim * 0.98)
    ax.text(
        xpos,
        i,
        f"VRAM-Hours: {vram_hours:.2f}\n Job Hours: {user_job_hours:.2f}",
        va="center",
        ha="left",
        fontsize=10,
        color="black",
        clip_on=True
    )
plt.tight_layout()
plt.show()

### <a id='toc1_2_3_'></a>[PI Group Efficiency Metrics](#toc0_)

In [None]:
pi_accounts_with_metrics

#### <a id='toc1_2_3_1_'></a>[Find Inefficient PIs based on `vram_hours`](#toc0_)

In [None]:
inefficient_pis_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=ea.MetricsDataFrameNameEnum.PI_GROUPS,
    sorting_key="pi_acc_vram_hours",
    ascending=False,
    filter_criteria={
        "pi_acc_vram_hours": {"min": 200, "inclusive": True},  # VRAM-hours threshold for identifying inefficient users
        "job_count": {"min": 5, "inclusive": True},  # Minimum number of jobs to consider a PI account
    }
)
# Display top inefficient users by VRAM-hours
print("\nTop inefficient PI Groups by VRAM-hours:")
display(inefficient_pis_vram_hours.head(20))

top_pi_accounts = inefficient_pis_vram_hours.head(20)

# Plot top inefficient users by VRAM-hours, with VRAM-hours as labels
plt.figure(figsize=(8, 8))
barplot = sns.barplot(
    y=top_pi_accounts["pi_account"],
    x=top_pi_accounts["pi_acc_vram_hours"],
    order=top_pi_accounts["pi_account"].tolist(),  # Only show present values
    orient="h"
)
plt.xlabel("VRAM-Hours")
plt.ylabel("PI Account")
plt.title("Top Inefficient PI Accounts by VRAM-Hours")
# Annotate bars with gpu_hours, keeping text fully inside the plot's right spine
ax = barplot
xmax = top_pi_accounts["pi_acc_vram_hours"].max()
# Add headroom for annotation space (20% extra)
xlim = xmax * 1.6 if xmax > 0 else 1
ax.set_xlim(0, xlim)
# Calculate annotation x-position: place at 98% of xlim or just left of the right spine, whichever is smaller
for i, (vram_hours, pi_acc_job_hours) in enumerate(
    zip(
        top_pi_accounts["pi_acc_vram_hours"],
        top_pi_accounts["pi_acc_job_hours"],
        strict=True,
    )
):
    # Place annotation at min(vram_hours + 2% of xlim, 98% of xlim)
    xpos = min(vram_hours + xlim * 0.02, xlim * 0.98)
    ax.text(
        xpos,
        i,
        f"VRAM-Hours: {vram_hours:.2f}\n Job Hours: {pi_acc_job_hours:.2f}",
        va="center",
        ha="left",
        fontsize=10,
        color="black",
        clip_on=True
    )
plt.tight_layout()
plt.show()

## <a id='toc1_3_'></a>[Example: Analyze all jobs with no VRAM constraints](#toc0_)

In [None]:
# Filter jobs where no VRAM constraint was set but a GPU was allocated
no_vram_constraint_efficiency_analysis = ea.EfficiencyAnalysis(
	jobs_df=preprocessed_jobs_df
)
all_no_vram_constraint_jobs = no_vram_constraint_efficiency_analysis.filter_jobs_for_analysis(
    vram_constraint_filter={"min": 0, "inclusive": False},  # No VRAM constraints
    gpu_count_filter={"min": 1, "inclusive": True},  # At least one GPU allocated
    gpu_mem_usage_filter={"min": 0, "inclusive": False}  # Used more than 0 GiB of VRAM
)

display(all_no_vram_constraint_jobs.head(10))
print(all_no_vram_constraint_jobs.shape)

### <a id='toc1_3_1_'></a>[Job Efficiency Metrics](#toc0_)

In [None]:
no_vram_constraint_jobs_with_metrics = no_vram_constraint_efficiency_analysis.calculate_job_efficiency_metrics(
    all_no_vram_constraint_jobs
    )

# Set option to display all columns
pd.set_option('display.max_columns', None)
# Display the DataFrame
display(no_vram_constraint_jobs_with_metrics.head(10))
# To revert to default settings (optional)
pd.reset_option('display.max_columns')
print(f"Jobs found: {len(no_vram_constraint_jobs_with_metrics)}")

#### <a id='toc1_3_1_1_'></a>[Problem with duplicate JobIDs](#toc0_)

In [None]:
# select jobs with specific job id
pd.set_option('display.max_columns', None)
# Display the DataFrame
display(no_vram_constraint_jobs_with_metrics[no_vram_constraint_jobs_with_metrics["JobID"] == 24374463])
pd.reset_option('display.max_columns')

#### <a id='toc1_3_1_2_'></a>[Top users with most number of jobs that have no VRAM constraints](#toc0_)

In [None]:
# Plot top users by number of jobs with no VRAM constraints
if not all_no_vram_constraint_jobs.empty:
    plt.figure(figsize=(10, 5))
    user_counts = all_no_vram_constraint_jobs["User"].value_counts().head(20)
    sns.barplot(x=user_counts.values, y=user_counts.index, orient="h")
    plt.xlabel("Number of Jobs")
    plt.ylabel("User")
    plt.title("Top 20 Users: Jobs with no VRAM Constraints")
    plt.tight_layout()
    plt.show()
else:
    print("No jobs found without VRAM constraints.")

#### <a id='toc1_3_1_3_'></a>[Find inefficient jobs with no VRAM Constraints based on `alloc_vram_efficiency_score`](#toc0_)

In [None]:
low_alloc_vram_score_jobs = no_vram_constraint_efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=ea.MetricsDataFrameNameEnum.JOBS,
    sorting_key="alloc_vram_efficiency_score",
    ascending=True,  # Sort by alloc_vram_efficiency_score in ascending order
    filter_criteria={
        "alloc_vram_efficiency_score": {"max": -10, "inclusive": True},  # score threshold
    }
)
# Display top inefficient users by alloc_vram_efficiency_score
print("\nTop inefficient Jobs by allocated VRAM efficiency score:")

top_jobs = low_alloc_vram_score_jobs.head(20)
display(top_jobs)

# Plot top inefficient jobs by alloc_vram_efficiency_score
plt.figure(figsize=(10, 12))

# Create y-tick labels with JobID and User
yticklabels = [
    f"idx: {idx}\nID: {job_id}\n{user}"
    for idx, job_id, user in zip(
        top_jobs.index,
        top_jobs["JobID"],
        top_jobs["User"],
        strict=True
    )
]

xmin = top_jobs["alloc_vram_efficiency_score"].min()
print(f"Minimum Allocated VRAM Efficiency Score: {xmin}")

x = pd.Series([abs(xmin)] * len(top_jobs), index=top_jobs.index) - abs(top_jobs["alloc_vram_efficiency_score"])

# Build a DataFrame for plotting
plot_df = pd.DataFrame({
    "allocated_vram_efficiency_score_column_height": x.to_numpy(),
    # "Job Id": top_jobs["JobID"],
    "job_hours": top_jobs["job_hours"],
    "job_index_and_username": yticklabels
}, index=top_jobs.index)

# Ensure the order is preserved as in x and yticklabels
plot_df = plot_df.iloc[:20]

barplot = sns.barplot(
    data=plot_df,
    y="job_index_and_username",
    x="allocated_vram_efficiency_score_column_height",
    orient="h",
)

plt.xlabel("Allocated VRAM Efficiency Score")
plt.ylabel("Job Index / User")
plt.title("Top Inefficient Jobs by Allocated VRAM Efficiency Score")

ax = barplot
ax.set_xlim(0, abs(xmin) * 1.2 if xmin < 0 else 1)
# Set x-ticks to actual alloc_vram_efficiency_score values
num_xticks = max(4, min(12, int(abs(xmin) // (xlim * 0.10)) + 1))
xticks = np.linspace(xmin, 0, num=num_xticks)
ax.set_xticks([abs(xmin) - abs(val) for val in xticks])
ax.set_xticklabels([f"{val:.0f}" for val in xticks], rotation=45)

for i, (column_height, alloc_vram_efficiency_score, job_hours) in enumerate(
    zip(
        plot_df["allocated_vram_efficiency_score_column_height"],
        top_jobs["alloc_vram_efficiency_score"],
        plot_df["job_hours"],
        strict=True)
):
    # Place annotation just right of the bar end, but inside the plot
    xpos = column_height + abs(xmin) * 0.02
    ax.text(
        xpos,
        i,
        f"Score: {alloc_vram_efficiency_score:.2f}\nJob Hours: {job_hours:.2f}",
        va="center",
        ha="left",
        fontsize=10,
        color="black",
        clip_on=True
    )

plt.tight_layout()
plt.show()
