# <a id='toc1_'></a>[Efficiency Analysis](#toc0_)


This notebook demonstrates the use of `EfficiencyAnalysis` class in `src/analysis/efficiency_analysis.py` for analyzing the efficiency of jobs, users, and PI groups.

**Table of contents**<a id='toc0_'></a>    
- [Efficiency Analysis](#toc1_)    
  - [Setup](#toc1_1_)    
  - [Example: Analyze Workload Efficiency Across Jobs that Use some VRAM](#toc1_2_)    
    - [Job Efficiency Metrics](#toc1_2_1_)    
      - [Find most inefficient jobs with no VRAM constraints based on `alloc_vram_efficiency`](#toc1_2_1_1_)    
    - [User Efficiency Metrics](#toc1_2_2_)    
      - [Find Inefficient Users based on `expected_value_alloc_vram_efficiency`](#toc1_2_2_1_)    
      - [Distribution of `expected_value_requested_vram_efficiency`](#toc1_2_2_2_)    
      - [Find Users with Highest `vram_hours`](#toc1_2_2_3_)    
    - [PI Group Efficiency Metrics](#toc1_2_3_)    
      - [Find PIs with Highest `vram_hours`](#toc1_2_3_1_)    
  - [Example: Analyze all jobs with no VRAM constraints](#toc1_3_)    
    - [Job Efficiency Metrics](#toc1_3_1_)    
    - [Top users with most number of jobs that have no VRAM constraints](#toc1_3_2_)    
    - [Find inefficient jobs with no VRAM Constraints based on `alloc_vram_efficiency_score`](#toc1_3_3_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

## <a id='toc1_1_'></a>[Setup](#toc0_)

In [None]:
# Import required modules
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Jupyter server should be run at the notebook directory, so the output of the following cell would be the project root:

In [None]:
project_root = str(Path.cwd().resolve().parent.parent)
print(f"Project root: {project_root}")

In [None]:
# Automatically reload modules before executing code (set this up BEFORE imports)
%load_ext autoreload
%autoreload 2

# Add project root to sys.path for module imports
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.analysis import efficiency_analysis as ea
from src.visualization import (
    JobsWithMetricsVisualizer,
    UsersWithMetricsVisualizer,
    PIGroupsWithMetricsVisualizer,
)
from src.config.enum_constants import MetricsDataFrameNameEnum
from src.config.paths import (
    DATA_DIR,
    JOBS_VISUALIZATION_DATA_DIR,
    USERS_VISUALIZATION_DATA_DIR,
    PI_GROUPS_VISUALIZATION_DATA_DIR,
)

In [None]:
# Load the jobs DataFrame from DuckDB
preprocessed_jobs_df = ea.load_preprocessed_jobs_dataframe_from_duckdb(
    db_path=Path(DATA_DIR) / "slurm_data.db",
    table_name="Jobs",
)
display(preprocessed_jobs_df.head(10))
print(preprocessed_jobs_df.shape)

## <a id='toc1_2_'></a>[Example: Analyze Workload Efficiency Across Jobs that Use some VRAM](#toc0_)


In [None]:
efficiency_analysis = ea.EfficiencyAnalysis(
    jobs_df=preprocessed_jobs_df, metrics_df_name_enum=MetricsDataFrameNameEnum
)

In [None]:
filtered_jobs = efficiency_analysis.filter_jobs_for_analysis(
    gpu_mem_usage_filter={"min": 0, "inclusive": False}, elapsed_seconds_min=600
)
filtered_jobs

Generate all metrics:

In [None]:
metrics_dict = efficiency_analysis.calculate_all_efficiency_metrics(filtered_jobs)

jobs_with_metrics = metrics_dict["jobs_with_efficiency_metrics"]
users_with_metrics = metrics_dict["users_with_efficiency_metrics"]
pi_accounts_with_metrics = metrics_dict["pi_accounts_with_efficiency_metrics"]

### <a id='toc1_2_1_'></a>[Job Efficiency Metrics](#toc0_)

In [None]:
# Set option to display all columns
pd.set_option("display.max_columns", None)
# Display the DataFrame
display(jobs_with_metrics.head(10))
# To revert to default settings (optional)
pd.reset_option("display.max_columns")

print(f"Jobs found: {len(jobs_with_metrics)}")

#### <a id='toc1_2_1_1_'></a>[Find most inefficient jobs with no VRAM constraints based on `alloc_vram_efficiency`](#toc0_)

In [None]:
inefficient_jobs_alloc_vram_eff = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=MetricsDataFrameNameEnum.JOBS,
    sorting_key="alloc_vram_efficiency",
    ascending=True,  # Sort by alloc_vram_efficiency in ascending order
    filter_criteria={
        # "alloc_vram_efficiency": {"min": 80 * 24, "inclusive": True},
    },
)
# Display top inefficient users by allocated VRAM efficiency
print("\nTop inefficient Jobs by Allocated VRAM Efficiency:")
display(inefficient_jobs_alloc_vram_eff.head(10))

# Plot top inefficient jobs by allocated VRAM efficiency, with efficiency as labels
jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(inefficient_jobs_alloc_vram_eff.head(10))
jobs_with_metrics_visualizer.visualize(
    output_dir_path=JOBS_VISUALIZATION_DATA_DIR,
    column="alloc_vram_efficiency",
    bar_label_columns=["job_hours", "vram_hours"],
    figsize=(10, 6),
)

### <a id='toc1_2_2_'></a>[User Efficiency Metrics](#toc0_)

In [None]:
users_with_metrics

#### <a id='toc1_2_2_1_'></a>[Find Inefficient Users based on `expected_value_alloc_vram_efficiency`](#toc0_)

In [None]:
inefficient_users_alloc_vram_eff = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=MetricsDataFrameNameEnum.USERS,
    sorting_key="expected_value_alloc_vram_efficiency",
    ascending=True,  # we want to find users with low efficiency
    filter_criteria={
        "expected_value_alloc_vram_efficiency": {"max": 0.3, "inclusive": True},
        "job_count": {"min": 5, "inclusive": True},  # Minimum number of jobs to consider a user
    },
)
print("\nTop inefficient users by allocated vram efficiency:")
display(inefficient_users_alloc_vram_eff.head(20))

# Plot top inefficient users by allocated vram efficiency, with allocated vram efficiency as labels
users_with_metrics_visualizer = UsersWithMetricsVisualizer(inefficient_users_alloc_vram_eff.head(10))
users_with_metrics_visualizer.visualize(
    column="expected_value_alloc_vram_efficiency",
    bar_label_columns=["vram_hours", "user_job_hours"],
    figsize=(10, 6),
)

#### <a id='toc1_2_2_2_'></a>[Distribution of `expected_value_requested_vram_efficiency`](#toc0_)

In [None]:
inefficient_users_ev_req_vram_eff = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=MetricsDataFrameNameEnum.USERS,
    sorting_key="expected_value_requested_vram_efficiency",
    ascending=True,  # Sort by expected_value_requested_vram_efficiency in ascending order
    filter_criteria={
        "job_count": {"min": 5, "inclusive": True},  # minimum job count threshold
    },
)
users_with_metrics_ev_visualizer = UsersWithMetricsVisualizer(inefficient_users_ev_req_vram_eff)
users_with_metrics_ev_visualizer.visualize_metric_distribution(
    output_dir_path=USERS_VISUALIZATION_DATA_DIR, column="expected_value_requested_vram_efficiency", figsize=(8, 5)
)

#### <a id='toc1_2_2_3_'></a>[Find Users with Highest `vram_hours`](#toc0_)

In [None]:
users_with_highest_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=MetricsDataFrameNameEnum.USERS,
    sorting_key="vram_hours",
    ascending=False,  # Sort by vram_hours in descending order
)
# Display top users by VRAM-hours
print("\nTop users by VRAM-hours:")
display(users_with_highest_vram_hours.head(20))


# Plot top users by VRAM-hours, with VRAM-hours as labels
users_with_metrics_visualizer = UsersWithMetricsVisualizer(users_with_highest_vram_hours.head(10))
users_with_metrics_visualizer.visualize(
    column="vram_hours", bar_label_columns=["vram_hours", "user_job_hours"], figsize=(10, 6)
)

### <a id='toc1_2_3_'></a>[PI Group Efficiency Metrics](#toc0_)

In [None]:
pi_accounts_with_metrics

#### <a id='toc1_2_3_1_'></a>[Find PIs with Highest `vram_hours`](#toc0_)

In [None]:
top_pi_groups_by_vram_hours = efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=MetricsDataFrameNameEnum.PI_GROUPS,
    sorting_key="pi_acc_vram_hours",
    ascending=False,
    filter_criteria={
        "pi_acc_vram_hours": {"min": 200, "inclusive": True},  # VRAM-hours threshold for identifying inefficient users
        "job_count": {"min": 5, "inclusive": True},  # Minimum number of jobs to consider a PI account
    },
)
# Display top inefficient users by VRAM-hours
print("\nTop inefficient PI Groups by VRAM-hours:")
display(top_pi_groups_by_vram_hours.head(20))

pi_group_visualizer = PIGroupsWithMetricsVisualizer(top_pi_groups_by_vram_hours.head(10))
pi_group_visualizer.visualize(
    output_dir_path=PI_GROUPS_VISUALIZATION_DATA_DIR,
    column="pi_acc_vram_hours",
    bar_label_columns=["pi_acc_vram_hours", "pi_acc_job_hours"],
    figsize=(10, 6),
)

## <a id='toc1_3_'></a>[Example: Analyze all jobs with no VRAM constraints](#toc0_)

In [None]:
# Filter jobs where no VRAM constraint was set but a GPU was allocated
no_vram_constraint_efficiency_analysis = ea.EfficiencyAnalysis(
    jobs_df=preprocessed_jobs_df, metrics_df_name_enum=MetricsDataFrameNameEnum
)
all_no_vram_constraint_jobs = no_vram_constraint_efficiency_analysis.filter_jobs_for_analysis(
    vram_constraint_filter={"min": 0, "inclusive": False},  # No VRAM constraints
    gpu_count_filter={"min": 1, "inclusive": True},  # At least one GPU allocated
    gpu_mem_usage_filter={"min": 0, "inclusive": False},  # Used more than 0 GiB of VRAM
)

display(all_no_vram_constraint_jobs.head(10))
print(all_no_vram_constraint_jobs.shape)

### <a id='toc1_3_1_'></a>[Job Efficiency Metrics](#toc0_)

In [None]:
no_vram_constraint_jobs_with_metrics = no_vram_constraint_efficiency_analysis.calculate_job_efficiency_metrics(
    all_no_vram_constraint_jobs
)

# Set option to display all columns
pd.set_option("display.max_columns", None)
# Display the DataFrame
display(no_vram_constraint_jobs_with_metrics.head(10))
# To revert to default settings (optional)
pd.reset_option("display.max_columns")
print(f"Jobs found: {len(no_vram_constraint_jobs_with_metrics)}")

### <a id='toc1_3_2_'></a>[Top users with most number of jobs that have no VRAM constraints](#toc0_)

In [None]:
# Plot top users by number of jobs with no VRAM constraints
if not all_no_vram_constraint_jobs.empty:
    plt.figure(figsize=(10, 5))
    user_counts = all_no_vram_constraint_jobs["User"].value_counts().head(20)
    sns.barplot(x=user_counts.values, y=user_counts.index, orient="h")
    plt.xlabel("Number of Jobs")
    plt.ylabel("User")
    plt.title("Top 20 Users with the Most Jobs with no VRAM Constraints")
    plt.tight_layout()
    plt.show()
else:
    print("No jobs found without VRAM constraints.")

### <a id='toc1_3_3_'></a>[Find inefficient jobs with no VRAM Constraints based on `alloc_vram_efficiency_score`](#toc0_)

In [None]:
low_alloc_vram_score_jobs = no_vram_constraint_efficiency_analysis.sort_and_filter_records_with_metrics(
    metrics_df_name_enum=MetricsDataFrameNameEnum.JOBS,
    sorting_key="alloc_vram_efficiency_score",
    ascending=True,  # Sort by alloc_vram_efficiency_score in ascending order
    filter_criteria={
        "alloc_vram_efficiency_score": {"max": -10, "inclusive": True},  # score threshold
    },
)
# Display top inefficient users by alloc_vram_efficiency_score
print("\nTop inefficient Jobs by allocated VRAM efficiency score:")

display(low_alloc_vram_score_jobs.head(5))

jobs_with_metrics_visualizer = JobsWithMetricsVisualizer(low_alloc_vram_score_jobs.head(10))
jobs_with_metrics_visualizer.visualize(
    column="alloc_vram_efficiency_score",
    bar_label_columns=["alloc_vram_efficiency_score", "job_hours"],
    figsize=(10, 6),
)