# Inefficient Users Analysis - Requested VRAM Efficiency

This notebook analyzes inefficient users based on requested VRAM efficiency. 

The analysis focuses on identifying users who consistently request more VRAM than they actually use, which can indicate inefficient resource allocation and impact overall cluster utilization.

In [None]:
# Import required modules

import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root to sys.path for module imports
project_root = str(Path.cwd().resolve().parent)
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
%load_ext autoreload
# Reload all modules imported with %aimport every time before executing the Python code typed.
%autoreload 1
%aimport src.visualization.columns, src.database.database_connection, src.visualization.models,\
src.preprocess.preprocess, src.analysis.efficiency_analysis

In [None]:
from src.analysis import efficiency_analysis as ea
from src.utilities.load_and_preprocess_jobs import load_and_preprocess_jobs

print(f"Project root: {project_root}")

# Set display options for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Configure matplotlib
plt.style.use('default')
sns.set_palette("husl")

## Load Dataset

Load the complete jobs dataset from the database without any filtering to analyze all users and their requested VRAM efficiency.

In [None]:
# Load the full jobs DataFrame from DuckDB
preprocessed_jobs_df = load_and_preprocess_jobs(
    db_path="../20250814_slurm_data.db",
    table_name="Jobs",
)

print(f"Loaded {len(preprocessed_jobs_df)} jobs from the database")

# Display basic statistics about the dataset
display(preprocessed_jobs_df.head())

## Initialize Efficiency Analysis

Create an EfficiencyAnalysis instance and calculate all efficiency metrics for the complete dataset.

In [None]:
# Initialize EfficiencyAnalysis with the full dataset
efficiency_analysis = ea.EfficiencyAnalysis(jobs_df=preprocessed_jobs_df)

# Calculate all efficiency metrics using the entire dataset (no filtering)
print("Calculating efficiency metrics for all jobs...")
metrics_dict = efficiency_analysis.calculate_all_efficiency_metrics(preprocessed_jobs_df)

# Extract the calculated metrics
jobs_with_metrics = metrics_dict["jobs_with_efficiency_metrics"]
users_with_metrics = metrics_dict["users_with_efficiency_metrics"]
pi_accounts_with_metrics = metrics_dict["pi_accounts_with_efficiency_metrics"]

print("Calculated metrics for:")
print(f"  - {len(jobs_with_metrics)} jobs")
print(f"  - {len(users_with_metrics)} users")
print(f"  - {len(pi_accounts_with_metrics)} PI accounts")

## Inefficient Users by Requested VRAM Efficiency

Find users who consistently request more VRAM than they actually use. This analysis helps identify users who may need guidance on optimal resource allocation.

In [None]:
# Find inefficient users by requested VRAM efficiency
# Users with efficiency < 0.3 (using less than 30% of requested VRAM)
inefficient_users_requested_vram = efficiency_analysis.find_inefficient_users_by_requested_vram_efficiency(
    requested_vram_efficiency_filter={"min": 0, "max": 0.3, "inclusive": False},
    min_jobs=10  # Only consider users with at least 10 jobs
)

print(f"Found {len(inefficient_users_requested_vram)} inefficient users by requested VRAM efficiency")
print("\nTop 10 most inefficient users by requested VRAM efficiency:")
display(inefficient_users_requested_vram.head(10))

## Analysis at Different Efficiency Thresholds

Let's analyze users at different efficiency thresholds to understand the distribution of inefficiency.

In [None]:
# Analyze users at different efficiency thresholds
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
threshold_analysis = []

for threshold in thresholds:
    inefficient_users = efficiency_analysis.find_inefficient_users_by_requested_vram_efficiency(
        requested_vram_efficiency_filter={"max": threshold, "inclusive": True},
        min_jobs=5
    )
    
    total_vram_hours = inefficient_users["vram_hours"].sum() if not inefficient_users.empty else 0
    total_job_hours = inefficient_users["user_job_hours"].sum() if not inefficient_users.empty else 0
    
    threshold_analysis.append({
        "threshold": threshold,
        "user_count": len(inefficient_users),
        "total_vram_hours": total_vram_hours,
        "total_job_hours": total_job_hours,
        "avg_efficiency": inefficient_users["expected_value_requested_vram_efficiency"].mean() 
        if not inefficient_users.empty else 0
    })

threshold_df = pd.DataFrame(threshold_analysis)
print("Users below different requested VRAM efficiency thresholds:")
display(threshold_df)

## Summary

Based on the analysis of requested VRAM efficiency across all users, we can identify the worst offenders.

In [None]:
# Generate summary statistics
total_users = len(users_with_metrics)
users_with_data = len(users_with_metrics[users_with_metrics["expected_value_requested_vram_efficiency"].notna()])

# Users below 30% efficiency (our main focus)
inefficient_pct = len(inefficient_users_requested_vram)
pct_inefficient = (inefficient_pct / users_with_data) * 100 if users_with_data > 0 else 0

# Total resource impact
total_vram_hours_inefficient = inefficient_users_requested_vram["vram_hours"].sum()
total_vram_hours_all = users_with_metrics["vram_hours"].sum()
pct_vram_hours_wasted = (total_vram_hours_inefficient / total_vram_hours_all) * 100 if total_vram_hours_all > 0 else 0

# Top worst performers
if not inefficient_users_requested_vram.empty:
    worst_user = inefficient_users_requested_vram.iloc[0]
    print("=== WORST PERFORMER ===")
    print(f"User: {worst_user['User']}")
    print(f"Requested VRAM efficiency: {worst_user['expected_value_requested_vram_efficiency']:.3f}")
    print(f"Job count: {worst_user['job_count']}")
    print(f"VRAM-hours: {worst_user['vram_hours']:.1f}")
    print(f"PI Account: {worst_user['pi_account']}")

### Generate Report for Worst Performing User

Let's generate a report for the worst performing user we identified earlier. Can also be done through a separate script:
```py
python scripts/generate_user_reports.py --db-path .\20250814_slurm_data.db users --users user1,user2
```

In [None]:
# Generate report for the worst performing user using the new method
if not inefficient_users_requested_vram.empty:
    worst_user_id = inefficient_users_requested_vram.iloc[1]["User"]  # 2nd user as they have more jobs
    
    print(f"Generating report for inefficient user: {worst_user_id}")
    
    report_path = efficiency_analysis.generate_user_report(
        user_id=worst_user_id,
        output_dir="../reports/user_reports",
        template_path="../reports/ppt_user_report_template.qmd"
    )
    
    if not report_path:
        print("\nFailed to generate report.")
else:
    print("No inefficient users found to generate report for.")

### Generate Reports for Top 5 Inefficient Users

In [None]:
# Generate reports for top 5 inefficient users using the new method
if not inefficient_users_requested_vram.empty:
    top_5_users = inefficient_users_requested_vram.head(5)["User"].tolist()
    
    print(f"Generating reports for top 5 inefficient users: {top_5_users}")
    
    successful_reports = []
    failed_reports = []
    
    for user_id in top_5_users:
        print(f"\n--- Generating report for {user_id} ---")
        
        try:
            # Much simpler - just call the method!
            report_path = efficiency_analysis.generate_user_report(
                user_id=user_id,
                output_dir="../reports/user_reports",
                template_path="../reports/ppt_user_report_template.qmd"
            )
            
            if report_path:
                successful_reports.append((user_id, report_path))
                print(f"✅ Success: {user_id}")
            else:
                failed_reports.append(user_id)
                print(f"❌ Failed: {user_id}")
                
        except Exception as e:
            failed_reports.append(user_id)
            print(f"❌ Exception for {user_id}: {e}")
    
    print(f"Successfully generated: {len(successful_reports)} reports")
    print(f"Failed: {len(failed_reports)} reports")

            
else:
    print("No inefficient users found to generate reports for.")