In [None]:
import sys
from pathlib import Path

# Add src to sys.path for module imports
sys.path.append(str(Path.cwd().parent / "src"))

from src.database.database_connection import DatabaseConnection

# Connect to the database
db = DatabaseConnection(db_url="../slurm_data.db")

In [None]:
# Query jobs with GPUs
gpu_df = db.connection.query("SELECT * FROM Jobs WHERE GPUs > 0").to_df()

# Display the data
print(gpu_df.head())
print(f"Number of jobs with GPUs: {len(gpu_df)}")

In [None]:
project_root = str(Path.cwd().resolve().parent)
print(f"Project root: {project_root}")

# Add project root to sys.path for module imports
if project_root not in sys.path:
    sys.path.insert(0, project_root)


# Automatically reload modules before executing code
# This is useful for development to see changes without restarting the kernel.
%load_ext autoreload
# Reload all modules imported with %aimport every time before executing the Python code typed.
%autoreload 1
%aimport src.analysis.vram_usage, src.preprocess.preprocess

In [None]:
# Import analysis and preprocess functions
from src.analysis.vram_usage import EfficiencyAnalysis
from src.preprocess.preprocess import preprocess_data

new_df = preprocess_data(
    gpu_df, min_elapsed_seconds=0, include_failed_cancelled_jobs=False, include_cpu_only_jobs=True
)

# Initialize the EfficiencyAnalysis class
efficiency_analyzer = EfficiencyAnalysis(df=new_df, table_name="Jobs")

# Load and preprocess data
df_multi_gpu = efficiency_analyzer.jobs_df

# Display the loaded data
print(df_multi_gpu.head())

In [None]:
# Run the efficiency metrics calculation
import numpy as np

new_df = efficiency_analyzer.filter_jobs_for_analysis(
    gpu_count_filter=1, vram_constraint_filter=None, gpu_mem_usage_filter={"min": 0, "max": np.inf, "inclusive": False}
)
efficiency_metrics = efficiency_analyzer.calculate_job_efficiency_metrics(new_df)

# Display the calculated efficiency metrics
print(efficiency_metrics.head())

In [None]:
import matplotlib.pyplot as plt

# Evaluate CPU-GPU usage
analysis_results = efficiency_analyzer.evaluate_cpu_gpu_usage(
    hours_percentage_threshold=25, vram_efficiency_threshold=0.3
)

# Display key summary statistics
print("Total jobs:", analysis_results["total_jobs"])
print("Total GPU hours:", f"{analysis_results['total_gpu_hours']:.2f}")
print("Average VRAM efficiency:", f"{analysis_results['avg_efficiency']:.2%}")
print("Median VRAM efficiency:", f"{analysis_results['median_efficiency']:.2%}")

# Show recommendations
print("\nRecommendations:")
for rec in analysis_results["report"]:
    print("-", rec)

# Display efficiency patterns table
display(analysis_results["efficiency_patterns"])

# Visualize the analysis results
cpu_gpu_balance = analysis_results.get("cpu_gpu_balance", None)
if cpu_gpu_balance is not None:
    plt.figure(figsize=(10, 6))
    # cpu_gpu_balance['Job_Count'].plot(kind='bar', color='green')
    plt.xlabel("Workload Type")
    plt.ylabel("Job Count")
    plt.title("CPU-GPU Balance Analysis")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# Export the analysis results to a CSV file
# output_path = 'results/main_db_analysis.csv'
# zero_vram_analyzer.export_results_to_csv(analysis_results, output_path)
# print(f"Analysis results exported to {output_path}")

# Visualize efficiency patterns
efficiency_patterns = analysis_results["efficiency_patterns"]
plt.figure(figsize=(10, 6))
sns.barplot(x=efficiency_patterns.index, y=efficiency_patterns["job_hours"])
plt.xlabel("Efficiency Category")
plt.ylabel("Job Hours")
plt.title("Efficiency Patterns")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Apply additional filtering options

filtered_data = efficiency_analyzer.filter_jobs_for_analysis(
    gpu_count_filter=1,
    vram_constraint_filter=None,
    allocated_vram_filter={"min": 0, "max": np.inf, "inclusive": False},
    gpu_mem_usage_filter={"min": 0.1, "max": np.inf, "inclusive": False},
)

user_data = efficiency_analyzer.calculate_user_efficiency_metrics()

# Display the filtered data
display(user_data.head())

# Identify inefficient users
inefficient_users = efficiency_analyzer.find_inefficient_users_by_alloc_vram_efficiency(
    efficiency_threshold=0.3, min_jobs=5
)

# Display the inefficient users
inefficient_users.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot VRAM efficiency distribution (x-axis cut at 1.0)
plt.figure(figsize=(8, 5))
sns.histplot(user_data["expected_value_alloc_vram_efficiency"].dropna().clip(upper=1.0), bins=30, kde=True)
plt.xlabel("VRAM Efficiency (clipped at 1.0)")
plt.ylabel("Number of Jobs")
plt.title("Distribution of VRAM Efficiency")
plt.xlim(0, 1.0)
plt.show()

In [None]:
# Plot with value labels for average efficiency
if inefficient_users is not None and not inefficient_users.empty:
    top_problematic = inefficient_users.head(10)
    top_problematic = top_problematic.sort_values(by="expected_value_alloc_vram_efficiency", ascending=True)
    plt.figure(figsize=(10, 5))
    ax = sns.barplot(y=top_problematic["User"], x=top_problematic["expected_value_alloc_vram_efficiency"], orient="h")
    plt.xlabel("Average Weighted VRAM Efficiency")
    plt.ylabel("User")
    plt.title("Top 10 Problematic Users (Lowest Efficiency)")
    plt.xlim(0, 1.0)
    # Add value labels
    for bar in ax.patches:
        ax.text(
            bar.get_width() + 0.01,  # Position slightly to the right of the bar
            bar.get_y() + bar.get_height() / 2,  # Center vertically
            f"{bar.get_width():.4f}",  # Format the value
            va="center",  # Align vertically
            fontsize=9,  # Font size
        )

    plt.show()
    display(top_problematic)
else:
    print("No inefficient user data available in results.")

In [None]:
# group by "interactive" using number of hours
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

hybrid_jobs = efficiency_analyzer.filter_jobs_for_analysis(
    gpu_count_filter=None,
    vram_constraint_filter=None,
    allocated_vram_filter={"min": 0, "max": np.inf, "inclusive": False},
    gpu_mem_usage_filter={"min": 0.1, "max": np.inf, "inclusive": False},
)
hybrid_jobs["gpu_memory_used_gb"] = hybrid_jobs["GPUMemUsage"] / (2**30)
hybrid_jobs["allocated_vram"] = np.where(
    (hybrid_jobs["gpu_memory_used_gb"] > 16) & (hybrid_jobs["GPUType"].iloc[0] in ["A100", "V100"]),
    32,
    hybrid_jobs["allocated_vram"],
)

# Group by job type and sum the hours
job_type_hours = hybrid_jobs.groupby("Interactive")["Elapsed"].sum().reset_index()

# Plot job type distribution
plt.figure(figsize=(8, 5))
ax = sns.barplot(x="Interactive", y="Elapsed", data=job_type_hours)

plt.xlabel("Job Type (Interactive vs Non-Interactive)")
plt.ylabel("Total Elapsed Seconds")
plt.title("Total Elapsed Seconds by Job Type (Interactive vs Non-Interactive)")
plt.xticks(rotation=45)
plt.tight_layout()
ax.invert_yaxis()  # This flips the y-axis so bars grow from bottom to top
plt.show()

In [None]:
# Identify inefficient PIs
pi_metrics = efficiency_analyzer.calculate_pi_account_efficiency_metrics()
inefficient_pis = efficiency_analyzer.find_inefficient_pis_weighted_by_hours(efficiency_threshold=0.3, min_jobs=5)

# Display the inefficient PIs
display(inefficient_pis.head())

In [None]:
# Print GPU count summary for hybrid jobs
print("Unique GPU counts in hybrid jobs:", hybrid_jobs["GPUs"].unique())
print("GPU count distribution in hybrid jobs:")
print(hybrid_jobs["GPUs"].value_counts())
print(
    f"Min GPUs: {hybrid_jobs['GPUs'].min()}, "
    f"Max GPUs: {hybrid_jobs['GPUs'].max()}, "
    f"Mean GPUs: {hybrid_jobs['GPUs'].mean():.2f}"
)

# Visualize inefficient users (top 10 only with nonzero values)
top_inefficient_users = inefficient_users[inefficient_users["Weighted_Efficiency_Contribution"] > 0].head(10)
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_inefficient_users["Weighted_Efficiency_Contribution"], y=top_inefficient_users["User"])
plt.xlabel("Weighted Efficiency Contribution")
plt.ylabel("User")
plt.title("Top 10 Inefficient Users")


plt.tight_layout()
plt.show()

In [None]:
# Extract the first element of GPUType array and create a new field
hybrid_jobs["GPUType_First"] = hybrid_jobs["GPUType"].apply(lambda x: x[0])
# Group by GPUType_First and calculate statistics
hybrid_jobs = efficiency_analyzer.calculate_job_efficiency_metrics(hybrid_jobs)
print(hybrid_jobs.columns)
gpu_stats_first = hybrid_jobs.groupby("GPUType_First").agg(
    {
        "alloc_vram_efficiency": ["mean", "std"],
        "gpu_memory_used_gb": ["mean", "std"],
        "allocated_vram": ["mean", "std"],
        "job_hours": ["mean", "std"],
    }
)
print("Statistics grouped by GPUType_First:")
print(gpu_stats_first)

# Plot most requested GPUs (first element)
plt.figure(figsize=(12, 6))
requested_gpu_counts_first = hybrid_jobs["GPUType_First"].value_counts()
sns.barplot(x=requested_gpu_counts_first.index, y=requested_gpu_counts_first.values)
plt.title("Most Assigned GPUs (First Element)")
plt.xlabel("GPUType_First")
plt.ylabel("Request Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
hybrid_jobs[hybrid_jobs["alloc_vram_efficiency"] > 1][
    [
        "JobID",
        "Interactive",
        "Status",
        "ExitCode",
        "Constraints",
        "NodeList",
        "gpu_count",
        "GPUMemUsage",
        "gpu_memory_used_gb",
        "allocated_vram",
        "GPUType",
        "alloc_vram_efficiency",
    ]
]

In [None]:
# Filter multi-GPU jobs
gpu_jobs = hybrid_jobs[(hybrid_jobs["gpu_count"] > 0) & (~hybrid_jobs["Status"].isin(["TIMEOUT", "OUT_OF_MEMORY"]))]
print(f"Number of multi-GPU jobs: {len(gpu_jobs)}")

# Summarize multi-GPU job efficiency
gpu_summary = gpu_jobs.groupby("gpu_count").agg(
    {"alloc_vram_efficiency": ["mean", "std"], "job_hours": ["mean", "std"], "GPUMemUsage": ["mean", "std"]}
)
print("GPU job summary:")
print(gpu_summary)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize efficiency distribution for multi-GPU jobs
plt.figure(figsize=(10, 6))
sns.boxplot(x="gpu_count", y="alloc_vram_efficiency", data=gpu_jobs)
plt.title("Efficiency Distribution by GPU Count")
plt.xlabel("GPU Count")
plt.ylabel("Efficiency")
plt.show()
gpu_jobs[gpu_jobs["alloc_vram_efficiency"] > 1][
    [
        "JobID",
        "Status",
        "ExitCode",
        "Constraints",
        "gpu_count",
        "gpu_memory_used_gb",
        "allocated_vram",
        "GPUType",
        "alloc_vram_efficiency",
    ]
].sort_values(by="alloc_vram_efficiency", ascending=False)

In [None]:
# Plot GPU Utilization vs. GPU Count
plt.figure(figsize=(10, 6))
sns.scatterplot(x="gpu_count", y="GPUMemUsage", data=gpu_jobs)
plt.title("GPU Utilization vs. GPU Count")
plt.xlabel("GPU Count")
plt.ylabel("GPU Memory Usage")
plt.show()

In [None]:
# Plot Job Duration vs. GPU Count
plt.figure(figsize=(10, 6))
sns.boxplot(x="gpu_count", y="job_hours", data=gpu_jobs)
plt.title("Job Duration Distribution by GPU Count")
plt.xlabel("GPU Count")
plt.ylabel("Job Duration (hours)")
plt.show()

In [None]:
# Plot VRAM Efficiency Over Time
vals = gpu_jobs.sort_values("Elapsed")
plt.figure(figsize=(10, 6))
sns.lineplot(x="Elapsed", y="alloc_vram_efficiency", data=vals.head(1000))
plt.title("VRAM Efficiency Over Time")
plt.xlabel("Elapsed Time")
plt.ylabel("VRAM Efficiency")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Analyze GPU jobs that use more CPU than they should
import matplotlib.pyplot as plt
import seaborn as sns

# Filter jobs where CPU usage exceeds GPU usage
cpu_gpu_threshold = 1  # CPU usage is greater than GPU usage
high_cpu_jobs = gpu_jobs[gpu_jobs["CPUMemUsage"] > cpu_gpu_threshold * gpu_jobs["GPUMemUsage"]]

# Identify top users with high CPU usage relative to GPU usage
top_users_high_cpu = high_cpu_jobs.groupby("User").agg({"CPUMemUsage": "sum", "GPUMemUsage": "sum"}).reset_index()
top_users_high_cpu["CPU_to_GPU_Ratio"] = top_users_high_cpu["CPUMemUsage"] / top_users_high_cpu["GPUMemUsage"]
top_users_high_cpu = top_users_high_cpu.sort_values(by="CPU_to_GPU_Ratio", ascending=False).head(10)

# Display the top users
print("Top users where CPU usage exceeds GPU usage:")
print(top_users_high_cpu)

# Visualize the top users
plt.figure(figsize=(10, 6))
sns.barplot(x="CPU_to_GPU_Ratio", y="User", data=top_users_high_cpu, palette="viridis")
plt.xlabel("CPU to GPU Usage Ratio")
plt.ylabel("User")
plt.title("Top Users with High CPU Usage Relative to GPU Usage")
plt.tight_layout()
plt.show()

# Display the top jobs
top_jobs_high_cpu = high_cpu_jobs.sort_values(by="CPUMemUsage", ascending=False).head(10)
print("Top jobs where CPU usage exceeds GPU usage:")
top_jobs_high_cpu["cpu_gpu_ratio"] = top_jobs_high_cpu["CPUMemUsage"] / top_jobs_high_cpu["GPUMemUsage"]
print(top_jobs_high_cpu[["JobID", "User", "CPUMemUsage", "GPUMemUsage", "cpu_gpu_ratio"]])

# Display the filtered jobs
print("Jobs with high CPU usage relative to GPU usage:")
print(high_cpu_jobs.head())

# Visualize the CPU vs GPU usage for these jobs
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=high_cpu_jobs["GPUMemUsage"],
    y=high_cpu_jobs["CPUMemUsage"],
    hue=high_cpu_jobs["Interactive"],  # Assuming JobType is a column in gpu_df
    palette="viridis",
)
plt.xlabel("GPU Usage")
plt.ylabel("CPU Usage")
plt.title("CPU vs GPU Usage for High CPU Jobs")
plt.tight_layout()
plt.show()