In [None]:
import sys
from pathlib import Path

# Add src to sys.path for module imports
sys.path.append(str(Path.cwd().parent / "src"))

from database.DatabaseConnection import DatabaseConnection

# Connect to the database
db = DatabaseConnection(db_url="../slurm_data.db")

In [None]:
# Query jobs with GPUs
gpu_df = db.connection.query("SELECT * FROM Jobs WHERE GPUs > 0").to_df()

# Display the data
print(gpu_df.head())
print(f"Number of jobs with GPUs: {len(gpu_df)}")

In [None]:
# Import analysis and preprocess functions
from analysis.vram_usage import EfficiencyAnalysis
from preprocess.preprocess import preprocess_data

new_df = preprocess_data(gpu_df, min_elapsed_second=0, include_failed_cancelled_jobs=False, include_CPU_only_job=True)

# Initialize the EfficiencyAnalysis class
efficiency_analyzer = EfficiencyAnalysis(df=new_df, table_name="Jobs")

# Load and preprocess data using the new class-based API
df_multi_gpu = efficiency_analyzer.jobs_df

# Display the loaded data
print(df_multi_gpu.head())

# Run the efficiency metrics calculation
efficiency_metrics = efficiency_analyzer.calculate_efficiency_metrics(
    gpus_min=1, vram_constraint_filter=0, gpu_mem_usage_min=0
)

# Display the calculated efficiency metrics
print(efficiency_metrics.head())
print(new_df.columns)

In [None]:
# Calculate efficiency metrics
filtered_jobs = efficiency_analyzer.calculate_efficiency_metrics(
    vram_constraint_filter=0, allocated_vram_greater_than=0, gpu_mem_usage_min=0, gpus_min=1, elapsed_seconds_min=600
)

# Display the filtered jobs
filtered_jobs.head()

In [None]:
import matplotlib.pyplot as plt

# results_df = db.connection.query("SELECT * FROM Jobs WHERE array_length(constraints, 1) > 1 and GPUs > 1").to_df()
# # results_df.to_csv("results.csv", index=False)
# results_df2 = db.connection.query("SELECT * FROM Jobs WHERE array_length(GPUType, 1) > 1").to_df()
# # results_df2.to_csv("results2.csv", index=False)

# Evaluate CPU-GPU usage
analysis_results = efficiency_analyzer.evaluate_cpu_gpu_usage(
    hours_percentage_threshold=25, vram_efficiency_threshold=0.3
)

# Display key summary statistics
print("Total jobs:", analysis_results["total_jobs"])
print("Total GPU hours:", f"{analysis_results['total_gpu_hours']:.2f}")
print("Average VRAM efficiency:", f"{analysis_results['avg_efficiency']:.2%}")
print("Median VRAM efficiency:", f"{analysis_results['median_efficiency']:.2%}")

# Show recommendations
print("\nRecommendations:")
for rec in analysis_results["report"]:
    print("-", rec)

# Display efficiency patterns table
display(analysis_results["efficiency_patterns"])

# Visualize the analysis results
cpu_gpu_balance = analysis_results.get("cpu_gpu_balance", None)
if cpu_gpu_balance is not None:
    plt.figure(figsize=(10, 6))
    # cpu_gpu_balance['Job_Count'].plot(kind='bar', color='green')
    plt.xlabel("Workload Type")
    plt.ylabel("Job Count")
    plt.title("CPU-GPU Balance Analysis")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# Export the analysis results to a CSV file
# output_path = 'results/main_db_analysis.csv'
# zero_vram_analyzer.export_results_to_csv(analysis_results, output_path)
# print(f"Analysis results exported to {output_path}")

# Visualize efficiency patterns
efficiency_patterns = analysis_results["efficiency_patterns"]
plt.figure(figsize=(10, 6))
sns.barplot(x=efficiency_patterns.index, y=efficiency_patterns["GPU_Hours"])
plt.xlabel("Efficiency Category")
plt.ylabel("GPU Hours")
plt.title("Efficiency Patterns")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Apply additional filtering options
filtered_data = efficiency_analyzer.calculate_efficiency_metrics(
    allocated_vram_greater_than=0, gpus_min=1, gpu_mem_usage_min=0.1
)

# Display the filtered data
display(filtered_data.head())

# Identify inefficient users
inefficient_users = efficiency_analyzer.find_inefficient_users_weighted_by_hours(efficiency_threshold=0.3, min_jobs=5)

# Display the inefficient users
inefficient_users.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot VRAM efficiency distribution (x-axis cut at 1.0)
plt.figure(figsize=(8, 5))
sns.histplot(filtered_data["vram_efficiency"].dropna().clip(upper=1.0), bins=30, kde=True)
plt.xlabel("VRAM Efficiency (clipped at 1.0)")
plt.ylabel("Number of Jobs")
plt.title("Distribution of VRAM Efficiency")
plt.xlim(0, 1.0)
plt.show()

In [None]:
# Plot with value labels for average efficiency
if inefficient_users is not None and not inefficient_users.empty:
    top_problematic = inefficient_users.head(10)
    top_problematic = top_problematic.sort_values(by="Avg_Weighted_VRAM_Efficiency", ascending=True)
    plt.figure(figsize=(10, 5))
    ax = sns.barplot(y=top_problematic["User"], x=top_problematic["Avg_Weighted_VRAM_Efficiency"], orient="h")
    plt.xlabel("Average Weighted VRAM Efficiency")
    plt.ylabel("User")
    plt.title("Top 10 Problematic Users (Lowest Efficiency)")
    plt.xlim(0, 1.0)
    # Add value labels
    for bar in ax.patches:
        ax.text(
            bar.get_width() + 0.01,  # Position slightly to the right of the bar
            bar.get_y() + bar.get_height() / 2,  # Center vertically
            f"{bar.get_width():.4f}",  # Format the value
            va="center",  # Align vertically
            fontsize=9,  # Font size
        )

    plt.show()
    display(top_problematic)
else:
    print("No inefficient user data available in results.")

In [None]:
# group by "interactive" using number of hours
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

hybrid_jobs = filtered_data
hybrid_jobs["allocated_vram"] = np.where(
    (hybrid_jobs["gpu_memory_used_gb"] > 16) & (hybrid_jobs["GPUType"].iloc[0] in ["A100", "V100"]),
    32,
    hybrid_jobs["allocated_vram"],
)

# Group by job type and sum the hours
job_type_hours = hybrid_jobs.groupby("Interactive")["Elapsed"].sum().reset_index()

# Plot job type distribution
plt.figure(figsize=(8, 5))
ax = sns.barplot(x="Interactive", y="Elapsed", data=job_type_hours)

plt.xlabel("Job Type (Interactive vs Non-Interactive)")
plt.ylabel("Total Elapsed Seconds")
plt.title("Total Elapsed Seconds by Job Type (Interactive vs Non-Interactive)")
plt.xticks(rotation=45)
plt.tight_layout()
ax.invert_yaxis()  # This flips the y-axis so bars grow from bottom to top
plt.show()

In [None]:
# test = hybrid_jobs[hybrid_jobs["allocated_vram"] < hybrid_jobs["gpu_memory_used_gb"]]
# print(test[["allocated_vram", "gpu_memory_used_gb", "GPUs", "vram_efficiency", "GPUType", "Status", "ExitCode"]])
# display(test)

In [None]:
# results_df = db.connection.query("SELECT * FROM Jobs WHERE array_length(constraints, 1) > 1 and GPUs > 1").to_df()
# # results_df.to_csv("results.csv", index=False)
# results_df2 = db.connection.query("SELECT * FROM Jobs WHERE array_length(GPUType, 1) > 1").to_df()
# # results_df2.to_csv("results2.csv", index=False)
# results_df2

In [None]:
db.connection.query("SELECT * FROM Jobs WHERE array_length(GPUType, 1) > 1 AND GPUMemUsage > 0").to_df()

In [None]:
# Identify inefficient PIs
inefficient_pis = efficiency_analyzer.find_inefficient_pis_weighted_by_hours(efficiency_threshold=0.3, min_jobs=5)

# Display the inefficient PIs
display(inefficient_pis.head())

In [None]:
# Print GPU count summary for hybrid jobs
print("Unique GPU counts in hybrid jobs:", hybrid_jobs["GPUs"].unique())
print("GPU count distribution in hybrid jobs:")
print(hybrid_jobs["GPUs"].value_counts())
print(
    f"Min GPUs: {hybrid_jobs['GPUs'].min()}, "
    f"Max GPUs: {hybrid_jobs['GPUs'].max()}, "
    f"Mean GPUs: {hybrid_jobs['GPUs'].mean():.2f}"
)

# Visualize inefficient users (top 10 only with nonzero values)
top_inefficient_users = inefficient_users[inefficient_users["Weighted_Efficiency_Contribution"] > 0].head(10)
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=top_inefficient_users["Weighted_Efficiency_Contribution"], y=top_inefficient_users["User"])
plt.xlabel("Weighted Efficiency Contribution")
plt.ylabel("User")
plt.title("Top 10 Inefficient Users")


plt.tight_layout()
plt.show()

In [None]:
# Extract the first element of GPUType array and create a new field
hybrid_jobs["GPUType_First"] = hybrid_jobs["GPUType"].apply(lambda x: x[0])
# Group by GPUType_First and calculate statistics
gpu_stats_first = hybrid_jobs.groupby("GPUType_First").agg(
    {
        "vram_efficiency": ["mean", "std"],
        "gpu_memory_used_gb": ["mean", "std"],
        "allocated_vram": ["mean", "std"],
        "gpu_hours": ["mean", "std"],
    }
)
print("Statistics grouped by GPUType_First:")
print(gpu_stats_first)

# Plot most requested GPUs (first element)
plt.figure(figsize=(12, 6))
requested_gpu_counts_first = hybrid_jobs["GPUType_First"].value_counts()
sns.barplot(x=requested_gpu_counts_first.index, y=requested_gpu_counts_first.values)
plt.title("Most Assigned GPUs (First Element)")
plt.xlabel("GPUType_First")
plt.ylabel("Request Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
hybrid_jobs[hybrid_jobs["vram_efficiency"] > 1][
    [
        "JobID",
        "Interactive",
        "Status",
        "ExitCode",
        "Constraints",
        "NodeList",
        "GPUs",
        "GPUMemUsage",
        "gpu_memory_used_gb",
        "allocated_vram",
        "GPUType",
        "vram_efficiency",
    ]
]

In [None]:
# Filter multi-GPU jobs
import numpy as np

hybrid_jobs["allocated_vram"] = np.where(
    (hybrid_jobs["gpu_memory_used_gb"] > 16) & (hybrid_jobs["GPUType"].iloc[0][0] == "a100",),
    32,
    hybrid_jobs["allocated_vram"],
)
hybrid_jobs["allocated_vram"] = np.where(
    (hybrid_jobs["gpu_memory_used_gb"] > 16) & (hybrid_jobs["GPUType"].iloc[0][0] == "v100"),
    32,
    hybrid_jobs["allocated_vram"],
)

gpu_jobs = hybrid_jobs[(hybrid_jobs["GPUs"] > 0) & (~hybrid_jobs["Status"].isin(["TIMEOUT", "OUT_OF_MEMORY"]))]
print(f"Number of multi-GPU jobs: {len(gpu_jobs)}")

# Summarize multi-GPU job efficiency
gpu_summary = gpu_jobs.groupby("GPUs").agg(
    {"vram_efficiency": ["mean", "std"], "gpu_hours": ["mean", "std"], "GPUMemUsage": ["mean", "std"]}
)
print("GPU job summary:")
print(gpu_summary)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize efficiency distribution for multi-GPU jobs
plt.figure(figsize=(10, 6))
sns.boxplot(x="GPUs", y="vram_efficiency", data=gpu_jobs)
plt.title("Efficiency Distribution by GPU Count")
plt.xlabel("GPU Count")
plt.ylabel("Efficiency")
plt.show()
gpu_jobs[gpu_jobs["vram_efficiency"] > 1][
    [
        "JobID",
        "Status",
        "ExitCode",
        "Constraints",
        "GPUs",
        "gpu_memory_used_gb",
        "allocated_vram",
        "GPUType",
        "vram_efficiency",
    ]
].sort_values(by="vram_efficiency", ascending=False)

In [None]:
# Plot GPU Utilization vs. GPU Count
plt.figure(figsize=(10, 6))
sns.scatterplot(x="GPUs", y="GPUMemUsage", data=gpu_jobs)
plt.title("GPU Utilization vs. GPU Count")
plt.xlabel("GPU Count")
plt.ylabel("GPU Memory Usage")
plt.show()

In [None]:
# Plot Job Duration vs. GPU Count
plt.figure(figsize=(10, 6))
sns.boxplot(x="GPUs", y="gpu_hours", data=gpu_jobs)
plt.title("Job Duration Distribution by GPU Count")
plt.xlabel("GPU Count")
plt.ylabel("Job Duration (hours)")
plt.show()

In [None]:
# Plot VRAM Efficiency Over Time
vals = gpu_jobs.sort_values("Elapsed")
plt.figure(figsize=(10, 6))
sns.lineplot(x="Elapsed", y="vram_efficiency", data=vals.head(1000))
plt.title("VRAM Efficiency Over Time")
plt.xlabel("Elapsed Time")
plt.ylabel("VRAM Efficiency")
plt.xticks(rotation=45)
plt.show()

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Perform clustering based on VRAM efficiency
vram_efficiency_data = gpu_jobs[["vram_efficiency"]].dropna()

# Determine the optimal number of clusters using the elbow method
inertia = []
for n_clusters in range(1, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(vram_efficiency_data)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), inertia, marker="o")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal Clusters")
plt.show()

# Apply KMeans clustering with the optimal number of clusters (e.g., 3)
kmeans = KMeans(n_clusters=3, random_state=42)
vram_efficiency_data["Cluster"] = kmeans.fit_predict(vram_efficiency_data)

# Visualize the clusters
plt.figure(figsize=(8, 5))
sns.scatterplot(
    x=vram_efficiency_data.index,
    y=vram_efficiency_data["vram_efficiency"],
    hue=vram_efficiency_data["Cluster"],
    palette="viridis",
)
plt.xlabel("Index")
plt.ylabel("VRAM Efficiency")
plt.title("Clustering of VRAM Efficiency")
plt.show()

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare data for clustering: group by user and calculate weighted VRAM efficiency
user_efficiency_data = gpu_jobs.groupby("User").agg({"user_weighted_vram_efficiency": "mean"}).dropna()

# Calculate total number of users
total_users = len(user_efficiency_data)
print(f"Total number of users: {total_users}")

# Group users with low usage (e.g., efficiency below a threshold)
low_usage_threshold = 0.2
user_efficiency_data["Usage_Group"] = user_efficiency_data["user_weighted_vram_efficiency"].apply(
    lambda x: "Low Usage" if x < low_usage_threshold else "High Usage"
)

# Display grouped data
print("Grouped Data by Usage:")
print(user_efficiency_data.groupby("Usage_Group").size())

# Aggregate data by usage group for visualization
aggregated_data = user_efficiency_data.groupby("Usage_Group").agg(
    {"user_weighted_vram_efficiency": ["mean", "std", "count"]}
)
print("Summary statistics for each usage group:")
print(aggregated_data)

# Visualize the aggregated data without individual users on the x-axis
plt.figure(figsize=(10, 6))
sns.barplot(x=aggregated_data.index, y=aggregated_data[("user_weighted_vram_efficiency", "mean")], palette="viridis")
plt.xlabel("Usage Group")
plt.ylabel("Mean Weighted VRAM Efficiency")
plt.title("Mean Weighted VRAM Efficiency by Usage Group")
# plt.tight_layout()
# plt.show()

# Perform clustering only on numeric data
clustering_data = user_efficiency_data[["user_weighted_vram_efficiency"]]

# Determine the optimal number of clusters using the elbow method
inertia = []
for n_clusters in range(1, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(clustering_data)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), inertia, marker="o")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal Clusters")
plt.show()

# Apply KMeans clustering with the optimal number of clusters (e.g., 3)
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
user_efficiency_data["Cluster"] = kmeans.fit_predict(clustering_data)

# Display the clustered data
print("Clustered Data by User:")
print(user_efficiency_data)

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=user_efficiency_data.index,
    y=user_efficiency_data["user_weighted_vram_efficiency"],
    hue=user_efficiency_data["Cluster"],
    palette="viridis",
)
plt.xlabel("User")
plt.ylabel("Weighted VRAM Efficiency Contribution")
plt.title(f"Clustering of Users by Weighted VRAM Efficiency (Optimal Clusters: {optimal_clusters})")
plt.xticks([])

plt.tight_layout()
plt.show()

# Visualize the usage groups
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=user_efficiency_data.index,
    y=user_efficiency_data["user_weighted_vram_efficiency"],
    hue=user_efficiency_data["Usage_Group"],
    palette="viridis",
)
plt.xlabel("User")
plt.ylabel("Weighted VRAM Efficiency Contribution")
plt.title("Grouping of Users by Weighted VRAM Efficiency")
plt.xticks([])
plt.tight_layout()
plt.show()

In [None]:
# Analyze GPU jobs that use more CPU than they should
import matplotlib.pyplot as plt
import seaborn as sns

# Filter jobs where CPU usage exceeds GPU usage
cpu_gpu_threshold = 1  # CPU usage is greater than GPU usage
high_cpu_jobs = gpu_jobs[gpu_jobs["CPUMemUsage"] > cpu_gpu_threshold * gpu_jobs["GPUMemUsage"]]

# Identify top users with high CPU usage relative to GPU usage
top_users_high_cpu = high_cpu_jobs.groupby("User").agg({"CPUMemUsage": "sum", "GPUMemUsage": "sum"}).reset_index()
top_users_high_cpu["CPU_to_GPU_Ratio"] = top_users_high_cpu["CPUMemUsage"] / top_users_high_cpu["GPUMemUsage"]
top_users_high_cpu = top_users_high_cpu.sort_values(by="CPU_to_GPU_Ratio", ascending=False).head(10)

# Display the top users
print("Top users where CPU usage exceeds GPU usage:")
print(top_users_high_cpu)

# Visualize the top users
plt.figure(figsize=(10, 6))
sns.barplot(x="CPU_to_GPU_Ratio", y="User", data=top_users_high_cpu, palette="viridis")
plt.xlabel("CPU to GPU Usage Ratio")
plt.ylabel("User")
plt.title("Top Users with High CPU Usage Relative to GPU Usage")
plt.tight_layout()
plt.show()

# Display the top jobs
top_jobs_high_cpu = high_cpu_jobs.sort_values(by="CPUMemUsage", ascending=False).head(10)
print("Top jobs where CPU usage exceeds GPU usage:")
print(top_jobs_high_cpu[["JobID", "User", "CPUMemUsage", "GPUMemUsage", "cpu_gpu_ratio"]])

# Display the filtered jobs
print("Jobs with high CPU usage relative to GPU usage:")
print(high_cpu_jobs.head())

# Visualize the CPU vs GPU usage for these jobs
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=high_cpu_jobs["GPUMemUsage"],
    y=high_cpu_jobs["CPUMemUsage"],
    hue=high_cpu_jobs["Interactive"],  # Assuming JobType is a column in gpu_df
    palette="viridis",
)
plt.xlabel("GPU Usage")
plt.ylabel("CPU Usage")
plt.title("CPU vs GPU Usage for High CPU Jobs")
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


# Function to plot VRAM efficiency over time for a specific user
def plot_user_vram_efficiency(user, data):
    # Ensure 'Month' column is added to the original DataFrame
    data.loc[:, "Month"] = pd.to_datetime(data["StartTime"]).dt.to_period(
        "M"
    )  # Convert StartTime to datetime and extract month

    # Create a copy of the user-specific data to avoid SettingWithCopyWarning
    user_data = data[data["User"] == user].copy()

    # Perform month calculation based on 'StartTime'
    user_data.loc[:, "Month"] = user_data["StartTime"].dt.to_period("M")  # Convert to monthly periods

    # Filter data by month and calculate user_weighted_vram_efficiency
    monthly_efficiency = []
    monthly_gpu_hours = []
    for month, month_data in user_data.groupby("Month"):
        user_gpu_hours = month_data["gpu_hours"].sum()
        total_gpu_hours = data[data["Month"] == month]["gpu_hours"].sum()
        monthly_gpu_hours.append(user_gpu_hours)
        month_data.loc[:, "user_weighted_vram_efficiency"] = (
            month_data["vram_efficiency"] * user_gpu_hours
        ) / total_gpu_hours
        monthly_efficiency.append((month, month_data.groupby("User")["user_weighted_vram_efficiency"].mean()))

    monthly_efficiency = pd.DataFrame(monthly_efficiency, columns=["Month", "Efficiency"])
    monthly_efficiency["Month"] = monthly_efficiency["Month"].astype(str)  # Convert Period to string

    plt.figure(figsize=(10, 6))
    plt.plot(monthly_efficiency["Month"], monthly_efficiency["Efficiency"], marker="o", color="purple")
    plt.title(f"Weighted VRAM Efficiency Over Time for User: {user}")
    plt.xlabel("Month")
    plt.ylabel("Average VRAM Efficiency")
    plt.xticks(rotation=45)

    # Add annotations for GPU hours
    for i, (_month, hours) in enumerate(zip(monthly_efficiency["Month"], monthly_gpu_hours, strict=False)):
        plt.text(i, monthly_efficiency.loc[i, "Efficiency"], f"{hours:.1f} hrs", fontsize=9, ha="center", va="bottom")

    plt.tight_layout()
    plt.show()


# Apply the function to the most problematic users
problematic_users = inefficient_users.head(5)["User"]  # Assuming inefficient_users DataFrame exists
display(inefficient_users.head(5))
for user in problematic_users:
    user_data = gpu_jobs[gpu_jobs["User"] == user]
    plot_user_vram_efficiency(user, gpu_jobs)

In [None]:
# Filter jobs for the user 'fikram_umass_edu' and check if StartTime is in January 2025
fikram_jobs = gpu_jobs[gpu_jobs["User"] == "fikram_umass_edu"]
# Check if StartTime is in January 2025
january_2025_jobs = fikram_jobs[(fikram_jobs["StartTime"].dt.year == 2025) & (fikram_jobs["StartTime"].dt.month == 1)]

# Display the filtered jobs
print("Jobs for 'fikram_umass_edu' with StartTime in January 2025:")
jobs = january_2025_jobs[["JobID", "StartTime", "user_weighted_vram_efficiency"]]
print(jobs)

In [None]:
gpu_jobs[gpu_jobs["JobID"] == 28257429]