In [None]:
import sys
from pathlib import Path

# Add src to sys.path for module imports
sys.path.append(str(Path.cwd().parent / "src"))

In [None]:
project_root = str(Path.cwd().resolve().parent)
print(f"Project root: {project_root}")

# Add project root to sys.path for module imports
if project_root not in sys.path:
    sys.path.insert(0, project_root)


# Automatically reload modules before executing code
# This is useful for development to see changes without restarting the kernel.
%load_ext autoreload
# Reload all modules imported with %aimport every time before executing the Python code typed.
%autoreload 1
%aimport src.analysis.vram_usage, src.preprocess.preprocess

In [None]:
# Import required libraries and modules
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

In [None]:
from src.analysis.efficiency_analysis import EfficiencyAnalysis
from src.preprocess.preprocess import preprocess_data
from src.database.database_connection import DatabaseConnection

# Connect to the database
db = DatabaseConnection(db_url="../slurm_data.db")

# Query jobs with GPUs
gpu_df = db.connection.query("SELECT * FROM Jobs WHERE GPUs > 0").to_df()

In [None]:
# Preprocess the data
preprocessed_df = preprocess_data(
    gpu_df, min_elapsed_seconds=0, include_failed_cancelled_jobs=False, include_cpu_only_jobs=True
)

# Initialize the EfficiencyAnalysis class
efficiency_analyzer = EfficiencyAnalysis(df=preprocessed_df, table_name="Jobs")

# Filter jobs for clustering analysis
gpu_jobs = efficiency_analyzer.filter_jobs_for_analysis(
    gpu_count_filter=1, vram_constraint_filter=None, gpu_mem_usage_filter={"min": 0, "max": np.inf, "inclusive": False}
)

In [None]:
# Calculate user efficiency metrics
jobs_df = efficiency_analyzer.calculate_job_efficiency_metrics(gpu_jobs)
user_efficiency_data = jobs_df
print(user_efficiency_data.columns)
# Group users by usage category
low_usage_threshold = 0.2
user_efficiency_data["Usage_Group"] = user_efficiency_data["alloc_vram_efficiency"].apply(
    lambda x: "Low Usage" if x < low_usage_threshold else "High Usage"
)

# Display grouped data
print("Grouped Data by Usage:")
print(user_efficiency_data.groupby("Usage_Group").size())

In [None]:
# Determine the optimal number of clusters using the elbow method
# clustering_data = user_efficiency_data[["alloc_vram_efficiency"]]
print(user_efficiency_data.columns)
# clustering_data = clustering_data.replace([np.inf, -np.inf], np.nan).dropna()
user_efficiency_data["vram_hours"] = user_efficiency_data["job_hours"] * user_efficiency_data["used_vram_gib"]
clustering_data = user_efficiency_data[["alloc_vram_efficiency"]
clustering_data = clustering_data.replace([np.inf, -np.inf], np.nan).dropna()
print(clustering_data.describe())

inertia = []
for n_clusters in range(1, 11):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(clustering_data)
    inertia.append(kmeans.inertia_)

# Plot the elbow method
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), inertia, marker="o")
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal Clusters")
plt.show()

In [None]:
# Apply KMeans clustering
optimal_clusters = 4  # Replace with the optimal number of clusters determined from the elbow method
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)

# Fit and predict clusters for the filtered data

clustering_data["Cluster"] = kmeans.fit_predict(clustering_data)

# Merge the cluster assignments back into the original DataFrame
user_efficiency_data = user_efficiency_data.merge(
    clustering_data[["Cluster"]], left_index=True, right_index=True, how="left"
)

# Display the clustered data
print("Clustered Data by User:")
print(user_efficiency_data)

In [None]:
# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=user_efficiency_data.index,
    y=user_efficiency_data["alloc_vram_efficiency"].clip(upper=1.0),
    hue=user_efficiency_data["Cluster"],
    palette="viridis",
)
plt.xlabel("User")
plt.ylabel("Weighted VRAM Efficiency Contribution")
plt.title(f"Clustering of Users by Weighted VRAM Efficiency (Optimal Clusters: {optimal_clusters})")
plt.xticks([])
plt.tight_layout()
plt.show()