In [1]:
# Import required modules
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
project_root = str(Path.cwd().resolve().parent)
print(f"Project root: {project_root}")

Project root: C:\Users\Nitya Karthik A\ds4cg-job-analytics


In [5]:
# Add project root to sys.path for module imports
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.analysis import efficiency_analysis as ea
from src.visualization import JobsWithMetricsVisualizer, UsersWithMetricsVisualizer

# Automatically reload modules before executing code
# This is useful for development to see changes without restarting the kernel.
%load_ext autoreload
# Reload all modules imported with %aimport every time before executing the Python code typed.
%autoreload 2

In [6]:
# Load the jobs DataFrame from DuckDB
preprocessed_jobs_df = ea.load_preprocessed_jobs_dataframe_from_duckdb(
    db_path='../data/slurm_data.db',
    table_name='Jobs',
    )
display(preprocessed_jobs_df.head(10))
print(preprocessed_jobs_df.shape)

Connected to ../data/slurm_data.db


RuntimeError: Failed to load jobs DataFrame: 0

In [None]:
#get all unique partitions 
partitions = preprocessed_jobs_df['Partition'].unique()
print(f"Unique partitions: {partitions}")
    

In [None]:
#for each unique partition, get the vram constraint they are setting 
result = preprocessed_jobs_df.groupby('Partition')['vram_constraint'].unique()
print("VRAM constraints for each partition:")
for partition, constraints in result.items():
    print(f"Partition: {partition}, VRAM Constraints: {constraints}")

In [None]:
def plot_vram_constraints_for_partition(df, partition_name):
    """
    Create a bar plot showing job counts for each VRAM constraint in a specific partition.
    
    Parameters:
    df: DataFrame containing the job data
    partition_name: Name of the partition to plot
    """
    # Filter data for the specific partition
    partition_data = df[df['Partition'] == partition_name]
    
    if partition_data.empty:
        print(f"No data found for partition: {partition_name}")
        return
    
    # Count jobs for each VRAM constraint in this partition
    vram_counts = partition_data['vram_constraint'].value_counts().sort_index()
    
    # Create the bar plot
    plt.figure(figsize=(max(6, len(vram_counts) * 0.8), 6))
    
    # Use categorical x-axis with custom width
    x_positions = range(len(vram_counts))
    bars = plt.bar(x_positions, vram_counts.values, width=0.6)
    
    # Customize the plot
    plt.title(f'Job Count by VRAM Constraint for Partition: {partition_name}')
    plt.xlabel('VRAM Constraint')
    plt.ylabel('Number of Jobs')
    
    # Set x-axis labels to show GB values
    plt.xticks(x_positions, [f'{int(vram)}GB' for vram in vram_counts.index])
    
    # Add value labels on top of bars with smaller font
    for i, (bar, count) in enumerate(zip(bars, vram_counts.values)):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(1, max(vram_counts.values) * 0.01), 
                str(count), ha='center', va='bottom', fontsize=10)
    
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print(f"\nSummary for Partition '{partition_name}':")
    for vram, count in vram_counts.items():
        print(f"  {count} jobs for {vram}GB VRAM constraint")


In [None]:
for partition in partitions:
    plot_vram_constraints_for_partition(preprocessed_jobs_df, partition)