In [1]:
# Import required modules
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [4]:
project_root = str(Path.cwd().resolve().parent)
print(f"Project root: {project_root}")

Project root: C:\Users\Nitya Karthik A\ds4cg-job-analytics


In [3]:
# Add project root to sys.path for module imports
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.analysis import efficiency_analysis as ea
from src.visualization import JobsWithMetricsVisualizer, UsersWithMetricsVisualizer

# Automatically reload modules before executing code
# This is useful for development to see changes without restarting the kernel.
%load_ext autoreload
# Reload all modules imported with %aimport every time before executing the Python code typed.
%autoreload 2

In [7]:
# Load the jobs DataFrame from DuckDB
preprocessed_jobs_df = ea.load_preprocessed_jobs_dataframe_from_duckdb(
    db_path='../data/slurm_data.db',
    table_name='Jobs',
    )
display(preprocessed_jobs_df.head(10))
print(preprocessed_jobs_df.shape)

RuntimeError: Failed to load jobs DataFrame: IO Error: Cannot open file "c:\users\nitya karthik a\ds4cg-job-analytics\notebooks\..\data\slurm_data.db": The process cannot access the file because it is being used by another process.

File is already open in 
C:\Users\Nitya Karthik A\anaconda3\envs\duckdb\python.exe (PID 24884)

In [None]:
#get all unique partitions 
partitions = preprocessed_jobs_df['Partition'].unique()
print(f"Unique partitions: {partitions}")
    

In [None]:
#for each unique partition, get the vram constraint they are setting 
result = preprocessed_jobs_df.groupby('Partition')['vram_constraint'].unique()
print("VRAM constraints for each partition:")
for partition, constraints in result.items():
    print(f"Partition: {partition}, VRAM Constraints: {constraints}")

In [None]:
def plot_vram_constraints_for_partition(df, partition_name):
    """
    Create a bar plot showing job counts for each VRAM constraint in a specific partition.
    
    Parameters:
    df: DataFrame containing the job data
    partition_name: Name of the partition to plot
    """
    # Filter data for the specific partition
    partition_data = df[df['Partition'] == partition_name]
    
    if partition_data.empty:
        print(f"No data found for partition: {partition_name}")
        return
    
    # Count jobs for each VRAM constraint in this partition
    vram_counts = partition_data['vram_constraint'].value_counts().sort_index()
    
    # Create the bar plot
    plt.figure(figsize=(max(6, len(vram_counts) * 0.8), 6))
    
    # Use categorical x-axis with custom width
    x_positions = range(len(vram_counts))
    bars = plt.bar(x_positions, vram_counts.values, width=0.6)
    
    # Customize the plot
    plt.title(f'Job Count by VRAM Constraint for Partition: {partition_name}')
    plt.xlabel('VRAM Constraint')
    plt.ylabel('Number of Jobs')
    
    # Set x-axis labels to show GB values
    plt.xticks(x_positions, [f'{int(vram)}GB' for vram in vram_counts.index])
    
    # Add value labels on top of bars with smaller font
    for i, (bar, count) in enumerate(zip(bars, vram_counts.values)):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(1, max(vram_counts.values) * 0.01), 
                str(count), ha='center', va='bottom', fontsize=10)
    
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Print summary
    print(f"\nSummary for Partition '{partition_name}':")
    for vram, count in vram_counts.items():
        print(f"  {count} jobs for {vram}GB VRAM constraint")


In [None]:
for partition in partitions:
    plot_vram_constraints_for_partition(preprocessed_jobs_df, partition)

In [None]:
def plot_percentage_stacked_vram_by_partition(df):
    """
    Create a percentage stacked bar plot (all bars have same height, showing proportions).
    """
    # Create a pivot table
    pivot_data = df.groupby(['Partition', 'vram_constraint']).size().unstack(fill_value=0)
    
    # Convert to percentages
    pivot_percentage = pivot_data.div(pivot_data.sum(axis=1), axis=0) * 100
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(max(8, len(pivot_data) * 1.2), 8))
    
    # Create stacked bars
    bottom = np.zeros(len(pivot_percentage))
    colors = plt.cm.Set3(np.linspace(0, 1, len(pivot_percentage.columns)))
    
    for i, (vram_constraint, color) in enumerate(zip(pivot_percentage.columns, colors)):
        values = pivot_percentage[vram_constraint].values
        bars = ax.bar(range(len(pivot_percentage)), values, bottom=bottom,
                     label=f'{int(vram_constraint)}GB', color=color, width=0.7)
        
        # Add percentage labels
        for bar, value in zip(bars, values):
            if value > 5:  # Only label segments >5%
                ax.text(bar.get_x() + bar.get_width()/2, 
                       bottom[bars.index(bar)] + value/2,
                       f'{value:.1f}%', ha='center', va='center', 
                       fontsize=9, fontweight='bold')
        
        bottom += values
    
    # Customize the plot
    ax.set_title('VRAM Constraint Distribution by Partition (Percentage)', fontsize=14, pad=20)
    ax.set_xlabel('GPU Partition', fontsize=12)
    ax.set_ylabel('Percentage of Jobs', fontsize=12)
    
    # Set x-axis labels
    ax.set_xticks(range(len(pivot_percentage)))
    ax.set_xticklabels(pivot_percentage.index, rotation=45 if len(pivot_percentage) > 5 else 0)
    
    # Add legend
    ax.legend(title='VRAM Constraint', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Set y-axis to 0-100%
    ax.set_ylim(0, 100)
    ax.grid(axis='y', alpha=0.3)
    ax.set_axisbelow(True)
    
    plt.tight_layout()
    plt.show()


In [None]:
# Usage:
print("Available partitions:", preprocessed_jobs_df['Partition'].unique())


# Or create percentage stacked bar plot
plot_percentage_stacked_vram_by_partition(preprocessed_jobs_df)
