# Resource usage (real data)
Using Slurm and pyseff, we can get the runtime and memory usage of each tool.



In [7]:
import os
os.chdir('/clusterfs/jgi/scratch/science/metagen/neri/code/blits/spacer_matching_bench/')
from bench import *
from bench.utils.functions import *
from bench.utils.pyseff import *
import polars as pl
pl.Config(tbl_rows=65)
sacct_df = pyseff()
sacct_df

BaseJobID,JobName,State,ExitCode,AllocCPUS,Elapsed,TotalCPU,CPU_Efficiency,MaxRSS_MB,ReqMem_MB,Memory_Efficiency
str,str,list[str],list[str],i64,str,str,f64,f64,f64,f64
"""14889230""","""mummer4""","[""COMPLETED"", ""FAILED""]","[""0:0"", ""6:0""]",34,"""00:49:30""","""00:00:00""",0.0,0.082031,335872.0,0.0
"""15408595""","""blastn""","[""COMPLETED""]","[""0:0""]",40,"""7-11:54:06""","""190-16:57:28""",63.6,0.0859375,18432.0,0.0
"""15441355""","""blastn""","[""COMPLETED""]","[""0:0""]",50,"""3-00:59:06""","""46-16:32:44""",30.71,343.261719,18432.0,1.86
"""15441356""","""blastn""","[""COMPLETED""]","[""0:0""]",50,"""3-17:17:16""","""72-21:08:28""",39.18,398.679688,18432.0,2.16
"""15296783""","""bowtie1""","[""FAILED"", ""COMPLETED""]","[""0:0"", ""11:0""]",60,"""00:00:03""","""00:00:00""",0.0,0.011719,274432.0,0.0
"""14917028""","""bbmap_skimmer""","[""FAILED"", ""COMPLETED""]","[""1:0"", ""0:0""]",14,"""00:32:33""","""02:07:24""",27.96,129259.683594,131072.0,98.62
"""14961307""","""bbmap_skimmer""","[""COMPLETED""]","[""0:0""]",14,"""06:23:45""","""2-04:00:12""",58.08,0.082031,438272.0,0.0
"""14915189""","""bowtie2""","[""OUT_OF_MEMORY"", ""COMPLETED""]","[""0:125"", ""0:0""]",14,"""01:06:27""","""00:00:00""",0.0,0.011719,131072.0,0.0
"""15192702""","""strobealign""","[""COMPLETED""]","[""0:0""]",12,"""01:03:48""","""02:35:28""",20.31,0.0859375,499712.0,0.0
"""15192707""","""bowtie2""","[""COMPLETED""]","[""0:0""]",12,"""2-08:00:33""","""11-19:10:26""",42.13,237594.253906,499712.0,47.55


# Get tool info group by tool
Excluding cancelled jobs, for each tool that successfully completed, we will get the runtime and peak memory usage.
For tools that required multiple jobs (blastn), and splitting the query into chunks, we will sum the runtime, and report the maximum peak memory usage per chunk size. TODO: remember to note that blastn could not complete the job due to time out error, hence the split.


In [8]:
# # Group by JobID first to combine the tool name with its batch/extern records
# sacct_df = pl.read_csv('results/real_data/sacct.out', separator='|')

# Get tool info - exclude cancelled jobs
tool_info = (sacct_df
    .filter(~pl.col('JobName').is_in(['batch', 'extern']))
    # .filter(~pl.col('State').str.contains('CANCELLED'))  # Remove cancelled jobs
    .select(['BaseJobID', 'JobName', 'AllocCPUS', 'ReqMem_MB', 'State'])
    .unique())

# Get memory info from batch rows
memory_info = (sacct_df
    .filter(pl.col('JobName') == 'batch')
    .select(['BaseJobID', 'MaxRSS_MB'])
    .filter(pl.col('MaxRSS_MB').is_not_null()))

# Combine all information
memory_info = memory_info.with_columns(pl.col('MaxRSS_MB').cast(pl.Int64).alias('Peak_Memory_Gb'))
memory_info = memory_info.with_columns((pl.col('Peak_Memory_Gb') /  1024).alias('Peak_Memory_Gb'))

job_analysis = (tool_info
    .join(memory_info, on='BaseJobID', how='left')
    .select([
        'BaseJobID',
        pl.col('JobName').alias('Tool'),
        'AllocCPUS',
        'State',    
        'Peak_Memory_Gb',
        'ReqMem_MB'
    ])
    .sort([ 'Peak_Memory_Gb'], descending=True)) #'State',


# Group by tool to see patterns
# first we remove out of memory jobs, and for timed out jobs we sum the time.
job_analysis


BaseJobID,Tool,AllocCPUS,State,Peak_Memory_Gb,ReqMem_MB
str,str,i64,list[str],f64,f64
"""15192702""","""strobealign""",12,"[""COMPLETED""]",,499712.0
"""15192703""","""mmseqs""",12,"[""COMPLETED""]",,499712.0
"""14915188""","""bowtie1""",14,"[""COMPLETED"", ""OUT_OF_MEMORY""]",,131072.0
"""14917028""","""bbmap_skimmer""",14,"[""FAILED"", ""COMPLETED""]",,131072.0
"""15408595""","""blastn""",40,"[""COMPLETED""]",,18432.0
"""15408554""","""blastn""",36,"[""FAILED"", ""COMPLETED""]",,28672.0
"""14904950""","""bowtie2""",34,"[""COMPLETED"", ""OUT_OF_MEMORY""]",,233472.0
"""14887241""","""bowtie2""",34,"[""COMPLETED"", ""FAILED""]",,394240.0
"""15296985""","""bowtie1""",60,"[""OUT_OF_MEMORY"", ""COMPLETED""]",,438272.0
"""15192729""","""lexicmap""",20,"[""COMPLETED""]",,141312.0


In [9]:
jobids_to_remove = []
for i in job_analysis.iter_rows(named=True):
    set1 = set(i['State'])
    if len(set1.intersection({'OUT_OF_MEMORY','CANCELLED','PREEMPTED'})) > 0:
        jobids_to_remove.append(i['BaseJobID'])
jobids_to_remove
job_analysis = job_analysis.filter(~pl.col('BaseJobID').is_in(jobids_to_remove))
job_analysis 

BaseJobID,Tool,AllocCPUS,State,Peak_Memory_Gb,ReqMem_MB
str,str,i64,list[str],f64,f64
"""15192702""","""strobealign""",12,"[""COMPLETED""]",,499712.0
"""15192703""","""mmseqs""",12,"[""COMPLETED""]",,499712.0
"""14917028""","""bbmap_skimmer""",14,"[""FAILED"", ""COMPLETED""]",,131072.0
"""15408595""","""blastn""",40,"[""COMPLETED""]",,18432.0
"""15408554""","""blastn""",36,"[""FAILED"", ""COMPLETED""]",,28672.0
"""14887241""","""bowtie2""",34,"[""COMPLETED"", ""FAILED""]",,394240.0
"""15192729""","""lexicmap""",20,"[""COMPLETED""]",,141312.0
"""15296793""","""bowtie1""",60,"[""FAILED"", ""COMPLETED""]",,274432.0
"""14887288""","""bbmap""",34,"[""FAILED"", ""COMPLETED""]",,394240.0
"""14928594""","""bowtie2""",14,"[""COMPLETED""]",,397312.0


In [10]:
tool_summary = (job_analysis 
    .group_by('Tool')
    .agg([
        pl.col('State').value_counts(),
        pl.col('Peak_Memory_Gb').max().alias('Peak_Memory_Gb'),
        pl.col('ReqMem_MB').min().alias('Min_Requested_Mem'),
        pl.col('ReqMem_MB').max().alias('Max_Requested_Mem')
    ])
    .sort('Tool'))

#for  blastn we need to sum the runtime, and report the maximum peak memory usage per chunk size.
blastn_job_analysis = job_analysis.filter(pl.col('Tool') == 'blastn')
blastn_job_analysis = blastn_job_analysis.group_by('Tool').agg([
    pl.col('State').value_counts(),
    pl.col('Peak_Memory_Gb').max().alias('Peak_Memory_Gb'),
    pl.col('ReqMem_MB').min().alias('Min_Requested_Mem'),
    pl.col('ReqMem_MB').max().alias('Max_Requested_Mem')
])
blastn_job_analysis


print("\nPer-job details:")
print(job_analysis)
print("\nPer-tool summary:")
print(tool_summary)



Per-job details:
shape: (49, 6)
┌───────────┬───────────────┬───────────┬─────────────────────────┬────────────────┬───────────┐
│ BaseJobID ┆ Tool          ┆ AllocCPUS ┆ State                   ┆ Peak_Memory_Gb ┆ ReqMem_MB │
│ ---       ┆ ---           ┆ ---       ┆ ---                     ┆ ---            ┆ ---       │
│ str       ┆ str           ┆ i64       ┆ list[str]               ┆ f64            ┆ f64       │
╞═══════════╪═══════════════╪═══════════╪═════════════════════════╪════════════════╪═══════════╡
│ 15192702  ┆ strobealign   ┆ 12        ┆ ["COMPLETED"]           ┆ null           ┆ 499712.0  │
│ 15192703  ┆ mmseqs        ┆ 12        ┆ ["COMPLETED"]           ┆ null           ┆ 499712.0  │
│ 14917028  ┆ bbmap_skimmer ┆ 14        ┆ ["FAILED", "COMPLETED"] ┆ null           ┆ 131072.0  │
│ 15408595  ┆ blastn        ┆ 40        ┆ ["COMPLETED"]           ┆ null           ┆ 18432.0   │
│ 15408554  ┆ blastn        ┆ 36        ┆ ["FAILED", "COMPLETED"] ┆ null           ┆ 28672.0  