In [3]:
%matplotlib notebook
%load_ext autoreload
%pwd

'/ocean/projects/asc170022p/mtragoza/lung-project/notebooks'

In [4]:
import sys
import pandas as pd
sys.path.append('../../param_search')
import param_search as ps

## Setup experiment

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [3]:
# define a job template and name format
template = '''\
#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --account=asc170022p
#SBATCH --partition=GPU-shared
#SBATCH --gres=gpu:1
#SBATCH -x v034
#SBATCH --time=48:00:00
#SBATCH -o %J.stdout
#SBATCH -e %J.stderr
#SBATCH --mail-type=all

hostname
pwd
module load anaconda3
conda activate /ocean/projects/asc170022p/mtragoza/mambaforge/envs/4DCT
nvidia-smi

mpirun -n 1 python ../../../train1d.py \\
    --out_name {job_name} \\
    --pde_name {pde_name} \\
    --image_size {image_size} \\
    --batch_size {batch_size} \\
    --n_nodes {n_nodes}

echo Done
'''
name_format = 'train1d_{pde_name}_{image_size}_{n_nodes}_{batch_size}'

In [74]:
param_space = ps.ParamSpace(
    pde_name='poisson',
    image_size=[128, 256, 512, 1024],
    n_nodes=[128, 256, 512, 1024],
    batch_size=[64, 128, 256],
)

for p in param_space:
    print(name_format.format(**p))

print(len(param_space))

train1d_poisson_128_128_64
train1d_poisson_128_128_128
train1d_poisson_128_128_256
train1d_poisson_128_256_64
train1d_poisson_128_256_128
train1d_poisson_128_256_256
train1d_poisson_128_512_64
train1d_poisson_128_512_128
train1d_poisson_128_512_256
train1d_poisson_128_1024_64
train1d_poisson_128_1024_128
train1d_poisson_128_1024_256
train1d_poisson_256_128_64
train1d_poisson_256_128_128
train1d_poisson_256_128_256
train1d_poisson_256_256_64
train1d_poisson_256_256_128
train1d_poisson_256_256_256
train1d_poisson_256_512_64
train1d_poisson_256_512_128
train1d_poisson_256_512_256
train1d_poisson_256_1024_64
train1d_poisson_256_1024_128
train1d_poisson_256_1024_256
train1d_poisson_512_128_64
train1d_poisson_512_128_128
train1d_poisson_512_128_256
train1d_poisson_512_256_64
train1d_poisson_512_256_128
train1d_poisson_512_256_256
train1d_poisson_512_512_64
train1d_poisson_512_512_128
train1d_poisson_512_512_256
train1d_poisson_512_1024_64
train1d_poisson_512_1024_128
train1d_poisson_512_1024

## Submit jobs

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [90]:
expt_name = ['2024-03-08_poisson', '2024-03-09_poisson', '2024-03-12_poisson'][-1]

In [85]:
if False:
    jobs = ps.submit(template, name_format, param_space, work_dir=expt_name)
    jobs.to_csv(f'{expt_name}.jobs')


Unnamed: 0,pde_name,image_size,n_nodes,batch_size,job_name,job_id,partition,job_state,node_id,runtime,work_dir,array_idx
0,poisson,128,128,64,train1d_poisson_128_128_64,22919052,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,
1,poisson,128,128,128,train1d_poisson_128_128_128,22919053,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,
2,poisson,128,128,256,train1d_poisson_128_128_256,22919054,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,
3,poisson,128,256,64,train1d_poisson_128_256_64,22919055,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,
4,poisson,128,256,128,train1d_poisson_128_256_128,22919056,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,
5,poisson,128,256,256,train1d_poisson_128_256_256,22919057,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,
6,poisson,128,512,64,train1d_poisson_128_512_64,22919058,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,
7,poisson,128,512,128,train1d_poisson_128_512_128,22919059,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,
8,poisson,128,512,256,train1d_poisson_128_512_256,22919060,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,
9,poisson,128,1024,64,train1d_poisson_128_1024_64,22919061,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,


## Monitor jobs

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [10]:
expt_name = ['2024-03-08_poisson', '2024-03-09_poisson', '2024-03-12_poisson'][-1]

In [11]:
jobs = pd.read_csv(f'{expt_name}.jobs', index_col=0)
status = ps.status(jobs, parse_stderr=True)
status

Unnamed: 0_level_0,index,pde_name,image_size,n_nodes,batch_size,job_name,partition,job_state,node_id,runtime,work_dir,array_idx,stdout,stderr
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
22919052,0,poisson,128,128,64,train1d_poisson_128_128_64,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v033.ib.bridges2.psc.edu\n/ocean/projects/asc1...,
22919053,1,poisson,128,128,128,train1d_poisson_128_128_128,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v033.ib.bridges2.psc.edu\n/ocean/projects/asc1...,
22919054,2,poisson,128,128,256,train1d_poisson_128_128_256,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v031.ib.bridges2.psc.edu\n/ocean/projects/asc1...,
22919055,3,poisson,128,256,64,train1d_poisson_128_256_64,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v031.ib.bridges2.psc.edu\n/ocean/projects/asc1...,
22919056,4,poisson,128,256,128,train1d_poisson_128_256_128,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v031.ib.bridges2.psc.edu\n/ocean/projects/asc1...,
22919057,5,poisson,128,256,256,train1d_poisson_128_256_256,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v031.ib.bridges2.psc.edu\n/ocean/projects/asc1...,
22919058,6,poisson,128,512,64,train1d_poisson_128_512_64,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v031.ib.bridges2.psc.edu\n/ocean/projects/asc1...,
22919059,7,poisson,128,512,128,train1d_poisson_128_512_128,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v031.ib.bridges2.psc.edu\n/ocean/projects/asc1...,
22919060,8,poisson,128,512,256,train1d_poisson_128_512_256,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v031.ib.bridges2.psc.edu\n/ocean/projects/asc1...,
22919061,9,poisson,128,1024,64,train1d_poisson_128_1024_64,GPU-shared,,,,/ocean/projects/asc170022p/mtragoza/lung-proje...,,v033.ib.bridges2.psc.edu\n/ocean/projects/asc1...,


In [12]:
status['job_state'] = status['job_state'].fillna('DONE')
status['stderr'] = status['stderr'].fillna('N/A')
status.groupby(['job_state', 'image_size', 'n_nodes', 'stderr'])[['job_name']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,job_name
job_state,image_size,n_nodes,stderr,Unnamed: 4_level_1
DONE,128,128,,3
DONE,128,256,,3
DONE,128,512,,3
DONE,128,1024,,3
DONE,256,128,,3
DONE,256,256,,3
DONE,256,512,,3
DONE,256,1024,,3
DONE,512,128,,3
DONE,512,256,,3


In [13]:
print(status.iloc[0].stderr)




## Analyze results

[[Setup](#Setup-experiment)] [[Submit](#Submit-jobs)] [[Monitor](#Monitor-jobs)] [[Analyze](#Analyze-results)]

In [14]:
m = ps.metrics(jobs, sep='\t')
m

Unnamed: 0,pde_name,image_size,n_nodes,batch_size,job_name,job_id,partition,job_state,node_id,runtime,work_dir,array_idx,epoch,phase,u_loss,mu_loss,t_model,t_loss,t_grad,t_optim
0,poisson,128,128,64,train1d_poisson_128_128_64,22919052,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,,0.000000,train,0.009514,1.208242,7.526374,0.623156,7.130759,1.948564
1,poisson,128,128,64,train1d_poisson_128_128_64,22919052,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,,0.007092,train,0.011694,1.627577,0.201888,0.033427,0.848133,0.000495
2,poisson,128,128,64,train1d_poisson_128_128_64,22919052,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,,0.014184,train,0.010370,1.337310,0.203189,0.000417,0.623567,0.000546
3,poisson,128,128,64,train1d_poisson_128_128_64,22919052,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,,0.021277,train,0.011133,1.449446,0.206238,0.000591,0.628820,0.000593
4,poisson,128,128,64,train1d_poisson_128_128_64,22919052,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,,0.028369,train,0.013075,1.802813,0.201915,0.000424,0.622286,0.000562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409794,poisson,1024,1024,256,train1d_poisson_1024_1024_256,22919099,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,,99.972222,train,0.001954,0.467625,0.181389,0.001673,0.441703,0.000486
409795,poisson,1024,1024,256,train1d_poisson_1024_1024_256,22919099,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,,100.000000,test,0.001476,0.310312,1.133690,0.000462,,
409796,poisson,1024,1024,256,train1d_poisson_1024_1024_256,22919099,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,,100.000000,test,0.001521,0.302251,1.146793,0.000165,,
409797,poisson,1024,1024,256,train1d_poisson_1024_1024_256,22919099,GPU-shared,PENDING,(None),0:00,/ocean/projects/asc170022p/mtragoza/lung-proje...,,100.000000,test,0.001496,0.310346,1.136603,0.000171,,


In [26]:
d = m.groupby(['pde_name', 'image_size', 'n_nodes', 'batch_size', 'job_name'])[['epoch']].max()
d

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,epoch
pde_name,image_size,n_nodes,batch_size,job_name,Unnamed: 5_level_1
poisson,128,128,64,train1d_poisson_128_128_64,100.0
poisson,128,128,128,train1d_poisson_128_128_128,100.0
poisson,128,128,256,train1d_poisson_128_128_256,100.0
poisson,128,256,64,train1d_poisson_128_256_64,100.0
poisson,128,256,128,train1d_poisson_128_256_128,100.0
poisson,128,256,256,train1d_poisson_128_256_256,100.0
poisson,128,512,64,train1d_poisson_128_512_64,100.0
poisson,128,512,128,train1d_poisson_128_512_128,100.0
poisson,128,512,256,train1d_poisson_128_512_256,100.0
poisson,128,1024,64,train1d_poisson_128_1024_64,100.0


In [30]:
unfinished_jobs = d[d.epoch < 100]
unfinished_jobs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,epoch
pde_name,image_size,n_nodes,batch_size,job_name,Unnamed: 5_level_1
poisson,256,128,64,train1d_poisson_256_128_64,83.0
poisson,256,128,128,train1d_poisson_256_128_128,80.0
poisson,256,128,256,train1d_poisson_256_128_256,91.0
poisson,256,256,64,train1d_poisson_256_256_64,53.0
poisson,256,256,128,train1d_poisson_256_256_128,57.0
poisson,256,256,256,train1d_poisson_256_256_256,35.0
poisson,256,512,64,train1d_poisson_256_512_64,12.0


In [41]:
%autoreload
fig = ps.plot(
    m[(m.phase == 'train') & (m.epoch > 99)],
    x=['image_size', 'n_nodes', 'batch_size'],
    y=['u_loss', 'mu_loss'],
    legend=False,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True
)

<IPython.core.display.Javascript object>

In [51]:
%autoreload
fig = ps.plot(
    m[(m.phase == 'train') & (m.epoch > 99) & (m.batch_size == 64)],
    x=['image_size', 'n_nodes'],
    y=['u_loss', 'mu_loss'],
    legend=True,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True
)

<IPython.core.display.Javascript object>

In [45]:
%autoreload
m['nodes_per_pixel'] = m['n_nodes'] / m['image_size']
fig = ps.plot(
    m[(m.phase == 'train') & (m.epoch > 90)],
    x=['nodes_per_pixel', 'batch_size'],
    y=['u_loss', 'mu_loss'],
    legend=True,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True
)

<IPython.core.display.Javascript object>

In [48]:
m[(m.phase == 'train') & (m.epoch > 90)].groupby(['n_nodes', 'image_size', 'nodes_per_pixel', 'batch_size'])[['u_loss', 'mu_loss']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,u_loss,mu_loss
n_nodes,image_size,nodes_per_pixel,batch_size,Unnamed: 4_level_1,Unnamed: 5_level_1
128,128,1.0,64,0.000369,0.134447
128,128,1.0,128,0.000366,0.192505
128,128,1.0,256,0.00142,0.361678
128,256,0.5,256,0.001322,0.319769
128,512,0.25,64,0.000406,0.2508
128,512,0.25,128,0.000418,0.212024
128,512,0.25,256,0.000796,0.321007
128,1024,0.125,64,0.001075,0.29099
128,1024,0.125,128,0.00309,0.597349
128,1024,0.125,256,0.003971,0.635622


In [52]:
m.columns

Index(['pde_name', 'image_size', 'n_nodes', 'batch_size', 'job_name', 'job_id',
       'partition', 'job_state', 'node_id', 'runtime', 'work_dir', 'array_idx',
       'epoch', 'phase', 'u_loss', 'mu_loss', 't_model', 't_loss', 't_grad',
       't_optim', 'nodes_per_pixel', 'nodes_x_pixels', 'nodes = pixels'],
      dtype='object')

In [58]:
%autoreload
fig = ps.plot(
    m[(m.phase == 'train')],
    x=['n_nodes', 'batch_size'],
    y=['t_model', 't_loss', 't_grad', 't_optim'],
    legend=True,
    legend_kws=dict(bbox_to_anchor=(0, -0.2)),
    tight=True
)

<IPython.core.display.Javascript object>