# BASELINE PIPELINE
> Dev log (format < Date > | <Author(s)> )  
> - Created: 28 July 2023 | JiHoon Kim <br>

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# check GPUs available and memory
! gpustat

[1m[37mcuda02                    [m  Mon Jul 31 08:56:07 2023  [1m[30m470.199.02[m
[36m[0][m [34mNVIDIA GeForce RTX 3090[m |[31m 30'C[m, [32m  0 %[m | [36m[1m[33m14871[m / [33m24268[m MB | [1m[30mtomasz[m([33m813M[m) [1m[30mtomasz[m([33m2133M[m) [1m[30mtomasz[m([33m3569M[m) [1m[30mtomasz[m([33m895M[m) [1m[30mtomasz[m([33m2249M[m) [1m[30mtomasz[m([33m1219M[m) [1m[30mtomasz[m([33m1057M[m) [1m[30mtomasz[m([33m1869M[m) [1m[30mtomasz[m([33m1057M[m) [1m[30mgdm[m([33m4M[m)
[36m[1][m [34mNVIDIA GeForce RTX 3090[m |[1m[31m 62'C[m, [1m[32m 65 %[m | [36m[1m[33m18440[m / [33m24268[m MB | [1m[30mmanuel[m([33m1343M[m) [1m[30mjihoon[m([33m2443M[m) [1m[30mjihoon[m([33m2443M[m) [1m[30mjihoon[m([33m2443M[m) [1m[30mjihoon[m([33m2443M[m) [1m[30mjihoon[m([33m2443M[m) [1m[30mjihoon[m([33m2425M[m) [1m[30mjihoon[m([33m2445M[m) [1m[30mgdm[m([33m4M[m)
[36m[2][m [34mNVIDIA GeForce

## import libraries

In [3]:
# add custom imports
from create_toybrains import ToyBrainsData
from utils.pipeline import run_baseline_pipeline

## function

In [4]:
# create toybrains dataset in loop
def generate_toybrains_list(data_dict, img=True, debug=False):
    ''' create a toybrains using loop of dict
    
    PARAMETER
    ---------
    data_dict : dictionary
        data_dict
        
    img : boolean, default : True
        generate toybrains dataset
        
    debug : boolean, default : False
        debug mode
    
    OUTPUT
    ------
    path_list : list
        list of csv path, element is a string
    '''
    path_list = []
    
    for data_dir, args in data_dict.items():
        n_samples, config = args['n_samples'], args['config']
        
        if img:
            if debug: print(f"Save toybrains dataset (N={n_samples}) in 'dataset/{data_dir}'")
            # (TODO) compatible with CLI and notebook
            ! python create_toybrains.py --dir $data_dir -n $n_samples -c $config
        
        csv_path = f"dataset/{data_dir}/toybrains_n{n_samples}.csv"
        path_list.append(csv_path)
        if debug: print(f'summary can be found in {csv_path}\n')

    return path_list

## check your config before generate toybrains dataset

In [None]:
# check config file before generate toybrains it used shapes directory
ToyBrainsData().show_current_config()
# ToyBrainsData(config='demo').show_current_config()

## generate toybrains dataset

`bsp1. run manually`

In [None]:
# ! python create_toybrains.py -d --dir 'toybrains30k' -n 30000 -c 'demo'

`bsp2. generate_toybrains_list`

It should be extended as baseline config dict.

In [None]:
data_dict = {
    'toybrains1k': dict(
        n_samples=1000,
        config=None,
    ),
    'toybrains3k': dict(
        n_samples=3000,
        config=None,
    ),
    'toybrains10k': dict(
        n_samples=10000,
        config=None,
        debug=True,
    ),
    'toybrains30k': dict(
        n_samples=30000,
        config=None,
        debug=True,
    ),
}

In [None]:
# RUN with generate toybrains only for img=True
# csv_path_list = generate_toybrains_list(data_dict, img=True, debug=True)

In [None]:
csv_path_list = generate_toybrains_list(data_dict, img=False, debug=False)

## run baseline pipeline

![Diagram](https://github.com/RoshanRane/toybrains/assets/39021807/50ea3447-536f-4977-803e-7ff7668a48f1)

pass the input file list

`bsp1. use data_dict`

In [5]:
data_dict = {
    'toybrains1k': dict(
        n_samples=1000,
        config=None,
    ),
}
csv_path_list = generate_toybrains_list(data_dict, img=False, debug=False)

`bsp2. use raw csv absolute path`

In [None]:
csv_path_list = ['/ritter/share/projects/JiHoon/toybrains/toybrains30k/toybrains_n30000.csv']
data_dict = None

`bsp3. use raw csv relative path`

In [None]:
csv_path_list = ['toybrains30k/toybrains_n30000.csv']
data_dict = None

In [None]:
print(csv_path_list)

run baseline

In [None]:
data_dict = None

In [6]:
# (TODO) parallel needed
for csv in csv_path_list:
    run_baseline_pipeline(csv, data_dict, debug=False)

time: 2023-07-31 08:56:40.467148
Running Baseline on a file: toybrains1k/toybrains_n1000.csv



Global seed set to 42


run supervised learning
 Training data split = 800 
 Validation data split = 100 
 Test data split = 100


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp/MLP
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | PyTorchMLP     | 615 K 
1 | train_acc | BinaryAccuracy | 0     
2 | val_acc   | BinaryAccuracy | 0     
3 | test_acc  | BinaryAccuracy | 0     
---------------------------------------------
615 K     Train

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp/MLP/version_0/checkpoints/epoch=9-step=500.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp/MLP/version_0/checkpoints/epoch=9-step=500.ckpt


You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp/MLP/version_0/checkpoints/epoch=9-step=500.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp/MLP/version_0/checkpoints/epoch=9-step=500.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vol/MLP
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | PyTorchMLP     | 615 K 
1 | train_acc | BinaryAccuracy | 0     
2 | val_acc   | BinaryAccuracy | 0     
3 | test_acc  | BinaryAccuracy | 0     
---------------------------------------------
615 K     T

 Training data split = 800 
 Validation data split = 100 
 Test data split = 100


Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test/predict dataloaders.
The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 40 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 40 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
`Trainer.fit` stopped: `max_epochs=10` reached.
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vol/MLP/version_0/checkpoints/epoch=9-step=500.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vol/MLP/version_0/checkpoints/epoch=9-step=500.ckpt


You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vol/MLP/version_0/checkpoints/epoch=9-step=500.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vol/MLP/version_0/checkpoints/epoch=9-step=500.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vent/MLP
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | PyTorchMLP     | 615 K 
1 | train_acc | BinaryAccuracy | 0     
2 | val_acc   | BinaryAccuracy | 0     
3 | test_acc  | BinaryAccuracy | 0     
---------------------------------------------
615 K     

 Training data split = 800 
 Validation data split = 100 
 Test data split = 100


Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test/predict dataloaders.
The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 40 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 40 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
`Trainer.fit` stopped: `max_epochs=10` reached.
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vent/MLP/version_0/checkpoints/epoch=9-step=500.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vent/MLP/version_0/checkpoints/epoch=9-step=500.ckpt


You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vent/MLP/version_0/checkpoints/epoch=9-step=500.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/lblbin_shp-vent/MLP/version_0/checkpoints/epoch=9-step=500.ckpt


 Training data split = 800 
 Validation data split = 100 
 Test data split = 100


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/cov_sex/MLP
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | PyTorchMLP     | 615 K 
1 | train_acc | BinaryAccuracy | 0     
2 | val_acc   | BinaryAccuracy | 0     
3 | test_acc  | BinaryAccuracy | 0     
---------------------------------------------
615 K     Trainabl

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/cov_sex/MLP/version_0/checkpoints/epoch=9-step=500.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/cov_sex/MLP/version_0/checkpoints/epoch=9-step=500.ckpt


You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/cov_sex/MLP/version_0/checkpoints/epoch=9-step=500.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
Loaded model weights from checkpoint at /ritter/share/projects/JiHoon/toybrains/results/toybrains1k/20230731-0856/supervised_logs/cov_sex/MLP/version_0/checkpoints/epoch=9-step=500.ckpt


running a total of 2 different settings of models
TOTAL RUNTIME: 0:05:35


visualization

In [None]:
# (TODO) visualization

In [None]:
    n_list, tr_acc, vl_acc, te_acc = [], [], [], []

    for n_components in n_component_list:
        data = get_reduc_loader(dataset = dataset, method = method, n_components=n_components, seed = seed)
        
        # run logistic regression
    
        print(f"N = {n_components}")
        acc, _ = run_logistic_regression(data)
        n_list.append(n_components)
        tr_acc.append(acc[0])
        vl_acc.append(acc[1])
        te_acc.append(acc[2])
        
    plt.plot(n_list, tr_acc)
    plt.plot(n_list, vl_acc)
    plt.plot(n_list, te_acc)
    
    plt.title(f"Accuracy with n on {label}")
    plt.xlabel('N component')
    plt.ylabel('Accuracy')
    
    plt.show()

In [None]:
metrics = pd.read_csv(f"logs/my_model/version_5/metrics.csv")

aggreg_metrics = []
agg_col = "epoch"
for i, dfg in metrics.groupby(agg_col):
    agg = dict(dfg.mean())
    agg[agg_col] = i
    aggreg_metrics.append(agg)

df_metrics = pd.DataFrame(aggreg_metrics)
df_metrics[["train_loss", "val_loss"]].plot(
    grid=True, legend=True, xlabel="Epoch", ylabel="Loss"
)
df_metrics[["train_acc", "val_acc"]].plot(
    grid=True, legend=True, xlabel="Epoch", ylabel="ACC"
)

plt.show()