## load libraries

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# standard python packages
import os, sys, shutil
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import random

In [3]:
sys.path.insert(0, "../")
from utils.DLutils import *
from utils.vizutils import plot_col_dists
from create_toybrains import ToyBrainsData
from experiments.fit_DL_model import *

In [4]:
DEEPREPVIZ_REPO = "../../Deep-confound-control-v2/"
sys.path.append(DEEPREPVIZ_REPO)
from DeepRepViz import *

In [5]:
from lightning.pytorch.loggers import TensorBoardLogger, CSVLogger, WandbLogger

In [6]:
import logging
# disable some unneccesary lightning warnings
logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.WARNING)
logging.getLogger("lightning.pytorch.accelerators.cuda").setLevel(logging.WARNING)

## Generate toybrain datasets 
```bash
$ nohup python3 create_toybrains.py -c configs.lbl1cov1 -n 10000 &> nohup1.out &
$ nohup python3 create_toybrains.py -c configs.lbl1cov1_midsignal -n 10000 --suffix n_midsignal &>  nohup2.out &
$ nohup python3 create_toybrains.py -c configs.lbl1cov1_lowsignal -n 10000 --suffix n_lowsignal &> nohup3.out &
```

## Fit DL models

In [7]:
DATASETS = ["../dataset/toybrains_n10000",
            "../dataset/toybrains_n10000_midsignal",
            "../dataset/toybrains_n10000_lowsignal"]

### Generative attr. dist.

In [8]:
# for data_dir in DATASETS:
#     data_name = data_dir.split('/')[-1]
#     df = pd.read_csv(f'{data_dir}/{data_name}.csv')
#     cov_cols = df.filter(regex='^(cov_|lbl_)').columns
#     attr_cols = df.filter(regex='^(?!(cov_|lbl_)).+').columns
#     plot_col_dists(df, 
#                    attr_cols=attr_cols, cov_cols=cov_cols, 
#                    title=f"{data_name}: Dist. of generative data attributes vs the labels")
#     plt.show()

### Baseline attr. accuracies

In [9]:
# results = []
# for data_dir in DATASETS:
#     toy = ToyBrainsData("configs.lbl1cov1")
#     # load the already generated dataset
#     toy.load_generated_dataset(data_dir)
#     result = toy.fit_baseline_models(CV=10) 
#     results.append(result)

In [10]:
# toy.viz_baseline_results(results)

##### Set the GPU

In [11]:
# check GPUs available and memory
# ! gpustat

In [12]:
GPUs = [1]

In [13]:
torch.set_float32_matmul_precision('medium')
os.environ["CUDA_LAUNCH_BLOCKING"]="1"

### fit_DL_model

In [14]:
label = "lbl_lesion"
model_class = SimpleCNN
num_classes = 1
logger_args = dict(save_dir='log')
batch_size=128
random_seed=42
debug = True

see the layer names in the model 

In [15]:
model = model_class(num_classes=num_classes)

In [16]:
get_all_model_layers(model)

[('0', Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
 ('0', Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
 ('0', Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))),
 ('1', Linear(in_features=4096, out_features=3, bias=True)),
 ('2', Linear(in_features=3, out_features=1, bias=False))]

### setup DeepRepViz

In [17]:
dataset_path = DATASETS[0]
unique_name = dataset_path.split('/')[-1].split('_')[-1]
raw_csv_path = glob(f'{dataset_path}/*{unique_name}.csv')[0]
df_data = pd.read_csv(raw_csv_path)

In [18]:
# split the dataset
df_train, df_val, df_test = split_dataset(df_data, label, random_seed)

print(f"Dataset: {dataset_path} ({unique_name})\n  Training data split = {len(df_train)} \n \
 Validation data split = {len(df_val)} \n  Test data split = {len(df_test)}")

# generate data loaders
common_settings = dict(images_dir=dataset_path+'/images',
                       batch_size=batch_size)

train_loader = get_toybrain_dataloader(
                df_train,
                **common_settings)
val_loader = get_toybrain_dataloader(
                df_val, shuffle=False,
                **common_settings)
test_loader = get_toybrain_dataloader(
                df_test, shuffle=False,
                **common_settings)

Dataset: ../dataset/toybrains_n10000 (n10000)
  Training data split = 7809 
  Validation data split = 191 
  Test data split = 2000


In [19]:
# create one more datalonum_workers with the whole data and shuffle off
# add the split info in the main df
split_colname = 'datasplit'
ID_col = 'subjectID'
df_train[split_colname] = 'train'
df_val[split_colname]   = 'val'
df_test[split_colname]  = 'test'
df_data = pd.concat([df_train, df_val, df_test])

drv_loader_kwargs = dict(
                img_dir=dataset_path+'/images',
                img_names=df_data[ID_col].values,
                labels=df_data[label].values,
                transform=transforms.ToTensor())

In [20]:
trainer_args = {"max_epochs":50, 
                "accelerator":'gpu',
                "devices":[1]}
early_stop_patience  = 6
show_training_curves = not(debug)

In [21]:
drv = DeepRepViz(conf_table=df_data,
                 ID_col=ID_col, label_col=label, split_col=split_colname,
                 dataloader_class=ToyBrainsDataloader, 
                 dataloader_kwargs=drv_loader_kwargs,
                 hook_layer=-1,
                 debug=False)

model = model_class(num_classes=num_classes)


callbacks=[drv]
if early_stop_patience and not debug:
    callbacks.append(EarlyStopping(monitor="val_loss", mode="min", 
                                   patience=early_stop_patience))
    
lightning_model = LightningModel(model, learning_rate=0.05, 
                                 num_classes=num_classes)
# configure trainer settings
logger = TensorBoardLogger(version=unique_name, **logger_args)
# callbacks.append(ModelCheckpoint(dirpath=logger.log_dir,
#                          monitor="val_loss", mode="min",  
#                          save_top_k=1, save_last=True))
# delete previous logs
try:
    if os.path.isdir(logger.log_dir): 
        shutil.rmtree(logger.log_dir+'/')
except:
    pass
# train model
trainer = L.Trainer(callbacks=callbacks,
                    logger=logger,
                    # fast_dev_run= 50 if debug else False,
                    overfit_batches= 5 if debug else 0,
                    log_every_n_steps=2 if debug else 50,
                    **trainer_args) # deterministic=True
trainer.fit(
    model=lightning_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader)


# show training curves
if show_training_curves and (logger) and not isinstance(logger, WandbLogger):
    metrics = pd.read_csv(f"{logger.log_dir}/metrics.csv")
    aggreg_metrics = []
    agg_col = "epoch"
    for i, dfg in metrics.groupby(agg_col):
        agg = dict(dfg.mean())
        agg[agg_col] = int(i)
        aggreg_metrics.append(agg)

    f, axes = plt.subplots(1,2, sharex=True, 
                           constrained_layout=True, 
                           figsize=(7,3))
    df_metrics = pd.DataFrame(aggreg_metrics)
    df_metrics[["train_loss", "val_loss"]].plot(
        ylabel="Loss", ax=axes[0],
        grid=True, legend=True, xlabel="Epoch", 
    )
    df_metrics[["train_D2", "val_D2"]].plot(
        ylabel=r"$D^2$", ax=axes[1],
        grid=True, legend=True, xlabel="Epoch", ylim=(0,1)
    )
    plt.show()

# test model
test_scores = trainer.test(lightning_model, verbose=False,
                           dataloaders=test_loader,
                          )[0]

print("Test data performance with the best model:\n\
-------------------------------------------------------\n\
Dataset      = {} ({})\n\
Balanced Acc = {:.2f}% \t D2 = {:.2f}%".format(
    dataset_path, unique_name, 
     test_scores['test_BAC']*100,  test_scores['test_D2']*100))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name           | Type              | Params
-----------------------------------------------------
0 | model          | SimpleCNN         | 36.1 K
1 | _metric_spec   | BinarySpecificity | 0     
2 | _metric_recall | BinaryRecall      | 0     
3 | metric_D2      | D2metric          | 0     
-----------------------------------------------------
36.1 K    Trainable params
0         Non-trainable params
36.1 K    Total params
0.144     Total estimated model params size (MB)
2023-10-27 17:17:34.838073: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-27 17:17:34.977775: I tensorflow/core/util/port.cc:104] oneDNN custom operations are 

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: 0it [00:00, ?it/s]

Test data performance with the best model:
-------------------------------------------------------
Dataset      = ../dataset/toybrains_n10000 (n10000)
Balanced Acc = 87.52% 	 D2 = 48.21%


In [22]:
# drv.checkpoints

In [32]:
import pandas as pd
import h5py as h5

print("Showing the contents of the last generated h5file log:")
h5files = sorted(
    glob(logger.log_dir+'/deeprepvizlog/*.h5'))
with h5.File(h5files[-1], 'r') as h5:
    for k in h5.keys():
        print(f'key {k}:\t { h5[k]}')
    print(dict(h5.attrs))

key IDs:	 <HDF5 dataset "IDs": shape (10000,), type "<i8">
key acts:	 <HDF5 dataset "acts": shape (10000, 3), type "<f4">
key labels:	 <HDF5 dataset "labels": shape (10000,), type "<f4">
key preds:	 <HDF5 dataset "preds": shape (10000,), type "<f4">
{'BAC': 0.8780689, 'D2': 0.4782772401988667, 'loss': 0.35959324}


In [24]:
# 'epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 
# 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'

In [25]:
# from pytorch_lightning.callbacks import Callback
 
# class LogFinalLayerWeights(Callback):
    
#     def on_validation_batch_end(
#         self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
#         if batch_idx == 0:
#             print("dataloader_idx", dataloader_idx)
#             wandb_logger = trainer.logger
#             # log gradients, parameter histogram and model topology
#             x, y = batch
#             # log predictions as a Table
#             columns = ['representations', 'label_true', 'label_predicted']
#             #`outputs` comes from `LightningModule.validation_step` which corresponds to our model predictions in this case
#             data = [[wandb.Image(x_i), y_i, y_pred] for x_i, y_i, y_pred in list(zip(x, y, outputs))]
#             wandb_logger.log_table(key='sample_table', columns=columns, data=data)

# callbacks.append(LogFinalLayerWeights())

In [26]:
# %reload_ext tensorboard
# %tensorboard --logdir=./log_tensorboard

In [27]:
# ! kill -9 $(lsof -t -i:6006)

In [28]:
# trainer, logger = fit_DL_model(
#                             DATASETS[1],
#                             label,
#                             model=model,
#                             debug=debug,
#                             logger=WandbLogger,
#                             logger_args=logger_args,
#                             batch_size=batch_size,
#                             trainer_args=dict(devices=GPUs, 
#                                               max_epochs=max_epochs))

In [29]:
# trainer, logger = fit_DL_model(
#                             DATASETS[2],
#                             label,
#                             model=model,
#                             debug=debug,
#                             logger=WandbLogger,
#                             logger_args=logger_args,
#                             batch_size=batch_size,
#                             trainer_args=dict(devices=GPUs, 
#                                               max_epochs=max_epochs))

In [30]:
## dir(Trainer)
# ['_Trainer__setup_profiler', '__class__', '__delattr__', 
# '__dict__', '__dir__', '__doc__', '__eq__', '__format__', 
# '__ge__', '__getattribute__', '__gt__', '__hash__', 
# '__init__', '__init_subclass__', '__le__', '__lt__', 
# '__module__', '__ne__', '__new__', '__reduce__', 
# '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', 
# '__str__', '__subclasshook__', '__weakref__', 
# '_accelerator_connector', '_active_loop', 
# '_callback_connector', '_checkpoint_connector', 
# '_data_connector', '_default_root_dir', '_detect_anomaly', 
# '_evaluation_loop', '_fit_impl', '_logger_connector', 
# '_loggers', '_predict_impl', '_results', '_run', 
# '_run_sanity_check', '_run_stage', '_signal_connector', 
# '_teardown', '_test_impl', '_validate_impl', 
# 'accelerator', 
# 'accumulate_grad_batches', 'barebones', 'callback_metrics', 
# 'callbacks', 'check_val_every_n_epoch', 'checkpoint_callback',
# 'checkpoint_callbacks', 'ckpt_path', 'current_epoch', 
# 'datamodule', 'default_root_dir', 'device_ids', 
# 'distributed_sampler_kwargs', 'early_stopping_callback', 
# 'early_stopping_callbacks', 'enable_validation', 
# 'estimated_stepping_batches', 'evaluating', 'fast_dev_run', 
# 'fit', 'fit_loop', 'global_rank', 'global_step', 
# 'gradient_clip_algorithm', 'gradient_clip_val',
# 'interrupted', 'is_global_zero', 'is_last_batch', 
# 'lightning_module', 'limit_predict_batches', 
# 'limit_test_batches', 'limit_train_batches', 
# 'limit_val_batches', 'local_rank', 'log_dir', 
# 'log_every_n_steps', 'logged_metrics', 'logger', 'loggers', 
# 'lr_scheduler_configs', 'max_epochs', 'max_steps', 'min_epochs', 
# 'min_steps', 'model', 'node_rank', 'num_devices', 'num_nodes',
# 'num_predict_batches', 'num_sanity_val_batches', 
# 'num_sanity_val_steps', 'num_test_batches', 
# 'num_training_batches', 'num_val_batches', 'optimizers', 
# 'overfit_batches', 'precision', 'precision_plugin', 
# 'predict', 'predict_dataloaders', 'predict_loop', 
# 'predicting', 'profiler', 'progress_bar_callback', 
# 'progress_bar_metrics', 'received_sigterm', 
# 'reload_dataloaders_every_n_epochs', 'sanity_checking', 
# 'save_checkpoint', 'scaler', 'should_stop', 'state', 
# 'strategy', 'test', 'test_dataloaders', 'test_loop', 
# 'testing', 'train_dataloader', 'training', 'val_check_batch',
# 'val_check_interval', 'val_dataloaders', 
# 'validate', 'validate_loop', 'validating', 'world_size']