# Supervised Learning with toybrains dataset using Lightning
> Dev log (format < Date > | <Author(s)> )  
> - Developed: 30 May 2023 | JiHoon Kim <br>
> - Tested and improved: 17 July 2023 | Roshan Rane <br>
> - Tested: 28 July 2023 | JiHoon Kim

NOTE : 
- shuffle recommended `False` on both val, and test in dataloader
- used as a testbed notebook (outdated)

## Installing Libraries

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
# check GPUs available and memory
! gpustat

[1m[37mcuda02                    [m  Fri Jul 28 18:43:59 2023  [1m[30m470.199.02[m
[36m[0][m [34mNVIDIA GeForce RTX 3090[m |[31m 45'C[m, [32m  0 %[m | [36m[1m[33m16214[m / [33m24268[m MB | [1m[30mtomasz[m([33m813M[m) [1m[30mtomasz[m([33m2133M[m) [1m[30mtomasz[m([33m3569M[m) [1m[30mtomasz[m([33m895M[m) [1m[30mtomasz[m([33m2249M[m) [1m[30mtomasz[m([33m1219M[m) [1m[30mtomasz[m([33m1057M[m) [1m[30mtomasz[m([33m1869M[m) [1m[30mtomasz[m([33m1057M[m) [1m[30mmanuel[m([33m1343M[m) [1m[30mgdm[m([33m4M[m)
[36m[1][m [34mNVIDIA GeForce RTX 3090[m |[1m[31m 51'C[m, [32m 15 %[m | [36m[1m[33m19837[m / [33m24268[m MB | [1m[30mjihoon[m([33m2249M[m) [1m[30mjihoon[m([33m2443M[m) [1m[30mjihoon[m([33m2445M[m) [1m[30mjihoon[m([33m2443M[m) [1m[30mjihoon[m([33m2443M[m) [1m[30mjihoon[m([33m2445M[m) [1m[30mjihoon[m([33m2445M[m) [1m[30mmanuel[m([33m1343M[m) [1m[30mjihoon[m([33m1569M

In [3]:
# standard python packages
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

import torch
# import monai
import lightning as L
from lightning.pytorch.loggers import CSVLogger

In [4]:
# add custom imports
# (TODO) refactoring needed
from utils.dataset import generate_dataset
from utils.DLutils import (
    get_dataset_loaders,
    ToyBrainsDataset, LightningModel,
    PyTorchMLP, LogisticRegression, viz_batch
)

## Generating the synthetic dataset

In [None]:
# ! python create_toybrains.py --dir toybrains30k -d -n 30000

## Loading the DataLoader

In [5]:
# set random seed
seed = 42
torch.manual_seed(seed) 
np.random.seed(seed)
random.seed(seed)
# set the seed for Lightning
L.seed_everything(seed)

Global seed set to 42


42

In [6]:
# dataset
DATA_DIR = 'toybrains30k/'
raw_csv_path = DATA_DIR+'toybrains_n30000.csv'
data_df = pd.read_csv(raw_csv_path).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   subjectID                30000 non-null  int64  
 1   _gen_brain_vol-radminor  30000 non-null  float64
 2   _gen_brain_vol-radmajor  30000 non-null  float64
 3   gen_brain_thick          30000 non-null  float64
 4   gen_brain_int            30000 non-null  float64
 5   gen_border_int           30000 non-null  object 
 6   gen_vent_thick           30000 non-null  float64
 7   gen_shape-top_curv       30000 non-null  float64
 8   gen_shape-top_int        30000 non-null  object 
 9   _gen_shape-top_vol-rad   30000 non-null  float64
 10  gen_shape-midr_curv      30000 non-null  float64
 11  gen_shape-midr_int       30000 non-null  object 
 12  _gen_shape-midr_vol-rad  30000 non-null  float64
 13  gen_shape-midl_curv      30000 non-null  float64
 14  gen_shape-midl_int    

In [7]:
# choose a target label among the available columns in the table
label = 'lblbin_shp-vol'

In [8]:
# split the dataset for training, validation, and test from raw dataset
df_train, df_val, df_test = generate_dataset(raw_csv_path, label, seed)
print(f" Training data split = {len(df_train)} \n Validation data split = {len(df_val)} \n Test data split = {len(df_test)}")

 Training data split = 24000 
 Validation data split = 3000 
 Test data split = 3000


### Estimate the baselines
1. [TODO] Accuracy when predicting the most frequent class label:
   ```python
   from collections import Counter
   
   ounter = Counter()
   for images, labels in loader:
       counter.update(labels.tolist())
   print(f"\nLabel distribution: {sorted(counter.items())}")
   
   majority_class = counter.most_common(1)[0]
   print(f"Majority class: {majority_class[0]}")
   
   baseline_acc = majority_class[1] / sum(counter.values())
   print(f"Accuracy when always predicting the majority class: {baseline_acc:.2f} {baseline_acc*100:.2f}%)")
   ```

3. [TODO] best expected accuracy from predicting directly from the image attributes:
4. [TODO] best expected accuracy after deconfounding

In [9]:
# prepare the dataLaoder
train_loader, val_loader, test_loader = get_dataset_loaders(
                    data_split_dfs=[df_train, df_val, df_test],
                    data_dir=DATA_DIR,
                    batch_size=16, shuffle=True, 
                    num_workers=0, transform=[])

### Vizualize the Batch on DataLoader

In [None]:
viz_batch(train_loader, title="Training images", debug=True)
viz_batch(val_loader, title="Validation images", debug=True)
viz_batch(test_loader, title="Test images", debug=True)

## Predictive Modeling

Tasks : [baseline model](https://github.com/RoshanRane/toybrains/issues/1)

In [None]:
# multi layer perceprton implementation
pytorch_model = PyTorchMLP(num_features=12288, num_classes=2)

In [None]:
# set denseNet model as default

# pytorch_model = monai.networks.nets.DenseNet121(spatial_dims=2, in_channels=3, out_channels=2)

In [None]:
# logistic regression implmententation

# pytorch_model = LogisticRegression(num_features=12288)

In [None]:
# set lightning model
lightning_model = LightningModel(model=pytorch_model, learning_rate=0.05)

In [None]:
trainer = L.Trainer(
    max_epochs=10,
    accelerator="gpu",
    devices=[1],
    logger=CSVLogger(save_dir="logs/", name="new_model"),
    deterministic=True
)

In [None]:
trainer.fit(
    model=lightning_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)

In [None]:
train_acc = trainer.test(dataloaders=train_loader)[0]["accuracy"]
val_acc = trainer.test(dataloaders=val_loader)[0]["accuracy"]
test_acc = trainer.test(dataloaders=test_loader)[0]["accuracy"]

print(
    f"Train Acc {train_acc*100:.2f}%"
    f" | Val Acc {val_acc*100:.2f}%"
    f" | Test Acc {test_acc*100:.2f}%"
)

In [None]:
train_result = trainer.test(dataloaders=train_loader)

`Bug on below code: if you use mlti gpu then you may found the several version_X generated on one run.` <br>
Please specify correct metrics.csv which contain all. <br>
(TODO) Refactoring the code on result plot

In [None]:
metrics = pd.read_csv(f"logs/my_model/version_5/metrics.csv")

aggreg_metrics = []
agg_col = "epoch"
for i, dfg in metrics.groupby(agg_col):
    agg = dict(dfg.mean())
    agg[agg_col] = i
    aggreg_metrics.append(agg)

df_metrics = pd.DataFrame(aggreg_metrics)
df_metrics[["train_loss", "val_loss"]].plot(
    grid=True, legend=True, xlabel="Epoch", ylabel="Loss"
)
df_metrics[["train_acc", "val_acc"]].plot(
    grid=True, legend=True, xlabel="Epoch", ylabel="ACC"
)

plt.show()

In [None]:
PATH = "lightning.pt"
torch.save(pytorch_model.state_dict(), PATH)