In [None]:
# draft code used to be in class MyLogger(Logger)'s __init()__ method
# used to set up initial history keys
num_output_layers = 6

self.num_output_layers = num_output_layers
        history_per_output_layer = {
            # per epoch
            'loss':[],
            'acc':[],
            'f1_score':[],
            # these 2 will be replaced with step_y_pred and step_y_true respectively
            'y_pred':[],
            'y_true':[],
        } 

def pre_suf_fix_dict(dict, fix_val='output_layer', prefix=True):
    return {f"{fix_val}_{key}" if prefix else f"{key}_{fix_val}" : value for key, value in dict.items()}

# to create val/test prefixed keys
history_per_output_layer.update(
    **pre_suf_fix_dict(history_per_output_layer, fix_val='val', prefix=True),
    **pre_suf_fix_dict(history_per_output_layer, fix_val='test', prefix=True),
)
# to create step_ prefixed keys
history_per_output_layer.update(
    **pre_suf_fix_dict(history_per_output_layer, fix_val='step', prefix=True),
)
# removing y_pred, y_true and keeping step_y_pred and step_y_true
history_per_output_layer.pop('y_pred')
history_per_output_layer.pop('y_true')

# fix_val == suffix without "_"; to provide flexibility in using fix_val as prefix as well (if we wanted to!)
self.out_layers_fix_val = 'output_layer' 

# pre_suf_fix_dict function is used to create unique key names to merge the `num_output_layers` histories into one history dictionary
history_ol_lists = [pre_suf_fix_dict(history_per_output_layer, 
                                    fix_val=f'{self.out_layers_fix_val}_{i}', 
                                    prefix=False) 
                    for i in range(self.num_output_layers)] # ol == output layers

# 'loss' here is for the final loss calculated by the loss aggregator function which uses all `num_output_layers` losses
self.history = {'loss':[], 'val_loss':[], 'test_loss':[]}
self.history.update(**pre_suf_fix_dict(self.history, 'step', prefix=True))
# merging the `num_output_layers` histories into one history dictionary
for hist_dict in history_ol_lists:
    self.history.update(**hist_dict)

self.hist_keys = list(self.history.keys())

In [None]:
# removed manual logging
import pytorch_lightning as pl
from torch.nn import functional as F
import torchmetrics

class HierarchalModelPL(pl.LightningModule):
   def __init__(self, hierarchical_model, num_classes_per_layer:list, loss_weights=None, dummy_input_size=(32, 3, 306, 306), the_device='cpu'):
      super(HierarchalModelPL, self).__init__()
      self.hierarchical_model = hierarchical_model # .to(the_device)
      self.num_output_layers = len(num_classes_per_layer)
      self.loss_weights = [1]*self.num_output_layers if loss_weights is None else loss_weights
      self.num_output_layers = self.num_output_layers
      
      # useful for writing computational graph in tensorboard, summary(), etc
      self.example_input_array = torch.randn(*dummy_input_size) # , device=the_device

      self.the_device = the_device if 'cpu' in the_device else 'cuda'
      self.metrics = ['acc', 'f1_score']
      def _create_metric_func(metric_type:str, num_classes):
         if 'acc' in metric_type.lower():
               return torchmetrics.Accuracy(task='multiclass', 
                                          num_classes=num_classes, 
                                          average="micro").to(the_device)
         elif 'f1' in metric_type.lower():
               return torchmetrics.F1Score(task="multiclass", 
                                          num_classes=num_classes, 
                                          average=None).to(the_device)
      
      #                                         step/epoch vvv
      # max indexing possible: self.metric_funcs['f1_score'][1][num_output_layers-1][2]
      #                                       softmax output layers ^^^            ^^^  train/val/test 
      # output example of metric_funcs['acc']
      # (note: MCA == MulticlassAccuracy()):
      # {'acc': [ 
      #   [ #step
      #    [MCA, MCA, MCA], # softmax_output_layer_0 ; train/val/test
      #    [MCA, MCA, MCA],
      #    [MCA, MCA, MCA], # ...
      #    [MCA, MCA, MCA],
      #    [MCA, MCA, MCA],
      #    [MCA, MCA, MCA] # # softmax_output_layer_5 ; train/val/test
      #   ],
      #   [ #epoch
      #    [MCA, MCA, MCA],
      #    [MCA, MCA, MCA],
      #    [MCA, MCA, MCA],
      #    [MCA, MCA, MCA],
      #    [MCA, MCA, MCA],
      #    [MCA, MCA, MCA]
      #   ]
      # ]
      
      self.metric_funcs = {}
      for metric in self.metrics:
         # i --> step/epoch, j --> num_output_layers-1, k --> train/val/test
         self.metric_funcs[metric] = [[[_create_metric_func(metric, num_classes_per_layer[j]) for k in range(3)] 
                                       for j in range(self.num_output_layers)] 
                                       for i in range(2)]
      
      # fix_val == suffix without "_"; to provide flexibility in using fix_val as prefix as well (if we wanted to!)
      self.history_layers_fix_val = 'output_layer' 


   def forward(self, x):
      # len(hier_y_probs) == num_hierarchy_output_layers, 
      # while each element is 2D tensor of shape (batch_size, num_classes of the i_th output layer in hierarchy)
      hier_y_pred = []
      for tensor_output in self.hierarchical_model(x):
         hier_y_pred.append(F.softmax(tensor_output, dim=1))
      hier_y_pred
      return hier_y_pred
   
   def training_step(self, batch, batch_idx):
      ds_prefix = ''
      metrics_dict = self._step_logic(batch, ds_prefix=ds_prefix)
      # add other key:value pairs here or pass the entire metrics_dict if you want. just make sure 'loss' key is present
      return metrics_dict
    
   def training_epoch_end(self, outputs) -> None:
      # 'outputs' argument here contains values from what was returned from training_step()
      ds_prefix = ''
      self._epoch_end_logic(outputs, ds_prefix)

   def validation_step(self, batch, batch_idx):
      ds_prefix = 'val_'
      metrics_dict = self._step_logic(batch, ds_prefix=ds_prefix)
      return metrics_dict
   
   def validation_epoch_end(self, outputs) -> None:
      ds_prefix = 'val_'
      self._epoch_end_logic(outputs, ds_prefix)
   
   def test_step(self, batch, batch_idx):
      ds_prefix = 'test_'
      metrics_dict = self._step_logic(batch, ds_prefix=ds_prefix)
      return metrics_dict
   
   def test_epoch_end(self, outputs) -> None:
      ds_prefix = 'test_'
      self._epoch_end_logic(outputs, ds_prefix=ds_prefix)
   

   def _store_metric(self, metric_name, metric_val):
      self.logger.log(metric_name, metric_val)
      # try:
      #    self.log(metric_name, metric_val)
      # except Exception:
      #    # then metric_val is not scalar, then this means:
      #    # metric_name is an f1 score metric 
      #    # so, we just take the mean value
      #    self.log(metric_name, metric_val.mean())
      # # putting this here ensures we don't include the mean of f1 score, 
      # # but rather the tensor of shape (1, num_classes)

   def _step_logic(self, batch, ds_prefix=''):
      # ds_prefix == dataset_prefix
      if 'val' in ds_prefix:
         ds_type_idx = 1
      elif 'test' in ds_prefix:
         ds_type_idx = 2
      else:
         ds_type_idx = 0
      ds_prefix = 'step_' + ds_prefix

      x, y = batch
      hier_y_pred = self(x)

      # y now has shape (batch_size, len(hier_y_pred))
      # in other words, each row now consists of `len(hier_y_pred)` output layers' labels of 1 sample
      # side note: len(hier_y_pred) == self.num_output_layers
      y = torch.tensor([labelToHierarchy[int(y[i])] for i in range(len(y))], dtype=int, device=self.the_device) # alternatively, range(y.size()[0])
      # transpose y to take each row (batch of labels) with its corresponding hier_y_pred row (batch of predicted labels)
      # in other words, each row now consists of `batch_size` labels of the i_th output layer
      y = y.T
      # softmax_output_layer elements each has shape (batch_size, num_classes of the i_th output layer in hierarchy)
      # side note: the wording of "i_th output layers" refers to the strings mentioned in 
      # output_order argument of HierarchalModel() used in create_hnn_model_arch()

      losses_dict = {}
      other_metrics_dict = {}
      for out_layer_idx, softmax_output_layer in enumerate(hier_y_pred): 
         # 'cur' refers to current output layer
         y_cur = y[out_layer_idx]
         y_pred_cur = softmax_output_layer.to(self.the_device)
         ol_suffix = f'_{self.history_layers_fix_val}_{out_layer_idx}'

         # log step metrics

         loss = F.cross_entropy(y_pred_cur, y_cur)
         loss_full_metric_name = f'{ds_prefix}loss{ol_suffix}'
         self._store_metric(loss_full_metric_name, loss)
         losses_dict[loss_full_metric_name] = loss

         for metric_name in self.metrics:
            # recall the indexing of self.metric_funcs:
            # metric_name, step/epoch, num_output_layers-1, train/val/test
            metric_val = self.metric_funcs[metric_name][0][out_layer_idx][ds_type_idx](y_pred_cur, y_cur)
            full_metric_name = f'{ds_prefix}{metric_name}{ol_suffix}'
            self._store_metric(full_metric_name, metric_val)
            self.metric_funcs[metric_name][1][out_layer_idx][ds_type_idx].update(y_pred_cur, y_cur)
            other_metrics_dict[full_metric_name] = metric_val
      
         f1_score = self.metric_funcs[metric_name][0][out_layer_idx][ds_type_idx](y_pred_cur, y_cur)
         self._store_metric(f'{ds_prefix}{metric_name}{ol_suffix}', f1_score)
         self.metric_funcs[metric_name][1][out_layer_idx][ds_type_idx].update(y_pred_cur, y_cur)

         # storing y_pred/y_true
         # self.history[f'{ds_prefix}y_pred{ol_suffix}'].extend(y_pred_cur)
         # self.history[f'{ds_prefix}y_true{ol_suffix}'].extend(y_cur)

      final_loss = self.metric_reduce_fx(losses_dict.values(), 'weighted_sum')
      self._store_metric(f'{ds_prefix}loss', final_loss)

      # Important note: 'loss' key must be present, or else you'll get this error:
      # MisconfigurationException: In automatic_optimization, 
      # when `training_step` returns a dict, the 'loss' key needs to be present
      # side note: add `.update(losses_dict)` and `.update(other_metrics_dict)` 
      # if you want to directly use other metrics in "..._epoch_end()" methods
      return {f'loss' : final_loss} 

   def _epoch_end_logic(self, outputs, ds_prefix=''):
      if 'val' in ds_prefix:
         ds_type_idx = 1
      elif 'test' in ds_prefix:
         ds_type_idx = 2
      else:
         ds_type_idx = 0

      # log epoch metrics
      final_loss_epoch = torch.tensor([x[f'loss'] for x in outputs], dtype=float).mean()
      self._store_metric(f'{ds_prefix}loss', final_loss_epoch)

      for out_layer_idx in range(self.num_output_layers):
         ol_suffix = f'_{self.history_layers_fix_val}_{out_layer_idx}'

         for metric_name in self.metrics:
            # recall: '1' for accessing epoch func (not step func)
            metric_val_epoch = self.metric_funcs[metric_name][1][out_layer_idx][ds_type_idx].compute()
            self.metric_funcs[metric_name][1][out_layer_idx][ds_type_idx].reset()
            full_metric_name = f'{ds_prefix}{metric_name}{ol_suffix}'
            self._store_metric(full_metric_name, metric_val_epoch)
            if out_layer_idx == 0:
               print(metric_name)
               print(metric_val_epoch)
               print()
   
   def metric_reduce_fx(self, metric_list, agg_type='weighted_sum'):
      '''
      aggregates metric values from all `num_output_layers` into a single value
      side note: called "reduce_fx" as a reference to PyTorch Lightning's reduce_fx parameter found in self.log()
      '''
      if 'weighted' in agg_type.lower() and 'sum' in agg_type.lower():
         weighted_sum_val = 0
         for i, layer_metric_val in enumerate(metric_list):
            weighted_sum_val += self.loss_weights[i] * layer_metric_val
         final_val = weighted_sum_val

      return final_val

   def configure_optimizers(self):
      # make it self.hierarchical_model.parameters() if you defined other parameters in __init()__ which you don't want to optimize
      # side note: under the hood, pl automatically gets gradients 
      # from final_loss returned from training_step() and adjusts the models' branches accordingly
      # source: https://github.com/Lightning-AI/lightning/issues/2645#issuecomment-660681760
      return torch.optim.Adam(self.parameters(), lr=1e-3)


In [None]:
# draft function: useless, as 
def _get_num_classes(key, hierarchy_version):
    '''
    get the number of classes for the output layer which its order is specified in ol_num parameter,
    and based on level of hierarchy chosen
    Example: if hierarchy_version is 1, then a key with "0" suffix (as in, ..._output_layer_0 for example),
    will be considered an output layer with 2 classes: high/low color diversity, and if "1", then one of the 9 flat classes.
    However, if hierarchy_version is set to 2, then "0" will have same meaning, but "1" will mean `Category` classes (which were 4),
    and "2" will mean the 9 flat classes

    To summarize: you'll most likely change this function's logic based on the possible hierarchies that you want to expirement with
    '''
    if hierarchy_version == '1':
        if '0' in key:
            return ['00. selfies', ..., '82. academicDigital']

In [None]:
# useless draft code for log_metrics() function in MyLogger class:
# ... code above
for metric_name, metric_val in metrics.items():

    if isinstance(metric_val, torch.Tensor):
                    # I've added .clone() as I don't know if calling detach() on losses are advisable or not
                    # so using clone() and detach() to avoid detaching original losses from computation graph
                    # source for using clone():
                    # https://www.educba.com/pytorch-detach/#:~:text=If%20we%20need%20to%20copy%20constructs%20from%20the%20tensor%2C%20we%20can%20use%20sourceTensor.clone().detach()
                    metric_val = np.array(metric_val.clone().detach().cpu())
                else:
                    metric_val = np.array([np.array(tensor.clone().detach().cpu()) for tensor in metric_val])

    # converting tensors to list
    metric_val = metric_val.tolist() # tolist() will not however convert a single float value to a list (but will leave it as float instead)
    # ... code below

Debugging why stratification of imbalanced dataset doesn't really work

In [None]:
itrrr = trainer_debug.train_dataloader.sampler.__iter__()

In [None]:
ii = 0
while ii < 10:
    idx = next(itrrr)
    print(trainer_debug.train_dataloader.sampler.labels_in_csv[idx])
    ii += 1 

tensor(7)
tensor(5)
tensor(6)
tensor(2)
tensor(1)
tensor(0)
tensor(5)
tensor(5)
tensor(7)
tensor(1)


In [None]:
# when stratifying, the academicDigital class (most minor) appears at the order printed below (difference between orders)
# as you can see, there are steps (i.e., batches of 32) in which that class doesn't appear at all
lbls_loc = []
for i, a in enumerate(trainer_debug.train_dataloader.sampler.labels_after_strat):
    if a == 8:
        lbls_loc.append(i)
len(lbls_loc), lbls_loc
diff_between_locs = []
prev_loc = 0
for loc in lbls_loc:
    if loc-prev_loc < 64:
        continue
    diff_between_locs.append(loc-prev_loc)
    prev_loc = loc
len(diff_between_locs), lbls_loc

(705,
 [117,
  139,
  145,
  252,
  290,
  360,
  388,
  396,
  573,
  618,
  639,
  681,
  688,
  716,
  738,
  977,
  1111,
  1159,
  1191,
  1195,
  1293,
  1436,
  1578,
  1626,
  1648,
  1683,
  1782,
  1783,
  1905,
  2113,
  2122,
  2167,
  2169,
  2181,
  2193,
  2246,
  2249,
  2260,
  2354,
  2388,
  2417,
  2449,
  2493,
  2525,
  2561,
  2598,
  2648,
  2675,
  2691,
  2692,
  2715,
  2962,
  3007,
  3049,
  3119,
  3121,
  3127,
  3135,
  3242,
  3437,
  3444,
  3464,
  3467,
  3491,
  3507,
  3511,
  3535,
  3569,
  3578,
  3668,
  3967,
  4144,
  4194,
  4222,
  4254,
  4620,
  4670,
  4693,
  4768,
  4780,
  4817,
  4820,
  4881,
  4941,
  5068,
  5104,
  5157,
  5259,
  5301,
  5304,
  5354,
  5468,
  5513,
  5721,
  5848,
  5930,
  5989,
  6058,
  6206,
  6209,
  6363,
  6378,
  6748,
  6802,
  6824,
  7152,
  7277,
  7305,
  7318,
  7401,
  7430,
  7561,
  7677,
  7743,
  7822,
  7852,
  7919,
  7929,
  8004,
  8009,
  8016,
  8135,
  8218,
  8297,
  8436,
  8480,
  

In [None]:
# without stratifying, the academicDigital class (most minor) appears at the order printed below
# as you can see, there are steps (i.e., batches of 32) in which that class doesn't appear at all
# no big difference between enabling/disabling stratification, as dataset is too imbalanced
lbls_loc = []
for i, a in enumerate(trainer_debug.train_dataloader.dataset.datasets.labels):
    if a == 8:
        lbls_loc.append(i)
len(lbls_loc), lbls_loc
diff_between_locs = []
prev_loc = 0
for loc in lbls_loc:
    if loc-prev_loc < 64:
        continue
    diff_between_locs.append(loc-prev_loc)
    prev_loc = loc
len(diff_between_locs), lbls_loc

(736,
 [23,
  66,
  181,
  281,
  326,
  388,
  396,
  510,
  564,
  606,
  624,
  645,
  695,
  847,
  1020,
  1091,
  1183,
  1357,
  1417,
  1451,
  1666,
  1698,
  1744,
  1788,
  1851,
  1858,
  1860,
  1931,
  1968,
  2044,
  2056,
  2094,
  2160,
  2177,
  2239,
  2264,
  2273,
  2372,
  2407,
  2414,
  2432,
  2445,
  2705,
  2861,
  2902,
  2996,
  3050,
  3086,
  3269,
  3289,
  3346,
  3598,
  3627,
  3693,
  3718,
  3801,
  3951,
  4095,
  4120,
  4171,
  4228,
  4296,
  4297,
  4338,
  4450,
  4495,
  4531,
  4599,
  4726,
  4797,
  4853,
  4877,
  4882,
  4886,
  4919,
  4935,
  4971,
  5039,
  5096,
  5140,
  5246,
  5393,
  5408,
  5444,
  5451,
  5459,
  5660,
  5725,
  5745,
  5757,
  5943,
  5973,
  6132,
  6177,
  6241,
  6300,
  6309,
  6336,
  6390,
  6644,
  6660,
  6669,
  6752,
  6909,
  6964,
  6965,
  7076,
  7217,
  7258,
  7297,
  7338,
  7365,
  7414,
  7435,
  7496,
  7504,
  7508,
  7517,
  7546,
  7578,
  7613,
  7617,
  7629,
  7710,
  7862,
  7904,
  

In [None]:
aa = []
for i, a in enumerate(history['step_y_true_ol_1']):
    if a == 8:
        aa.append(i)
len(aa), aa

(6, [117, 139, 145, 277, 299, 305])

In [None]:
len(history['step_y_true_ol_1'])

320