# Run Training

In [1]:
from datetime import datetime
print("Run at:", datetime.now().strftime("%d.%m.%Y %H:%M:%S"))

Run at: 12.04.2024 17:18:19


In [2]:
# Enable interactive plot
#@formatter:off
%load_ext autoreload
%autoreload 2
#@formatter:on

from torchsummary import summary
import torch

import lightning as pl
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.callbacks import TQDMProgressBar

from pathlib import Path
import importlib

from run import path_resolution, train, evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
debug = True
dry_run = False
num_worker = 5
computease = False

# lib = "Sequence"
# params = "hp_default"
# data = "RSO_LModule_Seq"

lib = "DenseNet"
params = "hp_dropna"
# params = "hp_default"
data = "RSO_LModule"

# lib = "T_Sequence"
# params = "hp_default"
# data = "T_LModule_Seq"


# data = "RSO_LModule_Seq"
# lib = "Sequence_pretrained"
# # params = "hp_default"
# params = "hp_smaller"

In [4]:
interm_path = '_debug' if debug else ''

In [5]:
# path resolution
base_path = path_resolution()

 -- rsync --
Calling: rsync -av /share/temp/yhartmann/smart-cities-journal-based-on-jonahs-ma/data/ /share/data/yhartmann/data/ma-jonah/
sending incremental file list

sent 2,937 bytes  received 31 bytes  5,936.00 bytes/sec
total size is 20,378,829,669  speedup is 6,866,182.50
 -- rsync finished --

Base path: /share/data/yhartmann/data/ma-jonah/


In [6]:
# resolve checkpoints
checkpoint_path = Path(f"./checkpoints{interm_path}/").resolve()
checkpoint_path.mkdir(parents=True, exist_ok=True)
model_checkpoint_path = f"{str(checkpoint_path / lib)}.{params}.ckpt"
print(f"model_checkpoint_path: {model_checkpoint_path}")

model_checkpoint_path: /share/temp/yhartmann/smart-cities-journal-based-on-jonahs-ma/pipeline/checkpoints_debug/DenseNet.hp_dropna.ckpt


# Load model

In [7]:
# import nn module
NNModule = importlib.import_module(f"models.{lib}")

# resolve hyperparameters
hyper_params = getattr(NNModule, params)
if debug:
    hyper_params['trainer_params']['max_epochs'] = 2
    # hyper_params['trainer_params']['profiler'] = 'simple'

In [8]:
model = NNModule.NeuralNetwork(model_params=hyper_params['model_params'],
                    optimizer=hyper_params['optimizer'],
                    loss_function=hyper_params['loss_function'],
                    optimizer_params=hyper_params['optimizer_params'],
                    scheduler_params=hyper_params['scheduler_params'])

2024-04-12 17:18:23,313 - torch.distributed.nn.jit.instantiator - INFO - Created a temporary directory at /tmp/tmpx4pkiv17
2024-04-12 17:18:23,315 - torch.distributed.nn.jit.instantiator - INFO - Writing /tmp/tmpx4pkiv17/_remote_module_non_scriptable.py


/home/yale1/miniconda3/envs/smart-cities/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'loss_function' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss_function'])`.


## Create Datasets

In [9]:
# construct data module
DLModule = getattr(importlib.import_module(f"datasets.{data}"), data)
data_module = DLModule(data_dir=base_path, 
    # n_jobs=1, 
    n_jobs=num_worker, 
    debug=debug, 
    **hyper_params['data_params'])

# Training

In [10]:
print(data_module.data_shape)
summary(model, data_module.data_shape)

(1, 480, 848)
Layer (type:depth-idx)                   Output Shape              Param #
├─DenseNet: 1-1                          [-1, 63]                  --
|    └─Sequential: 2-1                   [-1, 1024, 15, 26]        --
|    |    └─Conv2d: 3-1                  [-1, 64, 240, 424]        3,136
|    |    └─BatchNorm2d: 3-2             [-1, 64, 240, 424]        128
|    |    └─ReLU: 3-3                    [-1, 64, 240, 424]        --
|    |    └─MaxPool2d: 3-4               [-1, 64, 120, 212]        --
|    |    └─_DenseBlock: 3-5             [-1, 256, 120, 212]       335,040
|    |    └─_Transition: 3-6             [-1, 128, 60, 106]        33,280
|    |    └─_DenseBlock: 3-7             [-1, 512, 60, 106]        919,680
|    |    └─_Transition: 3-8             [-1, 256, 30, 53]         132,096
|    |    └─_DenseBlock: 3-9             [-1, 1024, 30, 53]        2,837,760
|    |    └─_Transition: 3-10            [-1, 512, 15, 26]         526,336
|    |    └─_DenseBlock: 3-11       

Layer (type:depth-idx)                   Output Shape              Param #
├─DenseNet: 1-1                          [-1, 63]                  --
|    └─Sequential: 2-1                   [-1, 1024, 15, 26]        --
|    |    └─Conv2d: 3-1                  [-1, 64, 240, 424]        3,136
|    |    └─BatchNorm2d: 3-2             [-1, 64, 240, 424]        128
|    |    └─ReLU: 3-3                    [-1, 64, 240, 424]        --
|    |    └─MaxPool2d: 3-4               [-1, 64, 120, 212]        --
|    |    └─_DenseBlock: 3-5             [-1, 256, 120, 212]       335,040
|    |    └─_Transition: 3-6             [-1, 128, 60, 106]        33,280
|    |    └─_DenseBlock: 3-7             [-1, 512, 60, 106]        919,680
|    |    └─_Transition: 3-8             [-1, 256, 30, 53]         132,096
|    |    └─_DenseBlock: 3-9             [-1, 1024, 30, 53]        2,837,760
|    |    └─_Transition: 3-10            [-1, 512, 15, 26]         526,336
|    |    └─_DenseBlock: 3-11            [-1, 1024

In [11]:
# use tensor cores
if computease:
    torch.set_float32_matmul_precision('medium')
    print('Using Tensor Cores')

In [12]:
trainer = pl.Trainer(logger=TensorBoardLogger(checkpoint_path, name=lib, version=params), 
                fast_dev_run=dry_run,
                default_root_dir=checkpoint_path, 
                callbacks=[EarlyStopping(**hyper_params["early_stopping_params"]), TQDMProgressBar(refresh_rate=1 if debug else 100)],
                **hyper_params['trainer_params'])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
train(model, trainer, data_module, model_checkpoint_path)

[codecarbon INFO @ 17:18:27] offline tracker init
[codecarbon INFO @ 17:18:27] [setup] RAM Tracking...
[codecarbon INFO @ 17:18:27] [setup] GPU Tracking...
[codecarbon INFO @ 17:18:27] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 17:18:27] [setup] CPU Tracking...


[codecarbon INFO @ 17:18:28] CPU Model on constant consumption mode: Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz
[codecarbon INFO @ 17:18:28] >>> Tracker's metadata:
[codecarbon INFO @ 17:18:28]   Platform system: Linux-5.4.0-135-generic-x86_64-with-glibc2.31
[codecarbon INFO @ 17:18:28]   Python version: 3.10.14
[codecarbon INFO @ 17:18:28]   CodeCarbon version: 2.3.5
[codecarbon INFO @ 17:18:28]   Available RAM : 251.554 GB
[codecarbon INFO @ 17:18:28]   CPU count: 48
[codecarbon INFO @ 17:18:28]   CPU model: Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz
[codecarbon INFO @ 17:18:28]   GPU count: 1
[codecarbon INFO @ 17:18:28]   GPU model: 1 x NVIDIA GeForce RTX 2080 Ti


Layer (type:depth-idx)                   Output Shape              Param #
├─DenseNet: 1-1                          [-1, 63]                  --
|    └─Sequential: 2-1                   [-1, 1024, 15, 26]        --
|    |    └─Conv2d: 3-1                  [-1, 64, 240, 424]        3,136
|    |    └─BatchNorm2d: 3-2             [-1, 64, 240, 424]        128
|    |    └─ReLU: 3-3                    [-1, 64, 240, 424]        --
|    |    └─MaxPool2d: 3-4               [-1, 64, 120, 212]        --
|    |    └─_DenseBlock: 3-5             [-1, 256, 120, 212]       335,040
|    |    └─_Transition: 3-6             [-1, 128, 60, 106]        33,280
|    |    └─_DenseBlock: 3-7             [-1, 512, 60, 106]        919,680
|    |    └─_Transition: 3-8             [-1, 256, 30, 53]         132,096
|    |    └─_DenseBlock: 3-9             [-1, 1024, 30, 53]        2,837,760
|    |    └─_Transition: 3-10            [-1, 512, 15, 26]         526,336
|    |    └─_DenseBlock: 3-11            [-1, 1024

2024-04-12 17:18:28.661830: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-12 17:18:28.706380: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
QUEUEING TASKS | : 100%|██████████| 5/5 [00:00<00:00, 1886.78it/s]
PROCESSING TASKS | : 100%|██████████| 5/5 [00:04<00:00,  1.01it/s]
COLLECTING RESULTS | : 100%|██████████| 5/5 [00:00<00:00, 35187.11it/s]
/home/yale1/miniconda3/envs/smart-cities/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:653: Checkpoint directory /share/temp/yhartmann/smart-cities-journal-based-on-jonahs-ma/pipeline/checkpoints_debug/DenseNet/hp_dropna/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

2024-04-12 17:18:35,145 - run - ERROR - CUDA error: device kernel image is invalid
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Traceback (most recent call last):
  File "/home/yale1/miniconda3/envs/smart-cities/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/home/yale1/miniconda3/envs/smart-cities/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 580, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/home/yale1/miniconda3/envs/smart-cities/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 963, in _run
    self.strategy.setup(self)
  File "/home/yale1/miniconda3/envs/smart-cities/lib/python3.10/site-packages/lightning/pytorch/strategies/strategy.py", line 149, in setup
    self.accelerator.setup(tr

[codecarbon INFO @ 17:18:35] 
Graceful stopping: collecting and writing information.
Please wait a few seconds...
[codecarbon INFO @ 17:18:35] Energy consumed for RAM : 0.000181 kWh. RAM Power : 94.33266592025757 W
[codecarbon INFO @ 17:18:35] Energy consumed for all GPUs : 0.000105 kWh. Total GPU Power : 54.69078189695336 W
[codecarbon INFO @ 17:18:35] Energy consumed for all CPUs : 0.000082 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 17:18:35] 0.000368 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:18:35] Done!



# Evaluate Model

In [14]:
evaluate(model, trainer, data_module, model_checkpoint_path)

[codecarbon INFO @ 17:18:35] offline tracker init
[codecarbon INFO @ 17:18:35] [setup] RAM Tracking...
[codecarbon INFO @ 17:18:35] [setup] GPU Tracking...
[codecarbon INFO @ 17:18:35] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 17:18:35] [setup] CPU Tracking...
[codecarbon INFO @ 17:18:36] CPU Model on constant consumption mode: Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz
[codecarbon INFO @ 17:18:36] >>> Tracker's metadata:
[codecarbon INFO @ 17:18:36]   Platform system: Linux-5.4.0-135-generic-x86_64-with-glibc2.31
[codecarbon INFO @ 17:18:36]   Python version: 3.10.14
[codecarbon INFO @ 17:18:36]   CodeCarbon version: 2.3.5
[codecarbon INFO @ 17:18:36]   Available RAM : 251.554 GB
[codecarbon INFO @ 17:18:36]   CPU count: 48
[codecarbon INFO @ 17:18:36]   CPU model: Intel(R) Xeon(R) Gold 5118 CPU @ 2.30GHz
[codecarbon INFO @ 17:18:36]   GPU count: 1
[codecarbon INFO @ 17:18:36]   GPU model: 1 x NVIDIA GeForce RTX 2080 Ti




QUEUEING TASKS | : 100%|██████████| 1/1 [00:00<00:00, 2538.92it/s]
PROCESSING TASKS | : 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
COLLECTING RESULTS | : 100%|██████████| 1/1 [00:00<00:00, 8256.50it/s]
QUEUEING TASKS | : 100%|██████████| 1/1 [00:00<00:00, 1037.68it/s]
PROCESSING TASKS | : 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
COLLECTING RESULTS | : 100%|██████████| 1/1 [00:00<00:00, 8525.01it/s]
Restoring states from the checkpoint path at /share/temp/yhartmann/smart-cities-journal-based-on-jonahs-ma/pipeline/checkpoints_debug/DenseNet.hp_dropna.ckpt


2024-04-12 17:18:40,446 - run - ERROR - CUDA error: device kernel image is invalid
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Traceback (most recent call last):
  File "/share/temp/yhartmann/smart-cities-journal-based-on-jonahs-ma/pipeline/run.py", line 137, in evaluate
    res_dict["skeleton"] = trainer.test(model, data_module, ckpt_path=checkpoint_path)
  File "/home/yale1/miniconda3/envs/smart-cities/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 754, in test
    return call._call_and_handle_interrupt(
  File "/home/yale1/miniconda3/envs/smart-cities/lib/python3.10/site-packages/lightning/pytorch/trainer/call.py", line 44, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/home/yale1/miniconda3/envs/smart-cities/lib/python3.10/site-packages/lightning/pytorch/trainer/trainer.py", line 794, in _test_impl

[codecarbon INFO @ 17:18:40] 
Graceful stopping: collecting and writing information.
Please wait a few seconds...
[codecarbon INFO @ 17:18:40] Energy consumed for RAM : 0.000104 kWh. RAM Power : 94.33266592025757 W
[codecarbon INFO @ 17:18:40] Energy consumed for all GPUs : 0.000058 kWh. Total GPU Power : 53.12845586426277 W
[codecarbon INFO @ 17:18:40] Energy consumed for all CPUs : 0.000047 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 17:18:40] 0.000209 kWh of electricity used since the beginning.
[codecarbon INFO @ 17:18:40] Done!



({}, {}, {})