In [1]:
import wandb
from wandb.keras import WandbMetricsLogger

from tqdm.keras import TqdmCallback

import tensorflow as tf
from restorers.model import NAFNet
from restorers.dataloader import LOLDataLoader
from restorers.losses import CharbonnierLoss, PSNRLoss
from restorers.metrics import PSNRMetric, SSIMMetric
from restorers.utils import get_model_checkpoint_callback

2023-03-04 23:46:36.672863: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 23:46:36.811559: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-03-04 23:46:39.573183: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-03-0

In [2]:
wandb.init(project="nafnet", entity="ml-colabs")

data_loader = LOLDataLoader(
    image_size=128,
    bit_depth=8,
    val_split=0.2,
    visualize_on_wandb=False,
    dataset_artifact_address="ml-colabs/dataset/LoL:v0"
)

train_dataset, val_dataset = data_loader.get_datasets(batch_size=4)

[34m[1mwandb[0m: Currently logged in as: [33mgeekyrakshit[0m ([33mml-colabs[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact LoL:v0, 331.95MB. 1003 files... 
[34m[1mwandb[0m:   1003 of 1003 files downloaded.  
Done. 0:0:0.2
2023-03-04 23:46:58.283032: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-04 23:46:58.964956: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38220 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:00:04.0, compute capability: 8.0


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [3]:
model = NAFNet()

decay_steps = (len(data_loader.train_input_images) // 4) * 100
lr_schedule_fn = tf.keras.optimizers.schedules.CosineDecay(
    initial_learning_rate=2e-4,
    decay_steps=decay_steps,
    alpha=1e-6,
)
optimizer = tf.keras.optimizers.experimental.AdamW(
    learning_rate=lr_schedule_fn, weight_decay=1e-4
)

psnr_metric = PSNRMetric(max_val=1.0)
ssim_metric = SSIMMetric(max_val=1.0)

loss = CharbonnierLoss(epsilon=1e-3)

model.compile(
    optimizer=optimizer, loss=loss, metrics=[psnr_metric, ssim_metric]
)

In [4]:
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=100,
    callbacks=[
        WandbMetricsLogger(log_freq="batch"),
        get_model_checkpoint_callback(
            filepath="checkpoint", save_best_only=False, using_wandb=True
        ),
        TqdmCallback()
    ]
)

0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]

Epoch 1/100


2023-03-04 23:47:18.630207: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8200
2023-03-04 23:47:20.514102: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f118adc7c40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-03-04 23:47:20.514144: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2023-03-04 23:47:20.811567: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2023-03-04 23:47:22.255308: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.






INFO:tensorflow:Assets written to: checkpoint/assets


INFO:tensorflow:Assets written to: checkpoint/assets


TypeError: get_config() missing 1 required positional argument: 'self'

In [None]:
wandb.finish()