In [1]:
import tensorflow as tf
from tensorflow.keras import callbacks
import os
from utils.utils_functions import *
import json
import numpy as np
from models.registry import MODELS_REGISTRY
from utils.datasets import build_dataset
import utils.config as config

2026-01-27 13:10:20.943198: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-27 13:10:20.943252: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-27 13:10:20.943271: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-27 13:10:20.948327: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
tf.keras.backend.clear_session()
tf.keras.utils.set_random_seed(42)

loss_functions = {
    "sparse_cce": tf.keras.losses.SparseCategoricalCrossentropy(),
    "weighted_cce": weighted_cce,
    "dice_loss": dice_loss,
}

def convert_to_serializable(obj):
    """Convert numpy/tensorflow types to native Python types"""
    if isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.floating)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif hasattr(obj, 'numpy'):
        return float(obj.numpy())
    else:
        return obj

os.makedirs('logs', exist_ok=True)
os.makedirs('saved_models', exist_ok=True)

for loss_name, loss_function in loss_functions.items():
    tf.keras.backend.clear_session()
    
    test_model_name = 'model1'
    print(f"\n\n{'='*70}")
    print(f"Training {test_model_name} with loss: {loss_name}")
    print(f"{'='*70}")
    
    log_dir = f'logs/{test_model_name}_{loss_name}'
    os.makedirs(log_dir, exist_ok=True)
    
    model = MODELS_REGISTRY[test_model_name]()
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(config.LR),
        loss=loss_function,
        metrics=[
            MeanIoUMetric(num_classes=3),
            dice_coeff_metric()
        ]
    )

    print("\nBuilding datasets...")
    train_ds = build_dataset(
        config.TRAIN_X,
        config.TRAIN_Y,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        augment=True,
        capped_size=200
    )

    val_ds = build_dataset(
        config.VAL_X,
        config.VAL_Y,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        capped_size=50
    )
    
    print(f"Training dataset batch size: {config.BATCH_SIZE}")
    print(f"Validation dataset batch size: {config.BATCH_SIZE}")

    print(f"\nInitializing optimized dynamics logger...")
    dynamics_logger = OptimizedDynamicsLogger(
        val_dataset=val_ds,
        num_classes=3,
        log_dir=log_dir,
        max_samples=50,
        batch_log_freq=100
    )
    cbs = [
        callbacks.ModelCheckpoint(
            filepath=f"saved_models/testing_loss/{test_model_name}_{loss_name}_best.h5",
            monitor="val_mean_iou",
            mode="max",
            save_best_only=True,
            verbose=1
        ),
        callbacks.EarlyStopping(
            monitor="val_mean_iou",
            mode="max",
            patience=6,
            restore_best_weights=True,
            verbose=1
        ),
        callbacks.ReduceLROnPlateau(
            monitor='loss',
            factor=0.5,
            patience=3,
            min_lr=1e-6,
            verbose=1
        ),
        callbacks.TensorBoard(
            log_dir=log_dir,
            histogram_freq=1,
            write_graph=True,
            update_freq=50
        ),
        dynamics_logger
    ]
    
    try:
        history = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=10,
            callbacks=cbs,
            verbose=1
        )
        
        print("\n" + "="*70)
        print("Training completed successfully!")
        print("="*70)

        history_path = f"saved_models/testing_loss/{test_model_name}_{loss_name}_history.json"
        print(f"\nSaving training history to {history_path}...")
        with open(history_path, "w") as f:
            serializable_history = convert_to_serializable(history.history)
            json.dump(serializable_history, f, indent=2)

        dyn_path = f"saved_models/testing_loss/{test_model_name}_{loss_name}_dynamics.json"
        print(f"Exporting dynamics to {dyn_path}...")
        dynamics_logger.export_to_json(dyn_path)

        print("\n" + "-"*70)
        print("TRAINING SUMMARY")
        print("-"*70)
        print(f"Model: {test_model_name}")
        print(f"Loss function: {loss_name}")
        print(f"Best val_loss: {min(history.history['val_loss']):.4f}")
        if 'mean_iou' in history.history:
            print(f"Best val_mean_iou: {max(history.history.get('val_mean_iou', [0])):.4f}")
        print(f"\nFiles saved:")
        print(f"  - Model: saved_models/testing_loss/{test_model_name}_{loss_name}_best.h5")
        print(f"  - History: {history_path}")
        print(f"  - Dynamics: {dyn_path}")
        print(f"  - Logs: {log_dir}")
        print("\nView results in TensorBoard:")
        print(f"  tensorboard --logdir=logs/")
        print("-"*70)
        
    except Exception as e:
        print(f"\n{'!'*70}")
        print(f"ERROR during training with {loss_name}")
        print(f"{'!'*70}")
        print(f"Error message: {str(e)}")
        continue
    
    finally:
        print("\nCleaning up memory...")
        del model
        tf.keras.backend.clear_session()

print("\n" + "="*70)
print("ALL EXPERIMENTS COMPLETED")
print("="*70)
print("\nTo view all results in TensorBoard:")
print("  tensorboard --logdir=logs/loss/")



Training model1 with loss: sparse_cce


2026-01-27 13:10:22.712481: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2026-01-27 13:10:22.741315: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2026-01-27 13:10:22.741385: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2026-01-27 13:10:22.743820: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2026-01-27 13:10:22.743887: I tensorflow/compile


Building datasets...
Training dataset batch size: 1
Validation dataset batch size: 1

Initializing optimized dynamics logger...
Epoch 1/10


2026-01-27 13:10:26.913432: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8907
2026-01-27 13:10:29.164444: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f38f77f4a90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2026-01-27 13:10:29.164492: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1660, Compute Capability 7.5
2026-01-27 13:10:29.169238: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2026-01-27 13:10:29.248047: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2026-01-27 13:10:33.653197: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.28GiB with freed_by_count=0. The call

Epoch 1: val_mean_iou improved from -inf to 0.30931, saving model to loss_experiments/model1_sparse_cce_best.h5


  saving_api.save_model(
2026-01-27 13:11:24.013418: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1132462080 exceeds 10% of free system memory.
2026-01-27 13:11:24.361229: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2264924160 exceeds 10% of free system memory.
2026-01-27 13:11:24.950583: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1698693120 exceeds 10% of free system memory.
2026-01-27 13:11:26.919709: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.28GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.



  LR: 1.00e-04 | Class IoU Var: 0.1726 | Mean IoU: 0.2938
Epoch 2/10
Epoch 2: val_mean_iou did not improve from 0.30931


2026-01-27 13:12:14.945836: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1132462080 exceeds 10% of free system memory.
2026-01-27 13:12:15.308210: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2264924160 exceeds 10% of free system memory.



  LR: 1.00e-04 | Class IoU Var: 0.1726 | Mean IoU: 0.2938
Epoch 3/10
Epoch 3: val_mean_iou improved from 0.30931 to 0.39235, saving model to loss_experiments/model1_sparse_cce_best.h5

  LR: 1.00e-04 | Class IoU Var: 0.1314 | Mean IoU: 0.4166
Epoch 4/10
Epoch 4: val_mean_iou did not improve from 0.39235

  LR: 1.00e-04 | Class IoU Var: 0.1518 | Mean IoU: 0.3517
Epoch 5/10
Epoch 5: val_mean_iou improved from 0.39235 to 0.39356, saving model to loss_experiments/model1_sparse_cce_best.h5

  LR: 1.00e-04 | Class IoU Var: 0.1326 | Mean IoU: 0.4188
Epoch 6/10
Epoch 6: val_mean_iou did not improve from 0.39356

  LR: 1.00e-04 | Class IoU Var: 0.1419 | Mean IoU: 0.3880
Epoch 7/10
Epoch 7: val_mean_iou did not improve from 0.39356

  LR: 1.00e-04 | Class IoU Var: 0.1121 | Mean IoU: 0.3742
Epoch 8/10
Epoch 8: val_mean_iou improved from 0.39356 to 0.40423, saving model to loss_experiments/model1_sparse_cce_best.h5

  LR: 1.00e-04 | Class IoU Var: 0.1344 | Mean IoU: 0.4345
Epoch 9/10
Epoch 9: val