In [1]:
from utils.utils_functions import *
from utils.datasets import *
import utils.config as config
import tensorflow as tf
import os
from models.registry import *
from keras.callbacks import TensorBoard
from datetime import datetime
import json
import numpy as np
from tensorflow.keras import callbacks


2026-01-28 01:03:44.103088: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-28 01:03:44.103168: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-28 01:03:44.103196: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-28 01:03:44.113271: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tf.keras.backend.clear_session()
tf.keras.utils.set_random_seed(42)


def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.floating)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif hasattr(obj, 'numpy'):
        return float(obj.numpy())
    else:
        return obj

os.makedirs('logs/final_model', exist_ok=True)
os.makedirs('saved_models/final_model', exist_ok=True)

for model_name in MODELS_REGISTRY.keys():
    if model_name!='model3':
        continue
    tf.keras.backend.clear_session()
    print(f"\n\n{'='*70}")
    print(f"Training {model_name}")
    print(f"{'='*70}")
    
    # Create log directory for this experiment
    log_dir = f'logs/final_model/{model_name}_experiment'
    os.makedirs(log_dir, exist_ok=True)
    
    model = MODELS_REGISTRY[model_name]()
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(config.LR),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=[
            MeanIoUMetric(num_classes=3),
            dice_coeff_metric()
        ]
    )

    # Build datasets
    print("\nBuilding datasets...")
    train_ds = build_dataset(
        config.TRAIN_X,
        config.TRAIN_Y,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        augment=True
    )

    val_ds = build_dataset(
        config.VAL_X,
        config.VAL_Y,
        batch_size=config.BATCH_SIZE,
        shuffle=False
    )
    
    print(f"Training dataset batch size: {config.BATCH_SIZE}")
    print(f"Validation dataset batch size: {config.BATCH_SIZE}")

    # FIXED: Create optimized logger with memory-safe settings
    print(f"\nInitializing optimized dynamics logger...")
    dynamics_logger = OptimizedDynamicsLogger(
        val_dataset=val_ds,
        num_classes=3,
        log_dir=log_dir,
        max_samples=20,
        batch_log_freq=3000
    )
    cbs = [
        callbacks.ModelCheckpoint(
            filepath=f"saved_models/final_model/{model_name}_best.h5",
            monitor="val_mean_iou",  # ✓ Changed to IoU
            mode="max",              # ✓ Higher is better
            save_best_only=True,
            verbose=1
        ),
        callbacks.EarlyStopping(
            monitor="val_mean_iou",  # ✓ Changed to IoU
            mode="max",              # ✓ Higher is better
            patience=6,
            restore_best_weights=True,
            verbose=1
        ),
        callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=3,
            min_lr=1e-6,
            verbose=1
        ),
        callbacks.TensorBoard(
            log_dir=log_dir,
            histogram_freq=1,
            write_graph=True,
            update_freq=1000
        ),
        dynamics_logger
    ]
    
    try:
        history = model.fit(
            train_ds,
            validation_data=val_ds,
            epochs=50,
            callbacks=cbs,
            verbose=1
        )
        
        print("\n" + "="*70)
        print("Training completed successfully!")
        print("="*70)

        history_path = f"saved_models/final_model/{model_name}_history.json"
        print(f"\nSaving training history to {history_path}...")
        with open(history_path, "w") as f:
            serializable_history = convert_to_serializable(history.history)
            json.dump(serializable_history, f, indent=2)

        dyn_path = f"saved_models/final_model/{model_name}_dynamics.json"
        print(f"Exporting dynamics to {dyn_path}...")
        dynamics_logger.export_to_json(dyn_path)

        print("\n" + "-"*70)
        print("TRAINING SUMMARY")
        print("-"*70)
        print(f"Model: {model_name}")
        print(f"Best val_loss: {min(history.history['val_loss']):.4f}")
        if 'mean_iou' in history.history:
            print(f"Best val_mean_iou: {max(history.history.get('val_mean_iou', [0])):.4f}")
        print(f"\nFiles saved:")
        print(f"  - Model: saved_models/final_model/{model_name}_best.h5")
        print(f"  - History: {history_path}")
        print(f"  - Dynamics: {dyn_path}")
        print(f"  - Logs: {log_dir}")
        print("-"*70)
        
    except Exception as e:
        print(f"\n{'!'*70}")
        print(f"ERROR during training with {model_name}!")
        print(f"{'!'*70}")
        print(f"Error message: {str(e)}")
        continue
    
    finally:
        print("\nCleaning up memory...")
        del model
        tf.keras.backend.clear_session()

print("\n" + "="*70)
print("ALL EXPERIMENTS COMPLETED")
print("="*70)
print("\nTo view all results in TensorBoard:")
print("  tensorboard --logdir=logs/final_model/")



Training model3


2026-01-28 01:03:49.715672: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2026-01-28 01:03:49.755263: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2026-01-28 01:03:49.755320: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2026-01-28 01:03:49.758582: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:09:00.0/numa_node
Your kernel may have been built without NUMA support.
2026-01-28 01:03:49.758638: I tensorflow/compile


Building datasets...
Training dataset batch size: 2
Validation dataset batch size: 2

Initializing optimized dynamics logger...
Epoch 1/50


2026-01-28 01:03:58.191917: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8907
2026-01-28 01:03:59.019774: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.21GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2026-01-28 01:04:00.534765: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f0c2575c0a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2026-01-28 01:04:00.534808: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1660, Compute Capability 7.5
2026-01-28 01:04:00.543774: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2026-01-28 01:04:00.651568: I ./tensorflow/com



2026-01-28 01:09:43.957539: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.14GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.




2026-01-28 01:09:46.478278: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.21GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2026-01-28 01:09:46.478347: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.21GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2026-01-28 01:09:46.490071: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.09GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.



Epoch 1: val_mean_iou improved from -inf to 0.41378, saving model to saved_models/final_model/model3_best.h5


  saving_api.save_model(



  LR: 1.00e-04 | Class IoU Var: 0.1424 | Mean IoU: 0.4285
Epoch 2/50
Epoch 2: val_mean_iou improved from 0.41378 to 0.55673, saving model to saved_models/final_model/model3_best.h5

  LR: 1.00e-04 | Class IoU Var: 0.1304 | Mean IoU: 0.5123
Epoch 3/50
Epoch 3: val_mean_iou improved from 0.55673 to 0.60789, saving model to saved_models/final_model/model3_best.h5

  LR: 1.00e-04 | Class IoU Var: 0.1326 | Mean IoU: 0.5760
Epoch 4/50
Epoch 4: val_mean_iou improved from 0.60789 to 0.64469, saving model to saved_models/final_model/model3_best.h5

  LR: 1.00e-04 | Class IoU Var: 0.1115 | Mean IoU: 0.5959
Epoch 5/50
Epoch 5: val_mean_iou did not improve from 0.64469

  LR: 1.00e-04 | Class IoU Var: 0.1096 | Mean IoU: 0.6057
Epoch 6/50
Epoch 6: val_mean_iou improved from 0.64469 to 0.68055, saving model to saved_models/final_model/model3_best.h5

  LR: 1.00e-04 | Class IoU Var: 0.1004 | Mean IoU: 0.6259
Epoch 7/50
Epoch 7: val_mean_iou improved from 0.68055 to 0.70276, saving model to saved_mod