In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import f1_score, precision_score, recall_score
import datetime
from helpers import f1 as f1_metric
from model import unet

def set_random_seeds(seed):
    """Set random seeds for reproducibility."""
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def create_data_generators():
    """Create data generators for training and validation."""
    train_image_datagen = ImageDataGenerator(rescale=1./255)
    train_image_generator = train_image_datagen.flow_from_directory(
        'data_v3_processed/train/images',
        target_size=(256, 256),
        batch_size=8,
        class_mode=None,
        color_mode='rgb',
        seed=42)
    
    train_mask_datagen = ImageDataGenerator()
    train_mask_generator = train_mask_datagen.flow_from_directory(
        'data_v3_processed/train/masks',
        target_size=(256, 256),
        batch_size=8,
        class_mode=None,
        color_mode='grayscale',
        seed=42)

    val_image_datagen = ImageDataGenerator(rescale=1./255)
    val_image_generator = val_image_datagen.flow_from_directory(
        'data_v3_processed/val/images',
        target_size=(256, 256),
        batch_size=8,
        class_mode=None,
        color_mode='rgb',
        seed=42,
        shuffle=False)
    
    val_mask_datagen = ImageDataGenerator()
    val_mask_generator = val_mask_datagen.flow_from_directory(
        'data_v3_processed/val/masks',
        target_size=(256, 256),
        batch_size=8,
        class_mode=None,
        color_mode='grayscale',
        seed=42,
        shuffle=False)

    return train_image_generator, train_mask_generator, val_image_generator, val_mask_generator
    
def combine_generator(image_generator, mask_generator):
    while True:
        for img_batch, mask_batch in zip(image_generator, mask_generator):
            try:
                # Convert to numpy arrays
                img_batch = np.asarray(img_batch).astype(np.float32)
                mask_batch = np.asarray(mask_batch)

                # Skip batches with mismatched sizes
                if img_batch.shape[0] != mask_batch.shape[0]:
                    continue
                
                # Skip empty batches
                if img_batch.shape[0] == 0:
                    continue

                # Normalize masks if needed
                if mask_batch.max() > 1:
                    mask_batch = mask_batch / 255.0

                # Handle mask channel conversion
                if mask_batch.ndim == 4 and mask_batch.shape[-1] == 3:
                    mask_batch = np.mean(mask_batch, axis=-1, keepdims=True)
                elif mask_batch.ndim == 3:
                    mask_batch = np.expand_dims(mask_batch, axis=-1)

                # Force binary and correct dtype
                mask_batch = (mask_batch > 0.5).astype(np.float32)

                # Final shape validation
                if (mask_batch.ndim != 4 or mask_batch.shape[-1] != 1 or 
                    img_batch.shape[0] != mask_batch.shape[0]):
                    continue

                yield img_batch, mask_batch
                
            except Exception as e:
                print(f"Generator error: {e}")
                continue

def train_and_evaluate_model(run_number, train_generator, val_generator, 
                           epochs=3, steps_per_epoch=1000, validation_steps=5, 
                           random_seed=None):
    """Train a single model and return evaluation metrics."""
    print(f"\nTraining Run {run_number + 1}/5 (Seed: {random_seed})")
    print("=" * 50)
    
    if random_seed is not None:
        set_random_seeds(random_seed)
    
    model = unet(256, 256, 3)
    
    model.compile(optimizer='adam', 
                 loss='binary_crossentropy', 
                 metrics=[f1_metric])
    
    try:
        history = model.fit(train_generator,
                           validation_data=val_generator,
                           epochs=epochs,
                           steps_per_epoch=steps_per_epoch,
                           validation_steps=validation_steps,
                           verbose=1)

        model_path = os.path.join("models", f"unet_run{run_number+1}.h5")
        os.makedirs("models", exist_ok=True)
        model.save(model_path)
        
        return model_path
        
    except Exception as e:
        print(f"ERROR in training: {e}")
        return None

# Main execution
random_seeds = [42, 123, 456, 789, 999]

for run in range(5):
    # CREATE FRESH GENERATORS FOR EACH RUN - this fixes the state issue
    train_image_gen, train_mask_gen, val_image_gen, val_mask_gen = create_data_generators()
    
    train_generator = combine_generator(train_image_gen, train_mask_gen)
    val_generator = combine_generator(val_image_gen, val_mask_gen)
    
    model_path = train_and_evaluate_model(
        run, train_generator, val_generator, random_seed=random_seeds[run]
    )
    
    tf.keras.backend.clear_session()

2025-08-21 13:50:56.326475: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-21 13:50:56.366898: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-21 13:50:56.366957: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-21 13:50:56.368438: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-21 13:50:56.376493: I tensorflow/core/platform/cpu_feature_guar

Found 3285 images belonging to 2 classes.
Found 3285 images belonging to 2 classes.
Found 109 images belonging to 1 classes.
Found 108 images belonging to 1 classes.

Training Run 1/5 (Seed: 42)


2025-08-21 13:50:58.873476: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2025-08-21 13:50:58.874006: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 32985 MB memory:  -> device: 0, name: NVIDIA RTX 6000 Ada Generation, pci bus id: 0000:81:00.0, compute capability: 8.9


Epoch 1/3


2025-08-21 13:51:03.296730: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
2025-08-21 13:51:03.926178: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8906
2025-08-21 13:51:05.793674: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f7a74c9fb10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-08-21 13:51:05.793718: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX 6000 Ada Generation, Compute Capability 8.9
2025-08-21 13:51:05.801880: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1755784265.936787  543056 device_compiler.h:186] Compiled cluster using 

Epoch 2/3
Epoch 3/3


  saving_api.save_model(


Found 3285 images belonging to 2 classes.
Found 3285 images belonging to 2 classes.
Found 109 images belonging to 1 classes.
Found 108 images belonging to 1 classes.

Training Run 2/5 (Seed: 123)
Epoch 1/3


2025-08-21 13:52:27.278552: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 2/3
Epoch 3/3
Found 3285 images belonging to 2 classes.
Found 3285 images belonging to 2 classes.
Found 109 images belonging to 1 classes.
Found 108 images belonging to 1 classes.

Training Run 3/5 (Seed: 456)
Epoch 1/3


2025-08-21 13:53:43.939826: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 2/3
Epoch 3/3
Found 3285 images belonging to 2 classes.
Found 3285 images belonging to 2 classes.
Found 109 images belonging to 1 classes.
Found 108 images belonging to 1 classes.

Training Run 4/5 (Seed: 789)
Epoch 1/3


2025-08-21 13:54:59.799787: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 2/3
Epoch 3/3
Found 3285 images belonging to 2 classes.
Found 3285 images belonging to 2 classes.
Found 109 images belonging to 1 classes.
Found 108 images belonging to 1 classes.

Training Run 5/5 (Seed: 999)
Epoch 1/3


2025-08-21 13:56:20.189025: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape inmodel/dropout/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 2/3
Epoch 3/3
