In [1]:
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import layers

# Verify GPU availability
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

2025-03-21 17:53:30.323494: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-21 17:53:30.353992: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742559810.374242    5526 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742559810.379609    5526 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-21 17:53:30.403246: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Available GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
IMG_SIZE = 224
BATCH_SIZE = 16
EPOCHS = 100
AUTOTUNE = tf.data.AUTOTUNE
TEMPERATURE = 0.1

In [3]:
def parse_tfrecord(example_proto):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string)
    }
    parsed = tf.io.parse_single_example(example_proto, feature_description)
    image = tf.image.decode_jpeg(parsed["image"], channels=3)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = tf.cast(image, tf.float32) / 255.0
    return image

TFRECORD_PATH = "/home/srivatsa/Multiple_Crop_Disease_Detection/Dataset/PlantVillage_Structured/AUG_TFRecord/split/train.tfrecord"
raw_dataset = tf.data.TFRecordDataset(TFRECORD_PATH)
image_dataset = raw_dataset.map(parse_tfrecord, num_parallel_calls=AUTOTUNE)

I0000 00:00:1742559813.427548    5526 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [4]:
contrastive_augment = tf.keras.Sequential([
    layers.RandomCrop(IMG_SIZE, IMG_SIZE, seed=42),
    layers.RandomFlip("horizontal", seed=42),
    layers.RandomRotation(0.1, seed=42),
    layers.RandomZoom(0.2, seed=42),
    layers.RandomContrast(0.2, seed=42)
])

def create_contrastive_pairs(ds):
    def _pair(img):
        aug1 = contrastive_augment(img)
        aug2 = contrastive_augment(img)
        return (aug1, aug2)
    return ds.map(_pair, num_parallel_calls=AUTOTUNE).shuffle(1000).batch(BATCH_SIZE).prefetch(AUTOTUNE)

train_ds = create_contrastive_pairs(image_dataset)
train_ds = train_ds.apply(tf.data.experimental.prefetch_to_device('/gpu:0'))

In [5]:
def contrastive_loss(z1, z2, temperature=TEMPERATURE):
    z1 = tf.math.l2_normalize(z1, axis=1)
    z2 = tf.math.l2_normalize(z2, axis=1)
    batch_size = tf.shape(z1)[0]

    z = tf.concat([z1, z2], axis=0)
    similarity_matrix = tf.matmul(z, z, transpose_b=True)

    mask = tf.eye(2 * batch_size)
    logits = similarity_matrix / temperature
    logits = logits - 1e9 * mask

    contrastive_labels = tf.range(batch_size, dtype=tf.int32)
    contrastive_labels = tf.concat([contrastive_labels + batch_size, contrastive_labels], axis=0)

    positives = tf.concat([
    tf.linalg.diag_part(similarity_matrix, k=batch_size),
    tf.linalg.diag_part(similarity_matrix, k=-batch_size)
    ], axis=0)

    positives = positives / temperature
    exp_logits = tf.reduce_sum(tf.exp(logits), axis=1)
    loss = -positives + tf.math.log(exp_logits)
    return tf.reduce_mean(loss)

In [6]:
# Enable mixed precision
from tensorflow.keras.mixed_precision import Policy
tf.keras.mixed_precision.set_global_policy('float32')


# Use MirroredStrategy
strategy = tf.distribute.MirroredStrategy()
print("Number of GPUs used:", strategy.num_replicas_in_sync)

with strategy.scope():
    def build_encoder():
        base_model = tf.keras.applications.EfficientNetV2B0(
            include_top=False, weights='imagenet', pooling='avg',
            input_shape=(IMG_SIZE, IMG_SIZE, 3)
        )
        return tf.keras.Model(inputs=base_model.input, outputs=base_model.output)

    def build_projection_head():
        inputs = layers.Input(shape=(1280,))
        x = layers.Dense(512, activation=None)(inputs)
        x = layers.BatchNormalization()(x)
        outputs = layers.Dense(128, dtype='float32')(x)
        return tf.keras.Model(inputs, outputs)

    encoder = build_encoder()
    proj_head = build_projection_head()
    optimizer = tf.keras.optimizers.Adam(1e-4,clipnorm=1.0)

    @tf.function
    def train_step(x1, x2):
        with tf.GradientTape() as tape:
            h1 = encoder(x1, training=True)
            h2 = encoder(x2, training=True)
            z1 = proj_head(h1, training=True)
            z2 = proj_head(h2, training=True)
            print("z1 norm:", tf.norm(z1, axis=1))
            print("z2 norm:", tf.norm(z2, axis=1))
            loss = contrastive_loss(z1, z2)

        grads = tape.gradient(loss, encoder.trainable_variables + proj_head.trainable_variables)
        optimizer.apply_gradients(zip(grads, encoder.trainable_variables + proj_head.trainable_variables))
        return loss

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of GPUs used: 1


In [7]:
with strategy.scope():
    best_loss = float('inf')
    patience, wait = 5, 0

    for epoch in range(EPOCHS):
        total_loss, steps = 0.0, 0
        for x1, x2 in train_ds:
            if x1 is None or x2 is None:
                continue
            per_replica_loss = strategy.run(train_step, args=(x1, x2))
            batch_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None)
            total_loss += batch_loss.numpy()
            steps += 1

        epoch_loss = total_loss / steps
        print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {epoch_loss:.4f}")

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            wait = 0
            encoder.save("/home/srivatsa/Multiple_Crop_Disease_Detection/Models/SimCLR_Pretrained/best_simCLR_Encoder.keras")
            print(f"✅ Best model saved with loss: {best_loss:.4f}")
        else:
            wait += 1
            if wait >= patience:
                print("✅ Early stopping triggered.")
                break

print("✅ SimCLR Training Completed. Encoder Saved.")

2025-03-21 17:53:36.945260: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:370] TFRecordDataset `buffer_size` is unspecified, default to 262144


z1 norm: Tensor("norm/Squeeze:0", shape=(16,), dtype=float32, device=/job:localhost/replica:0/task:0/device:GPU:0)
z2 norm: Tensor("norm_1/Squeeze:0", shape=(16,), dtype=float32, device=/job:localhost/replica:0/task:0/device:GPU:0)
z1 norm: Tensor("norm/Squeeze:0", shape=(16,), dtype=float32, device=/job:localhost/replica:0/task:0/device:GPU:0)
z2 norm: Tensor("norm_1/Squeeze:0", shape=(16,), dtype=float32, device=/job:localhost/replica:0/task:0/device:GPU:0)


E0000 00:00:1742559854.861479    5526 meta_optimizer.cc:966] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape infunctional_1_3/block2b_drop_1/stateless_dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer
I0000 00:00:1742559860.835360    5638 cuda_dnn.cc:529] Loaded cuDNN version 90300
2025-03-21 18:03:59.422109: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2025-03-21 18:03:59.422452: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[RemoteCall]]


z1 norm: Tensor("norm/Squeeze:0", shape=(11,), dtype=float32, device=/job:localhost/replica:0/task:0/device:GPU:0)
z2 norm: Tensor("norm_1/Squeeze:0", shape=(11,), dtype=float32, device=/job:localhost/replica:0/task:0/device:GPU:0)


E0000 00:00:1742560455.821950    5526 meta_optimizer.cc:966] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape infunctional_1_3/block2b_drop_1/stateless_dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer


Epoch 1/100 - Loss: 0.0582
✅ Best model saved with loss: 0.0582


2025-03-21 18:13:57.501822: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


Epoch 2/100 - Loss: 0.0108
✅ Best model saved with loss: 0.0108


2025-03-21 18:23:38.517681: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[RemoteCall]]


Epoch 3/100 - Loss: 0.0071
✅ Best model saved with loss: 0.0071
Epoch 4/100 - Loss: 0.0058
✅ Best model saved with loss: 0.0058
Epoch 5/100 - Loss: 0.0054
✅ Best model saved with loss: 0.0054


2025-03-21 18:53:23.619878: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


Epoch 6/100 - Loss: 0.0042
✅ Best model saved with loss: 0.0042
Epoch 7/100 - Loss: 0.0036
✅ Best model saved with loss: 0.0036
Epoch 8/100 - Loss: 0.0034
✅ Best model saved with loss: 0.0034
Epoch 9/100 - Loss: 0.0037
Epoch 10/100 - Loss: 0.0033
✅ Best model saved with loss: 0.0033


2025-03-21 19:41:46.551938: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[RemoteCall]]


Epoch 11/100 - Loss: 0.0031
✅ Best model saved with loss: 0.0031
Epoch 12/100 - Loss: 0.0030
✅ Best model saved with loss: 0.0030
Epoch 13/100 - Loss: 0.0030
Epoch 14/100 - Loss: 0.0030
✅ Best model saved with loss: 0.0030
Epoch 15/100 - Loss: 0.0024
✅ Best model saved with loss: 0.0024
Epoch 16/100 - Loss: 0.0030
Epoch 17/100 - Loss: 0.0025
Epoch 18/100 - Loss: 0.0021
✅ Best model saved with loss: 0.0021
Epoch 19/100 - Loss: 0.0023
Epoch 20/100 - Loss: 0.0029
Epoch 21/100 - Loss: 0.0020
✅ Best model saved with loss: 0.0020


2025-03-21 21:27:42.757200: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


Epoch 22/100 - Loss: 0.0020
Epoch 23/100 - Loss: 0.0025
Epoch 24/100 - Loss: 0.0020
✅ Best model saved with loss: 0.0020
Epoch 25/100 - Loss: 0.0020
Epoch 26/100 - Loss: 0.0025
Epoch 27/100 - Loss: 0.0020
Epoch 28/100 - Loss: 0.0020
Epoch 29/100 - Loss: 0.0022
✅ Early stopping triggered.
✅ SimCLR Training Completed. Encoder Saved.
