Libraries Used

In [1]:
import json as js
import tensorflow as tf
from os import listdir as ld
from os import path
from keras.models import Model #type: ignore
from keras import layers,models #type: ignore
from tqdm import tqdm
from keras.utils import to_categorical #type: ignore
from keras.losses import Huber #type: ignore

2025-04-20 04:05:51.575287: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-20 04:05:51.589266: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-20 04:05:51.593021: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-20 04:05:51.604272: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading Paths

In [2]:
with open("config_CNN.json",'r') as file:
    paths = js.load(file)

Creating Datasets

In [3]:
train_img_dir = paths['Train_resized']
train_annotation = paths['Preprocessed_Train_json']
val_img_dir = paths['Validation_resized']
val_annotation = paths['Preprocessed_Validation_json']
NUM_CLASSES = 80

with open(train_annotation, 'r') as f:
    train_annotations = js.load(f)
    
with open(val_annotation, 'r') as f:
    val_annotations = js.load(f)

train_data = []
val_data = []

print("Processing Training Data")
for info in tqdm(train_annotations):
    bbox = info['bbox']
    label = info['category_id']
    image_path = path.join(train_img_dir, info["img_id"])
    train_data.append((image_path, bbox, label))

print("Processing Validation Data")
for info in tqdm(val_annotations):
    bbox = info['bbox']
    label = info['category_id']
    image_path = path.join(val_img_dir, info["img_id"])
    val_data.append((image_path, bbox, label))

def preprocess_example(image_path, bbox, label):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    bbox_tensor = tf.stack(bbox)
    class_tensor = tf.one_hot(label, depth=NUM_CLASSES)
    return image, {
        "class_output": class_tensor,
        "bbox_output": bbox_tensor
    }

def create_dataset(data, batch_size=4, shuffle=True):
    paths, bboxes, labels = zip(*data)
    dataset = tf.data.Dataset.from_tensor_slices((list(paths), list(bboxes), list(labels)))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(data))

    dataset = dataset.map(lambda p, b, l: preprocess_example(p, b, l), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

train_dataset = create_dataset(train_data)
val_dataset = create_dataset(val_data, shuffle=False)


Processing Training Data


100%|██████████| 117266/117266 [00:00<00:00, 550816.35it/s]


Processing Validation Data


100%|██████████| 4952/4952 [00:00<00:00, 424913.43it/s]
I0000 00:00:1745102155.283556  488185 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1745102155.330742  488185 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1745102155.333098  488185 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1745102155.336160  488185 cuda_e

Defining model

In [4]:
def compute_iou_tf(box1, box2):
    box1_x1 = box1[:, 0]
    box1_y1 = box1[:, 1]
    box1_x2 = box1[:, 0] + box1[:, 2]
    box1_y2 = box1[:, 1] + box1[:, 3]
    box2_x1 = box2[:, 0]
    box2_y1 = box2[:, 1]
    box2_x2 = box2[:, 0] + box2[:, 2]
    box2_y2 = box2[:, 1] + box2[:, 3]
    x1 = tf.maximum(box1_x1, box2_x1)
    y1 = tf.maximum(box1_y1, box2_y1)
    x2 = tf.minimum(box1_x2, box2_x2)
    y2 = tf.minimum(box1_y2, box2_y2)
    intersection = tf.maximum(0.0, x2 - x1) * tf.maximum(0.0, y2 - y1)
    area1 = box1[:, 2] * box1[:, 3]
    area2 = box2[:, 2] * box2[:, 3]
    union = area1 + area2 - intersection
    return tf.math.divide_no_nan(intersection, union)

In [5]:
class IoUMetric(tf.keras.metrics.Metric):
    def __init__(self, name='iou_metric', **kwargs):
        super(IoUMetric, self).__init__(name=name, **kwargs)
        self.total_iou = self.add_weight(name='total_iou', initializer='zeros')
        self.count = self.add_weight(name='count', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.reshape(y_true, (-1, 4))
        y_pred = tf.reshape(y_pred, (-1, 4))
        ious = compute_iou_tf(y_true, y_pred)
        self.total_iou.assign_add(tf.reduce_sum(ious))
        self.count.assign_add(tf.cast(tf.size(ious), tf.float32))

    def result(self):
        return tf.math.divide_no_nan(self.total_iou, self.count)

    def reset_states(self):
        self.total_iou.assign(0.0)
        self.count.assign(0.0)


In [6]:
def inception_block(x, filters):
    branch1x1 = layers.Conv2D(filters, (1, 1), padding='same', activation='relu')(x)
    branch3x3 = layers.Conv2D(filters, (1, 1), padding='same', activation='relu')(x)
    branch3x3 = layers.Conv2D(filters, (3, 3), padding='same', activation='relu')(branch3x3)
    branch5x5 = layers.Conv2D(filters, (1, 1), padding='same', activation='relu')(x)
    branch5x5 = layers.Conv2D(filters, (5, 5), padding='same', activation='relu')(branch5x5)
    branch_pool = layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(x)
    branch_pool = layers.Conv2D(filters, (1, 1), padding='same', activation='relu')(branch_pool)
    x = layers.concatenate([branch1x1, branch3x3, branch5x5, branch_pool], axis=-1)
    x = layers.BatchNormalization()(x)
    return x

def mbconv_block(x, filters, kernel_size, strides=(1, 1), expand_ratio=6):
    input_tensor = x
    in_channels = x.shape[-1]
    x = layers.Conv2D(in_channels * expand_ratio, (1, 1), padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.DepthwiseConv2D(kernel_size, strides=strides, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.Conv2D(filters, (1, 1), padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)

    if strides == (1, 1) and in_channels == filters:
        x = layers.add([x, input_tensor])
    return x

def efficientnet_encoder(input_shape=(256, 256, 3)):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same", use_bias=False)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = mbconv_block(x, 64, (3, 3), strides=(1, 1), expand_ratio=1)
    x = mbconv_block(x, 128, (3, 3), strides=(2, 2), expand_ratio=6)
    x = mbconv_block(x, 128, (3, 3), strides=(1, 1), expand_ratio=6)
    return models.Model(inputs, x, name="efficientnet_encoder")

def inception_decoder(input_tensor, num_classes=80):
    x = inception_block(input_tensor, 32)
    x = inception_block(x, 64)
    x = inception_block(x, 128)
    cls = layers.GlobalAveragePooling2D()(x)
    cls = layers.Dense(128, activation='relu')(cls)
    cls = layers.BatchNormalization()(cls)
    cls = layers.Dropout(0.5)(cls)
    class_output = layers.Dense(num_classes, activation='softmax', name="class_output")(cls)
    bbox = layers.Conv2D(64, (3, 3), padding='same', activation='relu')(x)
    bbox = layers.GlobalAveragePooling2D()(bbox)
    bbox_output = layers.Dense(4, activation='sigmoid', name="bbox_output")(bbox)

    return models.Model(input_tensor, [class_output, bbox_output], name="inception_decoder")

def custom_detection_model(input_shape=(256, 256, 3), num_classes=80):
    encoder = efficientnet_encoder(input_shape)
    image_input = encoder.input
    x = encoder.output
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Reshape((1, 1, 128))(x)
    x = layers.Conv2D(128, (1, 1), activation='relu')(x)
    x = layers.BatchNormalization()(x)
    decoder_model = inception_decoder(input_tensor=x, num_classes=num_classes)

    model = models.Model(inputs=image_input, outputs={
        "class_output": decoder_model.get_layer("class_output").output,
        "bbox_output": decoder_model.get_layer("bbox_output").output
    })

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),
        loss={
            "class_output": "categorical_crossentropy",
            "bbox_output": Huber(delta=1.0)
        },
        metrics={
            "class_output": "accuracy",
            "bbox_output": Huber(delta=1.0)
        }
    )
    return model

model = custom_detection_model()

Checkpoints

In [7]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath = paths["Trained_model"],
    monitor = 'bbox_output_huber_loss',
    save_best_only = True,
    save_weights_only = False,
    mode = 'min',
    verbose = 1
)

Model training 

In [8]:
model.fit(
    train_dataset,
    epochs = 5,
    validation_data = val_dataset,
    callbacks = [checkpoint]
)

Epoch 1/5


I0000 00:00:1745102166.534764  488284 service.cc:146] XLA service 0x7caebc046af0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745102166.534801  488284 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-04-20 04:06:06.813926: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-04-20 04:06:07.969770: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 90101


[1m    4/29317[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m20:47[0m 43ms/step - bbox_output_huber_loss: 103.4196 - bbox_output_loss: 103.4196 - class_output_accuracy: 0.0000e+00 - class_output_loss: 5.6055 - loss: 109.0250

I0000 00:00:1745102183.513272  488284 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m29317/29317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - bbox_output_huber_loss: 100.6876 - bbox_output_loss: 100.6876 - class_output_accuracy: 0.1816 - class_output_loss: 3.7336 - loss: 104.4213
Epoch 1: bbox_output_huber_loss improved from inf to 100.67818, saving model to /home/utkarsh/Desktop/Sem-2/Deep Learning/2024PGCSDS14_Utkarsh Saxena_DeepLearning/DLProject/best_model_bbox.keras
[1m29317/29317[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1041s[0m 35ms/step - bbox_output_huber_loss: 100.6876 - bbox_output_loss: 100.6876 - class_output_accuracy: 0.1816 - class_output_loss: 3.7336 - loss: 104.4213 - val_bbox_output_huber_loss: 100.4362 - val_bbox_output_loss: 100.4362 - val_class_output_accuracy: 0.2490 - val_class_output_loss: 3.4081 - val_loss: 103.8442
Epoch 2/5
[1m29316/29317[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 34ms/step - bbox_output_huber_loss: 100.7825 - bbox_output_loss: 100.7825 - class_output_accuracy: 0.3188 - class_outpu

<keras.src.callbacks.history.History at 0x7cafccef10d0>