Libraries Used

In [42]:
import json as js
import tensorflow as tf
from os import listdir as ld
from os import path
from keras.models import Model #type: ignore
from keras import layers,models,regularizers #type: ignore
from keras.utils import to_categorical #type: ignore
from collections import defaultdict
import numpy as np
from PIL import Image

Loading Paths

In [43]:
with open("config_CNN.json",'r') as file:
    paths = js.load(file)

In [44]:
IMG_SIZE = 128
GRID_SIZE = 8
NUM_CLASSES = 80  

In [45]:
def load_annotations(json_path):
    with open(json_path, 'r') as f:
        annotations = js.load(f)

    label_map = defaultdict(list)  # {img_id: [bbox_dicts]}
    for ann in annotations:
        label_map[ann["img_id"]].append({
            "bbox": ann["bbox"],
            "category_id": ann["category_id"]
        })
    return label_map


In [46]:
with open(paths['Instance_Train'], 'r') as f:
    instance_train = js.load(f)
    
unique_cats = sorted({ann["id"] for ann in instance_train['categories']})
category_id_to_index = {cat_id: i for i, cat_id in enumerate(unique_cats)}
num_classes = len(category_id_to_index)


In [47]:
def preprocess_image_and_labels(img_path, ann_list, num_classes=NUM_CLASSES, grid_size=GRID_SIZE):
    image = Image.open(img_path).convert("RGB").resize((IMG_SIZE, IMG_SIZE))
    image = np.array(image) / 255.0
    label_tensor = np.zeros((grid_size, grid_size, 5 + num_classes), dtype=np.float32)

    for ann in ann_list:
        x, y, w, h = ann["bbox"]
        x_center = (x + w / 2) / IMG_SIZE
        y_center = (y + h / 2) / IMG_SIZE
        w /= IMG_SIZE
        h /= IMG_SIZE
        grid_x = int(x_center * grid_size)
        grid_y = int(y_center * grid_size)

        if grid_x >= grid_size: grid_x = grid_size - 1
        if grid_y >= grid_size: grid_y = grid_size - 1

        label_tensor[grid_y, grid_x, 0:4] = [x_center, y_center, w, h]
        label_tensor[grid_y, grid_x, 4] = 1.0  # Object confidence
        class_index = category_id_to_index[ann["category_id"]]
        label_tensor[grid_y, grid_x, 5 + class_index] = 1.0

    return image.astype(np.float32), label_tensor


In [48]:
def build_tf_dataset(image_dir, annotation_map, batch_size=32):
    image_files = list(annotation_map.keys())

    def generator():
        for img_file in image_files:
            img_path = path.join(image_dir, img_file)
            if not path.exists(img_path):
                continue
            img, label = preprocess_image_and_labels(img_path, annotation_map[img_file])
            yield img, label

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(IMG_SIZE, IMG_SIZE, 3), dtype=tf.float32),
            tf.TensorSpec(shape=(GRID_SIZE, GRID_SIZE, 5 + NUM_CLASSES), dtype=tf.float32)
        )
    )

    dataset = dataset.shuffle(512).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    dataset = dataset.repeat()  
    return dataset


In [49]:
def inception_block(x, filters):
    path1 = layers.Conv2D(filters, (1, 1), padding='same', activation='relu')(x)
    path2 = layers.Conv2D(filters, (1, 1), padding='same', activation='relu')(x)
    path2 = layers.Conv2D(filters, (3, 3), padding='same', activation='relu')(path2)
    path3 = layers.Conv2D(filters, (1, 1), padding='same', activation='relu')(x)
    path3 = layers.Conv2D(filters, (5, 5), padding='same', activation='relu')(path3)
    path4 = layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(x)
    path4 = layers.Conv2D(filters, (1, 1), padding='same', activation='relu')(path4)
    return layers.Concatenate()([path1, path2, path3, path4])

def mbconv_block(x, filters, kernel_size=(3, 3), strides=(1, 1), expand_ratio=6):
    input_tensor = x
    in_channels = x.shape[-1]
    expanded = layers.Conv2D(in_channels * expand_ratio, (1, 1), padding='same', use_bias=False,kernel_regularizer=regularizers.l2(0.01))(x)
    expanded = layers.BatchNormalization()(expanded)
    expanded = layers.ReLU()(expanded)
    depthwise = layers.DepthwiseConv2D(kernel_size, strides=strides, padding='same', use_bias=False)(expanded)
    depthwise = layers.BatchNormalization()(depthwise)
    depthwise = layers.ReLU()(depthwise)
    projected = layers.Conv2D(filters, (1, 1), padding='same', use_bias=False,kernel_regularizer=regularizers.l2(0.01))(depthwise)
    projected = layers.BatchNormalization()(projected)
    if strides == (1, 1) and in_channels == filters:
        x = layers.Add()([input_tensor, projected])
    else:
        x = projected
    return x

def efficient_yolo_model(input_shape=(128, 128, 3), grid_size=8, num_classes=80):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv2D(32, (3, 3), strides=(2, 2), padding='same', activation='relu',kernel_regularizer=regularizers.l2(0.01))(inputs)
    x = mbconv_block(x, 64, strides=(1, 1), expand_ratio=2)
    x = mbconv_block(x, 128, strides=(2, 2), expand_ratio=2)
    x = mbconv_block(x, 256, strides=(2, 2), expand_ratio=4)
    x = inception_block(x, 128)
    x = inception_block(x, 128)
    x = layers.Conv2D(256, (3, 3), strides=(2, 2), padding='same',kernel_regularizer=regularizers.l2(0.01))(x)  
    x = layers.Conv2D(85, (1, 1), padding='same')(x)  
    model = models.Model(inputs, x)
    return model


In [50]:
def compute_iou_yolo(true_boxes, pred_boxes):
    true_xy = true_boxes[..., :2]
    true_wh = true_boxes[..., 2:4]
    true_x1y1 = true_xy - true_wh / 2
    true_x2y2 = true_xy + true_wh / 2
    pred_xy = pred_boxes[..., :2]
    pred_wh = pred_boxes[..., 2:4]
    pred_x1y1 = pred_xy - pred_wh / 2
    pred_x2y2 = pred_xy + pred_wh / 2
    intersect_x1 = tf.maximum(true_x1y1[..., 0], pred_x1y1[..., 0])
    intersect_y1 = tf.maximum(true_x1y1[..., 1], pred_x1y1[..., 1])
    intersect_x2 = tf.minimum(true_x2y2[..., 0], pred_x2y2[..., 0])
    intersect_y2 = tf.minimum(true_x2y2[..., 1], pred_x2y2[..., 1])
    intersect_area = tf.maximum(0.0, intersect_x2 - intersect_x1) * tf.maximum(0.0, intersect_y2 - intersect_y1)
    true_area = true_wh[..., 0] * true_wh[..., 1]
    pred_area = pred_wh[..., 0] * pred_wh[..., 1]
    union_area = true_area + pred_area - intersect_area
    iou = tf.math.divide_no_nan(intersect_area, union_area)
    return iou


In [51]:
class YOLOIoUMetric(tf.keras.metrics.Metric):
    def __init__(self, name='iou_metric', **kwargs):
        super(YOLOIoUMetric, self).__init__(name=name, **kwargs)
        self.total_iou = self.add_weight(name='total_iou', initializer='zeros')
        self.count = self.add_weight(name='count', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        true_conf = y_true[..., 4]
        pred_conf = y_pred[..., 4]
        true_boxes = y_true[..., 0:4]
        pred_boxes = y_pred[..., 0:4]
        mask = tf.where(true_conf > 0)
        true_boxes_masked = tf.gather_nd(true_boxes, mask)
        pred_boxes_masked = tf.gather_nd(pred_boxes, mask)
        ious = compute_iou_yolo(true_boxes_masked, pred_boxes_masked)
        self.total_iou.assign_add(tf.reduce_sum(ious))
        self.count.assign_add(tf.cast(tf.size(ious), tf.float32))

    def result(self):
        return tf.math.divide_no_nan(self.total_iou, self.count)

    def reset_states(self):
        self.total_iou.assign(0.0)
        self.count.assign(0.0)


In [52]:
def yolo_loss(y_true, y_pred, num_classes=80):
    true_box = y_true[..., :4]
    true_conf = y_true[..., 4:5]
    true_class = y_true[..., 5:]
    pred_box = tf.sigmoid(y_pred[..., :4])
    pred_conf = tf.sigmoid(y_pred[..., 4:5])
    pred_class = tf.nn.softmax(y_pred[..., 5:])
    iou = compute_iou_yolo(true_box, pred_box)
    conf_loss = tf.keras.losses.binary_crossentropy(true_conf, pred_conf)
    bbox_loss = tf.reduce_mean((true_box - pred_box) ** 2, axis=-1)
    class_loss = tf.keras.losses.categorical_crossentropy(true_class, pred_class)
    total_loss = bbox_loss + conf_loss + class_loss
    return tf.reduce_mean(total_loss)

In [53]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=paths["Trained_model"],
    monitor='iou_metric',
    mode='max',
    save_best_only=True,
    save_weights_only=False,
    verbose=1
)

In [54]:
model = efficient_yolo_model(input_shape=(128, 128, 3), grid_size=8, num_classes=80)
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=yolo_loss,
    metrics=[YOLOIoUMetric()]
)

In [None]:
annotation_map = load_annotations(paths["Preprocessed_Train_json"])
train_dataset = build_tf_dataset(paths["Train_resized"], annotation_map, batch_size=16)

model.fit(
    train_dataset,
    epochs=20,
    callbacks=[checkpoint]
)


Epoch 1/20


I0000 00:00:1745253817.174081   84409 service.cc:146] XLA service 0x725fa00032b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745253817.174116   84409 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2025-04-21 22:13:37.346242: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-04-21 22:13:37.959971: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 90101


      6/Unknown [1m15s[0m 25ms/step - iou_metric: 0.0000e+00 - loss: 12.6901

I0000 00:00:1745253827.385634   84409 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  14575/Unknown [1m358s[0m 24ms/step - iou_metric: 0.0000e+00 - loss: 890931.0000