In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
import xml.etree.ElementTree as ET
import tensorflow as tf
import cv2
from sklearn.model_selection import train_test_split
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
CLASSES = ["person", "car", "dog", "bicycle", "cat"]
NUM_CLASSES = len(CLASSES)
CLASS_TO_ID = {name: i for i, name in enumerate(CLASSES)}
def parse_voc_xml(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    boxes = []
    labels = []

    for obj in root.findall("object"):
        cls = obj.find("name").text
        if cls not in CLASS_TO_ID:
            continue

        bbox = obj.find("bndbox")
        xmin = int(float(bbox.find("xmin").text))
        ymin = int(float(bbox.find("ymin").text))
        xmax = int(float(bbox.find("xmax").text))
        ymax = int(float(bbox.find("ymax").text))

        boxes.append([xmin, ymin, xmax, ymax])
        labels.append(CLASS_TO_ID[cls])

    return np.array(boxes), np.array(labels)

In [None]:
def load_image_and_labels(img_path, xml_path):
    image = cv2.imread(img_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    boxes, labels = parse_voc_xml(xml_path)
    return image, boxes, labels
def normalize_boxes(boxes, img_w, img_h):
    boxes = boxes.astype(np.float32)
    if boxes.size == 0:
        return np.zeros((0, 4), dtype=np.float32)
    if boxes.ndim == 1:
        boxes = np.expand_dims(boxes, axis=0)
    boxes[:, [0, 2]] /= img_w
    boxes[:, [1, 3]] /= img_h
    return boxes

In [None]:
GRID_SIZE = 28
def encode_targets(boxes, labels):
    target = np.zeros((GRID_SIZE, GRID_SIZE, 5 + NUM_CLASSES))

    for box, label in zip(boxes, labels):
        x_center = (box[0] + box[2]) / 2
        y_center = (box[1] + box[3]) / 2
        w = box[2] - box[0]
        h = box[3] - box[1]

        grid_x = int(x_center * GRID_SIZE)
        grid_y = int(y_center * GRID_SIZE)

        if grid_x >= GRID_SIZE or grid_y >= GRID_SIZE:
            continue

        target[grid_y, grid_x, 0:4] = [x_center, y_center, w, h]
        target[grid_y, grid_x, 4] = 1.0
        target[grid_y, grid_x, 5 + label] = 1.0

    return target

In [None]:
def preprocess(img_path, xml_path):
    image, boxes, labels = load_image_and_labels(img_path, xml_path)
    h, w, _ = image.shape
    if boxes.size > 0:
        if np.random.rand() < 0.5:
            image = cv2.flip(image, 1)
            boxes[:, [0, 2]] = w - boxes[:, [2, 0]]

        if np.random.rand() < 0.5:
            image = image.astype(np.float32)
            brightness = np.random.uniform(-30, 30)
            image += brightness
            contrast = np.random.uniform(0.8, 1.2)
            image *= contrast
            image = np.clip(image, 0, 255)
            
    boxes = normalize_boxes(boxes, w, h)
    if boxes.size == 0:
      target = np.zeros((28, 28, 5 + NUM_CLASSES), dtype=np.float32)
    else: 
      target = encode_targets(boxes, labels)
    image = cv2.resize(image, (224, 224))
    image = image / 255.0
    return image.astype(np.float32), target.astype(np.float32)

In [None]:
IMAGE_DIR = "/kaggle/input/pascal-voc-2012-dataset/VOC2012_train_val/VOC2012_train_val/JPEGImages"
ANNOT_DIR = "/kaggle/input/pascal-voc-2012-dataset/VOC2012_train_val/VOC2012_train_val/Annotations"

image_files = sorted(os.listdir(IMAGE_DIR))
xml_files = sorted(os.listdir(ANNOT_DIR))

data_pairs = list(zip(
    [os.path.join(IMAGE_DIR, f) for f in image_files],
    [os.path.join(ANNOT_DIR, f.replace(".jpg", ".xml")) for f in image_files]
))

In [None]:
train_pairs, val_pairs = train_test_split(
    data_pairs,
    test_size=0.2,
    random_state=42,
    shuffle=True
)
output_signature=(
    tf.TensorSpec((224, 224, 3), tf.float32),
    tf.TensorSpec((28, 28, 5 + NUM_CLASSES), tf.float32)
)
def tf_data_generator(data_pairs):
    for img_path, xml_path in data_pairs:
        image, target = preprocess(img_path, xml_path)
        
        image = tf.cast(image, tf.float32)
        target = tf.cast(target, tf.float32)
        
        image = tf.ensure_shape(image, (224, 224, 3))
        target = tf.ensure_shape(target, (28, 28, 5 + NUM_CLASSES))
        yield image, target

train_dataset = tf.data.Dataset.from_generator(
    lambda:tf_data_generator(train_pairs),
    output_signature=output_signature
)
train_dataset = train_dataset.shuffle(200).batch(8).repeat().prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_generator(
    lambda:tf_data_generator(val_pairs),
    output_signature=output_signature
)
val_dataset = val_dataset.batch(8).prefetch(tf.data.AUTOTUNE)
for images, targets in train_dataset.take(1):
    print(images.shape)
    print(targets.shape)

In [None]:
from tensorflow.keras.layers import Input,Conv2D,MaxPooling2D,UpSampling2D,Concatenate,Conv2DTranspose
from tensorflow.keras.layers import BatchNormalization,Activation
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
def custom_cnn(input_shape=(224,224,3)): 
    inputs = Input(shape=input_shape)
    #block 1
    x = Conv2D(16, (3, 3), padding='same', activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    #block 2
    x = Conv2D(32, (3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    #block 3
    x = Conv2D(64, (3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    skip_features=x
    #block 4
    x = Conv2D(128, (3, 3), padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D((2, 2))(x)
    model = Model(inputs=inputs, outputs=[x,skip_features], name="Custom_CNN_Backbone")
    return model

In [None]:
def detection_head(feature_maps,skip_features, num_classes):
    x = UpSampling2D(2)(feature_maps)

    skip = Conv2D(64, 1, padding='same', use_bias=False)(skip_features)
    skip = BatchNormalization()(skip)
    skip = Activation('relu')(skip)

    x = Concatenate()([x, skip])
    
    x = Conv2D(256, 3, padding='same', use_bias=False)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    output_channels = 5 + num_classes
    outputs = Conv2D(
        output_channels,
        1,
        padding='same',
        activation=None
    )(x)
    return outputs

In [None]:
def object_model(input_shape,num_classes):
    backbone=custom_cnn(input_shape)
    feature_maps,skip_features=backbone.output
    print(skip_features.shape)
    print(feature_maps.shape)
    detect=detection_head(feature_maps,skip_features,num_classes)
    print(detect.shape)
    return Model(
        inputs=backbone.input,
        outputs=detect,
        name="Custom_CNN_Detector"
    )

In [None]:
def detection_loss(y_true, y_pred):
    bbox_loss = tf.reduce_mean(tf.square(y_true[..., :4] - y_pred[..., :4]))
    obj_loss = tf.keras.losses.binary_crossentropy(
        y_true[..., 4], y_pred[..., 4], from_logits=True )
    cls_loss = tf.keras.losses.sparse_categorical_crossentropy(
        y_true[..., 5], y_pred[..., 5:], from_logits=True )

    return bbox_loss + obj_loss + tf.reduce_mean(cls_loss)

In [None]:
model = object_model(input_shape=(224,224,3),num_classes=5)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=detection_loss
)
model.summary()

In [None]:
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=50,
    steps_per_epoch=len(train_pairs)//8,
    validation_steps=len(val_pairs)//8
)

In [None]:
def decode_predictions(pred, conf_thresh=0.05):
    grid_size = pred.shape[0]
    boxes = []

    for i in range(grid_size):
        for j in range(grid_size):
            cell = pred[i, j]

            obj_score = tf.sigmoid(cell[4])

            if obj_score < conf_thresh:
                continue
                
            tx = tf.sigmoid(cell[0])
            ty = tf.sigmoid(cell[1])
            tw = tf.sigmoid(cell[2])
            th = tf.sigmoid(cell[3])

            cx = (j + tx) / grid_size
            cy = (i + ty) / grid_size
            w  = tw
            h  = th

            xmin = tf.maximum(0.0,cx - w / 2)
            ymin = tf.maximum(0.0,cy - h / 2)
            xmax = tf.minimum(1.0,cx + w / 2)
            ymax = tf.minimum(1.0,cy + h / 2)

            class_id = tf.argmax(cell[5:])
            class_score = tf.nn.softmax(cell[5:])[class_id]

            score = obj_score * class_score

            boxes.append([
                xmin.numpy(),
                ymin.numpy(),
                xmax.numpy(),
                ymax.numpy(),
                score.numpy(),
                int(class_id.numpy())
            ])

    return np.array(boxes)

In [None]:
def apply_nms(boxes, iou_thresh=0.5):
    if len(boxes) == 0:
        return boxes

    boxes_tf = tf.convert_to_tensor(boxes[:, :4], dtype=tf.float32)
    scores_tf = tf.convert_to_tensor(boxes[:, 4], dtype=tf.float32)

    selected = tf.image.non_max_suppression(
        boxes_tf,
        scores_tf,
        max_output_size=100,
        iou_threshold=iou_thresh
    )

    return boxes[selected.numpy()]

In [None]:
def detect_objects_for_map(model, img, conf_thresh=0.05):
    pred = model(img, training=False)[0].numpy()
    decoded = decode_predictions(pred, conf_thresh)
    final_boxes = apply_nms(decoded)
    return final_boxes

In [None]:
images = []
annotations = []
for img_path, xml_path in val_pairs:
    image, boxes, labels = load_image_and_labels(img_path, xml_path)
    h, w, _ = image.shape
    boxes = normalize_boxes(boxes, w, h)

    image = cv2.resize(image, (224, 224))
    image = image / 255.0

    images.append(image.astype(np.float32))
    annotations.append(boxes)

In [None]:
detections = []
ground_truths = []

for img, gt in zip(images, annotations):
    img = np.expand_dims(img, axis=0)
    det = detect_objects_for_map(model, img)

    detections.append(det)
    ground_truths.append(gt)

In [None]:
def compute_iou(box1, box2):
    x1, y1, x2, y2 = box1
    x1g, y1g, x2g, y2g = box2

    xi1 = max(x1, x1g)
    yi1 = max(y1, y1g)
    xi2 = min(x2, x2g)
    yi2 = min(y2, y2g)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)

    box1_area = (x2 - x1) * (y2 - y1)
    box2_area = (x2g - x1g) * (y2g - y1g)
    union_area = box1_area + box2_area - inter_area

    return inter_area / union_area

In [None]:
from sklearn.metrics import average_precision_score
def compute_map(detections, annotations, iou_threshold=0.5):
    aps = []
    for det, ann in zip(detections, annotations):
        if len(ann) == 0:
            continue  

        tp = 0
        fp = 0
        used = [False] * len(ann)

        for d in det:
            matched = False
            for idx, a in enumerate(ann):
                if used[idx]:
                    continue  
                iou = compute_iou(d[:4], a)
                if iou >= iou_threshold:
                    tp += 1
                    used[idx] = True
                    matched = True
                    break
            if not matched:
                fp += 1  

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / len(ann) if len(ann) > 0 else 0
        aps.append(precision * recall)

    return np.mean(aps) if len(aps) > 0 else 0

mAP = compute_map(detections, ground_truths)
print(f"Mean Average Precision (mAP): {mAP:.4f}")

In [None]:
import time
start = time.time()
model.predict(val_dataset.take(10))
fps = 10 / (time.time() - start)
print("FPS:", fps)

In [None]:
model.save("custom_detector.h5")

In [None]:
import cv2
import numpy as np

def run_detection(model, image_path, conf_thresh=0.10):
    img = cv2.imread(image_path)
    h, w, _ = img.shape

    img_resized = cv2.resize(img, (224, 224))
    img_input = img_resized / 255.0
    img_input = np.expand_dims(img_input, axis=0)

    boxes = detect_objects_for_map(model, img_input, conf_thresh)

    for box in boxes:
        xmin, ymin, xmax, ymax, score, cls = box
        x1 = int(xmin * w)
        y1 = int(ymin * h)
        x2 = int(xmax * w)
        y2 = int(ymax * h)

        cv2.rectangle(img, (x1, y1), (x2, y2), (0,255,0), 2)
        cv2.putText(
            img,
            f"Class {cls} | {score:.2f}",
            (x1, y1 - 5),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0,255,0),
            1
        )
    return img 

In [None]:
import matplotlib.pyplot as plt

img_path = "/kaggle/input/pascal-voc-2012-dataset/VOC2012_test/VOC2012_test/JPEGImages/2008_000029.jpg"

output = run_detection(model, img_path, conf_thresh=0.15)

plt.figure(figsize=(8,8))
plt.imshow(cv2.cvtColor(output, cv2.COLOR_BGR2RGB))
plt.axis("off")
plt.show()