In [1]:
import zipfile
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Extract the dataset zip file
def extract_zip(zip_path, extract_to='dataset'):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Dataset extracted to {extract_to}")

# Specify your zip file path
dataset_zip = 'dataset.zip'  # Update with your actual zip file name/path
extract_zip(dataset_zip)

2025-08-13 11:57:49.223778: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-13 11:57:49.271869: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755066469.315233 1189715 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755066469.320970 1189715 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755066469.340143 1189715 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Dataset extracted to dataset


In [6]:
import os
import cv2
import numpy as np
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Define paths
train_img_dir = 'dataset/object/train/train'
test_img_dir = 'dataset/object/test/test'

In [7]:
def parse_annotation(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    objects = []
    for obj in root.findall('object'):
        obj_name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = int(bndbox.find('xmin').text)
        ymin = int(bndbox.find('ymin').text)
        xmax = int(bndbox.find('xmax').text)
        ymax = int(bndbox.find('ymax').text)
        objects.append({
            'class': obj_name,
            'bbox': [xmin, ymin, xmax, ymax]
        })
    return objects

In [8]:
def load_dataset(image_dir):
    images = []
    labels = []
    bboxes = []
    class_names = set()
    
    for img_file in os.listdir(image_dir):
        if img_file.lower().endswith('.jpg'):
            # Get corresponding XML file
            xml_file = os.path.splitext(img_file)[0] + '.xml'
            xml_path = os.path.join(image_dir, xml_file)
            
            if not os.path.exists(xml_path):
                continue
                
            # Parse annotation
            annotations = parse_annotation(xml_path)
            if not annotations:
                continue
                
            # Load image
            img_path = os.path.join(image_dir, img_file)
            image = cv2.imread(img_path)
            if image is None:
                continue
                
            # For simplicity, we'll use the first object in the image
            obj = annotations[0]
            class_names.add(obj['class'])
            
            # Resize image and adjust bounding box
            orig_h, orig_w = image.shape[:2]
            image = cv2.resize(image, (224, 224))
            xmin, ymin, xmax, ymax = obj['bbox']
            
            # Scale bounding box coordinates
            xmin = int(xmin * (224 / orig_w))
            xmax = int(xmax * (224 / orig_w))
            ymin = int(ymin * (224 / orig_h))
            ymax = int(ymax * (224 / orig_h))
            
            images.append(image)
            bboxes.append([xmin, ymin, xmax, ymax])
    
    # Convert class names to indices
    class_names = sorted(list(class_names))
    class_to_idx = {name: idx for idx, name in enumerate(class_names)}
    labels = [class_to_idx[parse_annotation(
        os.path.join(image_dir, os.path.splitext(f)[0] + '.xml')
    )[0]['class']] for f in os.listdir(image_dir) if f.lower().endswith('.jpg')]
    
    return np.array(images), np.array(labels), np.array(bboxes), class_names

# Load datasets
X_train, y_train, bboxes_train, classes = load_dataset(train_img_dir)
X_test, y_test, bboxes_test, _ = load_dataset(test_img_dir)

In [9]:
# Normalize images
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

# Normalize bounding boxes (scale to [0,1])
bboxes_train = bboxes_train / 224.0
bboxes_test = bboxes_test / 224.0

# Convert labels to one-hot
num_classes = len(classes)
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

# Split into training and validation
X_train, X_val, y_train, y_val, bboxes_train, bboxes_val = train_test_split(
    X_train, y_train, bboxes_train, test_size=0.2, random_state=42)

In [10]:
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

def create_object_detection_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    
    # Base CNN
    x = Conv2D(32, (3, 3), activation='relu')(inputs)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    
    # Classification head
    cls_output = Dense(128, activation='relu')(x)
    cls_output = Dropout(0.5)(cls_output)
    cls_output = Dense(num_classes, activation='softmax', name='class_output')(cls_output)
    
    # Bounding box regression head
    bbox_output = Dense(128, activation='relu')(x)
    bbox_output = Dropout(0.5)(bbox_output)
    bbox_output = Dense(4, activation='sigmoid', name='bbox_output')(bbox_output)
    
    model = Model(inputs=inputs, outputs=[cls_output, bbox_output])
    
    model.compile(optimizer='adam',
                 loss={'class_output': 'categorical_crossentropy',
                       'bbox_output': 'mse'},
                 metrics={'class_output': 'accuracy',
                         'bbox_output': 'mse'})
    
    return model

model = create_object_detection_model((224, 224, 3), num_classes)
model.summary()

I0000 00:00:1755067058.292396 1189715 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 123436 MB memory:  -> device: 0, name: NVIDIA H200, pci bus id: 0000:1b:00.0, compute capability: 9.0
I0000 00:00:1755067058.309029 1189715 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 139166 MB memory:  -> device: 1, name: NVIDIA H200, pci bus id: 0000:43:00.0, compute capability: 9.0
I0000 00:00:1755067058.310658 1189715 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 133249 MB memory:  -> device: 2, name: NVIDIA H200, pci bus id: 0000:52:00.0, compute capability: 9.0
I0000 00:00:1755067058.312187 1189715 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 137473 MB memory:  -> device: 3, name: NVIDIA H200, pci bus id: 0000:61:00.0, compute capability: 9.0
I0000 00:00:1755067058.314646 1189715 gpu_device.cc:2019] Created device /job:localhost/replica:0/ta

In [12]:
# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')
history = model.fit(X_train,
                   {'class_output': y_train, 'bbox_output': bboxes_train},
                   validation_data=(X_val, {'class_output': y_val, 'bbox_output': bboxes_val}),
                   epochs=20,
                   batch_size=32)

Epoch 1/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 127ms/step - bbox_output_loss: 0.0331 - bbox_output_mse: 0.0331 - class_output_accuracy: 0.8137 - class_output_loss: 0.5638 - loss: 0.5969 - val_bbox_output_loss: 0.0247 - val_bbox_output_mse: 0.0242 - val_class_output_accuracy: 0.7917 - val_class_output_loss: 0.3997 - val_loss: 0.4403
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 69ms/step - bbox_output_loss: 0.0246 - bbox_output_mse: 0.0246 - class_output_accuracy: 0.8626 - class_output_loss: 0.3628 - loss: 0.3874 - val_bbox_output_loss: 0.0226 - val_bbox_output_mse: 0.0220 - val_class_output_accuracy: 0.9583 - val_class_output_loss: 0.2336 - val_loss: 0.2841
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - bbox_output_loss: 0.0204 - bbox_output_mse: 0.0204 - class_output_accuracy: 0.9434 - class_output_loss: 0.1778 - loss: 0.1983 - val_bbox_output_loss: 0.0227 - val_bbox_output_mse: 0.0223 - val_cla

In [13]:
def visualize_detection(model, image_path, class_names, confidence_thresh=0.5):
    # Load and preprocess image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error loading {image_path}")
        return None
    
    orig_image = image.copy()
    h, w = image.shape[:2]
    image = cv2.resize(image, (224, 224))
    input_img = np.expand_dims(image.astype('float32') / 255.0, axis=0)
    
    # Predict
    class_probs, bbox = model.predict(input_img)
    class_id = np.argmax(class_probs)
    confidence = np.max(class_probs)
    
    if confidence > confidence_thresh:
        # Scale bbox back to original image size
        xmin, ymin, xmax, ymax = bbox[0]
        xmin = int(xmin * w)
        ymin = int(ymin * h)
        xmax = int(xmax * w)
        ymax = int(ymax * h)
        
        # Draw bounding box and label
        cv2.rectangle(orig_image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
        label = f"{class_names[class_id]}: {confidence:.2f}"
        cv2.putText(orig_image, label, (xmin, ymin-10),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    return orig_image

# Test on a sample image
test_img_path = os.path.join(test_img_dir, os.listdir(test_img_dir)[0])
result = visualize_detection(model, test_img_path, classes)
if result is not None:
    cv2.imwrite('detection_result.jpg', result)
    print("Detection result saved as 'detection_result.jpg'")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Detection result saved as 'detection_result.jpg'
