In [4]:
import tensorflow as tf
import numpy as np
import xml.etree.ElementTree as ET
import os

In [5]:
def parse_voc_annotations(annotation_dir, image_dir, class_map):
    image_paths = []
    bbox_data = []
    class_labels =[]
    for xml_file in sorted(os.listdir(os.path.join(annotation_dir))):
        if not xml_file.endswith(".xml"):
            continue
        tree = ET.parse(os.path.join(annotation_dir, xml_file))
        root = tree.getroot()
        image_file_name = root.find("filename").text
        path = os.path.join(image_dir,image_file_name)

        size = root.find("size")
        img_width = int(size.find("width").text)
        img_height = int(size.find("height").text)

        obj = root.find("object")
        if obj is not None:
            class_name = obj.find("name").text
            if class_name not in class_map:
                continue
            class_id = class_map[class_name]

            bndbox = obj.find("bndbox")
            xmin = float(bndbox.find("xmin").text) / img_width
            ymin = float(bndbox.find("ymin").text) / img_height
            xmax = float(bndbox.find("xmax").text) / img_width
            ymax = float(bndbox.find("ymax").text) / img_height
            image_paths.append(path)
            bbox_data.append([xmin, ymin, xmax, ymax])
            class_labels.append(class_id)
    return image_paths, bbox_data, class_labels



In [6]:
image_directory = "../data/images"
annotations_directory = "../data/Annotations"
class_map ={"thank you":0,"open palm":1,"first":2,"okay":3,"call":4}
image_paths, bounding_box,class_labels= parse_voc_annotations(annotations_directory,image_directory,class_map)

In [7]:
image_paths = tf.constant(image_paths)
bbox_data = tf.constant(bounding_box, dtype=tf.float32)
class_labels = tf.constant(class_labels, dtype=tf.int32)

def load_and_preprocess_image(path, bbox, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_image(image, channels=3)
    image.set_shape([None, None, 3])
    image = tf.image.resize(image, [128, 128])
    image = image / 255.0
    label_one_hot = tf.one_hot(label, depth=len(class_map))
    return image, {"gesture": label_one_hot, "bbox": bbox}


dataset = tf.data.Dataset.from_tensor_slices((image_paths, bbox_data, class_labels))
dataset = dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)


dataset = dataset.shuffle(buffer_size=56).batch(32).prefetch(tf.data.AUTOTUNE)

DATASET_SIZE = len(image_paths)
train_size = int(0.8 * DATASET_SIZE)

train_ds = dataset.take(train_size)
val_ds = dataset.skip(train_size)

In [89]:
DATASET_SIZE

37

In [87]:

train_ds

<_TakeDataset element_spec=(TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32, name=None), {'gesture': TensorSpec(shape=(None, 5), dtype=tf.float32, name=None), 'bbox': TensorSpec(shape=(None, 4), dtype=tf.float32, name=None)})>

In [1]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D,BatchNormalization,Dropout
from tensorflow.keras.models import Model

In [81]:
base_model = MobileNetV2(input_shape=(128,128,3), include_top=False, weights="imagenet")
base_model.trainable =False

In [None]:
inputs = Input(shape=(128,128,3))
x = base_model(inputs, training=False)
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

bbox = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(1e-4))(x)
bbox = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(1e-4))(bbox)
bbox = Dense(4, name='bbox', activation='sigmoid')(bbox)  

cls = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.L2(1e-4))(x)
cls = Dense(len(class_map), name='gesture', activation='softmax')(cls)

model = Model(inputs=inputs, outputs=[bbox, cls])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss={
      'bbox': tf.keras.losses.mae,         
      'gesture': 'categorical_crossentropy'
    },
    metrics={'gesture': 'accuracy'}
)

In [83]:
model.fit(
    train_ds,
    validation_data= val_ds,
    epochs= 25,
    callbacks =[
            tf.keras.callbacks.EarlyStopping(monitor="gesture_loss", restore_best_weights=True, mode="min", patience=5),
            tf.keras.callbacks.ModelCheckpoint("../trained_model/best_sign_language_model.keras", save_best_only=True,monitor="loss", mode="min", verbose=1)
    ],
    verbose=1
)


Epoch 1/25
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - bbox_loss: 0.3915 - gesture_accuracy: 0.1592 - gesture_loss: 3.1247 - loss: 3.7943
Epoch 1: loss improved from inf to 3.70452, saving model to ../trained_model/best_sign_language_model.keras




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4s/step - bbox_loss: 0.3933 - gesture_accuracy: 0.1602 - gesture_loss: 3.0122 - loss: 3.7644 
Epoch 2/25
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - bbox_loss: 0.3153 - gesture_accuracy: 0.5203 - gesture_loss: 1.2227 - loss: 1.6639 
Epoch 2: loss improved from 3.70452 to 1.63041, saving model to ../trained_model/best_sign_language_model.keras
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step - bbox_loss: 0.3152 - gesture_accuracy: 0.5270 - gesture_loss: 1.1815 - loss: 1.6527  
Epoch 3/25
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - bbox_loss: 0.2450 - gesture_accuracy: 0.7981 - gesture_loss: 0.8183 - loss: 0.8667 
Epoch 3: loss improved from 1.63041 to 0.95270, saving model to ../trained_model/best_sign_language_model.keras
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step - bbox_loss: 0.2416 - gesture_accuracy: 0.7934 - gestu

<keras.src.callbacks.history.History at 0x249cebcb5c0>

In [84]:

# for layer in base_model.layers[-20:]:
#     layer.trainable = True

# model.compile(
#     optimizer=tf.keras.optimizers.Adam(1e-5),  
#     loss=model.loss,
#     metrics=[[],['accuracy']]
# )

# model.fit(
#     train_ds,
#     validation_data= val_ds,
#     epochs= 25,
#     callbacks =[
#             tf.keras.callbacks.EarlyStopping(monitor="gesture_accuracy", restore_best_weights=True, mode="max", patience=5),
#             tf.keras.callbacks.ModelCheckpoint("../trained_model/best_sign_language_model.keras", save_best_only=True,monitor="loss", mode="min", verbose=1)
#     ],
#     verbose=1
# )
