In [1]:
import pandas as pd
import kaggle as kg
import os
import cv2
import pathlib
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import json
import ast
from keras.models import Model
from keras.applications import VGG16
from keras.optimizers import SGD
from keras.layers import Conv2D
from xml.etree import ElementTree as ET

2024-09-17 22:29:36.909937: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-17 22:29:36.998873: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-17 22:29:37.622059: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-17 22:29:37.625335: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def anchor_boxes(image_size,grids_size,aspect_ratios):

    image_width, image_height, _ = image_size

    grid_width = image_width//grids_size[0]
    grid_height = image_height//grids_size[1]

    grid_center_x_start = grid_width//2
    grid_center_x_end = int((grids_size[0] - 0.5)*grid_width) 

    grid_center_x = np.linspace(grid_center_x_start,grid_center_x_end,grids_size[0])

    grid_center_y_start = grid_height//2
    grid_center_y_end = int((grids_size[1] - 0.5)*grid_height)

    grid_center_y = np.linspace(grid_center_y_start,grid_center_y_end,grids_size[1])

    grid_center_x_mesh, grid_center_y_mesh = np.meshgrid(grid_center_x,grid_center_y)

    grid_center_x_mesh = np.expand_dims(grid_center_x_mesh,-1)
    grid_center_y_mesh = np.expand_dims(grid_center_y_mesh,-1)

    anchor_boxes_no = len(aspect_ratios)

    anchor_boxes_tensor = np.zeros((grids_size[0],grids_size[1],anchor_boxes_no,4))

    anchor_boxes_tensor[...,0] = np.tile(grid_center_x_mesh,(1,1,anchor_boxes_no))
    anchor_boxes_tensor[...,1] = np.tile(grid_center_y_mesh,(1,1,anchor_boxes_no))

    anchor_box_width_height = list()

    for aspect_ratio in aspect_ratios:

        anchor_box_width_height.append((grid_width*np.sqrt(aspect_ratio),
                                        grid_height/np.sqrt(aspect_ratio)))
        
    anchor_box_width_height = np.array(anchor_box_width_height)

    anchor_boxes_tensor[...,2] = anchor_box_width_height[:,0]
    anchor_boxes_tensor[...,3] = anchor_box_width_height[:,1]

    return anchor_boxes_tensor

In [3]:
def centroid2minmax(anchor_boxes_centroid_tensor):

    anchor_boxes_minmax_tensor = np.copy(anchor_boxes_centroid_tensor)

    anchor_boxes_minmax_tensor[...,0] = anchor_boxes_minmax_tensor[...,0] - (anchor_boxes_minmax_tensor[...,2]//2)
    anchor_boxes_minmax_tensor[...,1] = anchor_boxes_minmax_tensor[...,1] - (anchor_boxes_minmax_tensor[...,3]//2)
    anchor_boxes_minmax_tensor[...,2] = anchor_boxes_minmax_tensor[...,0] + (anchor_boxes_minmax_tensor[...,2]//2)
    anchor_boxes_minmax_tensor[...,3] = anchor_boxes_minmax_tensor[...,1] + (anchor_boxes_minmax_tensor[...,3]//2)

    return anchor_boxes_minmax_tensor

In [67]:
def compute_IoU(anchor_boxes_minmax_tensor,image_gt_bbox_coords):

    image_gt_bbox_centroid_coords = np.array(image_gt_bbox_coords)
    image_gt_bbox_centroid_coords[:,0] = image_gt_bbox_centroid_coords[:,0] +\
                                         (image_gt_bbox_centroid_coords[:,2] - image_gt_bbox_centroid_coords[:,0])//2
    image_gt_bbox_centroid_coords[:,1] = image_gt_bbox_centroid_coords[:,1] +\
                                         (image_gt_bbox_centroid_coords[:,3] - image_gt_bbox_centroid_coords[:,1])//2
    image_gt_bbox_centroid_coords[:,2] = (image_gt_bbox_centroid_coords[:,2] - image_gt_bbox_centroid_coords[:,0])
    image_gt_bbox_centroid_coords[:,3] = (image_gt_bbox_centroid_coords[:,3] - image_gt_bbox_centroid_coords[:,1]) 
    
    IoU_tensor = np.zeros((len(image_gt_bbox_coords),anchor_boxes_minmax_tensor.shape[0],anchor_boxes_minmax_tensor.shape[1],
                    anchor_boxes_minmax_tensor.shape[2]))
    bbox_present_idxes = [[]]*len(image_gt_bbox_coords) 
    IoU_thresh = 0.25

    for i in range(len(image_gt_bbox_coords)):

        for j in range(anchor_boxes_minmax_tensor.shape[2]):
            """
            centroid_x_condition_anchor_boxes = ((image_gt_bbox_centroid_coords[i,0] > anchor_boxes_minmax_tensor[:,:,j,0]) & 
                                               (image_gt_bbox_centroid_coords[i,0] < anchor_boxes_minmax_tensor[:,:,j,2]))
            centroid_y_condition_anchor_boxes = ((image_gt_bbox_centroid_coords[i,1] > anchor_boxes_minmax_tensor[:,:,j,1]) & 
                                               (image_gt_bbox_centroid_coords[i,1] < anchor_boxes_minmax_tensor[:,:,j,3]))
            grid_cells_idxes = np.argwhere(centroid_x_condition_anchor_boxes & centroid_y_condition_anchor_boxes)
            bbox_present_idxes[i].append(grid_cells_idxes)
            """

            xmin_intersection = np.maximum(image_gt_bbox_coords[i][0],anchor_boxes_minmax_tensor[:,:,j,0])
            ymin_intersection = np.maximum(image_gt_bbox_coords[i][1],anchor_boxes_minmax_tensor[:,:,j,1])

            xmax_intersection = np.minimum(image_gt_bbox_coords[i][2],anchor_boxes_minmax_tensor[:,:,j,2])
            ymax_intersection = np.minimum(image_gt_bbox_coords[i][3],anchor_boxes_minmax_tensor[:,:,j,3])

            intersection_width = np.maximum(0,(xmax_intersection - xmin_intersection))
            intersection_height = np.maximum(0,(ymax_intersection - ymin_intersection))

            intersection_area = intersection_width * intersection_height

            image_gt_bbox_area = image_gt_bbox_centroid_coords[i,2] * image_gt_bbox_centroid_coords[i,3]
            anchor_boxes_width = (anchor_boxes_minmax_tensor[:,:,j,2] - anchor_boxes_minmax_tensor[:,:,j,0])
            anchor_boxes_height = (anchor_boxes_minmax_tensor[:,:,j,3] - anchor_boxes_minmax_tensor[:,:,j,1])

            union_area = ((anchor_boxes_width * anchor_boxes_height) + image_gt_bbox_area) - intersection_area

            IoU_tensor[i,:,:,j] = intersection_area/union_area
            bbox_present_idxes[i].append(np.argwhere(IoU_tensor[i,:,:,j] > 0))

    IoU_tensor_reduced = np.max(IoU_tensor,axis=0)
    anchor_boxes_gt_mask = np.float64(IoU_tensor_reduced > IoU_thresh)

    return image_gt_bbox_centroid_coords, anchor_boxes_gt_mask, bbox_present_idxes, IoU_tensor_reduced

In [5]:
def normalize_bbox_coords(image_size,amchor_boxes_gt_mask,bbox_present_idxes,image_gt_bbox_centroid_coords,anchor_boxes_minmax_tensor):

    image_width, image_height, _ = image_size
    normalized_image_gt_bbox_coords = np.zeros_like(anchor_boxes_minmax_tensor)

    for i in range(len(image_gt_bbox_centroid_coords)):
    
        for j in range(anchor_boxes_minmax_tensor.shape[2]):

            idx = bbox_present_idxes[i][j]

            normalized_image_gt_bbox_coords[idx[:,0],idx[:,1],j,0] = image_gt_bbox_centroid_coords[i][0]/anchor_boxes_minmax_tensor[idx[:,0],idx[:,1],j,2]
            normalized_image_gt_bbox_coords[idx[:,0],idx[:,1],j,1] = image_gt_bbox_centroid_coords[i][1]/anchor_boxes_minmax_tensor[idx[:,0],idx[:,1],j,3]
            normalized_image_gt_bbox_coords[idx[:,0],idx[:,1],j,2] = image_gt_bbox_centroid_coords[i][2]/image_width
            normalized_image_gt_bbox_coords[idx[:,0],idx[:,1],j,3] = image_gt_bbox_centroid_coords[i][3]/image_height

    return normalized_image_gt_bbox_coords

In [6]:
def create_gt_labels_tensor(normalized_image_gt_bbox_coords, IoU_tensor, bbox_present_idxes, image_cls_labels, num_classes):

    cls_probabilities_tensor = np.zeros((normalized_image_gt_bbox_coords.shape[0],normalized_image_gt_bbox_coords.shape[1],num_classes))

    for i in range(len(bbox_present_idxes)):
        idx_0 = bbox_present_idxes[i][0]
        idx_1 = bbox_present_idxes[i][1]
        cls_probabilities_tensor[idx_0[:,0],idx_0[:,1],:] = np.eye(num_classes,num_classes)[image_cls_labels[i]]
        cls_probabilities_tensor[idx_1[:,0],idx_1[:,1],:] = np.eye(num_classes,num_classes)[image_cls_labels[i]]

    gt_labels_tensor = np.copy(normalized_image_gt_bbox_coords)
    confidence_scores = np.expand_dims(IoU_tensor,-1)
    gt_labels_tensor = np.concatenate((gt_labels_tensor,confidence_scores),axis=3)
    gt_labels_tensor = gt_labels_tensor.reshape(gt_labels_tensor.shape[0],gt_labels_tensor.shape[1],gt_labels_tensor.shape[2]*gt_labels_tensor.shape[3])
    gt_labels_tensor = np.concatenate((gt_labels_tensor,cls_probabilities_tensor),axis=2)
    
    return gt_labels_tensor

In [7]:
def multiclass_cnn():

    vgg16 = VGG16(include_top=False,input_shape=(640,480,3),weights="imagenet",pooling=None)
    vgg16.trainable = False
    input_to_vgg16 = vgg16.input
    vgg16_output = Conv2D(filters=90,kernel_size=(14,9))(vgg16.layers[-1].output)

    return Model(inputs=[input_to_vgg16],outputs=[vgg16_output])

In [8]:
model = multiclass_cnn()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 640, 480, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 640, 480, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 640, 480, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 320, 240, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 320, 240, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 320, 240, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 160, 120, 128)     0     

In [9]:
def train_test_df(imgs_base_path,annotations_base_path):

    img_complete_paths = list()
    img_class_labels = list()
    img_gt_bbox_coords = list()

    for single_img_complete_path in pathlib.Path(imgs_base_path).glob("*"):

        img_path = str(single_img_complete_path)
        img_label_path = os.path.join(annotations_base_path,str(single_img_complete_path).split("/")[-1].split(".")[0]+".xml")

        class_gt_labels_list = list()
        gt_bbox_coords_list = list()

        tree = ET.parse(img_label_path)
        root = tree.getroot()

        for member in root.findall("object"):
            """
            for child in member:

                if child.tag == "name":
                    class_gt_labels_list.append(child.text)

                if child.tag == "bndbox":
                    xmin = float(child[0].text)
                    ymin = float(child[1].text)
                    xmax = float(child[2].text)
                    ymax = float(child[3].text)
            """
            class_gt_labels_list.append(member.find("name").text)
            xmin = float(member.find("bndbox/xmin").text)
            ymin = float(member.find("bndbox/ymin").text)
            xmax = float(member.find("bndbox/xmax").text)
            ymax = float(member.find("bndbox/ymax").text)
            
            bbox_width = xmax - xmin
            bbox_height = ymax - ymin
            

            gt_bbox_coords_list.append([xmin,ymin,bbox_width,bbox_height])

        img_complete_paths.append(str(single_img_complete_path))
        img_class_labels.append(class_gt_labels_list)
        img_gt_bbox_coords.append(gt_bbox_coords_list)

    return pd.DataFrame(data={"img_path":img_complete_paths,
                              "img_gt_class_labels":img_class_labels,
                              "img_gt_bbox_coords":img_gt_bbox_coords})

In [10]:
data_df = train_test_df("/home/thasin/class-projects/annotate/dataset/VOC2012_train_val/VOC2012_train_val/JPEGImages","/home/thasin/class-projects/annotate/dataset/VOC2012_train_val/VOC2012_train_val/Annotations")

In [11]:
data_df.shape

(17125, 3)

In [27]:
unique_labels = set()

for img_labels in data_df.iloc[:,1]:
    unique_labels = unique_labels.union(set(img_labels))

unique_labels = list(unique_labels)
#unique_labels.insert(0,"background")

labels2idx = dict(zip(unique_labels,range(len(unique_labels))))

In [28]:
def labels2idx_mapping(img_labels):
    
     return list(map(lambda x: labels2idx[x],img_labels))

In [29]:
data_df.iloc[:,1] = data_df.iloc[:,1].apply(labels2idx_mapping)


In [26]:
data_df["img_gt_class_labels"] = data_df["img_gt_class_labels"].apply(json.dumps)
data_df["img_gt_bbox_coords"] = data_df["img_gt_bbox_coords"].apply(json.dumps)

In [32]:
training_data = data_df.iloc[0:15000,:]
cv_data = data_df.iloc[15000:,]

training_data.to_csv("./training_data.csv",index=False)
cv_data.to_csv("./cv_data.csv",index=False)

In [33]:
training_data = pd.read_csv("training_data.csv")
training_data["img_gt_class_labels"] = training_data["img_gt_class_labels"].apply(ast.literal_eval)
training_data["img_gt_bbox_coords"] = training_data["img_gt_bbox_coords"].apply(ast.literal_eval)

In [34]:
plt.imread(training_data.iloc[0,0]).shape


(375, 500, 3)

In [35]:
anchor_boxes_tensor = anchor_boxes((640,480,3),(7,7),(1/2,2))
anchor_boxes_minmax_tensor = centroid2minmax(anchor_boxes_tensor)
"""
gt_bboxes_mask,iou_tensor = compute_IoU(anchor_boxes_tensor,training_data.iloc[0,2])
#normalized_gt_bbox_coords = normalize_bbox_coords((640,480,3),training_data.iloc[0,2],gt_bboxes_mask,anchor_boxes_tensor)
#gt_labels_tensor = create_gt_labels_tensor(normalized_gt_bbox_coords,iou_tensor,gt_bboxes_mask,training_data.iloc[0,1],
                                           20)"""

'\ngt_bboxes_mask,iou_tensor = compute_IoU(anchor_boxes_tensor,training_data.iloc[0,2])\n#normalized_gt_bbox_coords = normalize_bbox_coords((640,480,3),training_data.iloc[0,2],gt_bboxes_mask,anchor_boxes_tensor)\n#gt_labels_tensor = create_gt_labels_tensor(normalized_gt_bbox_coords,iou_tensor,gt_bboxes_mask,training_data.iloc[0,1],\n                                           20)'

In [36]:
def custom_data_generator(df,mb_size):

    for i in range(df.shape[0]//mb_size):

        X_train_mb = list()
        Y_train_mb = list()
        GT_mask_train_mb = list()

        for j in range(0,mb_size):

            df_mb = df.iloc[(i*mb_size)+j]
            img_path = df_mb["img_path"]

            X_train_mb.append(cv2.resize(plt.imread(img_path),(640,480)))

            gt_bboxes_mask, iou_tensor = compute_IoU(anchor_boxes_tensor,df_mb["img_gt_bbox_coords"])
            normalized_img_gt_bbox_coords = normalize_bbox_coords((640,480,3),df_mb["img_gt_bbox_coords"],
                                                                  gt_bboxes_mask,anchor_boxes_tensor)
            Y_train, final_gt_bboxes_mask = create_gt_labels_tensor(normalized_img_gt_bbox_coords,iou_tensor,
                                                                    gt_bboxes_mask,df_mb["img_gt_class_labels"],20)
            
            Y_train_mb.append(Y_train)
            GT_mask_train_mb.append(final_gt_bboxes_mask)
            
        yield np.array(X_train_mb), np.array(Y_train_mb), np.array(GT_mask_train_mb)

In [37]:
training_data_generator = custom_data_generator(training_data,5)


In [38]:
def custom_loss_fn(Y_true_mb,Y_pred_mb,GT_mask_train_mb,lambda_coord,lambda_noobj):

    squared_error = (Y_true_mb - Y_pred_mb)**2

    """
    squared_error_with_mask = GT_mask_train_mb * squared_error
    squared_error_with_neg_mask = (1.0 - GT_mask_train_mb) * squared_error
    """

    cx_cy_squared_error_tensor = np.concatenate((GT_mask_train_mb*squared_error[:,:,:,0:2],
                                                 GT_mask_train_mb*squared_error[:,:,:,5:7]),axis=0)
    
    sqrt_squared_error = (np.sqrt(Y_true_mb) - np.sqrt(Y_pred_mb))**2

    #sqrt_squared_error_with_mask = GT_mask_train_mb * sqrt_squared_error
    
    wh_sqrt_squared_error_tensor = np.concatenate((GT_mask_train_mb*sqrt_squared_error[:,:,:,2:4],
                                                   GT_mask_train_mb*sqrt_squared_error[:,:,:,7:9]),axis=0)
    
    loss_fn_first_term = lambda_coord*np.sum(cx_cy_squared_error_tensor)
    loss_fn_second_term = lambda_coord*np.sum(wh_sqrt_squared_error_tensor)

    confidence_score_error_tensor = GT_mask_train_mb*np.concatenate((squared_error[:,:,:,4],
                                                                     squared_error[:,:,:,9]),axis=0)
    
    loss_fn_third_term = np.sum(confidence_score_error_tensor)

    confidence_score_noobj_error_tensor = (1.0 - GT_mask_train_mb)*np.concatenate((squared_error[:,:,:,4],
                                                                                   squared_error[:,:,:,9]),axis=0)

    loss_fn_forth_term = lambda_noobj*np.sum(confidence_score_noobj_error_tensor)

    loss_fn_fifth_term = GT_mask_train_mb*np.sum(squared_error[:,:,:,10:])

    overall_loss_fn = loss_fn_first_term + loss_fn_second_term + loss_fn_third_term +\
                        loss_fn_forth_term + loss_fn_fifth_term
    
    return overall_loss_fn

In [43]:
optimizer = SGD()

In [76]:
@tf.function
def training_step(X_train_mb,Y_true_train_mb,GT_mask_train_mb):

    with tf.GradientTape() as tape:
            
        Y_pred_train_mb = model(X_train_mb, training=True)
        training_loss = custom_loss_fn(Y_true_train_mb, Y_pred_train_mb,GT_mask_train_mb,5,0.5)

    grads = tape.gradient(training_loss, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))

    train_acc_metric.update_state(Y_true_train_mb,Y_pred_train_mb)

    return training_loss

In [79]:
@tf.function
def testing_forward_pass(X_test_mb,Y_true_test_mb,GT_mask_test_mb):

    Y_pred_test_mb = model(X_test_mb,training=False)
    testing_loss = custom_loss_fn(Y_true_test_mb,Y_pred_test_mb,GT_mask_test_mb)
    test_acc_metric.update_state(Y_true_test_mb,Y_pred_test_mb)

    return testing_loss

In [80]:
epochs = 20
training_data_mb_size = 5
testing_data_mb_size = 5

In [81]:
train_acc_metric = tf.keras.metrics.CategoricalAccuracy()
test_acc_metric = tf.keras.metrics.CategoricalAccuracy()

for epoch in range(epochs):

    training_data_generator = custom_data_generator(training_data,5)

    for time_step, (X_train_mb, cv_data,GT_mask_train_mb)in enumerate(training_data_generator):
        training_loss = training_step(X_train_mb,cv_data,GT_mask_train_mb)

        if (time_step+1) % 10 == 0:
            print("Epoch %d, Time Step %d, Training loss for one mini batch: %.4f"
            % (epoch+1, time_step+1, float(training_loss)))
            
    training_acc = train_acc_metric.result()    
    print("Epoch %d, Training Accuracy: %.2f" % (epoch+1,float(training_acc)))
    train_acc_metric.reset_states()

    testing_data_generator = custom_data_generator(cv_data,5)

    for X_test_mb,GT_mask_test_mb in testing_data_generator:
        testing_loss = testing_forward_pass(X_test_mb,cv_data,GT_mask_test_mb)

    print("\nEpoch %d, Testing Loss for last mini batch: %.4f" % (epoch+1,float(testing_loss)))
    testing_acc = test_acc_metric.result()
    print("Epoch %d, Testing Accuracy: %.2f" % (epoch+1,float(testing_acc)))
    test_acc_metric.reset_states()

    print("\n\n")

TypeError: len() of unsized object