# Intro to Object Detection Colab

Welcome to the object detection colab!  This demo will take you through the steps of running an "out-of-the-box" detection model on a collection of images.

## Imports and Setup

In [1]:
import os
import pathlib
import matplotlib
import matplotlib.pyplot as plt

import io
import scipy.misc
import numpy as np
from six import BytesIO
from PIL import Image, ImageDraw, ImageFont

import tensorflow as tf
import pandas as pd
import time
from tqdm.notebook import tqdm

from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.builders import model_builder

%matplotlib inline

2024-12-11 15:44:39.656503: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-11 15:44:39.682874: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Utilities

In [2]:
def load_image_into_numpy_array(path):
  """Load an image from file into a numpy array.

  Puts image into numpy array to feed into tensorflow graph.
  Note that by convention we put it into a numpy array with shape
  (height, width, channels), where channels=3 for RGB.

  Args:
    path: the file path to the image

  Returns:
    uint8 numpy array with shape (img_height, img_width, 3)
  """
  img_data = tf.io.gfile.GFile(path, 'rb').read()
  image = Image.open(BytesIO(img_data))
  (im_width, im_height) = image.size
  return np.array(image.getdata()).reshape(
      (im_height, im_width, 3)).astype(np.uint8)

def get_keypoint_tuples(eval_config):
  """Return a tuple list of keypoint edges from the eval config.
  
  Args:
    eval_config: an eval config containing the keypoint edges
  
  Returns:
    a list of edge tuples, each in the format (start, end)
  """
  tuple_list = []
  kp_list = eval_config.keypoint_edge
  for edge in kp_list:
    tuple_list.append((edge.start, edge.end))
  return tuple_list

### Build a detection model and load pre-trained model weights

This sometimes takes a little while, please be patient!

In [3]:
pipeline_config = "/home/ttran/projects/TFmodels2/models/research/object_detection/training/ssd_resnet152_v1_fpn_640x640_coco17_tpu-8.config"
model_dir = "/home/ttran/projects/TFmodels2/models/research/object_detection/ssd_resnet152_v1_fpn_640x640_coco17_tpu-8_3"

# Load pipeline config and build a detection model
configs = config_util.get_configs_from_pipeline_file(pipeline_config)
model_config = configs['model']
detection_model = model_builder.build(
      model_config=model_config, is_training=False)

# Restore checkpoint
ckpt = tf.compat.v2.train.Checkpoint(
      model=detection_model)
ckpt.restore(os.path.join(model_dir, 'ckpt-26')).expect_partial()

def get_model_detection_function(model):
  """Get a tf.function for detection."""

  @tf.function
  def detect_fn(image):
    """Detect objects in image."""

    image, shapes = model.preprocess(image)
    prediction_dict = model.predict(image, shapes)
    detections = model.postprocess(prediction_dict, shapes)

    return detections, prediction_dict, tf.reshape(shapes, [-1])

  return detect_fn

detect_fn = get_model_detection_function(detection_model)

2024-12-11 15:44:40.948898: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-12-11 15:44:40.950376: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-12-11 15:44:40.972154: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

# Load label map data (for plotting).

Label maps correspond index numbers to category names, so that when our convolution network predicts `5`, we know that this corresponds to `airplane`.  Here we use internal utility functions, but anything that returns a dictionary mapping integers to appropriate string labels would be fine.

In [4]:
label_map_path = configs['eval_input_config'].label_map_path
label_map = label_map_util.load_labelmap(label_map_path)
categories = label_map_util.convert_label_map_to_categories(
    label_map,
    max_num_classes=label_map_util.get_max_label_map_index(label_map),
    use_display_name=True)
category_index = label_map_util.create_category_index(categories)
label_map_dict = label_map_util.get_label_map_dict(label_map, use_display_name=True)

### Putting everything together!

Run the below code which loads an image, runs it through the detection model and visualizes the detection results, including the keypoints.

Note that this will take a long time (several minutes) the first time you run this code due to tf.function's trace-compilation --- on subsequent runs (e.g. on new images), things will be faster.

Here are some simple things to try out if you are curious:
* Try running inference on your own images (local paths work)
* Modify some of the input images and see if detection still works.  Some simple things to try out here (just uncomment the relevant portions of code) include flipping the image horizontally, or converting to grayscale (note that we still expect the input image to have 3 channels).
* Print out `detections['detection_boxes']` and try to match the box locations to the boxes in the image.  Notice that coordinates are given in normalized form (i.e., in the interval [0, 1]).
* Set min_score_thresh to other values (between 0 and 1) to allow more detections in or to filter out more detections.

Note that you can run this cell repeatedly without rerunning earlier cells.


In [None]:
image_dir = "/mnt/sda1/Backup/heif_lite"
#image_dir = "/home/ttran/projects/TFmodels2/models/research/object_detection/test_images"
#image_path = os.path.join(image_dir, '0a00b11de9ad098befcd6543625b311a9a90ca80_lite.jpg')

result = []
label_id_offset = 1
timestamp = time.time()

def run_inference(directory):
    for filename in tqdm(os.listdir(directory), desc="Processing"):
        _result = {}
        file_path = os.path.join(directory, filename) 
        start_time = time.time()
        image_np = load_image_into_numpy_array(file_path)
        input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0), dtype=tf.float32)
        detections, predictions_dict, shapes = detect_fn(input_tensor)
        # Class result
        pred_class = (detections['detection_classes'][0].numpy() + label_id_offset).astype(int)
        pred_score = detections['detection_scores'][0].numpy()
        end_time = time.time()
        _result['name'] = filename
        _result['timestamp'] = timestamp
        _result['inferred'] = "x"
        _result['time(ms)'] = round((end_time - start_time)*1000,2)
        for i in range(len(pred_class)):
            _result['pred_' + str(i+1)] = category_index[pred_class[i]]['name']
            _result['pred_' + str(i+1) + '_score'] = pred_score[i]
        print(f"Finished {filename} in {_result['time(ms)']} ms")
        result.append(_result)
    return result

result = run_inference(image_dir)
result_df = pd.DataFrame(result)
result_df.to_parquet(f"/mnt/sda1/Backup/result/result_{timestamp}.parquet")


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Finished 0a7dd59bb60415e67535a5b0b1c37c79b5fcdcf9.jpg in 2797.01 ms
Finished 0a7dd59bb60415e67535a5b0b1c37c79b5fcdcf9_lite_lite.jpg in 137.68 ms
Finished 0a1d30b3fe35b24a4c4cf538a1969696dc53ae01_lite.jpg in 138.04 ms
Finished 0a00b11de9ad098befcd6543625b311a9a90ca80_lite.jpg in 136.85 ms
Finished 0a00b11de9ad098befcd6543625b311a9a90ca80.jpg in 2734.15 ms
Finished 0a7dd59bb60415e67535a5b0b1c37c79b5fcdcf9_lite.jpg in 135.87 ms
Finished 0a1d30b3fe35b24a4c4cf538a1969696dc53ae01.jpg in 2673.58 ms


In [8]:
num_pred = 100
list_of_pred = []
list_of_score=[]
col_to_drop = []
for i in range(num_pred):
    pred_col = "pred_" + str(i + 1)
    list_of_pred.append(pred_col)
    list_of_score = "pred_" + str(i + 1) + "_score"

In [14]:
view_result = pd.read_parquet(f"/mnt/sda1/Backup/result/result_1733953588.004121.parquet")
#view_result.sort_values(by=['pred_1_score'], ascending=False).head(20)
view_result

Unnamed: 0,name,timestamp,inferred,time(ms),pred_1,pred_1_score,pred_2,pred_2_score,pred_3,pred_3_score,...,pred_96,pred_96_score,pred_97,pred_97_score,pred_98,pred_98_score,pred_99,pred_99_score,pred_100,pred_100_score
0,0a7dd59bb60415e67535a5b0b1c37c79b5fcdcf9.jpg,1733954000.0,x,2797.01,apr22,0.83878,jun22,0.832451,aug22,0.25899,...,aug22,0.00828,may22,0.008206,aug22,0.008181,may22,0.008159,jul22,0.008137
1,0a7dd59bb60415e67535a5b0b1c37c79b5fcdcf9_lite_...,1733954000.0,x,137.68,jun22,0.855139,apr22,0.801726,aug22,0.299961,...,may22,0.008552,mar22,0.008547,apr22,0.008485,may22,0.008473,mar22,0.008431
2,0a1d30b3fe35b24a4c4cf538a1969696dc53ae01_lite.jpg,1733954000.0,x,138.04,may22,0.255437,mar22,0.152587,aug22,0.138384,...,may22,0.009245,jun22,0.009241,may22,0.009179,jun22,0.009152,jun22,0.009145
3,0a00b11de9ad098befcd6543625b311a9a90ca80_lite.jpg,1733954000.0,x,136.85,jun22,0.804075,may22,0.527216,apr22,0.308503,...,jun22,0.013594,may22,0.013395,apr22,0.012971,apr22,0.012934,may22,0.012695
4,0a00b11de9ad098befcd6543625b311a9a90ca80.jpg,1733954000.0,x,2734.15,jun22,0.782326,may22,0.516737,apr22,0.335677,...,jul22,0.011586,apr22,0.011576,mar22,0.011284,may22,0.011204,jun22,0.011188
5,0a7dd59bb60415e67535a5b0b1c37c79b5fcdcf9_lite.jpg,1733954000.0,x,135.87,jun22,0.855032,apr22,0.801876,aug22,0.300145,...,may22,0.008552,mar22,0.00855,apr22,0.008501,may22,0.008487,mar22,0.008431
6,0a1d30b3fe35b24a4c4cf538a1969696dc53ae01.jpg,1733954000.0,x,2673.58,may22,0.259532,apr22,0.163305,aug22,0.158829,...,may22,0.009447,jun22,0.009388,apr22,0.009382,mar22,0.009352,mar22,0.009319


In [15]:
view_result2 = view_result.drop(list_of_pred, axis=1).drop(['name', 'timestamp', 'inferred', 'time(ms)'], axis=1)
view_result2

Unnamed: 0,pred_1_score,pred_2_score,pred_3_score,pred_4_score,pred_5_score,pred_6_score,pred_7_score,pred_8_score,pred_9_score,pred_10_score,...,pred_91_score,pred_92_score,pred_93_score,pred_94_score,pred_95_score,pred_96_score,pred_97_score,pred_98_score,pred_99_score,pred_100_score
0,0.83878,0.832451,0.25899,0.234456,0.132826,0.102152,0.040367,0.033378,0.031389,0.026809,...,0.008457,0.008407,0.008376,0.008364,0.008351,0.00828,0.008206,0.008181,0.008159,0.008137
1,0.855139,0.801726,0.299961,0.247536,0.124324,0.106461,0.046577,0.034511,0.033634,0.032862,...,0.008898,0.008897,0.008891,0.008871,0.008563,0.008552,0.008547,0.008485,0.008473,0.008431
2,0.255437,0.152587,0.138384,0.117795,0.067158,0.025798,0.022689,0.022608,0.020634,0.020387,...,0.009397,0.009357,0.009328,0.009318,0.009254,0.009245,0.009241,0.009179,0.009152,0.009145
3,0.804075,0.527216,0.308503,0.244802,0.213589,0.190276,0.131537,0.055173,0.050884,0.049137,...,0.013901,0.013855,0.013831,0.01377,0.013603,0.013594,0.013395,0.012971,0.012934,0.012695
4,0.782326,0.516737,0.335677,0.180942,0.170238,0.07747,0.074252,0.040369,0.033265,0.033004,...,0.012153,0.012007,0.01197,0.011907,0.011876,0.011586,0.011576,0.011284,0.011204,0.011188
5,0.855032,0.801876,0.300145,0.247578,0.124633,0.10643,0.046524,0.034469,0.033585,0.032862,...,0.008909,0.008894,0.008872,0.00887,0.008553,0.008552,0.00855,0.008501,0.008487,0.008431
6,0.259532,0.163305,0.158829,0.145382,0.083921,0.030476,0.026489,0.022597,0.022585,0.021699,...,0.009629,0.009593,0.009508,0.009502,0.009485,0.009447,0.009388,0.009382,0.009352,0.009319


In [16]:
view_result2.idxmax(axis=1)

0    pred_1_score
1    pred_1_score
2    pred_1_score
3    pred_1_score
4    pred_1_score
5    pred_1_score
6    pred_1_score
dtype: object

In [17]:
view_result2.max(axis=1)

0    0.838780
1    0.855139
2    0.255437
3    0.804075
4    0.782326
5    0.855032
6    0.259532
dtype: float32