# Most of the code is from Keras Retinanet repository
https://github.com/fizyr/keras-retinanet/blob/master/examples/ResNet50RetinaNet.ipynb

# Make sure you have the proper environment from requirements.txt

In [1]:
!ls ../requirements.txt
!pip install -r ../requirements.txt

# doc about venv
## https://docs.python.org/3/library/venv.html

# Loading the venv into jupyter notebook (wont be required if its docker)
python -m ipykernel install --user --name=my-virtualenv-name


../requirements.txt


In [1]:
# show images inline
%matplotlib inline

# automatically reload modules when they have changed
%load_ext autoreload
%autoreload 2

# import keras
import keras

import sys
sys.path.insert(0, '../src/')

# import keras_retinanet
from keras_retinanet import models
from keras_retinanet.utils.image import read_image_bgr, preprocess_image, resize_image
from keras_retinanet.utils.visualization import draw_box, draw_caption
from keras_retinanet.utils.colors import label_color
from keras_retinanet.utils.gpu import setup_gpu

# import miscellaneous modules
import matplotlib.pyplot as plt
import cv2
import os
import numpy as np
import time

# use this to change which GPU to use
gpu = 0

# set the modified tf session as backend in keras
setup_gpu(gpu)

Using TensorFlow backend.


# Download the retinanet pretrained model

In [2]:
!ls ../model/resnet50_coco_best_v2.1.0.h5

../model/resnet50_coco_best_v2.1.0.h5


In [3]:
!wget -P ../model/ https://github.com/fizyr/keras-retinanet/releases/download/0.5.1/resnet50_coco_best_v2.1.0.h5

--2020-05-24 19:29:07--  https://github.com/fizyr/keras-retinanet/releases/download/0.5.1/resnet50_coco_best_v2.1.0.h5
Resolving github.com (github.com)... 52.192.72.89
Connecting to github.com (github.com)|52.192.72.89|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/100249425/b7184a80-9350-11e9-9cc2-454f5c616394?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200524%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200524T102907Z&X-Amz-Expires=300&X-Amz-Signature=587c58002bb1a69d792f5dad79756750edd54d5ab7b52000cec86f9cbb10d8ca&X-Amz-SignedHeaders=host&actor_id=0&repo_id=100249425&response-content-disposition=attachment%3B%20filename%3Dresnet50_coco_best_v2.1.0.h5&response-content-type=application%2Foctet-stream [following]
--2020-05-24 19:29:07--  https://github-production-release-asset-2e65be.s3.amazonaws.com/100249425/b7184a80-9350-11e9-9cc2-454f5c616394?X-Amz-Algorit

In [9]:
!ls -lha ../model/resnet50_coco_best_v2.1.0.h5

-rw-r--r--  1 yoovraj.shinde  679754705   146M Jun 20  2019 ../model/resnet50_coco_best_v2.1.0.h5


In [5]:
# adjust this to point to your downloaded/trained model
# models can be downloaded here: https://github.com/fizyr/keras-retinanet/releases
model_path = os.path.join('../', 'model', 'resnet50_coco_best_v2.1.0.h5')

# load retinanet model
model = models.load_model(model_path, backbone_name='resnet50')

# if the model is not converted to an inference model, use the line below
# see: https://github.com/fizyr/keras-retinanet#converting-a-training-model-to-inference-model
#model = models.convert_model(model)

#print(model.summary())

# load label to names mapping for visualization purposes
# labels_to_names = {0: 'person', 1: 'car'}
labels_to_names = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'}

tracking <tf.Variable 'Variable:0' shape=(9, 4) dtype=float32, numpy=
array([[-22.627417, -11.313708,  22.627417,  11.313708],
       [-28.50876 , -14.25438 ,  28.50876 ,  14.25438 ],
       [-35.918785, -17.959393,  35.918785,  17.959393],
       [-16.      , -16.      ,  16.      ,  16.      ],
       [-20.158737, -20.158737,  20.158737,  20.158737],
       [-25.398417, -25.398417,  25.398417,  25.398417],
       [-11.313708, -22.627417,  11.313708,  22.627417],
       [-14.25438 , -28.50876 ,  14.25438 ,  28.50876 ],
       [-17.959393, -35.918785,  17.959393,  35.918785]], dtype=float32)> anchors
tracking <tf.Variable 'Variable:0' shape=(9, 4) dtype=float32, numpy=
array([[-45.254833, -22.627417,  45.254833,  22.627417],
       [-57.01752 , -28.50876 ,  57.01752 ,  28.50876 ],
       [-71.83757 , -35.918785,  71.83757 ,  35.918785],
       [-32.      , -32.      ,  32.      ,  32.      ],
       [-40.317474, -40.317474,  40.317474,  40.317474],
       [-50.796833, -50.796833,  50.7



In [6]:
print(cv2.__version__)

4.2.0


# Copy the required files into data folder
## train_00.json (annotated data for the train_00 vide)
## train_00.mp4 (video file)

In [10]:
!ls -lha ../data/train_00.json
!ls -lha ../data/train_00.mp4

-rw-rw-r--  1 yoovraj.shinde  679754705   428K Mar 26 23:16 ../data/train_00.json
-rw-r--r--@ 1 yoovraj.shinde  679754705    87M May 19 21:18 ../data/train_00.mp4


Load the json file

In [11]:
import json
with open('../data/train_00.json') as f:
    train_00_json = json.load(f)


Open up the input and output video handlers

In [12]:
cap = cv2.VideoCapture('../data/train_00.mp4')
w = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)


fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('../data/train_00_output.mp4', fourcc, 15.0, (int(w), int(h)))

# Check if camera opened successfully
if (cap.isOpened()== False):
  print("Error opening video stream or file")



Process each frame.

Run model prediction on each frame and draw red boxes for prediction of pedestrians.

Draw blue box for ground truth.

In [13]:
# use this variable if you want to process first n frames and uncomment the frame_count condition
frame_count=0

# Read until video is completed
while (cap.isOpened()):
    if (frame_count == 10):
        break
    # Capture frame-by-frame
    ret, image = cap.read()
    if ret == True:
        draw = image.copy()
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
         # preprocess image for network
        image = preprocess_image(image)
        image, scale = resize_image(image)

        # process image
        start = time.time()
        boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0))
        print("processing time: ", time.time() - start)

        # correct for image scale
        boxes /= scale

        # visualize detections
        for box, score, label in zip(boxes[0], scores[0], labels[0]):
            # scores are sorted so we can break
            ## Just selecting label=0 (Pedestrian for now)
            if score < 0.5 or not (label==0):
                break

            color = label_color(label)

            b = box.astype(int)
            draw_box(draw, b, color=color)

            caption = "{} {:.3f}".format(labels_to_names[label], score)
            draw_caption(draw, b, caption)
        
        # visualize ground truth
        if 'Pedestrian' in train_00_json['sequence'][frame_count].keys():
            pedestrians_list = train_00_json['sequence'][frame_count]['Pedestrian']
        else:
            pedestrians_list=[]
        for box in pedestrians_list:
            b = box['box2d']
            b = list(map(int, b))
            draw_box(draw, b, color=(255, 0, 0))
            draw_caption(draw, b, "P")
        
        frame_count = frame_count + 1
        # write the resulting frame
        out.write(draw)
 
    # Break the loop
    else:
        break
 

processing time:  4.502428770065308
processing time:  2.030115842819214
processing time:  2.010169267654419
processing time:  2.3186609745025635
processing time:  2.2867300510406494
processing time:  2.399228096008301
processing time:  2.3198962211608887
processing time:  2.344682216644287
processing time:  2.2800979614257812
processing time:  2.6403582096099854


In [14]:
# When everything done, release the video capture object
cap.release()
 
# Closes all the frames
cv2.destroyAllWindows()

# write the file
out.release()