In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import os, sys
import time
import math
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

from copy import deepcopy

import cv2

  from ._conv import register_converters as _register_converters


In [2]:
# import necessary pwcnet (tfoptflow) functions

sys.path.append(os.path.realpath('../tfoptflow/tfoptflow'))

from model_pwcnet import ModelPWCNet, _DEFAULT_PWCNET_TEST_OPTIONS
from visualize import plot_img_pairs_w_flows
from optflow import flow_to_img, flow_write_as_png

CKPT_PATH = '../../pwcnet/pwcnet_sm/pwcnet.ckpt-592000'

In [3]:
# video_in_path = '/mnt/disks/tensorflow-disk/video_distillation/hockey1/hockey1002.mp4'
video_in_path = '/mnt/disks/tensorflow-disk/video_distillation/giraffe1/giraffe1000.mp4'
max_frames = 1001
training_stride = 8
# detection_path = '/mnt/disks/tensorflow-disk/video_distillation/hockey1/detectron_large_mask_rcnn_1_hockey1002.npy'
detection_path = '/mnt/disks/tensorflow-disk/video_distillation/giraffe1/detectron_large_mask_rcnn_1_giraffe1000.npy'
stats_path = None
start_frame = 0
height = 1080
width = 1920

In [5]:
# import helper functions from online_scene_seg

# TODO: these are copied over because online_scene_seg as relative imports that
# don't work with this notebook's path.

def get_class_groups():
    people_cls = [1]
    twowheeler_cls = [2, 4]
    vehicle_cls = [3, 6, 7, 8]

    #(40, 'bottle')
    #(41, 'wine glass')
    #(42, 'cup')
    #(43, 'fork')
    #(44, 'knife')
    #(45, 'spoon')
    #(46, 'bowl')

    utensils_cls = [40, 41, 42, 43, 44, 45, 46]

    #(14, 'bench')
    #(57, 'chair')
    #(58, 'couch')
    #(61, 'dining table')
    furniture_cls = [14, 57, 58, 61]

    fine_classes = False
    
    if fine_classes:
        cls = [1, 2, 4, 10, 40, 42, 46, 57, 61]
        class_groups = [[x] for x in cls]
        class_groups.append([3, 6, 8])
    else:
        class_groups = [people_cls, twowheeler_cls, vehicle_cls, utensils_cls, furniture_cls]

    # just detect people and giraffes
    class_groups = [people_cls, [24]]
        
    return class_groups

def update_stats(labels, pred_vals, class_tp, class_fp, class_fn,
                 class_total, class_correct, weight_mask, frame_stats,
                 entropy_vals, frame_id):
    eps = 1e-06
    num_classes = len(class_total)
    curr_tp = np.zeros(num_classes, np.float32)
    curr_fp = np.zeros(num_classes, np.float32)
    curr_fn = np.zeros(num_classes, np.float32)
    curr_iou = np.zeros(num_classes, np.float32)
    curr_correct = np.zeros(num_classes, np.float32)
    curr_total = np.zeros(num_classes, np.float32)
    correct_mask = (pred_vals == labels)

    for g in range(num_classes):
        cls_mask = np.logical_and((labels == g), weight_mask)
        cls_tp_mask = np.logical_and(cls_mask, correct_mask)
        cls_tp = np.sum(cls_tp_mask)
        curr_tp[g] = cls_tp
        class_tp[g] = class_tp[g] + cls_tp

        cls_total = np.sum(cls_mask)
        curr_total[g] = cls_total
        curr_correct[g] = cls_tp
        class_total[g] = class_total[g] + cls_total
        class_correct[g] = class_correct[g] + cls_tp

        pred_mask = np.logical_and((pred_vals == g), weight_mask)
        cls_fp_mask = np.logical_and(np.logical_not(cls_mask), pred_mask)
        cls_fn_mask = np.logical_and(cls_mask, np.logical_not(pred_mask))

        cls_fp = np.sum(cls_fp_mask)
        cls_fn = np.sum(cls_fn_mask)
        curr_fp[g] = cls_fp
        curr_fn[g] = cls_fn
        class_fp[g] = class_fp[g] + cls_fp
        class_fn[g] = class_fn[g] + cls_fn

        cls_iou = (cls_tp + eps) / (cls_tp + cls_fp + cls_fn + eps)
        curr_iou[g] = cls_iou

    frame_stats[frame_id] = { 'tp': curr_tp,
                              'fp': curr_fp,
                              'fn': curr_fn,
                              'iou': curr_iou,
                              'correct': curr_correct,
                              'total': curr_total,
                              'average_entropy': entropy_vals}

In [6]:
# Mask R-CNN utilities

sys.path.append(os.path.realpath('../datasets'))
sys.path.append(os.path.realpath('../utils'))

from mask_rcnn_tfrecords import get_dataset, batch_segmentation_masks,\
                                visualize_masks
from mask_rcnn_stream import MaskRCNNSequenceStream

In [7]:
# initialize PWCNet in test mode

nn_opts = deepcopy(_DEFAULT_PWCNET_TEST_OPTIONS)
nn_opts['verbose'] = True
nn_opts['ckpt_path'] = CKPT_PATH
nn_opts['batch_size'] = 1
nn_opts['gpu_devices'] = ['/device:GPU:0']
nn_opts['controller'] = '/device:GPU:0'
nn_opts['use_dense_cx'] = False
nn_opts['use_res_cx'] = False
nn_opts['pyr_lvls'] = 6
nn_opts['flow_pred_lvl'] = 2

# since the model generates flow padded to multiples of 64
# reduce back to the input video size
nn_opts['adapt_info'] = (1, height, width, 2)

nn = ModelPWCNet(mode='test', options=nn_opts)

Building model...
Instructions for updating:
`normal` is a deprecated alias for `truncated_normal`
... model built.
Loading model checkpoint ../../pwcnet/pwcnet_sm/pwcnet.ckpt-592000 for eval or testing...

INFO:tensorflow:Restoring parameters from ../../pwcnet/pwcnet_sm/pwcnet.ckpt-592000
... model loaded


In [4]:
from flow_cython import flow

In [9]:
# read in both the video and the detections using stream
video_files = [video_in_path]
detections_paths = [detection_path]

input_streams = MaskRCNNSequenceStream(video_files, 
                                       detections_paths,
                                       start_frame=start_frame, 
                                       stride=1)

# get the class groups
class_groups = get_class_groups()

# initialize the metrics

curr_frame = 0
prev_frame = None
prev_in_frame = None
pred = None
per_frame_stats = {}
num_classes = len(class_groups) + 1

class_correct = np.zeros(num_classes, np.float32)
class_total = np.zeros(num_classes, np.float32)
class_tp = np.zeros(num_classes, np.float32)
class_fp = np.zeros(num_classes, np.float32)
class_fn = np.zeros(num_classes, np.float32)
class_iou = np.zeros(num_classes, np.float32)

pos_matrix = np.zeros(width * height)
for i in range(width * height):
    pos_matrix[i] = i
pos_matrix_idx = np.int32(pos_matrix)

for frame, boxes, classes, scores, masks, num_objects, frame_id in input_streams:
    
    if curr_frame >= max_frames:
        break
   
    
    boxes = np.expand_dims(boxes, axis=0)
    classes = np.expand_dims(classes, axis=0)
    scores = np.expand_dims(scores, axis=0)
    masks = np.expand_dims(masks, axis=0)
    num_objects = np.expand_dims(num_objects, axis=0)
    
    labels_vals, _ = batch_segmentation_masks(1,
                                              (height, width),
                                              boxes, classes, masks, scores,
                                              num_objects, True,
                                              class_groups)
    labels_val = np.reshape(labels_vals, (height, width))
    if prev_frame is None:
        prev_frame = frame
    
    if curr_frame % training_stride == 0:
        pred = labels_val
        pred_ext = np.reshape(pred, (1, height, width))
        gt_pred = labels_val
        gt_im = frame
    else:
        # compute forward flow
        start = time.time()
        pred_flows = nn.predict_from_img_pairs([(prev_frame, frame)], 
                                               batch_size=1, 
                                               verbose=False)
        # shape: (height, width, 2)
        pred_flow = np.round(pred_flows[0]).astype(np.int32)
        
        # Cython function
        pred = flow(pred, pred_flow)
        end = time.time()
    
        # update stats
        pred_ext = np.reshape(pred, (1, height, width))
        update_stats(labels_vals, pred_ext, class_tp, class_fp, class_fn,
                     class_total, class_correct, np.ones(labels_vals.shape, dtype=np.bool),
                     per_frame_stats, None, curr_frame)
        
    # visualize predictions
#     vis_shape = (height, width, 3)
#     vis_labels = visualize_masks(pred_ext, 1, vis_shape,
#                                  num_classes=num_classes)
#     vis_labels = vis_labels[0]
#     labels_image = cv2.addWeighted(frame, 0.5, vis_labels, 0.5, 0)

#     fig = plt.figure(figsize=(16, 9))
#     ax = fig.add_subplot(111)
#     ax.set_aspect(7)
#     ax.imshow(labels_image)
#     # plt.show()
    
#     fig.savefig('/home/stevenzc3/giraffe_flow/{:03d}.png'.format(curr_frame))
#     plt.close()
    
    # end = time.time()
    prev_frame = frame
    if curr_frame in per_frame_stats:
        print('time', end - start)
        print(curr_frame, per_frame_stats[curr_frame]["iou"][1])
    else:
        print("Parent prediction: ", curr_frame)
    curr_frame += 1

if stats_path:
    np.save(stats_path, [per_frame_stats])


['/mnt/disks/tensorflow-disk/video_distillation/giraffe1/giraffe1000.mp4']
Parent prediction:  0
time 6.297091722488403
1 0.77006567
time 0.17461037635803223
2 0.67289966
time 0.18724918365478516
3 0.6278668
time 0.18098068237304688
4 0.58880424
time 0.16969704627990723
5 0.545571
time 0.1761150360107422
6 0.47822094
time 0.18421149253845215
7 0.38723433
Parent prediction:  8
time 0.16814088821411133
9 0.65053463
time 0.17323827743530273
10 0.47578785
time 0.17032361030578613
11 0.43177098
time 0.16793465614318848
12 0.40003175
time 0.1685318946838379
13 0.3845975
time 0.1739511489868164
14 0.37175918
time 0.1699848175048828
15 0.34728244
Parent prediction:  16
time 0.17185568809509277
17 0.84512323
time 0.17558884620666504
18 0.7479621
time 0.18017172813415527
19 0.6731222
time 0.17629694938659668
20 0.6184462
time 0.16859865188598633
21 0.56417906
time 0.16857266426086426
22 0.51135755
time 0.16977238655090332
23 0.45751446
Parent prediction:  24
time 0.17063426971435547
25 0.8640995

KeyboardInterrupt: 