In [1]:
from __future__ import absolute_import, division, print_function

import os
import sys
import skimage.io
import numpy as np
import caffe
import json
import cv2
import time
from glob import glob, iglob
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
sys.path.append('../')
sys.path.append('../track_model')

from track_model import track_model_test as trackmodel
from util import processing_tools, im_processing, text_processing, eval_tools
import demo_track_config

In [2]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

# Load config
config = demo_track_config.Config()

# Load the model
with open('./demo_track_features.prototxt', 'w') as f:
    f.write(str(trackmodel.generate_conv_features('val', config)))
with open('./demo_track_scores.prototxt', 'w') as f:
    f.write(str(trackmodel.generate_conv_scores('val', config)))

caffe.set_device(config.gpu_id)
caffe.set_mode_gpu()

# Load pretrained model
conv_features_net = caffe.Net('./demo_track_features.prototxt',
                              config.pretrained_model,
                              caffe.TEST)
conv_scores_net = caffe.Net('./demo_track_scores.prototxt',
                       config.pretrained_model,
                       caffe.TEST)

In [3]:
# input image and query box
#video_name = 'Bebop2_drone'
video_name = 'woman'
video_file = './demo_data/%s/*.jpg' % (video_name)
query_box = np.loadtxt('./demo_results/%s/0001_pred.txt' % (video_name), delimiter=',').reshape((-1, 4))

# query box
init_box = query_box[0,:].copy()
init_box_w = init_box[2] - init_box[0] + 1
init_box_h = init_box[3] - init_box[1] + 1

# load query image (first frame)
qimg = skimage.io.imread('./demo_data/%s/0001.jpg' % (video_name))

if qimg.ndim == 2:
    qimg = np.tile(qimg[:, :, np.newaxis], (1, 1, 3))
qimg_height, qimg_width = qimg.shape[:2]

# select the triple larger box to include some context
qbox = init_box.copy()
qbox[0] = qbox[0] - 1.0*init_box_w
qbox[1] = qbox[1] - 1.0*init_box_h
qbox[2] = qbox[2] + 1.0*init_box_w
qbox[3] = qbox[3] + 1.0*init_box_h
qbox = np.round(qbox).astype(int).reshape((-1, 4))

# extract query box features
dummy_label = np.zeros((config.N, 1))
inputs = np.zeros((config.N, config.query_H, config.query_W, 3), dtype=np.float32)
inputs[0, ...] = im_processing.crop_and_pad_bboxes_subtract_mean(
                    qimg, qbox, config.qimage_size*3, trackmodel.channel_mean)
inputs_trans = inputs.transpose((0, 3, 1, 2))
inputs_trans = inputs_trans[:, ::-1, :, :]
conv_features_net.blobs['image'].data[...] = inputs_trans
conv_features_net.blobs['label'].data[...] = dummy_label
conv_features_net.forward()
conv_features = conv_features_net.blobs['feat_all'].data[...].copy()

# crop feature map
qfeat = conv_features[0, ...].copy()
qfeat_crop = im_processing.crop_featmap_from_center(qfeat, 3.0)
qfeat_crop_resh = qfeat_crop.reshape((-1, qfeat_crop.shape[0], qfeat_crop.shape[1], qfeat_crop.shape[2]))

# set up dyn filters
#print('params nr %d'%(len(conv_scores_net.params['scores']), ))
conv_scores_net.params['scores'][0].data[...] = qfeat_crop_resh

################################################################################
# Start tracking target on each frame
################################################################################
query_bbox = np.copy(qimg)
cv2.rectangle(query_bbox, (int(init_box[0]-1), int(init_box[1]-1)), (int(init_box[2]-1), int(init_box[3]-1)), (255, 0, 0), 3)
plt.imsave('./demo_results/%s/0001_initial.jpg' % (video_name), query_bbox)
    
start_frame_id = 1
frames = sorted(glob(video_file))
num_frames = min(len(frames), 500)

results = np.zeros((num_frames, 5))
results[0, 0] = 1
results[0, 1:] = init_box

center_x = np.ceil((init_box[2]-init_box[0]+1)/2.0) + init_box[0] - 1
center_y = np.ceil((init_box[3]-init_box[1]+1)/2.0) + init_box[1] - 1

sz_times = config.sz_times
sample_w = init_box_w
sample_h = init_box_h

prev_scale = 1
counter = 1
start_time = time.time()
for ii in range(start_frame_id+1, num_frames+start_frame_id):
    img = skimage.io.imread('./demo_data/%s/%04d.jpg' % (video_name, ii))
    if img.ndim == 2:
        img = np.tile(img[:, :, np.newaxis], (1, 1, 3))
    img_height, img_width = img.shape[:2]

    # assemble box proposals with multiple scales
    boxes = np.zeros((config.scales.size, 4))
    for ss, scale in enumerate(config.scales):
        boxes[ss, 0] = center_x - 0.5*sz_times*scale*sample_w + 1
        boxes[ss, 1] = center_y - 0.5*sz_times*scale*sample_h + 1
        boxes[ss, 2] = center_x + 0.5*sz_times*scale*sample_w
        boxes[ss, 3] = center_y + 0.5*sz_times*scale*sample_h
    boxes = np.round(boxes).astype(int).reshape((-1, 4))
    num_boxes = boxes.shape[0]

    # extract query box features
    inputs = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32)
    inputs[:num_boxes, ...] = im_processing.crop_and_pad_bboxes_subtract_mean(
                                 img, boxes, config.timage_size, trackmodel.channel_mean)
    inputs_trans = inputs.transpose((0, 3, 1, 2))
    inputs_trans = inputs_trans[:, ::-1, :, :]
    conv_scores_net.blobs['image'].data[...] = inputs_trans
    conv_scores_net.blobs['label'].data[...] = dummy_label
    conv_scores_net.forward()
    scores_val = conv_scores_net.blobs['scores'].data[...].copy()
    scores_val = scores_val[:num_boxes, ...]

    # obtain sizes
    map_h = scores_val.shape[2]
    map_w = scores_val.shape[3]
    map_size = map_h * map_w

    # scale change penalty
    scores_val = np.multiply(scores_val, config.scale_penalty)
    scores_val = scores_val.reshape(-1)

    max_idx = np.argmax(scores_val)+1
    max_score = scores_val[max_idx-1]
    s_idx = np.ceil(float(max_idx)/map_size)
    max_idx_within = np.fmod(max_idx,map_size)
    r_idx = 0
    c_idx = 0
    if max_idx_within == 0:
        r_idx = map_h
        c_idx = map_w
    else:
        r_idx = np.ceil(float(max_idx_within)/map_w)
        c_idx = np.fmod(max_idx_within,map_w)
        if c_idx == 0:
            c_idx = map_w

    # obtain box prediction
    bbox = boxes[int(s_idx-1), :].copy()
    predict_box = bbox.copy()
    predict_box[0] = np.maximum(bbox[0] + (c_idx-1-5) * config.spatial_ratio / config.timage_size * (bbox[2]-bbox[0]+1), 1)
    predict_box[1] = np.maximum(bbox[1] + (r_idx-1-5) * config.spatial_ratio / config.timage_size * (bbox[3]-bbox[1]+1), 1)
    predict_box[2] = np.minimum(predict_box[0] + sample_w * config.scales[int(s_idx-1)] - 1, img_width) # Be careful when extended to multiple scales
    predict_box[3] = np.minimum(predict_box[1] + sample_h * config.scales[int(s_idx-1)] - 1, img_height)

    # record result
    results[counter, 0] = max_score
    results[counter, 1:] = predict_box

    # update center coordinates
    center_x = np.ceil((predict_box[2]-predict_box[0]+1)/2.0) + predict_box[0] - 1
    center_y = np.ceil((predict_box[3]-predict_box[1]+1)/2.0) + predict_box[1] - 1
    prev_scale = prev_scale*(1-config.scaleLP) + config.scales[int(s_idx-1)]*config.scaleLP

    sample_w = sample_w * prev_scale 
    sample_h = sample_h * prev_scale

    counter = counter + 1
    
    # show results
    pred_bbox = np.copy(img)
    cv2.rectangle(pred_bbox, (int(predict_box[0]-1), int(predict_box[1]-1)), (int(predict_box[2]-1), int(predict_box[3]-1)), (0, 255, 0), 3)
    np.savetxt('./demo_results/%s/%04d_pred.txt' % (video_name,ii), predict_box.reshape((1,4)).astype(np.int32), fmt='%d', delimiter=',')
    plt.imsave('./demo_results/%s/%04d.jpg' % (video_name,ii), pred_bbox)
    
    #plt.figure(figsize=(12, 6))
    #plt.subplot(1, 2, 1)
    #plt.imshow(query_bbox)
    #plt.subplot(1, 2, 2)
    #plt.imshow(pred_bbox)
    #plt.pause(2)

  "%s to %s" % (dtypeobj_in, dtypeobj))
