# Benchmark

We chose to use the following dataset for evaluation

In [1]:
import shutil
import urllib.request
import os
import cv2
import numpy as np
import yaml

import dataset
import scoring

In [2]:
if not os.path.exists('models'):
    os.makedirs('models')

In [14]:
# %cd ../..

vot_dataset = dataset.load_dataset('myvot2021')
tc_dataset = dataset.load_dataset('mytc128')

In [15]:
vot_dataset, tc_dataset

({'agility': {'name': 'agility',
   'gt': [[549, 249, 24, 58],
    [553, 242, 25, 41],
    [558, 241, 22, 38],
    [557, 244, 28, 43],
    [558, 255, 26, 51],
    [551, 270, 30, 37],
    [537, 275, 34, 37],
    [524, 280, 34, 35],
    [509, 280, 34, 36],
    [496, 274, 38, 38],
    [494, 275, 34, 33],
    [487, 273, 36, 35],
    [479, 275, 43, 35],
    [487, 275, 28, 33],
    [495, 283, 16, 24],
    [0, 0, 0, 0],
    [454, 284, 19, 27],
    [441, 266, 37, 44],
    [422, 255, 59, 56],
    [400, 242, 59, 48],
    [390, 250, 58, 42],
    [373, 256, 65, 55],
    [371, 269, 65, 52],
    [368, 293, 66, 26],
    [359, 290, 73, 29],
    [330, 288, 78, 40],
    [314, 294, 71, 44],
    [292, 292, 79, 48],
    [270, 291, 63, 45],
    [247, 296, 84, 43],
    [224, 307, 81, 38],
    [211, 312, 58, 38],
    [198, 321, 65, 36],
    [195, 322, 48, 38],
    [189, 314, 37, 48],
    [188, 324, 32, 36],
    [188, 326, 36, 30],
    [188, 312, 38, 37],
    [201, 303, 43, 40],
    [213, 303, 42, 38],
    [22

## [SiamSE](https://github.com/isosnovik/SiamSE)

In [5]:
# model = 'SiamSE'
# path_zip = os.path.join('models', model + '.zip')
# path_no_zip = os.path.join('models')

# urllib.request.urlretrieve('https://github.com/ISosnovik/SiamSE/archive/refs/heads/master.zip', path_zip)
# shutil.unpack_archive(path_zip, path_no_zip)
# os.rename(os.path.join(path_no_zip, 'SiamSE-master'), os.path.join(path_no_zip, model))
# os.remove(path_zip)

Install the needed packages (see README)

In [6]:
%cd models/SiamSE
# !pip install yacs scipy shapely opencv-python numpy pytorch torchvision pyyaml shapely

/mnt/c/Users/33652/Documents/ING 2 Prog/DeepNN/projet/models/SiamSE


Download the pretrained model :
https://drive.google.com/file/d/1WQ-9_QE9Xk9wj52vVcEDIXY2NBTupAnZ/view?usp=share_link

In [7]:
import lib.models.models as models
from lib.tracker import SESiamFCTracker
from lib.utils import load_pretrain, cxy_wh_2_rect, get_axis_aligned_bbox, load_dataset, poly_iou, convert_color_RGB

In [8]:
model_pretrained = 'checkpoint_vot.pth'
model_carac = 'VOT2017'
config_file = 'configs/test.yaml'

In [9]:
with open(config_file, 'r') as f:
    tracker_config = yaml.load(f.read(), Loader=yaml.FullLoader)

# prepare model
net = models.__dict__[tracker_config['MODEL']](padding_mode='constant')
net = load_pretrain(net, model_pretrained)
net = net.eval().cuda()

# prepare tracker
tracker_config = tracker_config['TRACKER'][model_carac]
tracker = SESiamFCTracker(net, **tracker_config)

%cd ../..

| using constant padding
| using scales: [0.8333333333333334, 1.0, 1.2]
| using ordinary correlation
load pretrained model from checkpoint_vot.pth
remove prefix "module."
missing keys:set()
unused checkpoint keys:set()
/mnt/c/Users/33652/Documents/ING 2 Prog/DeepNN/projet


# Tracking

In [16]:
def get_axis_aligned_bbox(bbox):
    """Convert bbox to [xc, yc, w, h] format"""
    x, y, w, h = bbox
    return [x + w / 2, y + h / 2, w, h]

In [17]:
def track(tracker, video):
    start_frame, toc = 0, 0

    regions = []
    image_files, gt = video['image_files'], video['gt']

    for f, image_file in enumerate(image_files):
        im = cv2.imread(image_file)
        im = convert_color_RGB(im)

        tic = cv2.getTickCount()

        if f == start_frame:  # init
            cx, cy, w, h = get_axis_aligned_bbox(gt[f])
            target_pos = np.array([cx, cy])
            target_sz = np.array([w, h])
            tracker.init(im, target_pos, target_sz)  # init tracker
            regions.append(gt[f])

        elif f > start_frame:  # tracking
            target_pos, target_sz = tracker.track(im)
            location = cxy_wh_2_rect(target_pos, target_sz)
            b_overlap = scoring.get_precision(gt[f], location)
            print(gt[f], location, b_overlap)
            if b_overlap > 0:
                regions.append(location)
            else:
                regions.append(2)
                start_frame = f + 5
        else:
            regions.append(0)

        toc += cv2.getTickCount() - tic
  
    toc /= cv2.getTickFrequency()
    print('Video: {:12s} Time: {:2.1f}s Speed: {:3.1f}fps'.format(video['name'], toc, f / toc))


In [12]:
track(tracker, vot_dataset['agility'])

[553, 242, 25, 41] [555.9291338582677, 247.11023622047242, 24.0, 58.0] 0.46608785427907456
[558, 241, 22, 38] [571.992125984252, 246.16535433070862, 24.0, 58.0] 0.10956630922992873
[557, 244, 28, 43] [580.1154602689147, 241.41354341890607, 23.501362926737517, 56.79496040628233] 0.07933039682877097
[558, 255, 26, 51] [580.1154602689147, 235.86204036534602, 23.501362926737517, 56.79496040628233] 0.045720095616418405
[551, 270, 30, 37] [573.0706973404349, 234.9262749156334, 24.020547986106124, 58.049657633089794] 0.054841930168749393
[537, 275, 34, 37] [568.9070052957002, 238.0511528777621, 23.521483996691416, 56.84358632533758] 0.010158788770822845
[524, 280, 34, 35] [563.6594301258347, 240.2119191241773, 23.521483996691416, 56.84358632533758] 0
[487, 275, 28, 33] [476.68291618390526, 275.36358953258724, 42.106608577071384, 34.272820934825546] 0.6265810801997789
[495, 283, 16, 24] [479.22527600975263, 278.7266890850029, 41.23177874099306, 33.5607501380176] 0.27750287968487636
[0, 0, 0, 0

In [20]:
track(tracker, tc_dataset['Airport_ce'])

[906, 365, 83, 322] [910.0, 364.49212598425197, 80.0, 322.0] 0.9375141330626762
[904, 365, 82, 321] [908.492125984252, 362.98425196850394, 80.0, 322.0] 0.9087292258196635
[903, 370, 80, 316] [906.9842519685039, 361.4763779527559, 80.0, 322.0] 0.8743075338288906
[903, 370, 78, 319] [905.4763779527559, 361.4763779527559, 80.0, 322.0] 0.8764263413177069
[901, 371, 77, 319] [902.4606299212599, 361.4763779527559, 80.0, 322.0] 0.8820154332327635
[899, 377, 79, 312] [900.9527559055118, 361.4763779527559, 80.0, 322.0] 0.8797270068560727
[900, 377, 72, 308] [899.4448818897638, 362.98425196850394, 80.0, 322.0] 0.8607834507042254
[894, 379, 77, 301] [897.9370078740158, 362.98425196850394, 80.0, 322.0] 0.8136817848111588
[895, 379, 73, 301] [896.4291338582677, 362.98425196850394, 80.0, 322.0] 0.8216125824821473
[895, 376, 73, 301] [893.4133858267717, 362.98425196850394, 80.0, 322.0] 0.8529891304347826
[895, 376, 73, 301] [891.9055118110236, 364.49212598425197, 80.0, 322.0] 0.8529891304347826
[891,