
## Initially we have this

In [None]:
from math import ceil
from itertools import product
from typing import Tuple

STEPS = [8, 16, 32]
MIN_SIZES = [[16, 32], [64, 128], [256, 512]]

def calculate_anchors(image_size: Tuple[int, int]):
    feature_maps = [[ceil(image_size[0]/step), ceil(image_size[1]/step)] for step in STEPS]
    anchors = []
    for k, f in enumerate(feature_maps):
        min_sizes = MIN_SIZES[k]
        for i, j in product(range(f[0]), range(f[1])):
            for min_size in min_sizes:
                s_kx = min_size / image_size[1]
                s_ky = min_size / image_size[0]
                dense_cx = [x * STEPS[k] / image_size[1] for x in [j + 0.5]]
                dense_cy = [y * STEPS[k] / image_size[0] for y in [i + 0.5]]
                for cy, cx in product(dense_cy, dense_cx):
                    anchors += [cx, cy, s_kx, s_ky]
    return anchors

In [None]:
%timeit calculate_anchors((640, 480))

## Let's add some caching with a hash table

In [None]:
caches = {}

def calculate_anchors_with_caching(image_size: Tuple[int, int]):
    if image_size in caches:
        return caches[image_size]
    
    feature_maps = [[ceil(image_size[0]/step), ceil(image_size[1]/step)] for step in STEPS]
    anchors = []
    for k, f in enumerate(feature_maps):
        min_sizes = MIN_SIZES[k]
        for i, j in product(range(f[0]), range(f[1])):
            for min_size in min_sizes:
                s_kx = min_size / image_size[1]
                s_ky = min_size / image_size[0]
                dense_cx = [x * STEPS[k] / image_size[1] for x in [j + 0.5]]
                dense_cy = [y * STEPS[k] / image_size[0] for y in [i + 0.5]]
                for cy, cx in product(dense_cy, dense_cx):
                    anchors += [cx, cy, s_kx, s_ky]

    caches[image_size] = image_size
    return anchors

In [None]:
%timeit calculate_anchors_with_caching((640, 480))

## So let's say the input size changes a lot, that's cheating

In [None]:
input_sizes = [(640+i, 480+i) for i in range(1000)]

In [None]:
import random

%timeit calculate_anchors_with_caching(random.choice(input_sizes))

In [None]:
%timeit calculate_anchors(random.choice(input_sizes))

## Numba does not support Global, so let's just move them to the local scope for now

In [None]:
import random
from typing import List
from numba import njit, prange
import numpy as np

@njit()
def calculate_anchors_numba(image_size: Tuple[int, int]) -> List[List[int]]:
    steps = [8, 16, 32]
    min_sizes_cfg = [[16, 32], [64, 128], [256, 512]]
    feature_maps = [[ceil(image_size[0]/step), ceil(image_size[1]/step)] for step in steps]
    anchors: List[List[int]] = []
    for k in range(len(feature_maps)):
        for i in range(feature_maps[k][0]):
            for j in range(feature_maps[k][1]):
                for min_size in min_sizes_cfg[k]:
                    s_kx = min_size / image_size[1]
                    s_ky = min_size / image_size[0]
                    cx = (j + 0.5) * steps[k] / image_size[1]
                    cy = (i + 0.5) * steps[k] / image_size[0]
                    anchors.append([cx, cy, s_kx, s_ky])
    return anchors

# Warm up
for _ in range(10):
    calculate_anchors_numba(random.choice(input_sizes))

%timeit calculate_anchors_numba(random.choice(input_sizes))

In [None]:
res = calculate_anchors_numba((640, 480))
print(len(res))
res2 = calculate_anchors((640, 480))
res2 = np.array(res2)
print(res2.shape)

In [None]:
list(product([1,2,3], [4,5,6]))

# 3x faster already, can we do more ? time for vectorization

In [None]:
import random
from typing import List
import numpy as np

# @njit()
def calculate_anchors_numba_more(image_size: Tuple[int, int]) -> np.ndarray:
    steps = np.array([8, 16, 32])
    min_sizes_cfg = [[16, 32], [64, 128], [256, 512]]
    feature_maps = np.array([[ceil(image_size[0]/step), ceil(image_size[1]/step)] for step in steps])
    
    num_anchors = 0
    num_anchors_per_fm: List[int] = []
    for k in range(len(feature_maps)):
        curr = feature_maps[k][0] * feature_maps[k][1] * len(min_sizes_cfg[k])
        num_anchors_per_fm.append(curr)
        num_anchors += curr
            
    # For k
    temp_np = np.zeros((num_anchors, 4), dtype=np.float64)
    start = 0
    for i, e in enumerate(num_anchors_per_fm):
        temp_np_sub_i = temp_np[start:start+e,:]
        temp_np_sub_i[:, 3] = i
        
        # i will then be in range of (0, feature_maps[k][0]), each i be duplicated feature_maps[k][1] * len(min_sizes_cfg[k]) times
        nums_repeat_i = feature_maps[i][1] * len(min_sizes_cfg[i])
        arrange = np.arange(0, feature_maps[i][0], 1)
        repeated = np.repeat(arrange, nums_repeat_i)
        temp_np_sub_i[:, 0] = repeated
        start += e

        # j will then be in range of (0, feature_maps[k][1]), each j be duplicated len(min_sizes_cfg[k]) times. Then duplicate this whole j array for each i for feature_maps[k][0] times 
        nums_repeat_j = len(min_sizes_cfg[i])
        arrange = np.arange(0, feature_maps[i][1], 1)
        repeated = np.tile(np.repeat(arrange, nums_repeat_j), feature_maps[i][0])
        temp_np_sub_i[:, 1] = repeated
        
        # Now min_sizes_cfg[i] will be duplicated feature_maps[k][0] * feature_maps[k][1] time
        nums_repeat_min_size = feature_maps[i][0] * feature_maps[i][1]
        repeated = np.tile(min_sizes_cfg[i], nums_repeat_min_size)
        temp_np_sub_i[:, 2] = repeated
            

    res = np.zeros((num_anchors, 4), dtype=np.float64)
    res[:, 2] = temp_np[:, 2] / image_size[1]
    res[:, 3] = temp_np[:, 2] / image_size[0]
    res[:, 0] = (temp_np[:, 1] + 0.5) * steps[temp_np[:, 3].astype(np.int32)] / image_size[1]
    res[:, 1] = (temp_np[:, 0] + 0.5) * steps[temp_np[:, 3].astype(np.int32)] / image_size[0]

    return res.flatten()

# Warm up
for _ in range(2):
    calculate_anchors_numba_more(random.choice(input_sizes))

%timeit calculate_anchors_numba_more(random.choice(input_sizes))

In [None]:
res_1 = calculate_anchors((640, 480))
res_2 = calculate_anchors_numba_more((640, 480))
res_1 = np.array(res_1)
print(res_1.shape, res_2.shape)
res_1[0], res_2[0]
np.all(res_1 == res_2)

## Nows that 35 times faster


In [None]:
import math

def std(xs):
    mean = 0
    for x in xs:
        mean += x
    mean = mean/len(xs)

    mean_squared = 0
    for x in xs:
        mean_squared = (mean - x)**2
    variance = mean_squared / len(xs)

    return math.sqrt(variance)


In [None]:
import numpy as np

%timeit std(np.random.normal(0, 1, 10000000))

In [None]:
from numba import njit

numba_std = njit(std)
%timeit numba_std(np.random.normal(0, 1, 10000000))

In [None]:
%timeit std(np.random.normal(0, 1, 10))

In [None]:
%timeit numba_std(np.random.normal(0, 1, 10))

In [None]:
def simple_sqrt(x):
    return math.sqrt(x)

In [None]:
%timeit simple_sqrt(3)

In [None]:
numba_simple_sqrt = njit(simple_sqrt)

%timeit numba_simple_sqrt(3)