In [79]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
from numba import njit, cuda
from timeit import default_timer as timer
import simple

In [80]:
def get_offset(offset_ms: int, track: np.array, sr: int) -> tuple[int, np.array]:
    offset = int(sr / 1000 * offset_ms)
    return offset, track[offset:(offset + 1000)]

def comparator(a: np.array, b: np.array) -> int:
    return np.sum(np.abs(np.subtract(a, b)))

def calc_offset(a: np.array, b: np.array) -> int:
    comps = np.array([comparator(a, b[i:(i + 1000)]) for i in range(len(b) - 1000)])
    mn_arg = np.argmin(comps)
    return mn_arg if comps[mn_arg] == 0.0 else -1

@cuda.jit(device=True)
def diff(a, b):
    return abs(a - b)

@cuda.jit
def calc(curr_offset, track, res):
    off = cuda.threadIdx.x * 32 + cuda.threadIdx.y + cuda.blockIdx.y * 32**2

    if off >= res.shape[0] or cuda.blockIdx.x >= res.shape[1]:
        return

    res[off, cuda.blockIdx.x] = diff(curr_offset[cuda.blockIdx.x], track[off + cuda.blockIdx.x])

In [88]:
original = simple.load_sound_files()[0]
offsets = {}
for off in [10, 50, 124, 735, 1232, 2312]:
    k, v = get_offset(off, original.data, original.sample_rate) 
    offsets[k] = v
offsets.keys()

dict_keys([220, 1102, 2734, 16206, 27165, 50979])

In [89]:
samples = 12
results = { k: np.zeros((samples)) for k in offsets }

for correct, offset in offsets.items():
    for i in range(samples):
        start = timer()
        res = calc_offset(offset, original.data)
        end = timer()
        results[correct][i] = end - start if res >= 0 else -1

for k, v in results.items():
    if v.min() == -1:
        print(f"Bad value at {k}")
    results[k] = (np.sum(v) - v.min() - v.max()) / (samples - 2)
    print(f"{k:>6} = {results[k] * 1000 :.2f}ms")

   220 = 539.35ms
  1102 = 536.34ms
  2734 = 548.12ms
 16206 = 546.70ms
 27165 = 535.63ms
 50979 = 541.75ms


In [93]:
threads_per_block = (32, 32)
blocks_per_grid = (1000, int(np.ceil((original.len - 1000) / (threads_per_block[0] * threads_per_block[1]))))

samples = 13
results_cuda = { k: np.zeros((samples - 1)) for k in offsets }

for correct, offset in offsets.items():
    for i in range(samples):
        start = timer()
        curr_offset_gpu = cuda.to_device(offset)
        track_gpu = cuda.to_device(original.data)
        res_gpu = cuda.to_device(np.zeros((original.len - 1000, 1000), dtype=np.int16))
        calc[blocks_per_grid, threads_per_block](curr_offset_gpu, track_gpu, res_gpu)
        arg = np.argmin(np.sum(res_gpu.copy_to_host(), axis=1))
        end = timer()
        if i == 0:
            continue
        results_cuda[correct][i - 1] = end - start if arg >= 0 else -1

for k, v in results_cuda.items():
    if v.min() == -1:
        print(f"Bad value at {k}")
    results_cuda[k] = (np.sum(v) - v.min() - v.max()) / (samples - 2)
    print(f"{k:>6} = {results_cuda[k] * 1000 :.2f}ms")

   220 = 127.20ms
  1102 = 131.11ms
  2734 = 145.88ms
 16206 = 132.71ms
 27165 = 131.55ms
 50979 = 129.20ms
