In [1]:
import numba as cuda

In [5]:
import numpy as np

In [6]:
def bsort(X):
    
    N = X.shape[0]
    for end in range(N,1,-1):
        for i in range(end-1):
            if X[i]>X[i+1]:
                X[i],X[i+1] = X[i+1],X[i]

In [8]:
run_parallel = cuda.config.NUMBA_NUM_THREADS>1
parallel_bsort = cuda.jit(nopython=True, parallel=run_parallel )(bsort)

In [19]:
dtype=np.int64

def compare_times():
    import time
    from timeit import default_timer as timer
    
    Xte = np.array(list(reversed(range(15))), dtype=dtype)
    
    print("Serial implementation of inplace sort : ")
    X0 = Xte.copy()
    st=time.time()
    bsort(X0)
    en=time.time()
    print(X0)
    print("Time taken for serial execution  : {:.3f} seconds".format(en-st))
    
    print("Testing CUDA implementation of inplace sort : ")
    X1 = Xte.copy()
    sp = time.time()
    parallel_bsort(X1)
    ep = time.time()
    print(X1)
    print("Time taken for parallel execution : {:.3f} seconds".format(ep-sp))
    
    assert all(X0==X1)
    
    REP = 10
    N = 1500

    Xorig = np.array(list(reversed(range(N))), dtype=dtype)

    t0 = timer()
    for t in range(REP):
        X0 = Xorig.copy()
        bsort(X0)
    tpython = (timer() - t0) / REP

    t1 = timer()
    for t in range(REP):
        X1 = Xorig.copy()
        parallel_bsort(X1)
    tnumba = (timer() - t1) / REP

    assert all(X0 == X1)
    
    print("\nTime taken for the execution of size {}".format(N))
    print('Python : ', tpython)
    print('Numba - Cuda : ', tnumba)
    print('\nSpeedup', tpython / tnumba, 'x')

In [18]:
compare_times()

Serial implementation of inplace sort : 
[0 1 2 3 4 5 6 7]
Time taken for serial execution  : 0.000 seconds
Testing CUDA implementation of inplace sort : 
[0 1 2 3 4 5 6 7]
Time taken for parallel execution : 0.000 seconds

Time taken for the execution of size 10000
Python :  24.448301293199982
Numba - Cuda :  0.026562912100052925

Speedup 920.3923576267555 x
