In [32]:
import cudf
import numpy as np
from numba import cuda
import time

In [33]:
data_length = int(1e8)
average_window = 4
threads_per_block = 128
trunk_size = 10240

In [34]:
df = cudf.DataFrame()
df['in1'] = np.arange(data_length, dtype=np.float64)

In [35]:
def moving_average_kernel(in1, out, average_length):
    # Set the first average_length-1 rows in each chunk to np.nan
    # since there's not enough history.
    for i in range(cuda.threadIdx.x, average_length-1, cuda.blockDim.x):
        out[i] = np.nan
    
    # For all other rows, compute the average of the preceding
    # average_length rows (inclusive)
    for i in range(cuda.threadIdx.x + average_length - 1, in1.size, cuda.blockDim.x):
        summ = 0.0
        
        for j in range(i - average_length + 1, i + 1):
            summ += in1[j]
        
        out[i] = summ / np.float64(average_length)

def fill_missing_average_kernel(in1, out, average_length):
    # Safeguard to make sure we're not accessing outside the subarray boundary.
    # Prevents the average for being calculated where there isn't enough history.
    if in1.size - average_length + cuda.threadIdx.x - average_length + 1 < 0:
        return
    
    # Calculate the moving average for the average_length-1 rows with np.nan values
    # at the end of each shifted chunk that need an actual value.
    for i in range(in1.size - average_length + cuda.threadIdx.x,
                   in1.size, cuda.blockDim.x):
        
        summ = 0.0
        
        for j in range(i - average_length + 1,
                       i + 1):
            summ += in1[j]
        
        out[i] = summ / np.float64(average_length)

In [36]:
start = time.time()
df = df.apply_chunks(moving_average_kernel,
                     incols=['in1'],
                     outcols=dict(out=np.float64),
                     kwargs=dict(average_length=average_window),
                     chunks=list(range(0, data_length,
                                       trunk_size))+ [data_length],
                     tpb=threads_per_block)

df = df.apply_chunks(fill_missing_average_kernel,
                     incols=['in1', 'out'],
                     outcols=dict(),
                     kwargs=dict(average_length=average_window),
                     chunks=[0]+list(range(average_window, data_length,
                                           trunk_size))+ [data_length],
                     tpb=threads_per_block)
end = time.time()
print('cuDF time', end-start)

cuDF time 0.6082448959350586


In [37]:
print(df.head(10))

    in1  out
 0  0.0     
 1  1.0     
 2  2.0     
 3  3.0  1.5
 4  4.0  2.5
 5  5.0  3.5
 6  6.0  4.5
 7  7.0  5.5
 8  8.0  6.5
 9  9.0  7.5
