# Práctica 2: GPU Programming (CUDA)

### Autores:
Rafael Domínguez Sáez

Iñigo Martínez Ciriza

## Preparación del entorno

Antes de nada debemos preparar el entorno de ejecución.

Eliminación de datos innecesarios creados por Google Collab:

In [1]:
!rm -rf sample_data

Descarga de `Numba` en caso de no encontrarse en el sistema:

In [2]:
!pip install numba --upgrade



Importación de los paquetes necesarios para el correcto funcionamiento del código:

In [3]:
import numpy as np
from numba import cuda, float32
import time
import math

## 2.1 Compulsory assignment #1: Matrix transpose

In [4]:
Ax = 5_000
Ay = 7_000
Bx = Ay
By = Ax

def create_matrix_1(Ax: int, Ay: int) -> tuple:
  return np.random.rand(Ax, Ay), np.zeros((Ay, Ax))

In [5]:
@cuda.jit
def transpose_parallel(A, B):
    i, j = cuda.grid(2)
    if i < B.shape[0] and j < B.shape[1]:
        B[i, j] = A[j, i]

def transpose_sequential(A, B):
    for i in range(0, B.shape[0]):
        for j in range(0, B.shape[1]):
            B[i, j] = A[j, i]

In [6]:
A, B_seq = create_matrix_1(Ax, Ay)

t_start = time.time()
transpose_sequential(A, B_seq)
t_finish = time.time()

t_cpu = t_finish - t_start

print(f"Input \n", A)
print()
print(f"Output \n", B_seq)
print()
print(f"Tiempo ejecución en CPU = {t_cpu} s")

Input 
 [[0.44486102 0.09760581 0.44534834 ... 0.11800538 0.24164943 0.25565313]
 [0.63799815 0.85673611 0.84667396 ... 0.146714   0.11202021 0.16948548]
 [0.18774264 0.94004216 0.50880155 ... 0.97488335 0.47848894 0.31737487]
 ...
 [0.38250947 0.48603796 0.8950678  ... 0.52787403 0.13116489 0.55453895]
 [0.46752753 0.81841218 0.58308125 ... 0.65193755 0.11150992 0.49548127]
 [0.77426633 0.39595189 0.98209078 ... 0.75678176 0.18726394 0.04267305]]

Output 
 [[0.44486102 0.63799815 0.18774264 ... 0.38250947 0.46752753 0.77426633]
 [0.09760581 0.85673611 0.94004216 ... 0.48603796 0.81841218 0.39595189]
 [0.44534834 0.84667396 0.50880155 ... 0.8950678  0.58308125 0.98209078]
 ...
 [0.11800538 0.146714   0.97488335 ... 0.52787403 0.65193755 0.75678176]
 [0.24164943 0.11202021 0.47848894 ... 0.13116489 0.11150992 0.18726394]
 [0.25565313 0.16948548 0.31737487 ... 0.55453895 0.49548127 0.04267305]]

Tiempo ejecución en CPU = 18.126837730407715 s


In [7]:
_, B_par = create_matrix_1(Ax, Ay)

A_device = cuda.to_device(A)
B_device = cuda.to_device(B_par)

threads_per_block = (16, 16)

blocks_X = math.ceil(B_par.shape[0] / threads_per_block[0])
blocks_Y = math.ceil(B_par.shape[1] / threads_per_block[1])
blocks_total = (blocks_X, blocks_Y)

t_start = time.time()

transpose_parallel[blocks_total, threads_per_block](A_device, B_device)

cuda.synchronize()
t_finish = time.time()

B_par = B_device.copy_to_host()

t_gpu = t_finish - t_start

print("Input \n", A)
print()
print("Otuput \n", B_par)
print()
print(f"Tiempo ejecución en GPU = {t_gpu} s")

Input 
 [[0.44486102 0.09760581 0.44534834 ... 0.11800538 0.24164943 0.25565313]
 [0.63799815 0.85673611 0.84667396 ... 0.146714   0.11202021 0.16948548]
 [0.18774264 0.94004216 0.50880155 ... 0.97488335 0.47848894 0.31737487]
 ...
 [0.38250947 0.48603796 0.8950678  ... 0.52787403 0.13116489 0.55453895]
 [0.46752753 0.81841218 0.58308125 ... 0.65193755 0.11150992 0.49548127]
 [0.77426633 0.39595189 0.98209078 ... 0.75678176 0.18726394 0.04267305]]

Otuput 
 [[0.44486102 0.63799815 0.18774264 ... 0.38250947 0.46752753 0.77426633]
 [0.09760581 0.85673611 0.94004216 ... 0.48603796 0.81841218 0.39595189]
 [0.44534834 0.84667396 0.50880155 ... 0.8950678  0.58308125 0.98209078]
 ...
 [0.11800538 0.146714   0.97488335 ... 0.52787403 0.65193755 0.75678176]
 [0.24164943 0.11202021 0.47848894 ... 0.13116489 0.11150992 0.18726394]
 [0.25565313 0.16948548 0.31737487 ... 0.55453895 0.49548127 0.04267305]]

Tiempo ejecución en GPU = 1.7737162113189697 s


In [8]:
speedup = t_cpu / t_gpu

print(f"Speedup = {speedup}")

Speedup = 10.219694455477885


## 2.2 Compulsory assignment #2: Average Rows/Cols I

In [9]:
def Avg_Cols_sequential(input, output):
    for y in range(input.shape[1]):
        output[y] = 0.0
        for x in range(input.shape[0]):
            output[y] += input[x, y]
        output[y] /= input.shape[0]

def Avg_Rows_sequential(input, output):
    for y in range(input.shape[0]):
        output[y] = 0.0
        for x in range(input.shape[1]):
            output[y] += input[y, x]
        output[y] /= input.shape[1]

@cuda.jit
def Avg_Cols_parallel(input, output):
    y = cuda.grid(1)

    if y < input.shape[1]:
        sum_val = 0.0
        for x in range(input.shape[0]):
            sum_val += input[x, y]
        output[y] = sum_val / input.shape[0]

@cuda.jit
def Avg_Rows_parallel(input, output):
    y = cuda.grid(1)

    if y < input.shape[0]:
        sum_val = 0.0
        for x in range(input.shape[1]):
            sum_val += input[y, x]
        output[y] = sum_val / input.shape[1]


In [10]:
np.random.seed(0)

Ax = 4_000
Ay = 4_000

A = np.random.rand(Ax, Ay)

# CPU
B_cpu_cols = np.zeros(Ay)

t_start = time.time()
Avg_Cols_sequential(A, B_cpu_cols)
t_finish = time.time()

t_cpu_cols = t_finish - t_start

print("Input \n", A)
print()
print("Output CPU \n", B_cpu_cols)
print()
print(f"Tiempo ejecución en CPU = {t_cpu_cols} s")
print()

# GPU
B_gpu_cols = np.zeros(Ay)

A_device = cuda.to_device(A)
B_device = cuda.to_device(B_gpu_cols)

threads_per_block = 32
blocks_total = math.ceil(A_device.shape[1] / threads_per_block)

t_start = time.time()
Avg_Cols_parallel[blocks_total, threads_per_block](A_device, B_device)
cuda.synchronize()
t_finish = time.time()

B_gpu_cols = B_device.copy_to_host()
t_gpu_cols = t_finish - t_start

print("Output GPU \n", B_gpu_cols)
print()
print(f"Tiempo ejecución en GPU = {t_gpu_cols} s")
print()

# Speedup
speedup = t_cpu_cols / t_gpu_cols
print(f"Speedup = {speedup}")

Input 
 [[0.5488135  0.71518937 0.60276338 ... 0.83000295 0.93280618 0.30833843]
 [0.29264205 0.56651827 0.13741443 ... 0.80182819 0.5391446  0.83721853]
 [0.4577597  0.3769177  0.70233513 ... 0.84348096 0.94290928 0.83282242]
 ...
 [0.45568019 0.05480491 0.25982542 ... 0.10151857 0.47639488 0.7728146 ]
 [0.47772193 0.61759916 0.66554051 ... 0.63370207 0.7252492  0.75827622]
 [0.2176855  0.35912559 0.78489484 ... 0.25148401 0.36997825 0.72145565]]

Output CPU 
 [0.49843221 0.49760128 0.49841932 ... 0.50760745 0.50385554 0.50377579]

Tiempo ejecución en CPU = 11.211371183395386 s





Output GPU 
 [0.49843221 0.49760128 0.49841932 ... 0.50760745 0.50385554 0.50377579]

Tiempo ejecución en GPU = 0.24784374237060547 s

Speedup = 45.235643539592814


In [11]:
# CPU
B_cpu_rows = np.zeros(Ax)

t_start = time.time()
Avg_Rows_sequential(A, B_cpu_rows)
t_finish = time.time()

t_cpu_rows = t_finish - t_start

print("Input \n", A)
print()
print("Output CPU \n", B_cpu_rows)
print()
print(f"Tiempo ejecución en CPU = {t_cpu_rows} s")
print()

# GPU
B_gpu_rows = np.zeros(Ax)

A_device = cuda.to_device(A)
B_device = cuda.to_device(B_gpu_rows)

threads_per_block = 32
blocks_total = math.ceil(A_device.shape[0] / threads_per_block)

t_start = time.time()
Avg_Rows_parallel[blocks_total, threads_per_block](A_device, B_device)
cuda.synchronize()
t_finish = time.time()

B_gpu_rows = B_device.copy_to_host()
t_gpu_rows = t_finish - t_start

print("Output GPU \n", B_gpu_rows)
print()
print(f"Tiempo ejecución en GPU = {t_gpu_rows} s")
print()

# Speedup
speedup = t_cpu_rows / t_gpu_rows
print(f"Speedup = {speedup}")

Input 
 [[0.5488135  0.71518937 0.60276338 ... 0.83000295 0.93280618 0.30833843]
 [0.29264205 0.56651827 0.13741443 ... 0.80182819 0.5391446  0.83721853]
 [0.4577597  0.3769177  0.70233513 ... 0.84348096 0.94290928 0.83282242]
 ...
 [0.45568019 0.05480491 0.25982542 ... 0.10151857 0.47639488 0.7728146 ]
 [0.47772193 0.61759916 0.66554051 ... 0.63370207 0.7252492  0.75827622]
 [0.2176855  0.35912559 0.78489484 ... 0.25148401 0.36997825 0.72145565]]

Output CPU 
 [0.49910374 0.49199426 0.49593635 ... 0.50096485 0.50020686 0.4928522 ]

Tiempo ejecución en CPU = 9.331703901290894 s

Output GPU 
 [0.49910374 0.49199426 0.49593635 ... 0.50096485 0.50020686 0.4928522 ]

Tiempo ejecución en GPU = 0.18918156623840332 s

Speedup = 49.32670812932822




## 2.3 Compulsory assignment #3: Average Rows/Cols II (2 points)

In [12]:
@cuda.jit
def Avg_Cols_parallel_shared(input, output):
    shared_mem = cuda.shared.array(shape=(32,), dtype=float32)
    rows, cols = input.shape
    y = cuda.grid(1)
    tx = cuda.threadIdx.x

    if y < cols:
        shared_mem[tx] = 0.0
        cuda.syncthreads()
        for x in range(rows):
            cuda.atomic.add(shared_mem, tx, input[x, y])
        cuda.syncthreads()
        output[y] = shared_mem[tx] / rows

@cuda.jit
def Avg_Rows_parallel_shared(input, output):
    shared_mem = cuda.shared.array(shape=(32,), dtype=float32)
    rows, cols = input.shape
    x = cuda.grid(1)
    tx = cuda.threadIdx.x

    if tx < rows:
        shared_mem[tx] = 0.0
        cuda.syncthreads()
        for y in range(cols):
            cuda.atomic.add(shared_mem, tx, input[x, y])
        cuda.syncthreads()
        output[x] = shared_mem[tx] / cols

In [13]:
# GPU SHARED
B_gpu_cols_shared = np.zeros(Ay)

B_device = cuda.to_device(B_gpu_cols_shared)

threads_per_block = 32
blocks_total = math.ceil(A_device.shape[1] / threads_per_block)

t_start = time.time()
Avg_Cols_parallel_shared[blocks_total, threads_per_block](A_device, B_device)
t_finish = time.time()

B_gpu_cols_shared = B_device.copy_to_host()
t_gpu_cols_shared = t_finish - t_start

print("Output GPU \n", B_gpu_cols)
print()
print("Output GPU memoria compartida \n", B_gpu_cols_shared)
print()
print(f"Tiempo ejecución en GPU con memoria compartida = {t_gpu_cols_shared} s")
print()

# Speedup
speedup = t_cpu_cols / t_gpu_cols_shared
print(f"Speedup = {speedup}")

speedup = t_gpu_cols / t_gpu_cols_shared
print(f"Speedup memoria compartida = {speedup}")



Output GPU 
 [0.49843221 0.49760128 0.49841932 ... 0.50760745 0.50385554 0.50377579]

Output GPU memoria compartida 
 [0.49843121 0.4976011  0.49841986 ... 0.50760754 0.50385571 0.50377637]

Tiempo ejecución en GPU con memoria compartida = 0.28505802154541016 s

Speedup = 39.33013750188187
Speedup memoria compartida = 0.8694501597497533


In [14]:
# GPU SHARED
B_gpu_rows_shared = np.zeros(Ax)

B_device = cuda.to_device(B_gpu_rows_shared)

threads_per_block = 32
blocks_total = math.ceil(A_device.shape[0] / threads_per_block)

t_start = time.time()
Avg_Rows_parallel_shared[blocks_total, threads_per_block](A_device, B_device)
t_finish = time.time()

B_gpu_rows_shared = B_device.copy_to_host()
t_gpu_rows_shared = t_finish - t_start

print("Output GPU \n", B_gpu_rows)
print()
print("Output GPU memoria compartida \n", B_gpu_rows_shared)
print()
print(f"Tiempo ejecución en GPU con memoria compartida = {t_gpu_rows_shared} s")
print()

# Speedup
speedup = t_cpu_rows / t_gpu_rows_shared
print(f"Speedup = {speedup}")

speedup = t_gpu_rows / t_gpu_rows_shared
print(f"Speedup memoria compartida = {speedup}")



Output GPU 
 [0.49910374 0.49199426 0.49593635 ... 0.50096485 0.50020686 0.4928522 ]

Output GPU memoria compartida 
 [0.49910382 0.49199466 0.49593689 ... 0.50096503 0.50020764 0.49285245]

Tiempo ejecución en GPU con memoria compartida = 0.2787179946899414 s

Speedup = 33.480808842901965
Speedup memoria compartida = 0.6787561974563483


## 2.4 Optional assignment #1: Mean_3x3 (2 points)

In [39]:
def Mean_3x3_sequential(input, output):
    for x in range(input.shape[0]):
        for y in range(input.shape[1]):
            output[x, y] = 0.0
            for i in range(-1, 2):
                for j in range(-1, 2):
                    if x + i >= 0 and x + i < input.shape[0] and \
                        y + j >= 0 and y + j < input.shape[1]:
                        output[x, y] += input[x + i, y + j]
            output[x, y] /= 9.0

@cuda.jit
def Mean_3x3_parallel(input, output):
    shared_mem = cuda.shared.array((16 + 2, 16 + 2), dtype=float32)
    rows, cols = input.shape
    x, y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y

    if 0 <= x < rows and 0 <= y < cols:
        shared_mem[tx + 1, ty + 1] = input[x, y]
    else:
        shared_mem[tx + 1, ty + 1] = 0.0

    cuda.syncthreads()

    # Calcular la media 3x3 si estamos dentro del rango
    if x < rows and y < cols:
        total_sum = 0.0
        if y != 0 and y != cols-1 and x != 0 and x != rows-1:
            # Sumar los 8 vecinos y la celda central
            total_sum = (shared_mem[tx, ty] + shared_mem[tx, ty + 1] + shared_mem[tx, ty + 2] +
                        shared_mem[tx + 1, ty] + shared_mem[tx + 1, ty + 1] + shared_mem[tx + 1, ty + 2] +
                        shared_mem[tx + 2, ty] + shared_mem[tx + 2, ty + 1] + shared_mem[tx + 2, ty + 2])
        elif y == 0 and x == 0:
            total_sum = (shared_mem[tx + 1, ty + 1] + shared_mem[tx + 1, ty + 2] +
                         shared_mem[tx + 2, ty + 1] + shared_mem[tx + 2, ty + 2])
        elif y == 0 and x == rows-1:
            total_sum = (shared_mem[tx, ty + 1]     +  shared_mem[tx, ty + 2] +
                         shared_mem[tx + 1, ty + 1] +  shared_mem[tx + 1, ty + 2])
        elif y == cols-1 and x == rows-1:
            total_sum = (shared_mem[tx, ty]    +  shared_mem[tx, ty + 1] +
                        shared_mem[tx + 1, ty] +  shared_mem[tx + 1, ty + 1])
        elif y == cols-1 and x == 0:
            total_sum = (shared_mem[tx + 1, ty] + shared_mem[tx + 1, ty + 1] +
                         shared_mem[tx + 2, ty] + shared_mem[tx + 2, ty + 1])
        elif y == 0:
            total_sum = (shared_mem[tx, ty + 1] + shared_mem[tx, ty + 2] +
                        shared_mem[tx + 1, ty + 1] + shared_mem[tx + 1, ty + 2] +
                        shared_mem[tx + 2, ty + 1] + shared_mem[tx + 2, ty + 2])
        elif y == cols-1:
            total_sum = (shared_mem[tx, ty] + shared_mem[tx, ty + 1] +
                        shared_mem[tx + 1, ty] + shared_mem[tx + 1, ty + 1] +
                        shared_mem[tx + 2, ty] + shared_mem[tx + 2, ty + 1])
        elif x == 0:
            total_sum = (shared_mem[tx + 1, ty] + shared_mem[tx + 1, ty + 1] + shared_mem[tx + 1, ty + 2] +
                        shared_mem[tx + 2, ty] + shared_mem[tx + 2, ty + 1] + shared_mem[tx + 2, ty + 2])
        elif x == rows-1:
            total_sum = (shared_mem[tx, ty] + shared_mem[tx, ty + 1] + shared_mem[tx, ty + 2] +
                        shared_mem[tx + 1, ty] + shared_mem[tx + 1, ty + 1] + shared_mem[tx + 1, ty + 2])

        output[x, y] = total_sum / 9.0

In [43]:
np.random.seed(0)

Ax = 500
Ay = 500
A = np.random.rand(Ax, Ay)
B = np.zeros_like(A)

t_start = time.time()
Mean_3x3_sequential(A, B)
t_finish = time.time()

t_cpu = t_finish - t_start

print("Input \n", A)
print()
print("Output \n", B)
print()
print(f"Tiempo ejecución en CPU = {t_cpu} s")

Input 
 [[0.5488135  0.71518937 0.60276338 ... 0.40171354 0.24841347 0.50586638]
 [0.31038083 0.37303486 0.52497044 ... 0.93841202 0.22864655 0.67714114]
 [0.59288027 0.0100637  0.4758262  ... 0.11224999 0.04236405 0.22774099]
 ...
 [0.04027885 0.47565007 0.23954916 ... 0.86257219 0.78585522 0.86425687]
 [0.26993482 0.42351381 0.63561855 ... 0.56271762 0.35168929 0.28798926]
 [0.21022704 0.09253358 0.75865814 ... 0.50272661 0.68693019 0.1843027 ]]

Output 
 [[0.21637984 0.3416836  0.39015958 ... 0.30972799 0.33335479 0.18445195]
 [0.28337361 0.46154695 0.52289962 ... 0.34842103 0.37583868 0.21446362]
 [0.28557148 0.42141822 0.4884467  ... 0.43059867 0.4280333  0.25794777]
 ...
 [0.31307159 0.48448196 0.51958307 ... 0.63846398 0.6379794  0.42470454]
 [0.16801535 0.34955156 0.47562152 ... 0.57647788 0.56544888 0.35122484]
 [0.11068992 0.26560955 0.37219746 ... 0.28558861 0.28626174 0.16787905]]

Tiempo ejecución en CPU = 2.6276276111602783 s


In [44]:
B = np.zeros_like(A)

A_device = cuda.to_device(A)
B_device = cuda.to_device(B)

threads_per_block = (16,16)

blocks_X = math.ceil(B.shape[0] / threads_per_block[0])
blocks_Y = math.ceil(B.shape[1] / threads_per_block[1])
blocks_total = (blocks_X, blocks_Y)

t_start = time.time()
Mean_3x3_parallel[blocks_total, threads_per_block](A_device, B_device)
t_finish = time.time()

t_gpu = t_finish - t_start

B_par = B_device.copy_to_host()

print("Input \n", A)
print()
print("Output \n", B_par)
print()
print(f"Tiempo ejecución en GPU = {t_gpu} s")

Input 
 [[0.5488135  0.71518937 0.60276338 ... 0.40171354 0.24841347 0.50586638]
 [0.31038083 0.37303486 0.52497044 ... 0.93841202 0.22864655 0.67714114]
 [0.59288027 0.0100637  0.4758262  ... 0.11224999 0.04236405 0.22774099]
 ...
 [0.04027885 0.47565007 0.23954916 ... 0.86257219 0.78585522 0.86425687]
 [0.26993482 0.42351381 0.63561855 ... 0.56271762 0.35168929 0.28798926]
 [0.21022704 0.09253358 0.75865814 ... 0.50272661 0.68693019 0.1843027 ]]

Output 
 [[0.21637983 0.3416836  0.39015961 ... 0.30972799 0.33335479 0.18445195]
 [0.28337359 0.46154695 0.52289963 ... 0.34842104 0.3758387  0.21446362]
 [0.28557147 0.42141822 0.48844671 ... 0.43059868 0.42803327 0.25794776]
 ...
 [0.31307162 0.48448202 0.51958312 ... 0.63846397 0.63797935 0.42470455]
 [0.16801535 0.34955155 0.47562154 ... 0.5764779  0.56544892 0.35122485]
 [0.11068992 0.26560956 0.37219744 ... 0.28558861 0.28626174 0.16787904]]

Tiempo ejecución en GPU = 0.0004222393035888672 s


In [18]:
speedup = t_cpu / t_gpu
print(f"Speedup = {speedup}")

Speedup = 4.731156187370015


# Optional assignment #2: Reduction approaches (3 points)

In [19]:
def Reduce_sequential(input, output):
    acum = 0
    for i in range(input.shape[0]):
        acum += input[i]
    output[0] = acum


N = 10_000_000
np.random.seed(0)
A = np.random.rand(N)
output = np.zeros(1)

t_start = time.time()
Reduce_sequential(A, output)
t_finish = time.time()

t_cpu = t_finish - t_start

print("Input \n", A)
print()
print("Output \n", output[0])
print()
print(f"Tiempo ejecución en CPU = {t_cpu} s")

Input 
 [0.5488135  0.71518937 0.60276338 ... 0.7016828  0.45616281 0.14553608]

Output 
 4999995.59696607

Tiempo ejecución en CPU = 1.5949816703796387 s


## Reduction #1: Interleaved addressing with divergent branching

In [20]:
@cuda.jit
def reduce_interleaved_divergent(input, output):
    sSrc = cuda.shared.array(16, dtype=float32)
    sDst = cuda.shared.array(16, dtype=float32)


    tid = cuda.threadIdx.x
    bid = cuda.blockIdx.x
    bx = cuda.blockDim.x

    if tid + bid * bx < input.size:
        sSrc[tid] = input[tid + bid * bx]
    else:
        sSrc[tid] = 0
    cuda.syncthreads()

    stride = 2
    while stride <= bx:
        if tid % stride == 0:
            if tid + stride // 2 < bx:
                # sSrc[tid + stride // 2]
                sDst[tid] = sSrc[tid] + sSrc[tid + stride // 2]
        cuda.syncthreads()

        sSrc[tid] = sDst[tid]
        cuda.syncthreads()

        # Como se indica arriba, se multiplica stride * 2
        stride *= 2

    # El primer hilo de todos tiene el privilegio de guardar el resultado
    if tid == 0:
        cuda.atomic.add(output, 0, sDst[0])

In [21]:
output = np.zeros(1)

input_device = cuda.to_device(A)
output_device = cuda.to_device(output)

threads_per_block = 16
blocks_per_grid = math.ceil(N + threads_per_block - 1 / threads_per_block)

#TODO: calculo del tiempo en gpu
t_start = time.time()
reduce_interleaved_divergent[blocks_per_grid, threads_per_block](input_device, output_device)
t_finish = time.time()

t_gpu_1 = t_finish - t_start

output = output_device.copy_to_host()

print("Input \n", A)
print()
print("Output \n", output[0])
print()
print(f"Tiempo ejecución en GPU = {t_gpu_1} s")

Input 
 [0.5488135  0.71518937 0.60276338 ... 0.7016828  0.45616281 0.14553608]

Output 
 4999995.596932173

Tiempo ejecución en GPU = 0.18361759185791016 s


In [22]:
speedup = t_cpu / t_gpu_1
print(f"Speedup = {speedup}")

Speedup = 8.6864316988423


## Reduction #2: Interleaved addressing with no divergent branching

In [23]:
@cuda.jit
def reduce_interleaved_no_divergent(input, output):
    sSrc = cuda.shared.array(shape=16, dtype=float32)
    tid = cuda.threadIdx.x
    bid = cuda.blockIdx.x
    bx = cuda.blockDim.x

    idx = tid + bid * bx
    if idx < input.size:
        sSrc[tid] = input[idx]
    else:
        sSrc[tid] = 0.0
    cuda.syncthreads()

    stride = 1
    while stride < bx:
        index = tid + stride
        if index < bx:
            sSrc[tid] += sSrc[index]
        cuda.syncthreads()
        stride *= 2

    if tid == 0:
        cuda.atomic.add(output, 0, sSrc[0])

In [24]:
output = np.zeros(1)

input_device = cuda.to_device(A)
output_device = cuda.to_device(output)

threads_per_block = 16
blocks_per_grid = math.ceil(N + threads_per_block - 1 / threads_per_block)

t_start = time.time()
reduce_interleaved_no_divergent[blocks_per_grid, threads_per_block](input_device, output_device)
t_finish = time.time()

t_gpu_2 = t_finish - t_start

output = output_device.copy_to_host()

print("Input \n", A)
print()
print("Output \n", output[0])
print()
print(f"Tiempo ejecución en GPU = {t_gpu_1} s")

Input 
 [0.5488135  0.71518937 0.60276338 ... 0.7016828  0.45616281 0.14553608]

Output 
 4999995.596932173

Tiempo ejecución en GPU = 0.18361759185791016 s


In [25]:
speedup = t_cpu / t_gpu_2
print(f"Speedup = {speedup}")

Speedup = 10.142910858484015


## Reduction #3: Sequential addressing

In [26]:
@cuda.jit
def reduce_sequential_addressing(input, output):
    sSrc = cuda.shared.array(16, dtype=float32)

    tid = cuda.threadIdx.x
    bid = cuda.blockIdx.x
    bx = cuda.blockDim.x

    if tid + bid * bx < input.size:
        sSrc[tid] = input[tid + bid * bx]
    else:
        sSrc[tid] = 0
    cuda.syncthreads()

    stride = 1
    while stride < bx:
        index = 2 * stride * tid
        if index + stride < bx:
            sSrc[index] += sSrc[index + stride]
        cuda.syncthreads()

        stride *= 2

    if tid == 0:
        cuda.atomic.add(output, 0, sSrc[0])

In [27]:
output = np.zeros(1)

input_device = cuda.to_device(A)
output_device = cuda.to_device(output)

threads_per_block = 16
blocks_per_grid = math.ceil(N + threads_per_block - 1 / threads_per_block)

t_start = time.time()
reduce_sequential_addressing[blocks_per_grid, threads_per_block](input_device, output_device)
t_finish = time.time()

t_gpu_3 = t_finish - t_start

output = output_device.copy_to_host()

print("Input \n", A)
print()
print("Output \n", output[0])
print()
print(f"Tiempo ejecución en GPU = {t_gpu_1} s")

Input 
 [0.5488135  0.71518937 0.60276338 ... 0.7016828  0.45616281 0.14553608]

Output 
 4999995.596932173

Tiempo ejecución en GPU = 0.18361759185791016 s


In [28]:
speedup = t_cpu / t_gpu_3
print(f"Speedup = {speedup}")

Speedup = 7.758717750961165


## Reduction #4: Atomic addition

In [29]:
@cuda.jit
def reduce_atomic_addition(input, output):

    partial_sum = cuda.shared.array(16, dtype=float32)

    tid = cuda.threadIdx.x
    bid = cuda.blockIdx.x
    bx = cuda.blockDim.x

    if tid == 0:
        partial_sum[0] = 0.0
    cuda.syncthreads()

    if tid + bid * bx < input.size:
        cuda.atomic.add(partial_sum, 0, input[tid + bid * bx])
    cuda.syncthreads()

    if tid == 0:
        cuda.atomic.add(output, 0, partial_sum[0])

In [30]:
output = np.zeros(1)

input_device = cuda.to_device(A)
output_device = cuda.to_device(output)

threads_per_block = 16
blocks_per_grid = math.ceil(N + threads_per_block - 1 / threads_per_block)

t_start = time.time()
reduce_atomic_addition[blocks_per_grid, threads_per_block](input_device, output_device)
t_finish = time.time()

t_gpu_atom = t_finish - t_start

output = output_device.copy_to_host()

print("Input \n", A)
print()
print("Output \n", output[0])
print()
print(f"Tiempo ejecución en GPU = {t_gpu_1} s")

Input 
 [0.5488135  0.71518937 0.60276338 ... 0.7016828  0.45616281 0.14553608]

Output 
 4999995.597110033

Tiempo ejecución en GPU = 0.18361759185791016 s


In [31]:
speedup = t_cpu / t_gpu_atom
print(f"Speedup = {speedup}")

Speedup = 12.920981168517624
