# Tensorflow with GPU

This notebook provides an introduction to computing on a [GPU](https://cloud.google.com/gpu) in Colab. In this notebook you will connect to a GPU, and then run some basic TensorFlow operations on both the CPU and a GPU, observing the speedup provided by using the GPU.


## Enabling and testing the GPU

First, you'll need to enable GPUs for the notebook:

- Navigate to Edit→Notebook Settings
- select GPU from the Hardware Accelerator drop-down

Next, we'll confirm that we can connect to the GPU with tensorflow:

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

## Observe TensorFlow speedup on GPU relative to CPU

This example constructs a typical convolutional neural network layer over a
random image and manually places the resulting ops on either the CPU or the GPU
to compare execution speed.

In [None]:
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)

# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

In [None]:
#!pip install numba cupy-cuda

In [None]:
from numba import cuda
import numpy as np
import math

# CUDA kernel
@cuda.jit
def add_kernel(x, y, out):
    tx = cuda.threadIdx.x # Thread id in a block
    ty = cuda.blockIdx.x  # Block id in a grid
    bw = cuda.blockDim.x  # Block width, i.e. number of threads per block
    pos = tx + ty * bw    # Compute flattened index inside the array

    if pos < out.size:  # Check array boundaries
        out[pos] = x[pos] + y[pos]

# Host code
n = 1000
x = np.arange(n).astype(np.float32)
y = 2 * x
out = np.zeros_like(x)

# Define thread hierarchy
threads_per_block = 128
blocks_per_grid = math.ceil(n / threads_per_block)

# Start the kernel
add_kernel[blocks_per_grid, threads_per_block](x, y, out)
print(out)


In [None]:
import cupy as cp

x_gpu = cp.array([1, 2, 3, 4, 5])
y_gpu = cp.array([6, 7, 8, 9, 10])

# Perform an element-wise addition on GPU
z_gpu = x_gpu + y_gpu

# Bring the result back to CPU memory
z = z_gpu.get()

print(z)


In [None]:

import numpy as np
import time

# Create large arrays
size = 300000000  # 300 million elements
x = np.random.rand(size)
y = np.random.rand(size)

# Element-wise operation on CPU
start_time = time.time()
z = x + y + x*x  # This operation is performed on the CPU
end_time = time.time()

cpu_time = end_time - start_time
print(f"Time taken on CPU: {cpu_time} seconds")


In [None]:
import cupy as cp
import time

# Create large arrays
x_gpu = cp.random.rand(size)
y_gpu = cp.random.rand(size)

# Element-wise operation on GPU
start_time = time.time()
z_gpu = x_gpu + y_gpu  + x_gpu*x_gpu # This operation is performed on the GPU
cp.cuda.Stream.null.synchronize()  # Ensure the operation is complete
end_time = time.time()

gpu_time = end_time - start_time
print(f"Time taken on GPU: {gpu_time} seconds")


In [None]:
import numpy as np
import time

# Initialize matrices
size = 10000  # size of square matrix
A = np.random.rand(size, size)
B = np.random.rand(size, size)

# Perform matrix multiplication on CPU
start_time_cpu = time.time()
C = np.matmul(A, B)
end_time_cpu = time.time()

cpu_time = end_time_cpu - start_time_cpu
print(f"Time for matrix multiplication on CPU: {cpu_time} seconds")


In [None]:
import cupy as cp
import time

# Initialize matrices
A_gpu = cp.random.rand(size, size)
B_gpu = cp.random.rand(size, size)

# Perform matrix multiplication on GPU
start_time_gpu = time.time()
C_gpu = cp.matmul(A_gpu, B_gpu)
cp.cuda.Stream.null.synchronize()  # Make sure the current stream is done with all operations
end_time_gpu = time.time()

gpu_time = end_time_gpu - start_time_gpu
print(f"Time for matrix multiplication on GPU: {gpu_time} seconds")


In [None]:
from numba import cuda
import numpy as np

# Define a CUDA kernel function.
# A kernel function is a GPU function that is meant to be called from CPU (host) code.
# It's executed N times in parallel by N different CUDA threads, unlike usual functions.
@cuda.jit
def thread_block_grid_demo(output):
    # cuda.threadIdx.x: This is the unique thread ID within each block (local ID).
    # The x denotes that we're using a 1D block of threads, hence we're interested in the x-dimension ID.
    tx = cuda.threadIdx.x

    # cuda.blockIdx.x: This is the unique block ID within the grid of blocks launched by this kernel (also local ID).
    # Like threadIdx, the x indicates we're using a 1D grid of blocks.
    ty = cuda.blockIdx.x

    # cuda.blockDim.x: This represents the number of threads in the block.
    # This is constant for all threads and is set during the kernel launch.
    bw = cuda.blockDim.x

    # cuda.gridDim.x: This represents the number of blocks in the grid.
    # Also constant for all threads and is set during the kernel launch.
    grid_size = cuda.gridDim.x

    # Calculate the unique position for this thread in the entire grid of threads.
    # Each block computes a contiguous chunk of elements in the array.
    pos = tx + ty * bw

    # Boundary check: we don't want to go past the end of the array.
    # This can happen if the total size isn't perfectly divisible by the block size.
    if pos < output.size:
        # Encoding the IDs in the output just for demonstration.
        # This isn't typically done in production code.
        # This operation uniquely combines the IDs for demonstration purposes.
        output[pos] = tx + ty * 10 + bw * 100 + grid_size * 1000

# The total number of elements that we want to compute.
n = 60

# The number of threads in each block.
threads_per_block = 5

# Calculate the number of blocks we need in our grid.
# We divide the total size by the block size.
# The addition and subtraction ensure we round up to account for all elements.
blocks_per_grid = (n + (threads_per_block - 1)) // threads_per_block

print(blocks_per_grid)

# The output array where results will be stored.
# The dtype is float32 because CUDA is most efficient with 32-bit data types.
output_array = np.zeros(n, dtype=np.float32)

# Kernel launch.
# The "[blocks_per_grid, threads_per_block]" specifies the grid and block dimensions.
# "output_array" is the argument passed to the kernel function.
thread_block_grid_demo[blocks_per_grid, threads_per_block](output_array)

# After this line, the kernel has finished executing and the output_array on the host
# (CPU memory) has been updated with whatever the kernel wrote into it on the device (GPU memory).

print(output_array)
