<a href="https://colab.research.google.com/github/Shu244/CUDA-Enabled-VGG16-Replica/blob/master/VGG16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Sets up environment variables to allow cudatoolkit and numba to function properly.
import os
os.environ['NUMBAPRO_LIBDEVICE'] = "/usr/local/cuda-10.0/nvvm/libdevice"
os.environ['NUMBAPRO_NVVM'] = "/usr/local/cuda-10.0/nvvm/lib64/libnvvm.so"

In [39]:
!git clone https://github.com/Shu244/CUDA-Enabled-VGG16-Replica.git
from google.colab import drive
drive.mount('/content/drive')

fatal: destination path 'CUDA-Enabled-VGG16-Replica' already exists and is not an empty directory.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from PIL import Image
import numpy as np
from numba import cuda, vectorize
import h5py


@cuda.jit
def matrix_mul(matrix1, matrix2, result):
    '''
    Matrix 1 and 2 must be 2D arrays.
    '''

    thread_r, thread_c = cuda.grid(2)  # Gets position of thread.
    thread_r_size, thread_c_size = cuda.gridsize(2)  # Gets total number of threads in each dimension.

    result_r, result_c = result.shape

    for thread_r_i in range(thread_r, result_r, thread_r_size):
        for thread_c_i in range(thread_c, result_c, thread_c_size):
            # The following code will calculate one element in the result matrix at location thread_r_i and thread_c_i.
            for vector_i in range(0, matrix2.shape[0]):
                result[thread_r_i, thread_c_i] = result[thread_r_i, thread_c_i] + matrix1[thread_r_i, vector_i] * matrix2[vector_i, thread_c_i]


@cuda.jit
def same_convolve_multiple_filters(unpadded_input, kernal, result):
    '''
    Strides = 1.
    unpadded_input and result must have the same shape.
    Expected shape of the unpadded_input: channels, height, width
    Expected shape of the kernal: kernals, channels, height, width.
    Expected shape result: channels, height, width
    '''
    thread_channel, thread_r, thread_c = cuda.grid(3)  # Gets position of thread.
    thread_channel_size, thread_r_size, thread_c_size = cuda.gridsize(
        3)  # Gets total number of threads in each dimension.

    kernal_channels, kernal_r, kernal_c = kernal.shape[1:]

    # Padding that must be applied to all borders to maintain same dimension.
    r_pad, c_pad = ((kernal_r - 1) // 2), ((kernal_c - 1) // 2)

    result_channel, result_r, result_c = result.shape

    for thread_channel_i in range(thread_channel, result_channel, thread_channel_size):
        for thread_r_i in range(thread_r, result_r, thread_r_size):
            for thread_c_i in range(thread_c, result_c, thread_c_size):
                corner_input_r_i = thread_r_i - r_pad
                corner_input_c_i = thread_c_i - c_pad
                for kernal_channel_i in range(0, kernal_channels):
                    for kernal_r_i in range(0, kernal_r):
                        for kernal_c_i in range(0, kernal_c):
                            input_r_i = corner_input_r_i + kernal_r_i
                            input_c_i = corner_input_c_i + kernal_c_i
                            if 0 <= input_r_i < result_r and 0 <= input_c_i < result_c:
                                new_result = unpadded_input[kernal_channel_i, input_r_i, input_c_i] * kernal[thread_channel_i, kernal_channel_i, kernal_r_i, kernal_c_i]
                                result[thread_channel_i, thread_r_i, thread_c_i] = result[thread_channel_i, thread_r_i, thread_c_i] + new_result


@cuda.jit
def max_pooling_multiple_filters(input, window, result):
    '''
    Expectation: the dimensions of the input is disivible by dimensions of window. Further, windows do not overlap.
    Expected shape of the input: channels, height, width
    "window" is a tuple of format (height, width)
    Expected shape of result: channels, height, width
    '''
    thread_channel, thread_r, thread_c = cuda.grid(3)  # Gets position of thread.
    thread_channel_size, thread_r_size, thread_c_size = cuda.gridsize(3)  # Gets total number of threads in each dimension.

    window_r, window_c = window

    result_channel, result_r, result_c = result.shape

    for thread_channel_i in range(thread_channel, result_channel, thread_channel_size):
        for thread_r_i in range(thread_r, result_r, thread_r_size):
            for thread_c_i in range(thread_c, result_c, thread_c_size):
                input_r_corner_i = thread_r_i * window_r
                input_c_corner_i = thread_c_i * window_c
                max = input[thread_channel_i, input_r_corner_i, input_c_corner_i]
                for window_r_i in range(0, window_r):
                    for window_c_i in range(0, window_c):
                        element = input[thread_channel_i, input_r_corner_i + window_r_i, input_c_corner_i + window_c_i]
                        if element > max:
                            max = element
                result[thread_channel_i, thread_r_i, thread_c_i] = max


@cuda.jit
def add_biases_2D(input, biases):
    '''
    Input is of format: height, width
    There must be one bias per input element.
    is one bias per layer.
    '''
    thread_r, thread_c = cuda.grid(2)  # Gets position of thread.
    thread_r_size, thread_c_size = cuda.gridsize(2)  # Gets total number of threads in each dimension.

    result_r, result_c = input.shape

    for thread_r_i in range(thread_r, result_r, thread_r_size):
        for thread_c_i in range(thread_c, result_c, thread_c_size):
            input[thread_r_i, thread_c_i] = input[thread_r_i, thread_c_i] + biases[thread_r_i, thread_c_i]


@cuda.jit
def add_biases_3D(input, biases):
    '''
    Input is of format: channels, height, width
    The number of biases must be equal to the number of layers as there
    is one bias per layer.
    '''
    thread_channel, thread_r, thread_c = cuda.grid(3)  # Gets position of thread.
    thread_channel_size, thread_r_size, thread_c_size = cuda.gridsize(3)  # Gets total number of threads in each dimension.

    result_channel, result_r, result_c = input.shape

    for thread_channel_i in range(thread_channel, result_channel, thread_channel_size):
        for thread_r_i in range(thread_r, result_r, thread_r_size):
            for thread_c_i in range(thread_c, result_c, thread_c_size):
                input[thread_channel_i, thread_r_i, thread_c_i] = input[thread_channel_i, thread_r_i, thread_c_i] + biases[thread_channel_i]


# Using float64 because h5 file stores parameters in float64.
@vectorize(['float32(float32)', 'float64(float64)'], target='cuda')
def ReLU(activation):
    if activation > 0:
        return activation
    return 0
    ''' 
    return max(activation, 0) causes racing condition to occur. 
    '''


def softmax(activations):
    '''
    Used only in last layer with 1000 neurons. Not worth the data transfer speed
    to involve GPU.
    '''
    eactivations = np.exp(activations)
    etotal = sum(eactivations)
    return eactivations / etotal


class VGG16:
    def __init__(self, params_path, categories_path):
        # reading in weights and baises now.
        self.params = h5py.File(params_path, 'r')
        self.input = None
        self.categories = np.array([line.rstrip('\n') for line in open(categories_path)])

    def classify(self, image_path):
        img = Image.open(image_path)
        img = img.resize((224, 224), Image.ANTIALIAS)
        # Flips image from RGB to BGR
        img = np.flip(img, [2])
        # Gets image to be in the format: channels, height, width
        img = np.rollaxis(img, 2, 0)  
        #print('Tranposed original image:\n\n',img[0,:5,:5])
        # Put img array in C-contiguous format
        img = img.copy(order='C')
        img_device = cuda.to_device(img)
        self.input = img_device

        logits = self.forward_propagation()
        return self.categorize(logits)

    def categorize(self, logits, top_num=5):
        guesses = []
        for i in range(0, top_num):
            index = np.argmax(logits)
            output = (self.categories[index] + " (prob: {0:.3f})").format(logits[index])
            guesses.append(output)
            logits[index] = -1
        return guesses

    def forward_propagation(self):
        layers_with_pool = (1, 3, 6, 9, 12)
        threads_per_block = (4, 8, 16)
        blocks_per_grid = (10, 10)
        pool_window_height, pool_window_width = 2, 2
        
        #TEST
        #orinput = self.input.copy_to_host()
        #print('original image:\n\n',orinput[0,:5,:5])

        # going through all convolution layers.
        for i in range(0, 13):
            filters = self.params[str(i)]['weights']    
            biases = self.params[str(i)]['biases']
            filters_device = cuda.to_device(filters)
            biases_device = cuda.to_device(biases)

            # Computes convolution.
            result = np.zeros((filters.shape[0],) + self.input.shape[1:])
            out_device = cuda.device_array_like(result)
            same_convolve_multiple_filters[blocks_per_grid, threads_per_block](self.input, filters_device, out_device)

            # Applying biases now.
            add_biases_3D[blocks_per_grid, threads_per_block](out_device, biases_device)          

            # Applying ReLU now.
            out_device = ReLU(out_device)           
            
            # # Test print
            # test = out_device.copy_to_host()
            # print('weights:\n\n', filters[0,2,:,:])
            # print("outputs: \n\n", test[0, :10, :10])
            # return

            # Applying maxpool now, if necessary.
            if i in layers_with_pool:
                channels, height, width = out_device.shape
                window_device = cuda.to_device((pool_window_height, pool_window_width))
                new_out_device = cuda.device_array((channels, height // pool_window_height, width // pool_window_width))
                max_pooling_multiple_filters[blocks_per_grid, threads_per_block](out_device, window_device, new_out_device)
                out_device = new_out_device

            self.input = out_device

        # Must apply fully connected layers now.
        threads_per_block = (32, 16)
        blocks_per_grid = (10, 10)
        # Converting volume to vector.
        num_elements = np.prod(self.input.shape)
        self.input = self.input.reshape((1, num_elements))
        for i in range(13, 16):
            # Setting up for CUDA Numba.
            weights = self.params[str(i)]['weights']
            biases = self.params[str(i)]['biases']
            weights_device = cuda.to_device(weights)
            biases_device = cuda.to_device(biases)

            mult_result = np.zeros((self.input.shape[0], weights.shape[1]))
            device_mult_result = cuda.to_device(mult_result)

            # Performing matrix multiplication on GPU.
            matrix_mul[blocks_per_grid, threads_per_block](self.input, weights_device, device_mult_result)

            # Must add biases now.
            biases_device = biases_device.reshape(device_mult_result.shape)
            add_biases_2D(device_mult_result, biases_device)          

            # Must apply ReLU now
            self.input = ReLU(device_mult_result)

        # Now applying softmax functon.
        result = self.input.copy_to_host()
        result = result.reshape(result.shape[1])
        return softmax(result)
        

In [74]:
image_paths = []

image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/dog photo.jpg')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/orangutan2.jpg')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/orangutan3.jpg')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/orangutan4.jpg')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/orangutan5.jpg')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/orangutan6.jpg')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/magpie.jfif')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/ostrich.jfif')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/scorpion.jfif')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/sea snake.jfif')
image_paths.append('/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/tick.jfif')

data_path = '/content/drive/My Drive/Colab Notebooks/VGG16 Test Data/vgg16_weights_reformatted.h5'
categories_path = '/content/CUDA-Enabled-VGG16-Replica/Categories.txt'

model = VGG16(data_path, categories_path)
for paths in image_paths:
    print(model.classify(paths))

['golden retriever (prob: 0.252)', 'Border collie (prob: 0.229)', 'collie (prob: 0.151)', 'tennis ball (prob: 0.095)', 'Shetland sheepdog, Shetland sheep dog, Shetland (prob: 0.048)']
['orangutan, orang, orangutang, Pongo pygmaeus (prob: 1.000)', 'gorilla, Gorilla gorilla (prob: 0.000)', 'gibbon, Hylobates lar (prob: 0.000)', 'patas, hussar monkey, Erythrocebus patas (prob: 0.000)', 'howler monkey, howler (prob: 0.000)']
['orangutan, orang, orangutang, Pongo pygmaeus (prob: 0.932)', 'gorilla, Gorilla gorilla (prob: 0.065)', 'guenon, guenon monkey (prob: 0.001)', 'siamang, Hylobates syndactylus, Symphalangus syndactylus (prob: 0.000)', 'gibbon, Hylobates lar (prob: 0.000)']
['orangutan, orang, orangutang, Pongo pygmaeus (prob: 0.999)', 'howler monkey, howler (prob: 0.000)', 'siamang, Hylobates syndactylus, Symphalangus syndactylus (prob: 0.000)', 'gorilla, Gorilla gorilla (prob: 0.000)', 'spider monkey, Ateles geoffroyi (prob: 0.000)']
['orangutan, orang, orangutang, Pongo pygmaeus (pro