<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


In [None]:
%%cuda
#include <stdio.h>

__global__ void hello() {
    printf("Hello from block %u, thread %u\n", blockIdx.x, threadIdx.x);
}

int main() {
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
    return 0;
}

Hello from block 0, thread 0
Hello from block 0, thread 1
Hello from block 1, thread 0
Hello from block 1, thread 1



# Chapter 1: Vector Addition

In [None]:
%%cuda

#include <assert.h>
#include <stdio.h>

__global__ void vectorAdd(int *d_a, int *d_b, int *d_c, int n) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < n) d_c[tid] = d_a[tid] + d_b[tid];
}

void numInit(int *h_a, int n) {
    for (int i = 0; i < n; i++) {
        h_a[i] = rand() % 100;
    }
}

void numCheck(int *h_a, int *h_b, int *h_c, int n) {
    for (int i = 0; i < n; i++) {
        assert(h_c[i] == h_a[i] + h_b[i]);
    }
}

int main() {
    int n = 1 << 16;
    size_t bytes = sizeof(int) * n;

    int *h_a, *h_b, *h_c;
    int *d_a, *d_b, *d_c;

    h_a = (int*) malloc(bytes);
    h_b = (int*) malloc(bytes);
    h_c = (int*) malloc(bytes);

    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    numInit(h_a, n);
    numInit(h_b, n);

    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int numThreads = 256;
    int numBlocks = (int) ceil(1.0 * n / numThreads);

    vectorAdd<<<numBlocks, numThreads>>>(d_a, d_b, d_c, n);

    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    numCheck(h_a, h_b, h_c, n);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    free(h_a);
    free(h_b);
    free(h_c);

    printf("Success!");

    return 0;
}

Success!


# Chapter 2: Unified Memory Vector Add

In [None]:
%%cuda

#include <assert.h>
#include <stdio.h>

__global__ void vectorAdd(int *a, int *b, int *c, int n) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < n) c[tid] = a[tid] + b[tid];
}

void numInit(int *a, int *b, int *c, int n) {
    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

void numCheck(int *a, int *b, int *c, int n) {
    for (int i = 0; i < n; i++) {
        assert(c[i] == a[i] + b[i]);
    }
}

int main() {
    int id = cudaGetDevice(&id);

    int n = 1 << 16;
    size_t bytes = n * sizeof(int);

    int *a, *b, *c;

    cudaMallocManaged(&a, bytes);
    cudaMallocManaged(&b, bytes);
    cudaMallocManaged(&c, bytes);

    numInit(a, b, c, n);

    int numThreads = 512;
    int numBlocks = (int) ceil(1.0 * n / numThreads);

    cudaMemPrefetchAsync(a, bytes, id);
    cudaMemPrefetchAsync(b, bytes, id);
    vectorAdd<<<numBlocks, numThreads>>>(a, b, c, n);
    cudaDeviceSynchronize();
    cudaMemPrefetchAsync(c, bytes, cudaCpuDeviceId);

    numCheck(a, b, c, n);

    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    printf("Success!");

    return 0;
}

Success!


# Chapter 3: Pinned Memory Vector Add

In [None]:
%%cuda

#include <assert.h>
#include <stdio.h>

__global__ void vectorAdd(int *d_a, int *d_b, int *d_c, int n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) d_c[i] = d_a[i] + d_b[i];
}

int main() {
    int n = 1 << 16;
    size_t bytes = sizeof(int) * n;

    int *h_a, *h_b, *h_c;
    int *d_a, *d_b, *d_c;

    cudaMallocHost(&h_a, bytes);
    cudaMallocHost(&h_b, bytes);
    cudaMallocHost(&h_c, bytes);

    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    for (int i = 0; i < n; i++) {
        h_a[i] = rand() % 100;
        h_b[i] = rand() % 100;
    }

    int numThreads = 256;
    int numBlocks = (int) ceil(1.0 * n / numThreads);

    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
    vectorAdd<<<numBlocks, numThreads>>>(d_a, d_b, d_c, n);
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    for (int i = 0; i < n; i++) {
        assert(h_c[i] == h_a[i] + h_b[i]);
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);

    printf("Success!");

    return 0;
}

Success!


# Chapter 4: Matrix Multiplication

In [None]:
%%cuda

#include <assert.h>
#include <stdio.h>

__global__ void matMul(int *d_a, int *d_b, int *d_c, int n) {
    int rowi = blockDim.y * blockIdx.y + threadIdx.y;
    int coli = blockDim.x * blockIdx.x + threadIdx.x;

    if (rowi < n && coli < n) {
        for (int i = 0; i < n; i++) {
            d_c[rowi * n + coli] += d_a[rowi * n + i] * d_b[i * n + coli];
        }
    }
}

int main() {
    int n = 1 << 10;
    size_t bytes = sizeof(int) * n * n;

    int *h_a = (int*) malloc(bytes);
    int *h_b = (int*) malloc(bytes);
    int *h_c = (int*) malloc(bytes);

    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    for (int i = 0; i < n * n; i++) {
        h_a[i] = rand() % 100;
        h_b[i] = rand() % 100;
    }

    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int numThreads = 16;
    int numBlocks = (int) ceil(1.0 * n / numThreads);

    dim3 sizeBlock(numThreads, numThreads);
    dim3 sizeGrid(numBlocks, numBlocks);
    matMul<<<sizeGrid, sizeBlock>>>(d_a, d_b, d_c, n);

    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
    printf("CUDA Completed!\n");

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            int cExpected = h_c[i * n + j];
            int cActual = 0;
            for (int k = 0; k < n; k++) {
                cActual += h_a[i * n + k] * h_b[k * n + j];
            }
            assert(cActual == cExpected);
        }
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    free(h_a);
    free(h_b);
    free(h_c);

    printf("Success!");

    return 0;
}

CUDA Completed!
Success!


# Chapter 5: Tiled Cache Matrix Multiplication (C Implementation)

In [None]:
%%cuda

#include <assert.h>
#include <stdio.h>

const int SIZE_BLOCK_EDGE = 1 << 4;

__global__ void matmul(int *a, int *b, int *c, int sizeEdge) {
    __shared__ int s_a[SIZE_BLOCK_EDGE * SIZE_BLOCK_EDGE];
    __shared__ int s_b[SIZE_BLOCK_EDGE * SIZE_BLOCK_EDGE];

    int rowi = blockDim.y * blockIdx.y + threadIdx.y;
    int coli = blockDim.x * blockIdx.x + threadIdx.x;

    int tmp = 0;

    for (int wini = 0; wini < sizeEdge; wini += SIZE_BLOCK_EDGE) {
        s_a[threadIdx.y * blockDim.x + threadIdx.x] = a[rowi * sizeEdge + threadIdx.x + wini];
        s_b[threadIdx.y * blockDim.x + threadIdx.x] = b[(threadIdx.y + wini) * sizeEdge + coli];
        __syncthreads();

        for (int keri = 0; keri < SIZE_BLOCK_EDGE; keri++) {
            tmp += s_a[threadIdx.y * blockDim.x + keri] * s_b[keri * blockDim.x + threadIdx.x];
        }

        __syncthreads();
    }

    c[rowi * sizeEdge + coli] = tmp;
}

void verifyResult(int *a, int *b, int *c, int sizeEdge) {
    for (int rowi = 0; rowi < sizeEdge; rowi++) {
        for (int coli = 0; coli < sizeEdge; coli++) {
            int tmp = 0;

            for (int keri = 0; keri < sizeEdge; keri++) {
                tmp += a[rowi * sizeEdge + keri] * b[keri * sizeEdge + coli];
            }

            assert(tmp == c[rowi * sizeEdge + coli]);
        }
    }
}

int main() {
    int sizeEdge = 1 << 10;
    int sizeMatrix = sizeEdge * sizeEdge;
    size_t numBytes = sizeof(int) * sizeMatrix;

    int *h_a = (int*) malloc(numBytes);
    int *h_b = (int*) malloc(numBytes);
    int *h_c = (int*) malloc(numBytes);

    int *d_a, *d_b, *d_c;

    cudaMalloc(&d_a, numBytes);
    cudaMalloc(&d_b, numBytes);
    cudaMalloc(&d_c, numBytes);

    for (int i = 0; i < sizeMatrix; i++) {
        h_a[i] = rand() % 100;
        h_b[i] = rand() % 100;
    }

    cudaMemcpy(d_a, h_a, numBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, numBytes, cudaMemcpyHostToDevice);

    int sizeBlockEdge = SIZE_BLOCK_EDGE;
    int sizeGridEdge = (int) ceil(1.0 * sizeEdge / sizeBlockEdge);

    printf("sizeEdge = %d\n", sizeEdge);
    printf("sizeBlockEdge = %d\n", sizeBlockEdge);
    printf("sizeGridEdge = %d\n", sizeGridEdge);

    dim3 dimBlock(sizeBlockEdge, sizeBlockEdge);
    dim3 dimGrid(sizeGridEdge, sizeGridEdge);
    matmul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, sizeEdge);

    cudaMemcpy(h_c, d_c, numBytes, cudaMemcpyDeviceToHost);

    verifyResult(h_a, h_b, h_c, sizeEdge);

    free(h_a);
    free(h_b);
    free(h_c);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    printf("Success!");

    return 0;
}

sizeEdge = 1024
sizeBlockEdge = 16
sizeGridEdge = 64
Success!


## Practice

In [None]:
%%cuda

#include <assert.h>
#include <stdio.h>

const int NUM_ROWS = 1 << 8;
const int NUM_COLS = 1 << 12;
const int NUM_CHANS = 1 << 16;

const int SIZE_EDGE_BLOCK = 1 << 4;

__global__ void matmul(int *a, int *b, int *c) {
    __shared__ int s_a[SIZE_EDGE_BLOCK * SIZE_EDGE_BLOCK];
    __shared__ int s_b[SIZE_EDGE_BLOCK * SIZE_EDGE_BLOCK];

    int tmp = 0;

    int rowi = blockDim.y * blockIdx.y + threadIdx.y;
    int coli = blockDim.x * blockIdx.x + threadIdx.x;

    for (int wini = 0; wini < NUM_CHANS; wini += SIZE_EDGE_BLOCK) {
        s_a[threadIdx.y * blockDim.x + threadIdx.x] = a[rowi * NUM_CHANS + wini + threadIdx.x];
        s_b[threadIdx.y * blockDim.x + threadIdx.x] = b[(wini + threadIdx.y) * NUM_COLS + coli];
        __syncthreads();

        for (int keri = 0; keri < SIZE_EDGE_BLOCK; keri++) {
            tmp += s_a[threadIdx.y * blockDim.x + keri] * s_b[keri * blockDim.x + threadIdx.x];
        }
        __syncthreads();
    }

    c[rowi * NUM_COLS + coli] = tmp;
}

void matrixInit(int *m, int sizeMatrix) {
    for (int i = 0; i < sizeMatrix; i++) {
        m[i] = rand() % 100;
    }
}

void matrixVerify(int *a, int *b, int *c) {
    for (int rowi = 0; rowi < NUM_ROWS; rowi++) {
        for (int coli = 0; coli < NUM_COLS; coli++) {
            int actual = 0;

            for (int chani = 0; chani < NUM_CHANS; chani++) {
                actual += a[rowi * NUM_CHANS + chani] * b[chani * NUM_COLS + coli];
            }

            if (actual != c[rowi * NUM_COLS + coli]) {
                printf("rowi = %d, coli = %d", rowi, coli);
            }
        }
    }
}

int main() {
    int sizeMatrixA = NUM_ROWS * NUM_CHANS;
    int sizeMatrixB = NUM_CHANS * NUM_COLS;
    int sizeMatrixC = NUM_ROWS * NUM_COLS;

    size_t numBytesA = sizeof(int) * sizeMatrixA;
    size_t numBytesB = sizeof(int) * sizeMatrixB;
    size_t numBytesC = sizeof(int) * sizeMatrixC;

    int *h_a = (int*) malloc(numBytesA);
    int *h_b = (int*) malloc(numBytesB);
    int *h_c = (int*) malloc(numBytesC);

    matrixInit(h_a, sizeMatrixA);
    matrixInit(h_b, sizeMatrixB);

    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, numBytesA);
    cudaMalloc(&d_b, numBytesB);
    cudaMalloc(&d_c, numBytesC);

    cudaMemcpy(d_a, h_a, numBytesA, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, numBytesB, cudaMemcpyHostToDevice);

    int numRowsGrid = NUM_ROWS / SIZE_EDGE_BLOCK;
    int numColsGrid = NUM_COLS / SIZE_EDGE_BLOCK;

    printf("numRowsGrid = %d\n", numRowsGrid);
    printf("numColsGrid = %d\n", numColsGrid);

    dim3 dimGrid(numRowsGrid, numColsGrid);
    dim3 dimBlock(SIZE_EDGE_BLOCK, SIZE_EDGE_BLOCK);

    matmul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c);

    cudaMemcpy(h_c, d_c, numBytesC, cudaMemcpyDeviceToHost);

    matrixVerify(h_a, h_b, h_c);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    free(h_a);
    free(h_b);
    free(h_c);

    printf("Success!");

    return 0;
}

# Chapter 5: Tiled Cache Matrix Multiplication (C++ Implementation)

In [None]:
%%cuda

#include <algorithm>
#include <cassert>
#include <iostream>
#include <vector>

__global__ void matmul(int *a, int *b, int *c, int sizeEdge) {
    // TODO
}

void arrayCheck(
    const std::vector<int>& a, const std::vector<int>& b,
    const std::vector<int>& c, const int sizeEdge
) {
    assert(false);
}

int main() {
    // Step 1: Setup parameters of the matrix multiplication.
    int sizeEdge = 1 << 10;
    int sizeMatrix = sizeEdge * sizeEdge;
    int sizeBytes = sizeof(int) * sizeMatrix;

    // Step 2: Init host matrix.
    std::vector<int> h_a(sizeMatrix);
    std::vector<int> h_b(sizeMatrix);
    std::vector<int> h_c(sizeMatrix);

    std::generate(h_a.begin(), h_b.end(), [](){return rand() % 100;});
    std::generate(h_b.begin(), h_b.end(), [](){return rand() % 100;});

    // Step 3: Init device matrix.
    int *d_a, *d_b, *d_c;

    cudaMalloc(&d_a, sizeBytes);
    cudaMalloc(&d_b, sizeBytes);
    cudaMalloc(&d_c, sizeBytes);

    // Step 4: Launch the kernel function.
    cudaMemcpy(d_a, h_a.data(), sizeBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b.data(), sizeBytes, cudaMemcpyHostToDevice);

    int sizeBlockEdge = 1 << 4;
    int sizeGridEdge = (int) ceil(1.0 * sizeEdge / sizeBlockEdge);

    dim3 dimBlock(sizeBlockEdge, sizeBlockEdge);
    dim3 dimGrid(sizeGridEdge, sizeGridEdge);
    matmul<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, sizeEdge);

    cudaMemcpy(h_c.data(), d_c, sizeBytes, cudaMemcpyDeviceToHost);

    arrayCheck(h_a, h_b, h_c, sizeEdge);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    std::cout << "Success!" << std::endl;

    return 0;
}


