<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

In [13]:
%%cuda
#include <stdio.h>

__global__ void hello() {
    printf("Hello from block %u, thread %u\n", blockIdx.x, threadIdx.x);
}

int main() {
    hello<<<2, 2>>>();
    cudaDeviceSynchronize();
    return 0;
}

Hello from block 0, thread 0
Hello from block 0, thread 1
Hello from block 1, thread 0
Hello from block 1, thread 1



# Chapter 1: Vector Addition

In [44]:
%%cuda

#include <assert.h>
#include <stdio.h>

__global__ void vectorAdd(int *d_a, int *d_b, int *d_c, int n) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < n) d_c[tid] = d_a[tid] + d_b[tid];
}

void numInit(int *h_a, int n) {
    for (int i = 0; i < n; i++) {
        h_a[i] = rand() % 100;
    }
}

void numCheck(int *h_a, int *h_b, int *h_c, int n) {
    for (int i = 0; i < n; i++) {
        assert(h_c[i] == h_a[i] + h_b[i]);
    }
}

int main() {
    int n = 1 << 16;
    size_t bytes = sizeof(int) * n;

    int *h_a, *h_b, *h_c;
    int *d_a, *d_b, *d_c;

    h_a = (int*) malloc(bytes);
    h_b = (int*) malloc(bytes);
    h_c = (int*) malloc(bytes);

    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    numInit(h_a, n);
    numInit(h_b, n);

    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int numThreads = 256;
    int numBlocks = (int) ceil(1.0 * n / numThreads);

    vectorAdd<<<numBlocks, numThreads>>>(d_a, d_b, d_c, n);

    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    numCheck(h_a, h_b, h_c, n);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    free(h_a);
    free(h_b);
    free(h_c);

    printf("Success!");

    return 0;
}

Success!


# Chapter 2: Unified Memory Vector Add

In [49]:
%%cuda

#include <assert.h>
#include <stdio.h>

__global__ void vectorAdd(int *a, int *b, int *c, int n) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < n) c[tid] = a[tid] + b[tid];
}

void numInit(int *a, int *b, int *c, int n) {
    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

void numCheck(int *a, int *b, int *c, int n) {
    for (int i = 0; i < n; i++) {
        assert(c[i] == a[i] + b[i]);
    }
}

int main() {
    int id = cudaGetDevice(&id);

    int n = 1 << 16;
    size_t bytes = n * sizeof(int);

    int *a, *b, *c;

    cudaMallocManaged(&a, bytes);
    cudaMallocManaged(&b, bytes);
    cudaMallocManaged(&c, bytes);

    numInit(a, b, c, n);

    int numThreads = 512;
    int numBlocks = (int) ceil(1.0 * n / numThreads);

    cudaMemPrefetchAsync(a, bytes, id);
    cudaMemPrefetchAsync(b, bytes, id);
    vectorAdd<<<numBlocks, numThreads>>>(a, b, c, n);
    cudaDeviceSynchronize();
    cudaMemPrefetchAsync(c, bytes, cudaCpuDeviceId);

    numCheck(a, b, c, n);

    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    printf("Success!");

    return 0;
}

Success!


# Chapter 3: Pinned Memory Vector Add

In [54]:
%%cuda

#include <assert.h>
#include <stdio.h>

__global__ void vectorAdd(int *d_a, int *d_b, int *d_c, int n) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < n) d_c[i] = d_a[i] + d_b[i];
}

int main() {
    int n = 1 << 16;
    size_t bytes = sizeof(int) * n;

    int *h_a, *h_b, *h_c;
    int *d_a, *d_b, *d_c;

    cudaMallocHost(&h_a, bytes);
    cudaMallocHost(&h_b, bytes);
    cudaMallocHost(&h_c, bytes);

    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    for (int i = 0; i < n; i++) {
        h_a[i] = rand() % 100;
        h_b[i] = rand() % 100;
    }

    int numThreads = 256;
    int numBlocks = (int) ceil(1.0 * n / numThreads);

    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
    vectorAdd<<<numBlocks, numThreads>>>(d_a, d_b, d_c, n);
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    for (int i = 0; i < n; i++) {
        assert(h_c[i] == h_a[i] + h_b[i]);
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);

    printf("Success!");

    return 0;
}

Success!


# Chapter 4: Matrix Multiplication

In [70]:
%%cuda

#include <assert.h>
#include <stdio.h>

__global__ void matMul(int *d_a, int *d_b, int *d_c, int n) {
    int rowi = blockDim.y * blockIdx.y + threadIdx.y;
    int coli = blockDim.x * blockIdx.x + threadIdx.x;

    if (rowi < n && coli < n) {
        for (int i = 0; i < n; i++) {
            d_c[rowi * n + coli] += d_a[rowi * n + i] * d_b[i * n + coli];
        }
    }
}

int main() {
    int n = 1 << 10;
    size_t bytes = sizeof(int) * n * n;

    int *h_a = (int*) malloc(bytes);
    int *h_b = (int*) malloc(bytes);
    int *h_c = (int*) malloc(bytes);

    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    for (int i = 0; i < n * n; i++) {
        h_a[i] = rand() % 100;
        h_b[i] = rand() % 100;
    }

    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int numThreads = 16;
    int numBlocks = (int) ceil(1.0 * n / numThreads);

    dim3 sizeBlock(numThreads, numThreads);
    dim3 sizeGrid(numBlocks, numBlocks);
    matMul<<<sizeGrid, sizeBlock>>>(d_a, d_b, d_c, n);

    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
    printf("CUDA Completed!\n");

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            int cExpected = h_c[i * n + j];
            int cActual = 0;
            for (int k = 0; k < n; k++) {
                cActual += h_a[i * n + k] * h_b[k * n + j];
            }
            assert(cActual == cExpected);
        }
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    free(h_a);
    free(h_b);
    free(h_c);

    printf("Success!");

    return 0;
}

CUDA Completed!
Success!


# Chapter 5: Tiled Cache Matrix Multiplication

In [None]:
%%cuda

__global__ void matmul(int *a, int *b, int *c, int n, int numThreads) {
    int rowi = blockDim.y * blockIdx.y + threadIdx.y;
    int coli = blockDim.x * blockIdx.x + threadIdx.x;

    __shared__ int A[numThreads];
    __shared__ int B[numThreads];

    int tmp = 0;

    for (int k = 0; k < n / numThreads; k++) {
        A[blockIdx.y * numThreads + blockIdx.x] = a[rowi * n + k * numThreads + threadIdx.x];
        B[blockIdx.y * numThreads + blockIdx.x] = b[(k * numThreads + threadIdx.y)];
    }

    c[rowi * n + coli] = tmp;
}

int main() {
    int n = 1 << 10;
    size_t bytes = sizeof(int) * n * n;

    int *h_a, *h_b, *h_c;
    cudaMallocHost(&h_a, bytes);
    cudaMallocHost(&h_b, bytes);
    cudaMallocHost(&h_c, bytes);

    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    for (int i = 0; i < n * n; i++) {
        h_a[i] = rand() % 100;
        h_b[i] = rand() % 100;
    }

    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    int numThreads = 16;
    int numBlocks = (int) ceil(1.0 * n / numThreads);

    dim3 sizeGrid(numBlocks, numBlocks);
    dim3 sizeBlock(numThreads, numThreads);

    matmul<<<sizeGrid, sizeBlock>>>(d_a, d_b, d_c, n, numThreads);

    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}