<a href="https://colab.research.google.com/github/TechDailyNotes/study-notes-cuda/blob/main/cuda_cublas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpwnq7_b_c".


# Chapter 1: cuBLAS Vector Addition

In [25]:
%%cuda

#include <assert.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <stdio.h>

void arrayInit(float *a, int n) {
    for (int i = 0; i < n; i++) {
        a[i] = (float) (rand() % 100);
    }
}

void arrayVerify(float *a, float *b, float *c, float scale, int n) {
    for (int i = 0; i < n; i++) {
        assert(c[i] == scale * a[i] + b[i]);
    }
}

int main() {
    int n = 1 << 4;
    size_t bytes = sizeof(float) * n;

    float *h_a = (float*) malloc(bytes);
    float *h_b = (float*) malloc(bytes);
    float *h_c = (float*) malloc(bytes);

    arrayInit(h_a, n);
    arrayInit(h_b, n);
    arrayInit(h_c, n);

    float *d_a, *d_b;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);

    cublasHandle_t handle;
    cublasCreate_v2(&handle);

    cublasSetVector(n, sizeof(float), h_a, 1, d_a, 1);
    cublasSetVector(n, sizeof(float), h_b, 1, d_b, 1);

    const float scale = 1.0f;
    cublasSaxpy(handle, n, &scale, d_a, 1, d_b, 1);

    cublasGetVector(n, sizeof(float), d_b, 1, h_c, 1);

    arrayVerify(h_a, h_b, h_c, scale, n);

    cublasDestroy(handle);

    cudaFree(d_a);
    cudaFree(d_b);

    free(h_a);
    free(h_b);
    free(h_c);

    printf("Success!");
    return 0;
}

/usr/bin/ld: /tmp/tmpxft_00005832_00000000-11_single_file.o: in function `main':
tmpxft_00005832_00000000-6_single_file.cudafe1.cpp:(.text+0x210): undefined reference to `cublasCreate_v2'
/usr/bin/ld: tmpxft_00005832_00000000-6_single_file.cudafe1.cpp:(.text+0x235): undefined reference to `cublasSetVector'
/usr/bin/ld: tmpxft_00005832_00000000-6_single_file.cudafe1.cpp:(.text+0x25a): undefined reference to `cublasSetVector'
/usr/bin/ld: tmpxft_00005832_00000000-6_single_file.cudafe1.cpp:(.text+0x291): undefined reference to `cublasSaxpy_v2'
/usr/bin/ld: tmpxft_00005832_00000000-6_single_file.cudafe1.cpp:(.text+0x2ba): undefined reference to `cublasGetVector'
/usr/bin/ld: tmpxft_00005832_00000000-6_single_file.cudafe1.cpp:(.text+0x2e7): undefined reference to `cublasDestroy_v2'
collect2: error: ld returned 1 exit status



## Practice

In [26]:
%%cuda

#include <assert.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <stdio.h>

void arrayInit(float * const a, const int n) {
    for (int i = 0; i < n; i++) {
        a[i] = (float) (rand() % 100);
    }
}

void arrayVerify(
    const float * const a,
    const float * const b,
    const float * const c,
    const float scale,
    const int n
) {
    for (int i = 0; i < n; i++) {
        assert(c[i] == scale * a[i] + b[i]);
    }
}

int main() {
    int n = 1 << 2;
    size_t bytes = sizeof(float) * n;

    float *h_a = (float*) malloc(bytes);
    float *h_b = (float*) malloc(bytes);
    float *h_c = (float*) malloc(bytes);
    arrayInit(h_a, n);
    arrayInit(h_b, n);

    float *d_a, *d_b;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);

    const float scale = 1.0f;
    cublasHandle_t handle;
    cublasCreate_v2(&handle);

    cublasSetVector(n, sizeof(float), h_a, 1, d_a, 1);
    cublasSetVector(n, sizeof(float), h_b, 1, d_b, 1);
    cublasSaxpy(handle, n, &scale, d_a, 1, d_b, 1);
    cublasGetVector(n, sizeof(float), d_b, 1, h_c, 1);
    arrayVerify(h_a, h_b, h_c, scale, n);

    cublasDestroy(handle);
    cudaFree(d_a);
    cudaFree(d_b);
    free(h_a);
    free(h_b);
    free(h_c);

    printf("Success!");
    return 0;
}

/usr/bin/ld: /tmp/tmpxft_00005869_00000000-11_single_file.o: in function `main':
tmpxft_00005869_00000000-6_single_file.cudafe1.cpp:(.text+0x20c): undefined reference to `cublasCreate_v2'
/usr/bin/ld: tmpxft_00005869_00000000-6_single_file.cudafe1.cpp:(.text+0x231): undefined reference to `cublasSetVector'
/usr/bin/ld: tmpxft_00005869_00000000-6_single_file.cudafe1.cpp:(.text+0x256): undefined reference to `cublasSetVector'
/usr/bin/ld: tmpxft_00005869_00000000-6_single_file.cudafe1.cpp:(.text+0x280): undefined reference to `cublasSaxpy_v2'
/usr/bin/ld: tmpxft_00005869_00000000-6_single_file.cudafe1.cpp:(.text+0x2a9): undefined reference to `cublasGetVector'
/usr/bin/ld: tmpxft_00005869_00000000-6_single_file.cudafe1.cpp:(.text+0x2d6): undefined reference to `cublasDestroy_v2'
collect2: error: ld returned 1 exit status



# Chapter 2: cuBLAS Matrix Multiplication

In [29]:
%%cuda

#include <assert.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <math.h>
#include <stdio.h>
#include <time.h>

void arrayVerify(
    const float* const a,
    const float* const b,
    const float* const c,
    const int n
) {
    const float epsilon = 0.01f;

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            float tmp;
            for (int k = 0; k < n; k++) {
                tmp += a[k * n + i] * b[j * n + k];
            }
            assert(c[j * n + i] - tmp < epsilon);
        }
    }
}

int main() {
    int n = 1 << 4;
    size_t bytes = sizeof(float) * n * n;

    float *h_a = (float*) malloc(bytes);
    float *h_b = (float*) malloc(bytes);
    float *h_c = (float*) malloc(bytes);
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    curandGenerator_t prng;
    curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
    curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());
    curandGenerateUniform(prng, d_a, n * n);
    curandGenerateUniform(prng, d_b, n * n);

    const float alpha = 1.0f;
    const float beta = 0.0f;
    cublasHandle_t handle;
    cublasCreate_v2(&handle);
    cublasSgemm(
        handle, CUBLAS_OP_N, CUBLAS_OP_N,
        n, n, n, &alpha, d_a, n, d_b, n,
        &beta, d_c, n
    );

    // cudaMemcpy(h_a, d_a, bytes, cudaMemcpyDeviceToHost);
    // cudaMemcpy(h_b, d_b, bytes, cudaMemcpyDeviceToHost);
    // cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
    cublasGetVector(n * n, sizeof(float), d_a, 1, h_a, 1);
    cublasGetVector(n * n, sizeof(float), d_b, 1, h_b, 1);
    cublasGetVector(n * n, sizeof(float), d_c, 1, h_c, 1);

    free(h_a);
    free(h_b);
    free(h_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cublasDestroy(handle);

    printf("Success!");
}

/usr/bin/ld: /tmp/tmpxft_0000666c_00000000-11_single_file.o: in function `main':
tmpxft_0000666c_00000000-6_single_file.cudafe1.cpp:(.text+0x1fa): undefined reference to `curandCreateGenerator'
/usr/bin/ld: tmpxft_0000666c_00000000-6_single_file.cudafe1.cpp:(.text+0x211): undefined reference to `curandSetPseudoRandomGeneratorSeed'
/usr/bin/ld: tmpxft_0000666c_00000000-6_single_file.cudafe1.cpp:(.text+0x22d): undefined reference to `curandGenerateUniform'
/usr/bin/ld: tmpxft_0000666c_00000000-6_single_file.cudafe1.cpp:(.text+0x249): undefined reference to `curandGenerateUniform'
/usr/bin/ld: tmpxft_0000666c_00000000-6_single_file.cudafe1.cpp:(.text+0x26b): undefined reference to `cublasCreate_v2'
/usr/bin/ld: tmpxft_0000666c_00000000-6_single_file.cudafe1.cpp:(.text+0x2b8): undefined reference to `cublasSgemm_v2'
/usr/bin/ld: tmpxft_0000666c_00000000-6_single_file.cudafe1.cpp:(.text+0x2e4): undefined reference to `cublasGetVector'
/usr/bin/ld: tmpxft_0000666c_00000000-6_single_file.cuda

## Practice

In [42]:
%%cuda

#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <curand.h>
#include <stdio.h>
#include <time.h>

int main() {
    int n = 1 << 4;
    size_t bytes = sizeof(float) * n * n;

    float *h_a = (float*) malloc(bytes);
    float *h_b = (float*) malloc(bytes);
    float *h_c = (float*) malloc(bytes);
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    curandGenerator_t prng;
    curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
    curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long) clock());
    curandGenerateUniform(prng, d_a, n * n);
    curandGenerateUniform(prng, d_b, n * n);

    const float alpha = 1.0f;
    const float beta = 0.0f;
    cublasHandle_t handle;
    cublasCreate_v2(&handle);
    cublasSgemm(
        handle, CUBLAS_OP_N, CUBLAS_OP_N,
        n, n, n, &alpha, d_a, n, d_b, n,
        &beta, d_c, n
    );
    cublasGetVector(n * n, sizeof(float), d_a, 1, h_a, 1);
    cublasGetVector(n * n, sizeof(float), d_b, 1, h_b, 1);
    cublasGetVector(n * n, sizeof(float), d_c, 1, h_c, 1);

    free(h_a);
    free(h_b);
    free(h_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    cublasDestroy(handle);

    printf("Success!");
    return 0;
}

/usr/bin/ld: /tmp/tmpxft_0000795e_00000000-11_single_file.o: in function `main':
tmpxft_0000795e_00000000-6_single_file.cudafe1.cpp:(.text+0xc9): undefined reference to `curandCreateGenerator'
/usr/bin/ld: tmpxft_0000795e_00000000-6_single_file.cudafe1.cpp:(.text+0xe0): undefined reference to `curandSetPseudoRandomGeneratorSeed'
/usr/bin/ld: tmpxft_0000795e_00000000-6_single_file.cudafe1.cpp:(.text+0xfc): undefined reference to `curandGenerateUniform'
/usr/bin/ld: tmpxft_0000795e_00000000-6_single_file.cudafe1.cpp:(.text+0x118): undefined reference to `curandGenerateUniform'
/usr/bin/ld: tmpxft_0000795e_00000000-6_single_file.cudafe1.cpp:(.text+0x13a): undefined reference to `cublasCreate_v2'
/usr/bin/ld: tmpxft_0000795e_00000000-6_single_file.cudafe1.cpp:(.text+0x187): undefined reference to `cublasSgemm_v2'
/usr/bin/ld: tmpxft_0000795e_00000000-6_single_file.cudafe1.cpp:(.text+0x1b3): undefined reference to `cublasGetVector'
/usr/bin/ld: tmpxft_0000795e_00000000-6_single_file.cudafe1