<a href="https://colab.research.google.com/github/ShafeeqAhamedS/PCA-EXP-1-SUM-ARRAY-GPU-AY-23-24/blob/main/PCA_Exp1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-f3tat5j2
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-f3tat5j2
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: nvcc4jupyter
  Building wheel for nvcc4jupyter (pyproject.toml) ... [?25l[?25hdone
  Created wheel for nvcc4jupyter: filename=nvcc4jupyter-1.2.1-py3-none-any.whl size=10743 sha256=3a570d3ab74e9051a615aa78d54722c78a69a0c83a1d407fccde14c37a30e96c
  Stored in directory: /tmp/pip-ephem-wheel-cache-fhiowpzi/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully bu

In [2]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpaz71mpiz".


In [3]:
%%cuda
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>

#ifndef _COMMON_H
#define _COMMON_H

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUBLAS(call)                                                     \
{                                                                              \
    cublasStatus_t err;                                                        \
    if ((err = (call)) != CUBLAS_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CURAND(call)                                                     \
{                                                                              \
    curandStatus_t err;                                                        \
    if ((err = (call)) != CURAND_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult err;                                                           \
    if ( (err = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__,        \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUSPARSE(call)                                                   \
{                                                                              \
    cusparseStatus_t err;                                                      \
    if ((err = (call)) != CUSPARSE_STATUS_SUCCESS)                             \
    {                                                                          \
        fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__);   \
        cudaError_t cuda_err = cudaGetLastError();                             \
        if (cuda_err != cudaSuccess)                                           \
        {                                                                      \
            fprintf(stderr, "  CUDA error \"%s\" also detected\n",             \
                    cudaGetErrorString(cuda_err));                             \
        }                                                                      \
        exit(1);                                                               \
    }                                                                          \
}

inline double seconds()
{
    struct timeval tp;
    struct timezone tzp;
    int i = gettimeofday(&tp, &tzp);
    return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#endif // _COMMON_H
void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            match = 0;
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
                   gpuRef[i], i);
            break;
        }
    }

    if (match) printf("Arrays match.\n\n");

    return;
}

void initialData(float *ip, int size)
{
    // generate different seed for random number
    time_t t;
    srand((unsigned) time(&t));

    for (int i = 0; i < size; i++)
    {
        ip[i] = (float)( rand() & 0xFF ) / 10.0f;
    }

    return;
}

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx < N; idx++)
    {
        C[idx] = A[idx] + B[idx];
    }
}


__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N){
    int i = blockIdx.x*blockDim.x+threadIdx.x;
    if (i<N) C[i] = A[i] + B[i];
}



int main(int argc, char **argv)
{
    printf("%s Starting...\n", argv[0]);

    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("Using Device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));

    // set up data size of vectors
    int nElem = 1 << 24;
    printf("Vector size %d\n", nElem);

    // malloc host memory
    size_t nBytes = nElem * sizeof(float);

    float *h_A, *h_B, *hostRef, *gpuRef;
    h_A     = (float *)malloc(nBytes);
    h_B     = (float *)malloc(nBytes);
    hostRef = (float *)malloc(nBytes);
    gpuRef  = (float *)malloc(nBytes);

    double iStart, iElaps;

    // initialize data at host side
    iStart = seconds();
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    iElaps = seconds() - iStart;
    printf("initialData Time elapsed %f sec\n", iElaps);
    memset(hostRef, 0, nBytes);
    memset(gpuRef,  0, nBytes);

    // add vector at host side for result checks
    iStart = seconds();
    sumArraysOnHost(h_A, h_B, hostRef, nElem);
    iElaps = seconds() - iStart;
    printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);

    // malloc device global memory
    float *d_A, *d_B, *d_C;
    CHECK(cudaMalloc((float**)&d_A, nBytes));
    CHECK(cudaMalloc((float**)&d_B, nBytes));
    CHECK(cudaMalloc((float**)&d_C, nBytes));

    // transfer data from host to device
    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));

    // invoke kernel at host side
    int iLen = 512;
    dim3 block (iLen);
    dim3 grid  ((nElem + block.x - 1) / block.x);

    iStart = seconds();
    sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
    CHECK(cudaDeviceSynchronize());
    iElaps = seconds() - iStart;
    printf("sumArraysOnGPU <<<  %d, %d  >>>  Time elapsed %f sec\n", grid.x,
           block.x, iElaps);

    // check kernel error
    CHECK(cudaGetLastError()) ;

    // copy kernel result back to host side
    CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));

    // check device results
    checkResult(hostRef, gpuRef, nElem);

    // free device global memory
    CHECK(cudaFree(d_A));
    CHECK(cudaFree(d_B));
    CHECK(cudaFree(d_C));

    // free host memory
    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);

    return(0);
}

/tmp/tmpaz71mpiz/b08ab6f5-258b-4fa4-8025-7b1622994b90/cuda_exec.out Starting...
Using Device 0: Tesla T4
Vector size 16777216
initialData Time elapsed 0.729443 sec
sumArraysOnHost Time elapsed 0.069198 sec
sumArraysOnGPU <<<  32768, 512  >>>  Time elapsed 0.134935 sec
Arrays match.




In [9]:
%%cuda
#include <cuda_runtime.h>
#include <stdio.h>
#include <sys/time.h>

#ifndef _COMMON_H
#define _COMMON_H

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
        exit(1);                                                               \
    }                                                                          \
}

inline double seconds()
{
    struct timeval tp;
    struct timezone tzp;
    int i = gettimeofday(&tp, &tzp);
    return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}
#endif // _COMMON_H

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;

    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i]) > epsilon)
        {
            match = 0;
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
                   gpuRef[i], i);
            break;
        }
    }

    if (match) printf("Arrays match.\n\n");

    return;
}

void initialData(float *ip, int size)
{
    // generate different seed for random number
    time_t t;
    srand((unsigned) time(&t));

    for (int i = 0; i < size; i++)
    {
        ip[i] = (float)( rand() & 0xFF ) / 10.0f;
    }

    return;
}

void sumArraysOnHost(float *A, float *B, float *C, const int N)
{
    for (int idx = 0; idx < N; idx++)
    {
        C[idx] = A[idx] + B[idx];
    }
}


__global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) C[i] = A[i] + B[i];
}


int main(int argc, char **argv)
{
    // Set up different block sizes
    int blockSizes[] = { 256, 512, 1023, 1024};

    // Iterate over each block size
    for (int i = 0; i < 4; i++)
    {
        printf("Running with block.x = %d\n", blockSizes[i]);

        // set up device
        int dev = 0;
        cudaDeviceProp deviceProp;
        CHECK(cudaGetDeviceProperties(&deviceProp, dev));
        printf("Using Device %d: %s\n", dev, deviceProp.name);
        CHECK(cudaSetDevice(dev));

        // set up data size of vectors
        int nElem = 1 << 24;
        printf("Vector size %d\n", nElem);

        // malloc host memory
        size_t nBytes = nElem * sizeof(float);

        float *h_A, *h_B, *hostRef, *gpuRef;
        h_A     = (float *)malloc(nBytes);
        h_B     = (float *)malloc(nBytes);
        hostRef = (float *)malloc(nBytes);
        gpuRef  = (float *)malloc(nBytes);

        double iStart, iElaps;

        // initialize data at host side
        iStart = seconds();
        initialData(h_A, nElem);
        initialData(h_B, nElem);
        iElaps = seconds() - iStart;
        printf("initialData Time elapsed %f sec\n", iElaps);
        memset(hostRef, 0, nBytes);
        memset(gpuRef,  0, nBytes);

        // add vector at host side for result checks
        iStart = seconds();
        sumArraysOnHost(h_A, h_B, hostRef, nElem);
        iElaps = seconds() - iStart;
        printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);

        // malloc device global memory
        float *d_A, *d_B, *d_C;
        CHECK(cudaMalloc((float**)&d_A, nBytes));
        CHECK(cudaMalloc((float**)&d_B, nBytes));
        CHECK(cudaMalloc((float**)&d_C, nBytes));

        // transfer data from host to device
        CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
        CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));

        // set up execution configuration
        int blockSize = blockSizes[i];
        dim3 block(blockSize);
        dim3 grid((nElem + block.x - 1) / block.x);

        // invoke kernel at host side
        iStart = seconds();
        sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
        CHECK(cudaDeviceSynchronize());
        iElaps = seconds() - iStart;
        printf("sumArraysOnGPU <<< %d, %d >>> Time elapsed %f sec\n", grid.x, block.x, iElaps);

        // check kernel error
        CHECK(cudaGetLastError());

        // copy kernel result back to host side
        CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));

        // check device results
        checkResult(hostRef, gpuRef, nElem);

        // free device global memory
        CHECK(cudaFree(d_A));
        CHECK(cudaFree(d_B));
        CHECK(cudaFree(d_C));

        // free host memory
        free(h_A);
        free(h_B);
        free(hostRef);
        free(gpuRef);

        printf("\n"); // Separate outputs for readability
    }

    return(0);
}


Running with block.x = 256
Using Device 0: Tesla T4
Vector size 16777216
initialData Time elapsed 0.687592 sec
sumArraysOnHost Time elapsed 0.052741 sec
sumArraysOnGPU <<< 65536, 256 >>> Time elapsed 0.001025 sec
Arrays match.


Running with block.x = 512
Using Device 0: Tesla T4
Vector size 16777216
initialData Time elapsed 0.670395 sec
sumArraysOnHost Time elapsed 0.054042 sec
sumArraysOnGPU <<< 32768, 512 >>> Time elapsed 0.000861 sec
Arrays match.


Running with block.x = 1023
Using Device 0: Tesla T4
Vector size 16777216
initialData Time elapsed 0.669788 sec
sumArraysOnHost Time elapsed 0.053019 sec
sumArraysOnGPU <<< 16401, 1023 >>> Time elapsed 0.000919 sec
Arrays match.


Running with block.x = 1024
Using Device 0: Tesla T4
Vector size 16777216
initialData Time elapsed 0.670355 sec
sumArraysOnHost Time elapsed 0.055307 sec
sumArraysOnGPU <<< 16384, 1024 >>> Time elapsed 0.000879 sec
Arrays match.



