<a href="https://colab.research.google.com/github/RSid8/PCAPLAB/blob/main/CudaLab8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-acudxyx1
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-acudxyx1
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=e4bd96f929ca67453ea7cd025682afb400dd790d23eed299275fe10b7e6f9110
  Stored in directory: /tmp/pip-ephem-wheel-cache-7joqtl0l/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu
#include<string.h>
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#define MASK_WIDTH 5
#define TILE_SIZE 4
#define INPUT_SIZE 12
__constant__ float M[MASK_WIDTH];       //define constant memory for the convolutional mask

__global__ void Conv1DKernel(float *N, float *P, int width){
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    float Pvalue=0;
    int N_start_point = i - (MASK_WIDTH/2);
    for(int j=0; j<MASK_WIDTH; j++){                      //iterate over input values from 
        if(N_start_point+j>=0 && N_start_point+j<width){  //i-<MASK_WIDTH/2 to i+MASK_WIDTH/2
            Pvalue += N[N_start_point+j]*M[j];
        }
    }
    P[i]=Pvalue;
}

__global__ void ConvSharedMemKernel(float* N, float *P){
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    __shared__ float N_s[TILE_SIZE];                      
    N_s[threadIdx.x] = N[i];                              //Segment input memory across multiple
    __syncthreads();                                      //blocks and call it tiles
    int this_tile_start_point = blockIdx.x*blockDim.x;    //thread id of current and next tile
    int next_tile_start_point = (blockIdx.x+1)*blockDim.x;
    int N_start_point = i - (MASK_WIDTH/2);
    float Pvalue=0;
    for(int j=0;j<MASK_WIDTH;j++){
        int N_index = N_start_point+j;
        if(N_index>=0 && N_index<INPUT_SIZE){             //if element lies in current block retrieve efficiently
            if((N_index>=this_tile_start_point)&&(N_index<next_tile_start_point)){   //from shared memory
                Pvalue += N_s[threadIdx.x+j-(MASK_WIDTH/2)]*M[j];
            }
            else{
                Pvalue += N[N_index]*M[j];                //if not then retrieve from global memory
            }
        }
    }
    P[i] = Pvalue;
}

int main(){
    srand((unsigned int)time(NULL));
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float milliseconds=0;
    float h_N[INPUT_SIZE];                              //initialize host array
    float h_M[MASK_WIDTH];
    float h_P[INPUT_SIZE];
    printf("Input array N:\n");
    for(int i=0;i<INPUT_SIZE; i++){
        h_N[i]=(float)rand()/(float)(RAND_MAX/15);
        printf("%f ", h_N[i]);
    }
    printf("\n\n");
    printf("Input mask M:\n");
    for(int i=0;i<MASK_WIDTH; i++){
        h_M[i]=(float)rand()/(float)(RAND_MAX/5);
        printf("%f ", h_M[i]);
    }
    printf("\n\n");
    float *d_N, *d_P;

    cudaMalloc(&d_N, INPUT_SIZE*sizeof(float));       //initialize device array
    cudaMalloc(&d_P, INPUT_SIZE*sizeof(float));

    cudaMemcpy(d_N, h_N, INPUT_SIZE*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_P, h_P, INPUT_SIZE*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(M, h_M, MASK_WIDTH*sizeof(float));

    cudaEventRecord(start);
    Conv1DKernel<<<1, INPUT_SIZE>>>(d_N, d_P, INPUT_SIZE);
    cudaEventRecord(stop);
    cudaMemcpy(h_P, d_P, INPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Constant memory result:\n");
    for(int i=0; i<INPUT_SIZE; i++){
        printf("%f ", h_P[i]);
    }
    printf("\n");
    printf("Time elapsed: %f ms\n", milliseconds);

    cudaEventRecord(start);
    ConvSharedMemKernel<<<(INPUT_SIZE+TILE_SIZE-1)/TILE_SIZE, TILE_SIZE>>>(d_N, d_P);
    cudaEventRecord(stop);
    cudaMemcpy(h_P, d_P, INPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Shared memory result:\n");
    for(int i=0; i<INPUT_SIZE; i++){
        printf("%f ", h_P[i]);
    }
    printf("\n");
    printf("Time elapsed: %f ms\n", milliseconds);

    cudaFree(d_N);
    cudaFree(d_P);
    cudaFree(M);
    return 0;
}


Input array N:
10.611133 3.894571 12.152851 9.161904 0.178203 14.148376 3.978005 2.424837 6.684398 5.979529 11.941113 12.529559 

Input mask M:
4.093635 1.512890 1.005996 0.913008 3.132183 

Constant memory result:
52.295475 59.763798 70.478928 88.023705 89.166969 63.235279 49.286934 91.207832 69.538589 76.201729 59.862144 55.148281 
Time elapsed: 0.020576 ms
Shared memory result:
52.295475 59.763798 70.478928 88.023705 89.166969 63.235279 49.286934 91.207832 69.538589 76.201729 59.862144 55.148281 
Time elapsed: 0.011424 ms



In [None]:
%%cu
#include<string.h>
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#define MASK_WIDTH 5
#define INPUT_SIZE 12
__global__ void Conv1DNaiveKernel(float *N, float *M, float *P, int width){
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    float Pvalue=0;
    int N_start_point = i - (MASK_WIDTH/2);
    for(int j=0; j<MASK_WIDTH; j++){                       
        if(N_start_point+j>=0 && N_start_point+j<width){  
            Pvalue += N[N_start_point+j]*M[j];
        }
    }
    P[i]=Pvalue;
}

int main(){
    srand((unsigned int)time(NULL));
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    float milliseconds=0;
    float h_N[INPUT_SIZE];                              //initialize host array
    float h_M[MASK_WIDTH];
    float h_P[INPUT_SIZE];
    printf("Input array N:\n");
    for(int i=0;i<INPUT_SIZE; i++){
        h_N[i]=(float)rand()/(float)(RAND_MAX/15);
        printf("%f ", h_N[i]);
    }
    printf("\n\n");
    printf("Input mask M:\n");
    for(int i=0;i<MASK_WIDTH; i++){
        h_M[i]=(float)rand()/(float)(RAND_MAX/5);
        printf("%f ", h_M[i]);
    }
    printf("\n\n");
    float *d_N, *d_P, *d_M;

    cudaMalloc(&d_N, INPUT_SIZE*sizeof(float));       //initialize device array
    cudaMalloc(&d_P, INPUT_SIZE*sizeof(float));
    cudaMalloc(&d_M, MASK_WIDTH*sizeof(float));
    cudaMemcpy(d_N, h_N, INPUT_SIZE*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_P, h_P, INPUT_SIZE*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_M, h_M, MASK_WIDTH*sizeof(float), cudaMemcpyHostToDevice);

    cudaEventRecord(start);
    Conv1DNaiveKernel<<<1, INPUT_SIZE>>>(d_N, d_M, d_P, INPUT_SIZE);
    cudaEventRecord(stop);
    cudaMemcpy(h_P, d_P, INPUT_SIZE*sizeof(float), cudaMemcpyDeviceToHost);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Global memory result:\n");
    for(int i=0; i<INPUT_SIZE; i++){
        printf("%f ", h_P[i]);
    }
    printf("\n");
    printf("Time elapsed: %f ms\n", milliseconds);

    cudaFree(d_N);
    cudaFree(d_P);
    cudaFree(d_M);
    return 0;
}

Input array N:
10.254496 14.837104 13.811041 8.563289 14.483236 8.257147 10.936128 7.317841 5.508823 3.183267 5.942060 6.358651 

Input mask M:
1.672705 2.313920 4.573117 1.989701 2.665976 

Global memory result:
113.236313 141.889267 170.294479 146.767319 154.734802 126.866623 122.591461 92.029861 82.593452 68.319954 56.406010 48.152973 
Time elapsed: 0.023040 ms



In [None]:
%%cu
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#define N 4
__global__ void SpMV_CSRKernel(int num_rows, int *data, int *col_index, int *row_ptr, int *x, int *y){
    int row = blockIdx.x*blockDim.x + threadIdx.x;
    if(row<num_rows){
        float dot = 0;
        int row_start = row_ptr[row];
        int row_end = row_ptr[row+1];
        for(int elem = row_start; elem<row_end; elem++){
            dot += data[elem] * x[col_index[elem]];
        }
        y[row]=dot;
    }
}
int main(){
    srand((unsigned int)time(NULL));
    int y[N], row_ptr[N+1];
    int inputMatrix[N][N], x[N];
    int non_zero_count=0;
    printf("Input Matrix: \n");
    for(int i=0; i<N; i++){
        for(int j=0; j<N; j++){
            inputMatrix[i][j]=(rand()%20);
            printf("%d ", inputMatrix[i][j]);
        }
        printf("\n");
    }
    printf("\n\n");
    printf("Input Vector: \n");
    for(int i=0; i<N; i++){
        x[i] = (rand()%15);
        printf("%d ", x[i]);
    }
    printf("\n\n");
    //find number of non zero elements and row_ptr array
    for(int i=0; i<N; i++){
        row_ptr[i]=non_zero_count;
        for(int j=0;j<N;j++){
            if(inputMatrix[i][j]!=0){
                non_zero_count++;
            }
        }
    }
    row_ptr[N]=non_zero_count;
    int data[non_zero_count], col_index[non_zero_count];
    int k=0;
    //finding data and col_index array
    for(int i=0; i<N; i++){
        for(int j=0; j<N; j++){
            if(inputMatrix[i][j]!=0){
                data[k]=inputMatrix[i][j];
                col_index[k++]=j;
            }
        }
    }
    printf("\nData Array: \n");
    for(int i=0; i<non_zero_count; i++){
        printf("%d ", data[i]);
    }
    printf("\ncol_index Array: \n");
    for(int i=0; i<non_zero_count; i++){
        printf("%d ", col_index[i]);
    }
    printf("\nrow_ptr Array: \n");
    for(int i=0; i<=N; i++){
        printf("%d ", row_ptr[i]);
    }
    int *d_data, *d_col_index, *d_row_ptr, *d_x, *d_y;
    //allocate device memory
    cudaMalloc((void**)&d_data, non_zero_count*sizeof(int));
    cudaMalloc((void**)&d_col_index, non_zero_count*sizeof(int));
    cudaMalloc((void**)&d_row_ptr, (N+1)*sizeof(int));
    cudaMalloc((void**)&d_x, N*sizeof(int));
    cudaMalloc((void**)&d_y, N*sizeof(int));
    //copy from host to device
    cudaMemcpy(d_data, data, non_zero_count*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_col_index, col_index, non_zero_count*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_row_ptr, row_ptr, (N+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, x, N*sizeof(int), cudaMemcpyHostToDevice);
    //run kernel
    SpMV_CSRKernel<<<1, N>>>(N, d_data, d_col_index, d_row_ptr, d_x, d_y);
    cudaMemcpy(y, d_y, N*sizeof(int), cudaMemcpyDeviceToHost);

    printf("\nResultant Vector: \n");
    for(int i=0; i<N; i++){
        printf("%d ", y[i]);
    }
    cudaFree(d_data);
    cudaFree(d_col_index);
    cudaFree(d_row_ptr);
    cudaFree(d_x);
    cudaFree(d_y);
    return 0;
}

Input Matrix: 
9 10 1 14 
3 3 18 7 
10 4 9 10 
16 16 13 11 


Input Vector: 
7 3 1 5 


Data Array: 
9 10 1 14 3 3 18 7 10 4 9 10 16 16 13 11 
col_index Array: 
0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 
row_ptr Array: 
0 4 8 12 16 
Resultant Vector: 
164 83 141 228 


In [None]:
%%cu
#include<stdio.h>
#include<stdlib.h>
__global__ void matMulKernel(const int *a, const int *b, int *c, int m, int n, int o){
    int row = blockIdx.y*blockDim.y+threadIdx.y;
    int col = blockIdx.x*blockDim.x+threadIdx.x;
    c[row*o+col]=0;
    //calculating one element
    for(int k=0; k<n;k++){
        c[row*o+col] += a[row*n+k]*b[k*o+col];
    }
}
int main(){
    int m=4, n=2, o=4;
    int A[m][n];
    int B[n][o];
    int C[m][o];
    printf("Matrix A: \n");
    for(int i=0;i<m;i++){
        for(int j=0;j<n;j++){
            A[i][j]=rand()%20;
            printf("%d ", A[i][j]);
        }
        printf("\n");
    }
    printf("\n\n");
    printf("Matrix B: \n");
    for(int i=0;i<n;i++){
        for(int j=0;j<o;j++){
            B[i][j]=rand()%20;
            printf("%d ", B[i][j]);
        }
        printf("\n");
    }
    printf("\n\n");
    //memory allocation
    int *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A,m*n*sizeof(int));
    cudaMalloc((void**)&d_B,n*o*sizeof(int));
    cudaMalloc((void**)&d_C,m*o*sizeof(int));
 
    cudaMemcpy(d_A,A,m*n*sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,B,n*o*sizeof(int),cudaMemcpyHostToDevice);

    int thread=2;
    dim3 threads(thread, thread);
    dim3 blocks((m*o)/(4*thread), (m*o)/(4*thread));
    matMulKernel<<<blocks, threads>>>(d_A, d_B, d_C, m,n,o);
    {
    cudaError_t cudaerr = cudaDeviceSynchronize();
    if (cudaerr != cudaSuccess)
        printf("kernel launch failed with error \"%s\".\n",
               cudaGetErrorString(cudaerr));
    }
    cudaMemcpy(C,d_C,m*o*sizeof(int),cudaMemcpyDeviceToHost);
    printf("Resultant Matrix: \n");
    for(int i=0;i<m; i++){
        for(int j=0;j<o; j++){
            printf("%d ", C[i][j]);
        }
        printf("\n");
    }
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    return 0;
}

Matrix A: 
3 6 
17 15 
13 15 
6 12 


Matrix B: 
9 1 2 7 
10 19 3 6 


Resultant Matrix: 
87 117 24 57 
303 302 79 209 
267 298 71 181 
174 234 48 114 

