<a href="https://colab.research.google.com/github/PrasannaPaithankar/KOSS-CUDA-Task/blob/main/KOSS_Selection_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Run the first 3 cell to load cuda runtime plugin 
#### Note: Check in the menu bar-> **Runtime**->**Change runtime type** the hardware accelerator selected is GPU



In [None]:
!nvcc --version #to check nvcc compiler version

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


In [None]:
%load_ext nvcc_plugin

In the below cells enter your CUDA code 
#### Note: this notebook runs python file by default to run cuda file using nvcc compiler at the begining of the code you have to add `%%cu` to tell the env that it is a cuda code. A demo code cell has been added below for reference


# **Part 1**
Print "Hello World on GPU" 1024 times using cuda kernel with 4 blocks and 256 threads and compare the time with a sequential C code to print "Hello World on CPU" 1024 times. Compare the performance wrt time using the C clock function from time library.

In [None]:
%%cu
//Enter your Cuda Code below this line
#include <stdio.h>        
#include <unistd.h>  
#include <time.h>

__global__ void helloWorld( )
{
   printf("Hello World on GPU\n");
}

int main()
{
   float CPU_time, GPU_time;
   clock_t start_t, end_t;
   cudaEvent_t start,stop;
  
   start_t = clock();  
   for(int i = 0; i < 1024; i++)
   {
       printf("Hello World on CPU\n");
   }
   end_t = clock();
   CPU_time = (float)(end_t - start_t) / CLOCKS_PER_SEC;
   
   cudaDeviceSynchronize();
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
   cudaEventRecord(start, 0); 
   cudaEventRecord(stop, 0);
 
   helloWorld<<< 4, 256 >>>( );
 
   cudaEventSynchronize(stop);
   cudaEventElapsedTime(&GPU_time, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
 
   
   
   printf("CPU execution time = %f \nGPU execution time = %f", 
           CPU_time, GPU_time);
 
   return 0;
}

# **Part 2**
Add 2 dimensional matrix A and B to result in a new matrix C. The Size of the matrix is 32x32. The elements of the matrix are A[i][j]=i and B[i][j]=j. Compute this on GPU kernel of size 4 blocks containg 16*16 threads and copy the result on GPU to CPU using cudaMemcpy and print the result on terminal. Compare the performance wrt time again.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>

#define N 32

__global__ void Add(int A[N][N], int B[N][N], int C[N][N]){
           int i = threadIdx.x;
           int j = threadIdx.y;
           if (i < N && j < N)
           {
           C[i][j] = A[i][j] + B[i][j];
           }
           __syncthreads();
       }


int main(){

int A[N][N];
int B[N][N];
int C[N][N];    

float CPU_time, GPU_time;
clock_t start_t, end_t;
cudaEvent_t start,stop;
int q, w;
for(q = 0; q<32; q++)
{
    for(w = 0; w<32; w++)
    {
        A[q][w] = rand()%100+1;
        B[q][w] = rand()%100+1;
    }
}

int i, j; 
printf("A = \n");
for(i=0;i<N;i++){
    for(j=0;j<N;j++){
        printf("%d ", A[i][j]);
    }
    printf("\n");
}
printf("\n");
printf("B = \n");
for(i=0;i<N;i++){
    for(j=0;j<N;j++){
        printf("%d ", B[i][j]);
    }
    printf("\n");
}
printf("\n");

//CPU
start_t = clock();  
for(q = 0; q<32; q++)
{
    for(w = 0; w<32; w++)
    {
        C[q][w] = A[q][w] + B[q][w];
    }
}
end_t = clock();
CPU_time = (float)(end_t - start_t) / CLOCKS_PER_SEC;

printf("CPU: C = \n");
for(i=0;i<N;i++){
    for(j=0;j<N;j++){
        printf("%d ", C[i][j]);
    }
    printf("\n");
}
printf("\n");

int (*pA)[N], (*pB)[N], (*pC)[N];

cudaMalloc((void**)&pA, (N*N)*sizeof(int));
cudaMalloc((void**)&pB, (N*N)*sizeof(int));
cudaMalloc((void**)&pC, (N*N)*sizeof(int));

cudaMemcpy(pA, A, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(pB, B, (N*N)*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(pC, C, (N*N)*sizeof(int), cudaMemcpyHostToDevice);

cudaDeviceSynchronize();
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
   cudaEventRecord(start, 0); 
   cudaEventRecord(stop, 0);

dim3 threadsPerBlock(32, 32);
dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
Add<<<numBlocks,threadsPerBlock>>>(pA,pB,pC);

cudaEventSynchronize(stop);
   cudaEventElapsedTime(&GPU_time, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);

cudaMemcpy(C, pC, (N*N)*sizeof(int), cudaMemcpyDeviceToHost);

//Printing

printf("GPU: C = \n");
for(i=0;i<N;i++){
    for(j=0;j<N;j++){
        printf("%d ", C[i][j]);
    }
    printf("\n");
}

printf("\nCPU execution time = %f \nGPU execution time = %f", 
           CPU_time, GPU_time);

//Garbage Collector
cudaFree(pA); 
cudaFree(pB); 
cudaFree(pC);

return 0;
}