<a href="https://colab.research.google.com/github/PrachiTekawade/CUDA_SpMV/blob/main/cuda_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

In [None]:
!nvcc --version

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

In [None]:
%%cu
#include<stdio.h>
#include<cuda.h>


int main()
{
    cudaDeviceProp p;
    int device_id;
    int major;
    int minor;


    cudaGetDevice(&device_id);
    cudaGetDeviceProperties(&p,device_id);

    major=p.major;
    minor=p.minor;

    printf("Name of GPU on your system is %s\n",p.name);

    printf("\n Compute Capability of a current GPU on your system is %d.%d",major,minor);

    return 0;
}


Name of GPU on your system is Tesla T4

 Compute Capability of a current GPU on your system is 7.5


In [None]:
%%cu

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>

// Static shmem calculation for convenience (Int 16x16 matrix)
#define SHMEM_SIZE 16 * 16 * 4

__global__ void tiledMatrixMul(int *a, int *b, int *c, int n, int tile_size) {
	// Two statically-sized pieces of shared memory
	__shared__ int A[SHMEM_SIZE];
	__shared__ int B[SHMEM_SIZE];

	// Shorten these parameters for clean re-use
	int tx = threadIdx.x;
	int ty = threadIdx.y;
	int bx = blockIdx.x;
	int by = blockIdx.y;

	// Calculate global row and column positions for this thread
	int row = by * tile_size + ty;
	int col = bx * tile_size + tx;

	// Intermediate sum for element being written
	int temp_val = 0;

	// Sweep tiles over entire matrix
	for (int i = 0; i < (n / tile_size); i++) {
		/*
			Every thread in a threadblock loads one element into shared memory
			The element location in shared memory corresponds to the thread's
			position in the threadblock (e.g. thread [0, 0] loads for 
			A[0 * tile_size + 0], and B[0 * tile_size + 0].)

			Explanation of indexing parameters
			For A:
				        row*n: Indexes the global row for this thread (loop-invariant)
				  i*tile_size: Indexes the new set of columns each iteration
				           tx: Indexes the column within that set
			for B:
				i*tile_size*n: Indexes the next set of rows each iteration
				         ty*n: Indexes the row within that set
						  col: Indexes the global column (loop-invariant)
		*/
		A[(ty * tile_size) + tx] = a[row * n + (i * tile_size + tx)];
		B[(ty * tile_size) + tx] = b[(i * tile_size * n + ty * n) + col];

		// Ensure all threads have loaded their data before proceeding
		__syncthreads();

		// Calculate all temp values for this tile
		for (int j = 0; j < tile_size; j++) {
			temp_val += A[(ty * tile_size) + j] * B[(j * tile_size) + tx];
		}

		// Ensure some threads don't progress and stomp current shared memory values
		__syncthreads();
	}
	c[(row * n) + col] = temp_val;
}

__host__ void cpu_mvm(int* h_a, int* h_b, int* h_result, int m) {
    for (int i = 0; i < m; ++i)
    { 
        
        for (int j = 0; j < m; ++j)
        {
            int tmp  =0;
            for (int h = 0; h < m; ++h)
            {
                tmp += h_a[i * m + h] * h_b[h * m + j];
            }
            h_result[i * m + j] = tmp;
        }
        
    }
}
void check_answer(int *a, int *b, int *c, int n) {
	int tmp;
	for (int i = 0; i < n; i++) {
		for (int j = 0; j < n; j++) {
			tmp = 0;
			for (int k = 0; k < n; k++) {
				 tmp += a[i * n + k] * b[k * n + j];
			}
            assert(tmp == c[i * n + j]);
		}
	}
}

void init_matrix(int *a, int n) {
	for (int i = 0; i < n; i++) {
		for (int j = 0; j < n; j++) {
			a[i * n + j] = rand() % 10;
		}
	}
}

int main() {
	// Problem size = 1024 x 1024 matrix
	int n = 1024;

	// Matrix size (in bytes)
	size_t bytes = n * n * sizeof(int);

	// Host matrix pointers
	int *h_a, *h_b, *h_c;

	// Device matrix pointers
	int *d_a, *d_b, *d_c;

	// Allocate host memory
	h_a = (int*)malloc(bytes);
	h_b = (int*)malloc(bytes);
	h_c = (int*)malloc(bytes);

	// Allocate device memory
	cudaMalloc(&d_a, bytes);
	cudaMalloc(&d_b, bytes);
	cudaMalloc(&d_c, bytes);

	// Initialize matrices
	init_matrix(h_a, n);
	init_matrix(h_b, n);

	// Copy matrices to the device
	cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

	// Threads per block (in both x and y dimensions)
	int BLOCK_SIZE = 4;

	// Blocks in each dimension
	int GRID_SIZE = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;

	// Use dim3 objects for 2-D grids and threadblocks
	dim3 grid(GRID_SIZE, GRID_SIZE);
	dim3 threads(BLOCK_SIZE, BLOCK_SIZE);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
	
	// Launch kernel
	tiledMatrixMul <<<grid, threads>>> (d_a, d_b, d_c, n, BLOCK_SIZE);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("runtime : %.4f\n", milliseconds*1000);

	// Copy result back from device
	cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);	

	// Verify the result
//	check_answer(h_a, h_b, h_c, n);

   clock_t begin = clock();

    cpu_mvm(h_a,h_b,h_c,n); //matrix multiplication on cpu

    clock_t end = clock();
    double time_spent = (double)1000*(end - begin) / CLOCKS_PER_SEC;
  printf(" CPU runtime : %lf\n", time_spent);
	// Free host memory
	free(h_a);
	free(h_b);
	free(h_c);

	// Free device memory
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);

	printf("COMPLETED SUCCESSFULLY\n");

	return 0;
}



runtime : 44.0739
 CPU runtime : 7021.771000
COMPLETED SUCCESSFULLY

