<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring2023/blob/main/Week7/DotProductGPU_UnifiedMemory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-w_t7g2j0
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-w_t7g2j0
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=46ac84c3b92eff0b41b60e8e1f272247df34e522f8ffa722b1f2213374aa0e73
  Stored in directory: /tmp/pip-ephem-wheel-cache-z19rg1dp/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [2]:
%%cu
#include <stdio.h>
const int N = 33 * 1024;
const int threadsPerBlock = 256;

#define imin(a,b) (a<b?a:b)

const int blocksPerGrid =  imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );

inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
}

__global__ void dot( float *a, float *b, float *c ) {
	__shared__ float cache[threadsPerBlock];
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	int cacheIndex = threadIdx.x;

	float   temp = 0;
	while (tid < N) {
		temp += a[tid] * b[tid];
		tid += blockDim.x * gridDim.x;
	}

	// set the cache values
	cache[cacheIndex] = temp;

	// synchronize threads in this block
	__syncthreads();

	// for reductions, threadsPerBlock must be a power of 2
	// because of the following code
	int i = blockDim.x/2;
	while (i != 0) {
		if (cacheIndex < i)
			cache[cacheIndex] += cache[cacheIndex + i];
		__syncthreads();
		i /= 2;
	}

	if (cacheIndex == 0)
		c[blockIdx.x] = cache[0];
}


int main() { 
	float   *a, *b, *partial_c, c=0; 

	// Allocate Unified Memory -- accessible from CPU or GPU
	checkCudaErr(cudaMallocManaged(&a, N*sizeof(float)), "cudaMallocManaged1");
	checkCudaErr(cudaMallocManaged(&b, N*sizeof(float)), "cudaMallocManaged2");
	checkCudaErr(cudaMallocManaged(&partial_c, blocksPerGrid*sizeof(float)), "cudaMallocManaged3");

	// fill in the memory with data
	for (int i=0; i<N; i++) {
		a[i] = i;
		b[i] = i*2;
	} 

	dot<<<blocksPerGrid,threadsPerBlock>>>( a, b,  partial_c );

	cudaDeviceSynchronize();

	// finish up on the CPU side
	c = 0;
	for (int i=0; i<blocksPerGrid; i++) {
		c += partial_c[i];
	}

	#define sum_squares(x)  (x*(x+1)*(2*x+1)/6)
	printf( "Does GPU value %.6g = %.6g?\n", c, 2 * sum_squares( (float)(N - 1) ) );

	// free memory on the gpu side
	checkCudaErr( cudaFree( a ) , "cudaFree1");
	checkCudaErr( cudaFree( b ) , "cudaFree2");
	checkCudaErr( cudaFree( partial_c ) , "cudaFree3");
	checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

	return 0;
}

Does GPU value 2.57236e+13 = 2.57236e+13?

