<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring2023/blob/main/Week3/CalculateSumOnGPU_BlockIdx.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-0u8oa1kx
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-0u8oa1kx
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=2320f482760107f2cf6c1363a21243ca47b8677dc9a40bdd3faf4ae89980ce59
  Stored in directory: /tmp/pip-ephem-wheel-cache-pcpiot08/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [2]:
%%cu 
#include <stdio.h>

__global__ void sum(int* a, int* b, int* c) {
	int i = blockIdx.x;
	c[i] = a[i] + b[i];	
}

void sum_host(int* a, int* b, int* c, const int N) {
	for(int i=0; i<N; ++i)
		c[i] = a[i] + b[i];	
}


int main() { 
	int* h_a = 0;
	int* h_b = 0;
	int* h_c = 0;
	
	int* d_a = 0;
	int* d_b = 0;
	int* d_c = 0;

	const int N = 128;
	size_t size = N * sizeof(int);

	//allocate host memory
	h_a = (int*)malloc(size);	 
	h_b = (int*)malloc(size);	 
	h_c = (int*)malloc(size);	 

	//initialize a, b and c
	for(int i=0;i<N;++i) {
		h_a[i] = i+1;
		h_b[i] = h_a[i]*2;
		h_c[i] = 0;
	}

	//allocate device memory 
	cudaMalloc((void**)&d_a, size);
	cudaMalloc((void**)&d_b, size);
	cudaMalloc((void**)&d_c, size);
	
	//copy host data to device memory
	cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice); 
	cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

	//calculate on host 
	sum_host(h_a, h_b, h_c, N); 
	 
	//output result
	printf("Host calculation result: \n");
	for(int i=0;i<N;++i) {
		printf("%3d + %3d = %3d\n", h_a[i], h_b[i], h_c[i]);
		//clear host result to ensure that the result of device is actually from the kernel
		h_c[i] = 0;
	}

	//calculate on device
	sum<<<N, 1>>>(d_a, d_b, d_c);

	//copy result from device to host
	cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

	//output result
	printf("--------------------------------------\n");
	printf("Device calculation result: \n");
	for(int i=0;i<N;++i) {
		printf("%3d + %3d = %3d\n", h_a[i], h_b[i], h_c[i]);
	}
	printf("--------------------------------------\n");

	//delete data allocated on device
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);

	//delete host memory
	free(h_a);
	free(h_b);
	free(h_c);

	cudaDeviceReset();
	return 0;
}

Host calculation result: 
  1 +   2 =   3
  2 +   4 =   6
  3 +   6 =   9
  4 +   8 =  12
  5 +  10 =  15
  6 +  12 =  18
  7 +  14 =  21
  8 +  16 =  24
  9 +  18 =  27
 10 +  20 =  30
 11 +  22 =  33
 12 +  24 =  36
 13 +  26 =  39
 14 +  28 =  42
 15 +  30 =  45
 16 +  32 =  48
 17 +  34 =  51
 18 +  36 =  54
 19 +  38 =  57
 20 +  40 =  60
 21 +  42 =  63
 22 +  44 =  66
 23 +  46 =  69
 24 +  48 =  72
 25 +  50 =  75
 26 +  52 =  78
 27 +  54 =  81
 28 +  56 =  84
 29 +  58 =  87
 30 +  60 =  90
 31 +  62 =  93
 32 +  64 =  96
 33 +  66 =  99
 34 +  68 = 102
 35 +  70 = 105
 36 +  72 = 108
 37 +  74 = 111
 38 +  76 = 114
 39 +  78 = 117
 40 +  80 = 120
 41 +  82 = 123
 42 +  84 = 126
 43 +  86 = 129
 44 +  88 = 132
 45 +  90 = 135
 46 +  92 = 138
 47 +  94 = 141
 48 +  96 = 144
 49 +  98 = 147
 50 + 100 = 150
 51 + 102 = 153
 52 + 104 = 156
 53 + 106 = 159
 54 + 108 = 162
 55 + 110 = 165
 56 + 112 = 168
 57 + 114 = 171
 58 + 116 = 174
 59 + 118 = 177
 60 + 120 = 180
 61 + 122 = 18