In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-mkdp53gr
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-mkdp53gr
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=cba8dade6485a604a1aa811705387517b923e758050ae315366ecfe03bf1acca
  Stored in directory: /tmp/pip-ephem-wheel-cache-229moo91/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu
#include<bits/stdc++.h>

using namespace std;

void vector_sum(vector<float> a,float &cpu_res,int n) {
    for (int i = 0; i < n; i++) {
        cpu_res += a[i];
    }
}

void vector_min(vector<float> a,float &cpu_res,int n){
    cpu_res = INT_MAX;
    for (int i = 0; i < n; i++) {
        cpu_res = min(cpu_res,a[i]);
    }
}

void vector_max(vector<float> a,float &cpu_res,int n){
    cpu_res = INT_MIN;
    for (int i = 0; i < n; i++) {
        cpu_res = max(cpu_res,a[i]);
    }
}

void vector_sd(vector<float> a,float sum,double &cpu_res_sd,int n){
    double mean = (double)sum/(double)n;

    double s = 0;
    for(int i=0;i<n;i++){
        s += ((a[i]-mean)*(a[i]-mean));
    }

    cpu_res_sd = (double)s/(double)n;
}

__global__ void cuda_vector_sum(float* a,int n) {
    const int tid=threadIdx.x;
    int no_of_threads=blockDim.x;

    for(int step=1;step < n; step *= 2,no_of_threads /= 2){
      if (tid <= no_of_threads){
        int ind=2*step*tid;
      
        if((ind+step) >= n){
          a[ind] = a[ind] + 0;
        }else{
          a[ind] = a[ind] + a[ind+step];
        }
      }
    }
}

__global__ void cuda_vector_min(float* a,int n) {
    const int tid=threadIdx.x;
    int no_of_threads=blockDim.x;

    for(int step=1;step < n; step *= 2,no_of_threads /= 2){
      if (tid <= no_of_threads){
        int ind=2*step*tid;
      
        if((ind+step) >= n){
          a[ind] = min(a[ind],FLT_MAX);
        }else{
          a[ind] = min(a[ind],a[ind+step]);
        }
      }
    }
}

__global__ void cuda_vector_max(float* a,int n) {
    const int tid=threadIdx.x;
    int no_of_threads=blockDim.x;

    for(int step=1;step < n; step *= 2,no_of_threads /= 2){
      if (tid <= no_of_threads){
        int ind=2*step*tid;
      
        if((ind+step) >= n){
          a[ind] = max(a[ind],FLT_MIN);
        }else{
          a[ind] = max(a[ind],a[ind+step]);
        }
      }
    }
}

__global__ void cuda_update_arr(float *a,double mean){
    const int tid=threadIdx.x;
    a[tid] = (a[tid]-mean)*(a[tid]-mean);
}

int main() {
    int N = 100;

    vector<float> a(N);
    srand(time(0));
    generate(begin(a), end(a), []() { return (float(rand())/float((RAND_MAX)) * 100.0); });
  
    for(auto item:a)
      cout<<item<<" ";
    cout<<'\n';

    float cpu_res=0,gpu_res=0;
    double cpu_res_sd = 0,gpu_res_sd = 0;

    cout<<"CPU: "<<'\n';
  //-------------------------------------------------------------------
    
    // Sum calculation
    vector_sum(a,cpu_res,N);
    cout << "Vector Sum using CPU :"<<cpu_res<<" \n";

    // Average calculation
    cout << "Vector Average using CPU :"<<(double)cpu_res/(double)N<<" \n";

    vector_sd(a,cpu_res,cpu_res_sd,N);
    cout << "Vector Standard Deviation using CPU :"<<fixed<<setprecision(2)<<sqrt(cpu_res_sd)<<" \n";

    vector_min(a,cpu_res,N);
    cout << "Vector Min using CPU :"<<cpu_res<<" \n";

    vector_max(a,cpu_res,N);
    cout << "Vector Max using CPU :"<<cpu_res<<" \n";
    

  cout<<"GPU: "<<'\n';
  //-------------------------------------------------------------------
    // Allocate memory on the device
    size_t bytes = sizeof(float) * N;
    float* d_a;
    cudaMalloc(&d_a, bytes);
  //-------------------------------------------------------------------

    // Copy data from the host to the device (CPU to GPU)
   
    

    cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
    cuda_vector_sum <<<1,N/2>>> (d_a,N);
    cudaMemcpy(&gpu_res, d_a, sizeof(float), cudaMemcpyDeviceToHost);

    cout << "Vector Sum using GPU :"<<gpu_res<<" \n";

//-------------------------------------------------------------------

    cout << "Vector Average using GPU :"<<(double)gpu_res/(double)N<<" \n";

//-------------------------------------------------------------------

    double mean = (double)gpu_res/(double)N;
    cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
    cuda_update_arr<<<1,N>>>(d_a,mean);
    cuda_vector_sum<<<1,N/2>>>(d_a,N);
    cudaMemcpy(&gpu_res, d_a, sizeof(float), cudaMemcpyDeviceToHost);
    gpu_res = (double)gpu_res/(double)N;
    cout << "Vector Standard Deviation using GPU :"<<fixed<<setprecision(2)<<sqrt(gpu_res)<<" \n";
    

//-------------------------------------------------------------------

    cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
    gpu_res = INT_MAX;
    cuda_vector_min <<<1,N/2>>> (d_a,N);
    cudaMemcpy(&gpu_res, d_a, sizeof(float), cudaMemcpyDeviceToHost);
    cout << "Vector Min using GPU :"<<gpu_res<<" \n";

//-------------------------------------------------------------------

    cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
    gpu_res = INT_MIN;
    cuda_vector_max <<<1,N/2>>> (d_a,N);
    cudaMemcpy(&gpu_res, d_a, sizeof(float), cudaMemcpyDeviceToHost);
    cout << "Vector Max using GPU :"<<gpu_res<<" \n";

//-------------------------------------------------------------------

    // Free memory on device
    cudaFree(d_a);
}


21.4509 51.1716 2.28868 79.6607 67.2713 20.6124 16.362 19.0212 27.3255 97.9246 16.9107 97.4966 77.2343 57.6389 72.4909 71.9441 68.4711 63.3909 93.328 14.5983 13.4629 5.33844 24.0436 0.100423 77.6323 98.1466 76.2849 74.0932 91.4415 76.3344 61.0801 12.8924 27.506 63.3688 92.5531 94.7773 83.9813 8.91513 13.7986 11.3068 6.83973 30.7093 8.80338 84.0741 88.3481 81.2943 56.0182 56.8192 44.6852 49.3462 71.4175 58.1481 54.6846 95.4611 58.2485 32.3169 93.6077 34.5334 6.41015 85.0492 10.8678 67.4903 97.9416 38.3738 30.8591 90.4947 33.1511 14.8404 99.4099 46.9497 26.1471 6.2496 77.659 34.9505 90.3237 66.0071 16.2448 46.3418 22.8264 60.93 95.688 94.2438 19.0781 50.3726 89.705 77.3266 82.6895 83.3126 11.8599 89.0997 68.3618 22.7277 56.59 66.3035 61.1015 87.4491 56.7982 94.2527 2.28944 56.208 
CPU: 
Vector Sum using CPU :5383.98 
Vector Average using CPU :53.8398 
Vector Standard Deviation using CPU :30.91 
Vector Min using CPU :0.10 
Vector Max using CPU :99.41 
GPU: 
Vector Sum using GPU :5383.98 
