This notebook contains my notes and implementation of some coding examples in [Parallel computing with CUDA](https://app.pluralsight.com/library/courses/parallel-computing-cuda) by Dmitri Nesteruk on Pluralsight

# CUDA set-up

In [2]:
# checking installed CUDA/nvcc 
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [3]:
# Run the given command to install an extension to run nvcc from the Notebook cells
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-iqhnb3g3
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-iqhnb3g3
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=f3fa4069d2c03195ef87cf157858b1913485c96e76c36564ce881a9d50fb1b6f
  Stored in directory: /tmp/pip-ephem-wheel-cache-gs96kw08/wheels/ca/33/8d/3c86eb85e97d2b6169d95c6e8f2c297fdec60db6e84cb56f5e
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [4]:
# Load the extension
%load_ext nvcc_plugin


created output directory at /content/src
Out bin /content/result.out


## Introduction to CUDA C/C++

### Hello , CUDA

Adding array elements without cuda

In [5]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

void addArrays(int* a , int* b ,int* c ,int count){
    for (int i = 0; i< count ; ++i){
        c[i] = a[i]+b[i];
    }
}

int main (){
    const int count = 5;
    int a[] = {1, 2, 3, 4, 5};
    int b[] = {10, 20, 30, 40, 50};
    int c[count];

    addArrays(a,b,c,count);

    for (int i = 0; i< count ; ++i){
      printf("%d ",c[i]);
    }
    
    return 0;
}





11 22 33 44 55 


Simulating multible threads

In [6]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

void addArrays(int* a , int* b ,int* c ,int i){
    c[i] = a[i]+b[i];
}

int main (){
    const int count = 5;
    int a[] = {1, 2, 3, 4, 5};
    int b[] = {10, 20, 30, 40, 50};
    int c[count];

  for (int i = 0; i< count ; ++i){
      addArrays(a,b,c,i);
  }

    for (int i = 0; i< count ; ++i){
      printf("%d ",c[i]);
    }
    
    return 0;
}

11 22 33 44 55 


### Transioning from CPU To GPU :
- ##### A function to run on GPU (a kernel) is defined using "__ global __" declaration specifier indicating that the corresponding function is to be called from the host and executed on a device.

- ##### CUDA operates on device memory , cannot operate on host memory: 
 - cudaMalloc -> for allocating memory on the device  
 - cudaMemcpy -> for copying data between host and device 

- ##### Dimensions of Grid and block is specified on kernel invocation in dim3 structure
- ##### Thread index i is an implicit variable ; A runing thread has all the information about the execution parameters as well as its own position on the grid and thread block

In [8]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

__global__ void addArrays(int* a , int* b ,int* c ){
    
    int i = threadIdx.x ; // threadid is var that contains the pos of this kernel as it's executing in a thread block
    c[i] = a[i]+b[i] ;
}

int main (){
    const int count = 5;
    const int size = count * sizeof(int);
    int ha[] = {1, 2, 3, 4, 5};  // h stands for host (smth runs on the cpu)
    int hb[] = {10, 20, 30, 40, 50};
    int hc[count];

    // Allocate some mem for these data to be copied from the host to the device
    int *da, *db, *dc;
    cudaMalloc (&da, size);
    cudaMalloc (&db, size);
    cudaMalloc (&dc, size);

    // copy the harrays into
    cudaMemcpy(da,ha,size,cudaMemcpyHostToDevice);
    cudaMemcpy(db,hb,size,cudaMemcpyHostToDevice);

    // 1 block , count threads in it
    addArrays<<<1,count>>>(da,db,dc);

    // back to host 
    cudaMemcpy(hc,dc,size,cudaMemcpyDeviceToHost);

    for (int i = 0; i< count ; ++i){
      printf("%d ",hc[i]);
    }

    //cleaning up
    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    
    return 0;
}

11 22 33 44 55 


###Device Query

Quering device parameters

In [9]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
using namespace std;

int main(){
    int count;
    cudaGetDeviceCount(&count);
    

    // To get information about a particular device 

    cudaDeviceProp prop;
    for (int i =0 ; i<count ; ++i){
        cudaGetDeviceProperties(&prop , i);
        cout<<"Device "<<i <<": "<< prop.name << endl ;
        cout<<"Compute capapility "<<": "<< prop.major <<"."<< prop.minor << endl ;
        cout<<"Max Grid Dim :( "<< prop.maxGridSize[0] << " x " << prop.maxGridSize[1] << " x " << prop.maxGridSize[2] << ") "<< endl ;
        cout<<"Max BLock Dim :( "<< prop.maxThreadsDim[0] << " x " << prop.maxThreadsDim[1] << " x " << prop.maxThreadsDim[2] << ") "<< endl ;


    }
    return(0);
}

Device 0: Tesla T4
Compute capapility : 7.5
Max Grid Dim :( 2147483647 x 65535 x 65535) 
Max BLock Dim :( 1024 x 1024 x 64) 



## Parallel programming patterns

### Map

In [10]:
%%cuda --name my_curand.cu 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "curand.h"

#include <ctime>
#include <cstdio>
#include <iostream>
using namespace std;


__global__ void addTen(float *d,int count)
{
    int threadsPerBlock = blockDim.x * blockDim.y * blockDim.z;
    int threadPosInBlock = threadIdx.x + blockDim.x * threadIdx.y + blockDim.x * blockDim.y * threadIdx.z;     
    int blockPosInGrid = blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z;  
    int tid = blockPosInGrid * threadsPerBlock + threadPosInBlock ;

    if (tid<count){
        d[tid] = d[tid]+10;
    }
}

int main() {
    curandGenerator_t gen;
    curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32);
    curandSetPseudoRandomGeneratorSeed(gen, time(0));

    const int count = 123456;
    const int size = count * sizeof(float);
    float *d;
    float h[count];
    cudaMalloc(&d,size); 

    //initializing the array 
    curandGenerateUniform(gen, d , count); 
    
    //kernel dimension
    dim3 block(8,8,8);
    dim3 grid(16,16);

    addTen<<<grid,block>>>(d,count);

    cudaMemcpy(h,d,size,cudaMemcpyDeviceToHost);
    cudaFree(d);

    for(int i =0; i< 100;++i){
        cout << h[i] <<" " ;
    }

    return 0;
}



'File written in /content/src/my_curand.cu'

In [11]:
!nvcc -o /content/src/my_curand /content/src/my_curand.cu -lcurand
!/content/src/my_curand

10.0852 10.3286 10.5196 10.3174 10.9631 10.444 10.9331 10.3817 10.6773 10.2615 10.6371 10.7809 10.0257 10.9503 10.4384 10.2716 10.3638 10.5453 10.5524 10.2075 10.3764 10.5308 10.3242 10.5421 10.0548 10.7493 10.1657 10.5587 10.519 10.2286 10.3594 10.4039 10.6184 10.3901 10.5421 10.4287 10.203 10.938 10.1874 10.0178 10.2055 10.3246 10.4924 10.7729 10.9913 10.3417 10.2838 10.2943 10.5655 10.1501 10.8893 10.8567 10.8777 10.902 10.4013 10.4166 10.6831 10.7198 10.3484 10.3256 10.4042 10.497 10.2061 10.1565 10.9536 10.7907 10.4115 10.0889 10.0524 10.7304 10.2368 10.6228 10.3419 10.5986 10.5086 10.2193 10.1952 10.0249 10.0181 10.4106 10.167 10.6811 10.8587 10.4183 10.218 10.9605 10.6729 10.3096 10.2346 10.6893 10.1103 10.8155 10.9644 10.7093 10.2212 10.0123 10.3107 10.646 10.2648 10.3904 

### Black Scholes

In [18]:
%%cuda --name black_scholes.cu 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
#define _USE_MATH_DEFINES
#include <ctime>
#include <cstdio>
#include <iostream>
#include <math.h>
using namespace std;

__device__ __host__ __inline__ float N(float x)
{
	return 0.5 + 0.5*erf(x*M_SQRT1_2);
}

__device__ __host__ void price(float k, float s, float t, float r, float v, float* c, float* p)
{
	float srt = v * sqrtf(t);
	float d1 = (logf(s/k)+(r+0.5*v*v)*t) / srt;
	float d2 = d1 - srt;
	float kert = k * expf(-r*t);
	*c = N(d1)*s - N(d2)*kert;
	*p = kert - s + *c;
}

__global__ void price(float* k,float* s,float* t,float* r,float* v,float* c,float* p){
    int idx = threadIdx.x;
    price(k[idx] , s[idx] , t[idx] , r[idx] , v[idx] , &c[idx] , &p[idx] );
}


int main()
{
	const int count = 512;
	curandStatus_t curandStatus;
	cudaError_t cudaError;
	float* args[5];
	curandGenerator_t gen;
	curandStatus = curandCreateGenerator(&gen, curandRngType_t::CURAND_RNG_PSEUDO_MTGP32);
	for (int i = 0; i < 5; ++i) 
	{
		cudaMalloc(&args[i], sizeof(float)*1024);
		curandStatus = curandGenerateUniform(gen, args[i], count);
	}

	float *dc, *dp;
	cudaError = cudaMalloc(&dc, count*sizeof(float));
	cudaError = cudaMalloc(&dp, count*sizeof(float));

	price<<<1,count>>>(args[0], args[1], args[2], args[3], args[4], dc, dp);

	float hc[count] = { 0 };
	float hp[count] = { 0 };
	cudaMemcpy(hc, dc, sizeof(float)*count, cudaMemcpyKind::cudaMemcpyDeviceToHost);
	cudaMemcpy(hp, dp, sizeof(float)*count, cudaMemcpyKind::cudaMemcpyDeviceToHost);

	cudaFree(dc);
	cudaFree(dp);
	for (int i = 0; i < 5; ++i)
		cudaFree(&args[i]);
	cudaDeviceReset();
  return 0;
}

'File written in /content/src/black_scholes.cu'

In [None]:
!nvcc -o /content/src/black_scholes /content/src/black_scholes.cu -lcurand
!/content/src/black_scholes

## Atomic operations

### Atomic sum

In [21]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_20_atomic_functions.h"

#include <iostream>
using namespace std;

__device__ int dSum = 0;

__global__ void sum(int* d)
{
    int tid = threadIdx.x;
    //dSum += d[tid];   //this will go wrong as when in several threads in parallel
    atomicAdd(&dSum,d[tid]);
}

int main ()
{
    const int count = 128;
    const int size = count * sizeof(int);
  
    int h[count];
    for (int i =0 ; i<count;i++)
    {
        h[i] = i+1;
    }
 
    int* d;
    cudaMalloc(&d ,size );
    cudaMemcpy(d,h,size,cudaMemcpyHostToDevice);
    sum<<<1,count>>>(d);
 
    int hSum;

    cudaMemcpyFromSymbol(&hSum , dSum, sizeof(int));
    cout<< "the sum of nums from 1 to "<<count << " is: " << hSum <<endl;
 
    cudaFree(d);
    return 0;
 
}

the sum of nums from 1 to 128 is: 8256



### Monte Carlo Pi

In [22]:
%%cuda --name MonteCarloPi.cu 
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_20_atomic_functions.h"
#include "curand.h"

#include <ctime>
#include <iostream>
#include <iomanip>

using namespace std;

__device__ int dCount = 0;

__global__ void countPoints(const float* xs, const float* ys )
{
    int idx = blockIdx.x + blockDim.x + threadIdx.x;
    float x = xs[idx] - 0.5f;
    float y = ys[idx] - 0.5f;
    int n = sqrt(x*x+y*y) > 0.5f ? 0 : 1;
    atomicAdd(&dCount,n);
}

int main ()
{
    const int count = 512*512;
    const int size = count * sizeof(float);

    cudaError_t cudaStatus;
    curandStatus_t curandStatus;
    curandGenerator_t gen;
 
    //initialize the 512*512 array of rand points
    curandStatus = curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_MTGP32);
    curandSetPseudoRandomGeneratorSeed(gen, time(0));
 
    //initializaing x, y coordinate arrays
    float* x;
    float* y;
    cudaStatus = cudaMalloc(&x ,size );
    cudaStatus = cudaMalloc(&y ,size );
    curandStatus = curandGenerateUniform(gen, x, count);
    curandStatus = curandGenerateUniform(gen, y, count);
 
    //count points
    countPoints<<<512,512>>>(x,y);

    int hCount;

    cudaMemcpyFromSymbol(&hCount, dCount, sizeof(int));
 
    cudaFree(x);
    cudaFree(y);
 
    cout << setprecision(12) 
         << "Pi is approximately " 
         << (4.0f * (float)hCount)/ (float)count << endl;
    
    return 0;
 
}

'File written in /content/src/MonteCarloPi.cu'

In [23]:
# Compile the code
!nvcc -o /content/src/MonteCarloPi /content/src/MonteCarloPi.cu -lcurand





In [24]:
# Run the code
!/content/src/MonteCarloPi

Pi is approximately 3.17822265625


## CUDA Events

In [25]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_20_atomic_functions.h"

#include <iostream>
using namespace std;

__device__ int dSum = 0;

__global__ void sum(int* d)
{
    int tid = threadIdx.x;
    //dSum += d[tid]; //this will go wrong as when in several threads in parallel
    atomicAdd(&dSum,d[tid]);
}

int main ()
{
    const int count = 128;
    const int size = count * sizeof(int);
  
    int h[count];
    for (int i =0 ; i<count;i++)
    {
        h[i] = i+1;
    }
 
    int* d;
    cudaMalloc(&d ,size );
    cudaMemcpy(d,h,size,cudaMemcpyHostToDevice);
    
    // create events
    cudaEvent_t start, end ;
    cudaEventCreate(&start);
    cudaEventCreate(&end);

    cudaEventRecord(start);
    sum<<<1,count>>>(d);
    cudaEventRecord(end);
    cudaEventSynchronize(end);
 
    float elapsed;
 
    cudaEventElapsedTime(&elapsed,start,end);
 
    int hSum;

    cudaMemcpyFromSymbol(&hSum , dSum, sizeof(int));
    cout<< "the sum of nums from 1 to "<<count << " is: " << hSum << ", and it took: "<<elapsed<<" ms" <<endl;
 
    cudaFree(d);
    return 0;
 
} 

the sum of nums from 1 to 128 is: 8256, and it took: 0.01648 ms



## Pinned memory performance speed up

In [26]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
using namespace std;

float timeMemory(bool pinned, bool toDevice)
{
    const int count = 1 << 20;
    const int iterations = 1 << 6;
    const int size = count * sizeof(int);
 
    cudaEvent_t start , end;
    int *h,*d;
    float elapsed;
    cudaError_t status;
 
    cudaEventCreate(&start);
    cudaEventCreate(&end);

    cudaMalloc(&d, size);
    if(pinned)
      cudaHostAlloc(&h, size, cudaHostAllocDefault);
    else
      h = new int[count]; //or malloc;
 
    cudaEventRecord(start);

    for (int i = 0 ; i < iterations ; ++i){
        if (toDevice){
            status = cudaMemcpy(d, h, size, cudaMemcpyHostToDevice);
        }
        else{
            status = cudaMemcpy(h, d, size, cudaMemcpyDeviceToHost);
        }
    }

    cudaEventRecord(end);
    cudaEventSynchronize(end);
    cudaEventElapsedTime(&elapsed, start, end);

    if(pinned)
      cudaFreeHost(h);
    else
      delete [] h;
 
    cudaFree(d);
    cudaEventDestroy(start);
    cudaEventDestroy(end);

    return elapsed;

}


int main ()
{
    cout << "From device, paged memory:\t" << timeMemory(false, false) << endl;  
    cout << "To device, paged memory:\t" << timeMemory(false, true) << endl;  
    cout << "From device, pinned memory:\t" << timeMemory(true, false) << endl;  
    cout << "To device, pinned memory:\t" << timeMemory(true, true) << endl;  

    return 0;
 
} 

From device, paged memory:	64.4473
To device, paged memory:	59.5559
From device, pinned memory:	21.3924
To device, pinned memory:	23.2829



## Single stream

In [27]:
# using a single stream and queing up operations on it

%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>
#include <ctime>
using namespace std;

//several chunks of elements 
const int chunkCount = 1 << 20;
const int totalCount = chunkCount << 3;


__global__ void kernel(float* a, float* b, float* c)
{
    int tid = blockDim.x*blockIdx.x+threadIdx.x;
    if (tid < chunkCount)
      c[tid] = erff(a[tid]+b[tid]);
}

int main ()
{
      
    cudaEvent_t start , end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);

    cudaStream_t stream;
    cudaStreamCreate(&stream);
 
    float *ha, *hb, *hc, *da, *db, *dc;
    const int totalSize = totalCount * sizeof(float);
    const int chunkSize = chunkCount * sizeof(float);
 
    // allocate memory
    cudaMalloc(&da, chunkSize);
    cudaMalloc(&db, chunkSize);
    cudaMalloc(&dc, chunkSize);
 
    cudaHostAlloc(&ha , totalSize, cudaHostAllocDefault);
    cudaHostAlloc(&hb , totalSize, cudaHostAllocDefault);
    cudaHostAlloc(&hc , totalSize, cudaHostAllocDefault);
 
    // fill a and b with noise on the cpu side
    srand((unsigned)time(0));
    for (int i = 0; i < totalCount ; i++)
    {
        ha[i] = rand() / RAND_MAX;
        hb[i] = rand() / RAND_MAX;
    }

    cudaEventRecord(start, stream);
    //send sequentially chunks of data to the kernel
    for (int i = 0 ; i < totalCount ; i+=chunkCount)
    {
        cudaMemcpyAsync(da, ha+i, chunkSize, cudaMemcpyHostToDevice,stream); //queue data up for the stream to do it
        cudaMemcpyAsync(db, hb+i, chunkSize, cudaMemcpyHostToDevice,stream);
        kernel<<<chunkCount/64,4,0,stream>>>(da,db,dc);
        cudaMemcpyAsync(hc+i, dc, chunkSize, cudaMemcpyDeviceToHost,stream); 
    }
    cudaStreamSynchronize(stream);
 
    cudaEventRecord(end,stream);
    cudaEventSynchronize(end);
    
    float elapsed;  
    cudaEventElapsedTime(&elapsed, start, end);
    cout << "This took: " << elapsed << " msec" <<endl;
    
    cudaFreeHost(ha);
    cudaFreeHost(hb);
    cudaFreeHost(hc);
    cudaFree(da);
    cudaFree(db);
    cudaFree(dc);
    cudaEventDestroy(start);
    cudaEventDestroy(end);
    cudaStreamDestroy(stream);
    return 0;
 
} 

This took: 8.68624 msec



## Multiple Streams

In [28]:
# using a single stream and queing up operations on it

%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cmath>
#include <ctime>
using namespace std;

//several chunks of elements 
const int chunkCount = 1 << 20;
const int totalCount = chunkCount << 3;


__global__ void kernel(float* a, float* b, float* c)
{
    int tid = blockDim.x*blockIdx.x+threadIdx.x;
    if (tid < chunkCount)
      c[tid] = erff(a[tid]+b[tid]);
}

int main ()
{
      
    cudaEvent_t start , end;
    cudaEventCreate(&start);
    cudaEventCreate(&end);

    cudaStream_t stream1;
    cudaStream_t stream2;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);
 
    float *ha, *hb, *hc, *d1a, *d1b, *d1c, *d2a, *d2b, *d2c;
    const int totalSize = totalCount * sizeof(float);
    const int chunkSize = chunkCount * sizeof(float);
 
    // allocate memory
    cudaMalloc(&d1a, chunkSize);
    cudaMalloc(&d1b, chunkSize);
    cudaMalloc(&d1c, chunkSize);
    cudaMalloc(&d2a, chunkSize);
    cudaMalloc(&d2b, chunkSize);
    cudaMalloc(&d2c, chunkSize);
 
    cudaHostAlloc(&ha , totalSize, cudaHostAllocDefault);
    cudaHostAlloc(&hb , totalSize, cudaHostAllocDefault);
    cudaHostAlloc(&hc , totalSize, cudaHostAllocDefault);
 
    // fill a and b with noise on the cpu side
    srand((unsigned)time(0));
    for (int i = 0; i < totalCount ; i++)
    {
        ha[i] = rand() / RAND_MAX;
        hb[i] = rand() / RAND_MAX;
    }

    cudaEventRecord(start, stream1);
    //send sequentially chunks of data to the kernel
    for (int i = 0 ; i < totalCount ; i+=chunkCount*2)
    {

        cudaMemcpyAsync(d1a, ha+i, chunkSize, cudaMemcpyHostToDevice,stream1); //queue data up for the stream to do it
        cudaMemcpyAsync(d2a, ha+i+chunkCount, chunkSize, cudaMemcpyHostToDevice,stream2); //queue data up for the stream to do it
        cudaMemcpyAsync(d1b, hb+i, chunkSize, cudaMemcpyHostToDevice,stream1);
        cudaMemcpyAsync(d2b, hb+i+chunkCount, chunkSize, cudaMemcpyHostToDevice,stream2);
        kernel<<<chunkCount/64,4,0,stream1>>>(d1a,d1b,d1c);
        kernel<<<chunkCount/64,4,0,stream2>>>(d2a,d2b,d2c);

        cudaMemcpyAsync(hc+i, d1c, chunkSize, cudaMemcpyDeviceToHost,stream1);    
        cudaMemcpyAsync(hc+i+chunkCount, d2c, chunkSize, cudaMemcpyDeviceToHost,stream2);    
      

    }
    cudaStreamSynchronize(stream1);
    cudaStreamSynchronize(stream2);
 
    cudaEventRecord(end,stream2);
    cudaEventSynchronize(end);
    
    float elapsed;  
    cudaEventElapsedTime(&elapsed, start, end);
    cout << "This took: " << elapsed << " msec" <<endl;
    
    cudaFreeHost(ha);
    cudaFreeHost(hb);
    cudaFreeHost(hc);
   
    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
    return 0;
 
} 

This took: 6.32221 msec



## Thrust

In [29]:
%%file thrust_example.cu
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>

#include <ctime>
using namespace std;

int myrand()
{
    return rand() % 10;
}
int main()
{
    int count = 1024;
    thrust::host_vector<int> h(count);
    generate(begin(h),end(h),myrand);
    thrust::device_vector<int> d = h;
    thrust::sort(begin(d), end(d));
    h =d ;
    for(int i = 0; i<count ; i++)
    {
        cout << h[i] <<"\t";
    }
    return 0;
}




Writing thrust_example.cu


In [30]:
!nvcc thrust_example.cu -o thrust_example
!./thrust_example

0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	2	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	3	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	4	