In [None]:
pip install nvcc4jupyter


Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpmoii5iv7".


In [None]:
%%cuda
#include <stdio.h>
#include <cuda.h>

__global__ void hello(){
    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}

int main(){
    hello<<<2, 3>>>();
    cudaDeviceSynchronize();
}

Hello from block: 0, thread: 0
Hello from block: 0, thread: 1
Hello from block: 0, thread: 2
Hello from block: 1, thread: 0
Hello from block: 1, thread: 1
Hello from block: 1, thread: 2



In [None]:
%%cuda
#include <stdio.h>

const int DSIZE = 4096;
const int block_size = 256;

__global__ void vadd(const float *A, const float *B, float *C, int ds){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < ds) {
        C[idx] = A[idx] + B[idx];
    }
}

int main(){
    float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;

    h_A = new float[DSIZE];
    h_B = new float[DSIZE];
    h_C = new float[DSIZE];

    for (int i = 0; i < DSIZE; i++) {
        h_A[i] = rand()/(float)RAND_MAX;
        h_B[i] = rand()/(float)RAND_MAX;
    }

    cudaMalloc(&d_A, DSIZE*sizeof(float));
    cudaMalloc(&d_B, DSIZE*sizeof(float));
    cudaMalloc(&d_C, DSIZE*sizeof(float));

    cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);

    vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);

    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);

    printf("A[0] = %f\n", h_A[0]);
    printf("B[0] = %f\n", h_B[0]);
    printf("C[0] = %f\n", h_C[0]);
    for(int i = 0 ; i < 10 ; i++){
        printf("%f " , h_C[i]);
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);w
    return 0;
}

A[0] = 0.840188
B[0] = 0.394383
C[0] = 1.234571
1.234571 1.581539 1.109199 1.103452 0.831745 1.106268 0.878185 1.868425 1.353009 0.748571 


In [None]:
%%cuda
#include<stdio.h>
#include<cuda.h>

__global__ void matadd(int *l,int *m, int *n)
{
    int x=blockIdx.x;
    int y=blockIdx.y;
    int id=gridDim.x * y +x;
    n[id]=l[id]+m[id];
}
int main()
{
    int n = 2 , m = 3;
    int a[n][m];
    int b[n][m];
    int c[n][m];
    int *d,*e,*f;
    int i,j;

    for(i=0;i<n;i++)
    {
        for(j=0;j<m;j++)
            {
                a[i][j] = (rand())%11;
            }
    }
    for(i=0;i<n;i++)
    {
        for(j=0;j<m;j++)
            {
                b[i][j] = (rand())%11;
            }
    }
    cudaMalloc((void **)&d,n*m*sizeof(int));
    cudaMalloc((void **)&e,n*m*sizeof(int));
    cudaMalloc((void **)&f,n*m*sizeof(int));
    cudaMemcpy(d,a,n*m*sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(e,b,n*m*sizeof(int),cudaMemcpyHostToDevice);

    dim3 grid(m,n);
    /* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is dim3 grid(no. of columns,no. of rows) */

    matadd<<<grid,1>>>(d,e,f);

    cudaMemcpy(c,f,n*m*sizeof(int),cudaMemcpyDeviceToHost);
    printf("Matrix A is :\n");
    for(i=0;i<n;i++)
    {
        for(j=0;j<m;j++)
        {
              printf("%d\t",a[i][j]);
        }
        printf("\n");
    }

    printf("Matrix B is :\n");
    for(i=0;i<n;i++)
    {
        for(j=0;j<m;j++)
        {
              printf("%d\t",b[i][j]);
        }
        printf("\n");
    }
    printf("\nSum of two matrices:\n ");
    for(i=0;i<n;i++)
    {
        for(j=0;j<m;j++)
        {
              printf("%d\t",c[i][j]);
        }
        printf("\n");
    }
    cudaFree(d);
    cudaFree(e);
    cudaFree(f);
    return 0;
}

Matrix A is :
6	10	6	
2	1	4	
Matrix B is :
0	6	3	
1	8	7	

Sum of two matrices:
 1	0	0	
0	0	0	



In [None]:
%%cuda
#include <stdio.h>
#include <stdlib.h>


__global__ void matrixMul(double *a, double *b, double *c, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < width && col < width) {

            c[row * width + col]= a[row * width + col]*b[row * width + col];
        }

    }


int main(int argc, char* argv[]) {
    int width = 800;

    double *h_a, *h_b;
    double *h_c;

    double *d_a ,*d_b;
    double *d_c;

    size_t bytes = width * width * sizeof(double);

    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);

    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    for (int i = 0; i < width; ++i) {
        for (int j = 0; j < width; ++j) {
            h_a[i * width + j] = rand()%11;
            h_b[i * width + j] = rand()%11;
        }
    }

    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (width + blockSize.y - 1) / blockSize.y);

    matrixMul<<<gridSize, blockSize>>>(d_a, d_b, d_c, width);

    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    for (int i = 0; i < width; ++i) {
        for (int j = 0; j < width; ++j) {
            printf("%f ", h_c[i * width + j]);
        }
        printf("\n");
    }

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    free(h_a);
    free(h_b);
    free(h_c);

    // Pause the system to view the output (cross-platform)
    #ifdef _WIN32
    system("pause");
    #else
    getchar();
    #endif

    return 0;
}

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



5.000000 10.000000 48.000000 5.000000 0.000000 27.000000 3.000000 1.000000 14.000000 9.000000 0.000000 27.000000 0.000000 45.000000 6.000000 42.000000 28.000000 0.000000 6.000000 0.000000 21.000000 42.000000 64.000000 70.000000 28.000000 28.000000 24.000000 25.000000 0.000000 2.000000 48.000000 0.000000 0.000000 1.000000 24.000000 9.000000 0.000000 24.000000 30.000000 42.000000 0.000000 20.000000 12.000000 20.000000 12.000000 42.000000 8.000000 15.000000 18.000000 14.000000 30.000000 54.000000 63.000000 40.000000 40.000000 63.000000 32.000000 81.000000 24.000000 27.000000 70.000000 0.000000 70.000000 4.000000 8.000000 0.000000 18.000000 18.000000 20.000000 6.000000 0.000000 60.000000 0.000000 40.000000 56.000000 16.000000 49.000000 12.000000 6.000000 42.000000 28.000000 0.000000 0.000000 4.000000 8.000000 0.000000 0.000000 0.000000 7.000000 28.000000 0.000000 18.000000 40.000000 63.000000 0.000000 10.000000 18.000000 24.000000 30.000000 48.000000 8.000000 30.000000 20.000000 35.000000 

In [None]:
%%cuda
#include <stdio.h>

#define N 800 // Matrix size
#define TILE_WIDTH 10

__global__
void MatrixMulKernel(float* A, float* B, float* C, int width) {
    __shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
    __shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];

    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    int Row = by * TILE_WIDTH + ty;
    int Col = bx * TILE_WIDTH + tx;
    float Cvalue = 0;

    for (int t = 0; t < width / TILE_WIDTH; ++t) {
        ds_A[ty][tx] = A[Row * width + t * TILE_WIDTH + tx];
        ds_B[ty][tx] = B[(t * TILE_WIDTH + ty) * width + Col];
        __syncthreads();

        for (int i = 0; i < TILE_WIDTH; ++i) {
            Cvalue += ds_A[ty][i] * ds_B[i][tx];
        }
        __syncthreads();
    }
    C[Row * width + Col] = Cvalue;
}

int main() {
    float *h_A, *h_B, *h_C;
    float *d_A, *d_B, *d_C;

    // Allocate memory on host
    h_A = (float*)malloc(N * N * sizeof(float));
    h_B = (float*)malloc(N * N * sizeof(float));
    h_C = (float*)malloc(N * N * sizeof(float));

    // Initialize data on host
     for (int i = 0; i < N * N; ++i) {
        h_A[i] = rand()%11;
        h_B[i] = rand()%11;
    }

    // Allocate memory on device
    cudaMalloc(&d_A, N * N * sizeof(float));
    cudaMalloc(&d_B, N * N * sizeof(float));
    cudaMalloc(&d_C, N * N * sizeof(float));

    // Transfer data from host to device
    cudaMemcpy(d_A, h_A, N * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N * N * sizeof(float), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 dimGrid(N / TILE_WIDTH, N / TILE_WIDTH);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);

    // Launch kernel
    MatrixMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C, N);

    // Transfer data from device to host
    cudaMemcpy(h_C, d_C, N * N * sizeof(float), cudaMemcpyDeviceToHost);

    // Print the output if needed
    for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      printf("%f ", h_C[i * N + j]);
    }
    printf("\n");
  }


    // Free memory on device
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Free memory on host
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



20073.000000 19443.000000 20391.000000 19939.000000 19925.000000 20300.000000 19217.000000 20219.000000 19131.000000 19601.000000 19668.000000 20310.000000 18881.000000 19312.000000 19919.000000 19953.000000 19484.000000 20559.000000 19895.000000 19734.000000 19109.000000 19933.000000 19424.000000 19415.000000 19635.000000 18916.000000 19361.000000 19590.000000 20149.000000 19268.000000 20003.000000 20721.000000 19392.000000 19308.000000 19253.000000 20425.000000 19188.000000 20130.000000 20204.000000 19322.000000 20184.000000 18926.000000 20191.000000 18330.000000 19923.000000 19177.000000 19271.000000 20383.000000 18511.000000 19538.000000 19116.000000 19515.000000 20063.000000 20211.000000 20040.000000 19528.000000 19452.000000 18765.000000 19739.000000 20073.000000 18896.000000 19732.000000 20084.000000 19531.000000 19417.000000 18793.000000 19432.000000 20223.000000 18887.000000 19783.000000 19931.000000 19464.000000 20290.000000 19582.000000 19630.000000 18639.000000 20285.000000