In [6]:
!pip install git+https://github.com/afnan47/cuda.git


Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-auuej2xh
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-auuej2xh
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [8]:
!/usr/local/cuda/bin/nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [9]:
%load_ext nvcc_plugin


created output directory at /content/src
Out bin /content/result.out


In [16]:
%%cu

#include <bits/stdc++.h>
using namespace std;

__global__ void add(int *X, int *Y, int *Z, int size){

    int tid=blockIdx.x * blockDim.x + threadIdx.x;
    if(tid<size){
        Z[tid]=X[tid]+Y[tid];
    }
}

void initialize(int *arr, int n){
    for(int i=0;i<n;i++){
        arr[i]=rand()%10;
    }
}

void print(int *vec, int size){
    for(int i=0;i<size;i++){
        cout<<vec[i]<<" ";
    }
    cout<<endl;
}

int main(){
    int *A, *B, *C;
    int N=8;
    int vectorsize=N;
    size_t vectorbytes=vectorsize * sizeof(int);

    A=new int[vectorsize];
    B=new int[vectorsize];
    C=new int[vectorsize];

    initialize(A,vectorsize);
    initialize(B,vectorsize);

    cout<<"print vector A: ";
    print(A,N);
    cout<<"print vector B: ";
    print(B,N);

    int *X, *Y, *Z;
    cudaMalloc(&X, vectorbytes);
    cudaMalloc(&Y, vectorbytes);
    cudaMalloc(&Z, vectorbytes);

    cudaMemcpy(X,A,vectorbytes,cudaMemcpyHostToDevice);
    cudaMemcpy(Y,B,vectorbytes,cudaMemcpyHostToDevice);

    int threadsperBlock=256;
    int blocksperGrid= (N + threadsperBlock-1) / threadsperBlock;

    add<<<blocksperGrid,threadsperBlock>>>(X,Y,Z,N);

    cudaMemcpy(C,Z,vectorbytes,cudaMemcpyDeviceToHost);

    cout<<"Addition vector is: "<<endl;
    print(C,N);

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);



    return 0;


}

print vector A: 3 6 7 5 3 5 6 2 
print vector B: 9 1 2 7 0 9 3 6 
Addition vector is: 
12 7 9 12 3 14 9 8 



In [23]:
%%cu

#include <bits/stdc++.h>
using namespace std;

__global__ void mul(int *X, int *Y, int *Z, int size){
    int row=blockIdx.y * blockDim.y + threadIdx.y;
    int col=blockIdx.x * blockDim.x + threadIdx.x;
    if(row<size && col<size){
        int sum=0;
        for(int i=0;i<size;i++){
          sum+=X[row*size+i]*Y[i*size+col];

        }
        Z[row*size+col]=sum;
    }
}

void initialize(int *arr, int n){
    for(int i=0;i<n*n;i++){
          arr[i]=rand()%10;
    }
}

void print(int *mat, int size){
    for(int row=0;row<size;row++){
        for(int col=0;col<size;col++){
            cout<<mat[row * size + col]<<" ";
        }
        cout<<endl;
    }
    cout<<endl;
}

int main(){
    int *A, *B, *C;
    int N=2;
    int matsize=N*N;
    int blockSize =  16;
    size_t matbytes=matsize * sizeof(int);


    A=new int[matsize];
    B=new int[matsize];
    C=new int[matsize];

    initialize(A,N);
    initialize(B,N);

    cout<<"print matrix A: ";
    print(A,N);
    cout<<"print matrix B: ";
    print(B,N);

    int *X, *Y, *Z;
    cudaMalloc(&X, matbytes);
    cudaMalloc(&Y, matbytes);
    cudaMalloc(&Z, matbytes);

    cudaMemcpy(X,A,matbytes,cudaMemcpyHostToDevice);
    cudaMemcpy(Y,B,matbytes,cudaMemcpyHostToDevice);

    int THREADS=2;
    int BLOCKS= N / THREADS;

    dim3 threads(THREADS,THREADS);
    dim3 blocks(BLOCKS,BLOCKS);


    mul<<<blocks,threads>>>(X,Y,Z,N);

    cudaMemcpy(C,Z,matbytes,cudaMemcpyDeviceToHost);

    cout<<"Multiplication matrix is: "<<endl;
    print(C,N);

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);



    return 0;


}

print matrix A: 3 6 
7 5 

print matrix B: 3 5 
6 2 

Multiplication matrix is: 
45 27 
51 45 


