In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
!nvidia-smi

Sun May  4 10:25:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile vector.cu
#include <bits/stdc++.h>
#include <cuda_runtime.h>
using namespace std;
using namespace std::chrono;

__global__ void add(int* A,int* B,int* C,int size){
  int tid=blockIdx.x*blockDim.x+threadIdx.x;
  if(tid<size){
    C[tid]=A[tid]+B[tid];
  }
}

void initialize(int* vector,int size){
  for(int i=0;i<size;i++){
    vector[i]=rand()%10;
  }
}

void print(int* vector,int size){
  for(int i=0;i<size;i++){
    cout<<vector[i]<<" ";
  }
  cout<<endl;
}

int main(){
  int N=4;
  int *A,*B,*C;
  int vectorSize=N;
  size_t vectorBytes=vectorSize*sizeof(int);
  A=new int[vectorSize];
  B=new int[vectorSize];
  C=new int[vectorSize];
  initialize(A,N);
  initialize(B,N);
  cout<<"Vector A : ";
  print(A,N);
  cout<<"Vector B : ";
  print(B,N);

  int *X,*Y,*Z;
  cudaMalloc(&X,vectorBytes);
  cudaMalloc(&Y,vectorBytes);
  cudaMalloc(&Z,vectorBytes);

  cudaMemcpy(X,A,vectorBytes,cudaMemcpyHostToDevice);
  cudaMemcpy(Y,B,vectorBytes,cudaMemcpyHostToDevice);

  int threadsPerBlock=256;
  int BlocksPerGrid=(N+threadsPerBlock-1)/threadsPerBlock;
  add<<<threadsPerBlock,BlocksPerGrid>>>(X,Y,Z,N);
  cudaMemcpy(C,Z,vectorBytes,cudaMemcpyDeviceToHost);
  cout<<"Addition : ";
  print(C,N);
  delete[] A;
  delete[] B;
  delete[] C;
  cudaFree(X);
  cudaFree(Y);
  cudaFree(Z);
  return 0;
}

Overwriting vector.cu


In [None]:
!nvcc -arch=sm_75 vector.cu -o vec

In [None]:
!./vec

Vector A : 3 6 7 5 
Vector B : 3 5 6 2 
Addition : 6 11 13 7 


In [None]:
%%writefile matrix.cu
#include <bits/stdc++.h>
#include <cuda_runtime.h>
using namespace std;
using namespace std::chrono;

__global__ void multiply(int *A,int *B,int *C,int size){
  int row=blockIdx.y*blockDim.y+threadIdx.y;
  int col=blockIdx.x*blockDim.x+threadIdx.x;
  if(row<size && col<size){
    int sum=0;
    for(int i=0;i<size;i++){
      sum+=A[row*size+i]*B[i*size+col];
    }
    C[row*size+col]=sum;
  }
}

void initialize(int* matrix,int size){
  for(int i=0;i<size*size;i++){
    matrix[i]=rand()%10;
  }
}

void print(int* matrix,int size){
  for(int row=0;row<size;row++){
    for(int col=0;col<size;col++){
      cout<<matrix[row*size+col]<<" ";
    }
    cout<<endl;
  }
  cout<<endl;
}

int main(){
  int *A,*B,*C;
  int N=2;
  int blockSize=16;
  int matrixSize=N*N;
  size_t matrixBytes=matrixSize*sizeof(int);
  A=new int[matrixSize];
  B=new int[matrixSize];
  C=new int[matrixSize];

  initialize(A,N);
  initialize(B,N);
  cout<<"Matrix A : ";
  print(A,N);
  cout<<"Matrix B : ";
  print(B,N);

  int *X,*Y,*Z;
  cudaMalloc(&X,matrixBytes);
  cudaMalloc(&Y,matrixBytes);
  cudaMalloc(&Z,matrixBytes);

  cudaMemcpy(X,A,matrixBytes,cudaMemcpyHostToDevice);
  cudaMemcpy(Y,B,matrixBytes,cudaMemcpyHostToDevice);

  int Threads=2;
  int Blocks=N/Threads;
  dim3 threads(Threads,Threads);
  dim3 blocks(Blocks,Blocks);
  multiply<<<blocks,threads>>>(X,Y,Z,N);
  cudaMemcpy(C,Z,matrixBytes,cudaMemcpyDeviceToHost);
  cout<<"Multiplication of A and B "<<endl;
  print(C,N);
  delete[] A;
  delete[] B;
  delete[] C;
  cudaFree(X);
  cudaFree(Y);
  cudaFree(Z);
  return 0;
}

Overwriting matrix.cu


In [None]:
!nvcc -arch=sm_75 matrix.cu -o mat

    int blockSize=16;
        ^




In [None]:
! ./mat

Matrix A : 3 6 
7 5 

Matrix B : 3 5 
6 2 

Multiplication of A and B 
45 27 
51 45 

