In [2]:
# Set up CUDA
#First Change runtime to GPU and run this cell
!pip install git+https://github.com/afnan47/cuda.git
%load_ext nvcc_plugin

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-4l7omyvg
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-4l7omyvg
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4290 sha256=a361185cc11c0d74dafe4788054b6e4f035268da88e1f6a3cbe50cc0f586c10e
  Stored in directory: /tmp/pip-ephem-wheel-cache-tqe8hatt/wheels/bc/4e/e0/2d86bd15f671dbeb32144013f1159dba09757fde36dc51a963
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [3]:
%%writefile vector_add.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__
void add(int* A, int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

__global__
void multiply(int* A, int* B, int* C, int size) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < size && col < size) {
        int sum = 0;
        for (int i = 0; i < size; i++) {
            sum += A[row * size + i] * B[i * size + col];
        }
        C[row * size + col] = sum;
    }
}

void initializeVector(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        vector[i] = rand() % 10;
    }
}

void initializeMatrix(int* matrix, int size) {
    for (int i = 0; i < size * size; i++) {
        matrix[i] = rand() % 10;
    }
}

void printVector(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        printf("%d ", vector[i]);
    }
    printf("\n");
}

void printMatrix(int* matrix, int size) {
    for (int row = 0; row < size; row++) {
        for (int col = 0; col < size; col++) {
            printf("%d ", matrix[row * size + col]);
        }
        printf("\n");
    }
    printf("\n");
}

int main() {
    int N = 4;
    cudaError_t err;

    // -------------------- Vector Addition --------------------
    int *A, *B, *C;
    size_t vectorBytes = N * sizeof(int);

    A = (int*)malloc(vectorBytes);
    B = (int*)malloc(vectorBytes);
    C = (int*)malloc(vectorBytes);

    initializeVector(A, N);
    initializeVector(B, N);

    printf("Vector A: ");
    printVector(A, N);
    printf("Vector B: ");
    printVector(B, N);

    int *d_A, *d_B, *d_C;
    err = cudaMalloc(&d_A, vectorBytes);
    err = cudaMalloc(&d_B, vectorBytes);
    err = cudaMalloc(&d_C, vectorBytes);

    err = cudaMemcpy(d_A, A, vectorBytes, cudaMemcpyHostToDevice);
    err = cudaMemcpy(d_B, B, vectorBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;

    add<<<blocks, threadsPerBlock>>>(d_A, d_B, d_C, N);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA error (add): %s\n", cudaGetErrorString(err));
    }
    cudaDeviceSynchronize();

    err = cudaMemcpy(C, d_C, vectorBytes, cudaMemcpyDeviceToHost);

    printf("Addition: ");
    printVector(C, N);

    free(A); free(B); free(C);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

    // -------------------- Matrix Multiplication --------------------
    int *D, *E, *F;
    size_t matrixBytes = N * N * sizeof(int);

    D = (int*)malloc(matrixBytes);
    E = (int*)malloc(matrixBytes);
    F = (int*)malloc(matrixBytes);

    initializeMatrix(D, N);
    initializeMatrix(E, N);

    printf("\nMatrix D: \n");
    printMatrix(D, N);
    printf("Matrix E: \n");
    printMatrix(E, N);

    int *d_D, *d_E, *d_F;
    err = cudaMalloc(&d_D, matrixBytes);
    err = cudaMalloc(&d_E, matrixBytes);
    err = cudaMalloc(&d_F, matrixBytes);

    err = cudaMemcpy(d_D, D, matrixBytes, cudaMemcpyHostToDevice);
    err = cudaMemcpy(d_E, E, matrixBytes, cudaMemcpyHostToDevice);

    dim3 threads(2, 2);
    dim3 blocksMat((N + threads.x - 1) / threads.x, (N + threads.y - 1) / threads.y);

    multiply<<<blocksMat, threads>>>(d_D, d_E, d_F, N);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("CUDA error (multiply): %s\n", cudaGetErrorString(err));
    }
    cudaDeviceSynchronize();

    err = cudaMemcpy(F, d_F, matrixBytes, cudaMemcpyDeviceToHost);

    printf("Multiplication: \n");
    printMatrix(F, N);

    free(D); free(E); free(F);
    cudaFree(d_D); cudaFree(d_E); cudaFree(d_F);

    return 0;
}


Writing vector_add.cu


In [4]:
!nvcc -arch=sm_70 vector_add.cu -o vector_add


In [5]:
!./vector_add


Vector A: 3 6 7 5 
Vector B: 3 5 6 2 
Addition: 6 11 13 7 

Matrix D: 
9 1 2 7 
0 9 3 6 
0 6 2 6 
1 8 7 9 

Matrix E: 
2 0 2 3 
7 5 9 2 
2 8 9 7 
3 6 1 2 

Multiplication: 
50 63 52 57 
87 105 114 51 
64 82 78 38 
99 150 146 86 



In [1]:
!nvidia-smi


Sat May  3 19:24:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   57C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                