In [None]:
!nvidia-smi

Mon Nov  3 19:49:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile hello.cu
#include <stdio.h>

__global__ void hello() {
    printf("Hello from GPU thread %d\n", threadIdx.x);
}

int main() {
    hello<<<1,5>>>();
    cudaDeviceSynchronize();
    printf("Hello from CPU\n");
    return 0;
}


Writing hello.cu


In [None]:
!nvcc -arch=sm_70 hello.cu -o hello
!./hello


Hello from GPU thread 0
Hello from GPU thread 1
Hello from GPU thread 2
Hello from GPU thread 3
Hello from GPU thread 4
Hello from CPU


In [None]:
%%writefile blockIdx.cu
#include<stdio.h>

__global__ void block(){
  int t = threadIdx.x;
  int b = blockIdx.x;
  printf("Thread %d in block %d\n", t, b);
}

int main(){
  block<<<3,5>>>();
  cudaDeviceSynchronize();
  return 0;
}


Overwriting blockIdx.cu


In [None]:
!nvcc -arch=sm_70 blockIdx.cu

In [None]:
!./a.out

Thread 0 in block 1
Thread 1 in block 1
Thread 2 in block 1
Thread 3 in block 1
Thread 4 in block 1
Thread 0 in block 2
Thread 1 in block 2
Thread 2 in block 2
Thread 3 in block 2
Thread 4 in block 2
Thread 0 in block 0
Thread 1 in block 0
Thread 2 in block 0
Thread 3 in block 0
Thread 4 in block 0


In [25]:
%%writefile d.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void d() {
    int x = threadIdx.x;
    int y = threadIdx.y;
    printf("Thread (%d,%d) in Block (%d,%d)\n", x, y, blockIdx.x, blockIdx.y);
}

int main() {
    dim3 grid(2, 2);      // 2x2 blocks
    dim3 block(2, 2);     // 2x2 threads per block

    d<<<grid, block>>>(); // Correct launch syntax

    cudaDeviceSynchronize();
    return 0;
}


Overwriting d.cu


In [26]:
!nvcc -arch=sm_70 d.cu

In [27]:
!./a.out

Thread (0,0) in Block (1,0)
Thread (1,0) in Block (1,0)
Thread (0,1) in Block (1,0)
Thread (1,1) in Block (1,0)
Thread (0,0) in Block (1,1)
Thread (1,0) in Block (1,1)
Thread (0,1) in Block (1,1)
Thread (1,1) in Block (1,1)
Thread (0,0) in Block (0,1)
Thread (1,0) in Block (0,1)
Thread (0,1) in Block (0,1)
Thread (1,1) in Block (0,1)
Thread (0,0) in Block (0,0)
Thread (1,0) in Block (0,0)
Thread (0,1) in Block (0,0)
Thread (1,1) in Block (0,0)


In [28]:
%%writefile vecAdd.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

// CUDA kernel for vector addition
__global__ void addVectors(float *A, float *B, float *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        C[i] = A[i] + B[i];
}

// CPU version
void addVectorsCPU(float *A, float *B, float *C, int N) {
    for (int i = 0; i < N; i++)
        C[i] = A[i] + B[i];
}

int main() {
    int sizes[] = {100000, 1000000, 10000000}; // 10^5, 10^6, 10^7
    int testCount = sizeof(sizes) / sizeof(sizes[0]);

    for (int t = 0; t < testCount; t++) {
        int N = sizes[t];
        size_t bytes = N * sizeof(float);

        printf("\n=================================\n");
        printf("Vector Size: %d\n", N);

        // Allocate memory on host
        float *h_A = (float*)malloc(bytes);
        float *h_B = (float*)malloc(bytes);
        float *h_C_cpu = (float*)malloc(bytes);
        float *h_C_gpu = (float*)malloc(bytes);

        // Initialize input vectors with random numbers
        for (int i = 0; i < N; i++) {
            h_A[i] = rand() % 100;
            h_B[i] = rand() % 100;
        }

        // ===================== CPU computation =====================
        clock_t start_cpu = clock();
        addVectorsCPU(h_A, h_B, h_C_cpu, N);
        clock_t end_cpu = clock();
        double cpu_time = (double)(end_cpu - start_cpu) / CLOCKS_PER_SEC;

        // ===================== GPU computation =====================
        float *d_A, *d_B, *d_C;
        cudaMalloc((void**)&d_A, bytes);
        cudaMalloc((void**)&d_B, bytes);
        cudaMalloc((void**)&d_C, bytes);

        cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice);

        int threads = 256;
        int blocks = (N + threads - 1) / threads;

        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start);

        addVectors<<<blocks, threads>>>(d_A, d_B, d_C, N);

        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        float gpu_time_ms = 0;
        cudaEventElapsedTime(&gpu_time_ms, start, stop); // in milliseconds

        cudaMemcpy(h_C_gpu, d_C, bytes, cudaMemcpyDeviceToHost);

        // ===================== Verify result =====================
        int correct = 1;
        for (int i = 0; i < N; i++) {
            if (fabs(h_C_cpu[i] - h_C_gpu[i]) > 1e-5) {
                correct = 0;
                break;
            }
        }

        // ===================== Print results =====================
        printf("CPU Time: %.6f s\n", cpu_time);
        printf("GPU Time: %.6f s\n", gpu_time_ms / 1000.0);
        printf("Speedup: %.2fx\n", cpu_time / (gpu_time_ms / 1000.0));
        printf("Result Verification: %s\n", correct ? "PASSED ✅" : "FAILED ❌");

        // ===================== Free memory =====================
        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
        free(h_A);
        free(h_B);
        free(h_C_cpu);
        free(h_C_gpu);
    }

    return 0;
}


Writing vecAdd.cu


In [29]:
!nvcc -arch=sm_70 vecAdd.cu

In [31]:
!./a.out


Vector Size: 100000
CPU Time: 0.000447 s
GPU Time: 0.000106 s
Speedup: 4.22x
Result Verification: PASSED ✅

Vector Size: 1000000
CPU Time: 0.004563 s
GPU Time: 0.000052 s
Speedup: 87.32x
Result Verification: PASSED ✅

Vector Size: 10000000
CPU Time: 0.053648 s
GPU Time: 0.000463 s
Speedup: 115.80x
Result Verification: PASSED ✅


In [34]:
%%writefile matAdd.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

#define M 1000   // number of rows
#define N 1000   // number of columns

__global__ void matrixAdd(float *A, float *B, float *C, int rows, int cols) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < rows && col < cols) {
        int idx = row * cols + col;
        C[idx] = A[idx] + B[idx];
    }
}

// CPU version of matrix addition
void matrixAddCPU(float *A, float *B, float *C, int rows, int cols) {
    for (int i = 0; i < rows * cols; i++) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    int size = M * N * sizeof(float);

    float *A, *B, *C, *d_A, *d_B, *d_C;

    // Allocate memory on host (CPU)
    A = (float *)malloc(size);
    B = (float *)malloc(size);
    C = (float *)malloc(size);

    // Initialize matrices A and B with random numbers
    for (int i = 0; i < M * N; i++) {
        A[i] = rand() % 100;
        B[i] = rand() % 100;
    }

    // Allocate memory on device (GPU)
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    // Copy A and B to device
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    // Define block and grid size
    dim3 blockSize(16, 16);
    dim3 gridSize((N + blockSize.x - 1) / blockSize.x,
                  (M + blockSize.y - 1) / blockSize.y);

    // Measure GPU time
    clock_t start_gpu = clock();
    matrixAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N);
    cudaDeviceSynchronize();
    clock_t end_gpu = clock();

    // Copy result back to host
    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

    // Measure CPU time
    clock_t start_cpu = clock();
    matrixAddCPU(A, B, C, M, N);
    clock_t end_cpu = clock();

    // Compute execution times
    double cpu_time = (double)(end_cpu - start_cpu) / CLOCKS_PER_SEC;
    double gpu_time = (double)(end_gpu - start_gpu) / CLOCKS_PER_SEC;
    double speedup = cpu_time / gpu_time;

    printf("Matrix size: %d x %d\n", M, N);
    printf("CPU Time: %.6f seconds\n", cpu_time);
    printf("GPU Time: %.6f seconds\n", gpu_time);
    printf("Speedup: %.2fx\n", speedup);

    // Free memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(A);
    free(B);
    free(C);

    return 0;
}


Writing matAdd.cu


In [35]:
!nvcc -arch=sm_70 matAdd.cu

In [36]:
!./a.out

Matrix size: 1000 x 1000
CPU Time: 0.003201 seconds
GPU Time: 0.000193 seconds
Speedup: 16.59x


In [37]:
%%writefile DotProd.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

#define N (2048 * 2048)          // Total number of elements
#define THREADS_PER_BLOCK 512    // Threads per block

// CUDA kernel for parallel dot product
__global__ void dot(int *a, int *b, int *c) {
    __shared__ int temp[THREADS_PER_BLOCK];
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int tid = threadIdx.x;

    // Each thread computes its partial product
    temp[tid] = a[index] * b[index];
    __syncthreads();

    // Only thread 0 in each block sums up and updates global result
    if (tid == 0) {
        int sum = 0;
        for (int i = 0; i < THREADS_PER_BLOCK; i++)
            sum += temp[i];
        atomicAdd(c, sum);
    }
}

// Function to fill array with random integers
void random_ints(int *x, int n) {
    for (int i = 0; i < n; i++)
        x[i] = rand() % 10;
}

int main(void) {
    int *a, *b, *c;             // Host copies
    int *dev_a, *dev_b, *dev_c; // Device copies
    int size = N * sizeof(int);
    float cpu_time, gpu_time;

    // Allocate memory on device
    cudaMalloc((void**)&dev_a, size);
    cudaMalloc((void**)&dev_b, size);
    cudaMalloc((void**)&dev_c, sizeof(int));

    // Allocate memory on host
    a = (int*)malloc(size);
    b = (int*)malloc(size);
    c = (int*)malloc(sizeof(int));

    // Initialize input vectors
    random_ints(a, N);
    random_ints(b, N);
    *c = 0;

    /* ---------------- CPU Implementation ---------------- */
    clock_t start_cpu = clock();
    long long cpu_result = 0;
    for (int i = 0; i < N; i++)
        cpu_result += (long long)a[i] * b[i];
    clock_t end_cpu = clock();
    cpu_time = ((float)(end_cpu - start_cpu)) / CLOCKS_PER_SEC * 1000.0; // in ms

    /* ---------------- GPU Implementation ---------------- */
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_c, c, sizeof(int), cudaMemcpyHostToDevice);

    cudaEventRecord(start);
    dot<<<N / THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(dev_a, dev_b, dev_c);
    cudaEventRecord(stop);

    cudaMemcpy(c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
    cudaEventSynchronize(stop);

    cudaEventElapsedTime(&gpu_time, start, stop);

    /* ---------------- Results ---------------- */
    printf("CPU Dot Product  = %lld\n", cpu_result);
    printf("GPU Dot Product  = %d\n", *c);
    printf("CPU Time = %.3f ms\n", cpu_time);
    printf("GPU Time = %.3f ms\n", gpu_time);
    printf("Speedup  = %.2fx\n", cpu_time / gpu_time);

    /* ---------------- Cleanup ---------------- */
    free(a); free(b); free(c);
    cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);
    cudaEventDestroy(start); cudaEventDestroy(stop);

    return 0;
}


Writing DotProd.cu


In [38]:
!nvcc -arcsm_70 DotProd.cu

nvcc fatal   : Unknown option '-arcsm_70'


In [39]:
!./a.out

Matrix size: 1000 x 1000
CPU Time: 0.003163 seconds
GPU Time: 0.000172 seconds
Speedup: 18.39x


In [40]:
%%writefile MatMat.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

#define M 512   // Rows in A and C
#define N 512   // Columns in A, Rows in B
#define P 512   // Columns in B and C

// CUDA kernel for Matrix Multiplication
__global__ void matrixMulKernel(float *A, float *B, float *C, int M, int N, int P) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0;

    if (row < M && col < P) {
        for (int k = 0; k < N; ++k)
            sum += A[row * N + k] * B[k * P + col];
        C[row * P + col] = sum;
    }
}

// Function to fill matrix with random floats
void random_matrix(float *mat, int rows, int cols) {
    for (int i = 0; i < rows * cols; i++)
        mat[i] = (float)(rand() % 10);
}

int main() {
    int size_A = M * N * sizeof(float);
    int size_B = N * P * sizeof(float);
    int size_C = M * P * sizeof(float);

    float *A, *B, *C_cpu, *C_gpu;
    float *d_A, *d_B, *d_C;

    A = (float*)malloc(size_A);
    B = (float*)malloc(size_B);
    C_cpu = (float*)malloc(size_C);
    C_gpu = (float*)malloc(size_C);

    random_matrix(A, M, N);
    random_matrix(B, N, P);

    /* ---------------- CPU Implementation ---------------- */
    clock_t start_cpu = clock();
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < P; j++) {
            float sum = 0.0;
            for (int k = 0; k < N; k++)
                sum += A[i * N + k] * B[k * P + j];
            C_cpu[i * P + j] = sum;
        }
    }
    clock_t end_cpu = clock();
    float cpu_time = ((float)(end_cpu - start_cpu)) / CLOCKS_PER_SEC * 1000.0; // in ms

    /* ---------------- GPU Implementation ---------------- */
    cudaMalloc((void**)&d_A, size_A);
    cudaMalloc((void**)&d_B, size_B);
    cudaMalloc((void**)&d_C, size_C);

    cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((P + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (M + threadsPerBlock.y - 1) / threadsPerBlock.y);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    matrixMulKernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, M, N, P);
    cudaEventRecord(stop);

    cudaMemcpy(C_gpu, d_C, size_C, cudaMemcpyDeviceToHost);
    cudaEventSynchronize(stop);

    float gpu_time;
    cudaEventElapsedTime(&gpu_time, start, stop);

    /* ---------------- Speedup and Output ---------------- */
    printf("CPU Time = %.3f ms\n", cpu_time);
    printf("GPU Time = %.3f ms\n", gpu_time);
    printf("Speedup  = %.2fx\n", cpu_time / gpu_time);

    /* ---------------- Cleanup ---------------- */
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(A);
    free(B);
    free(C_cpu);
    free(C_gpu);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Writing MatMat.cu
