In [None]:
%%writefile device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>
int main(){
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0){
        printf("No CUDA device found.\n");
        return 0;
    }
    for(int dev=0; dev<deviceCount; ++dev){
        cudaDeviceProp p;
        cudaGetDeviceProperties(&p, dev);
        printf("Device %d: %s\n", dev, p.name);
        printf("Compute capability: %d.%d\n", p.major, p.minor);
        printf("Global memory: %zu MB\n", p.totalGlobalMem/1048576);
        printf("Shared memory per block: %zu bytes\n", p.sharedMemPerBlock);
        printf("Multiprocessors: %d\n", p.multiProcessorCount);
    }
}


Writing device_info.cu


In [None]:
!nvcc device_info.cu -o device_info
!./device_info


Device 0: Tesla T4
Compute capability: 7.5
Global memory: 15095 MB
Shared memory per block: 49152 bytes
Multiprocessors: 40


In [None]:

!nvidia-smi


Wed Nov  5 15:39:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   54C    P8             13W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
%%writefile ps2_hello_single_block.cu
#include <stdio.h>

__global__ void helloFromThreads() {
    int tid = threadIdx.x; // Thread ID within block
    printf("Hello World from thread %d in single block!\n", tid);
}

int main() {
    int threadsPerBlock = 8;  // You can change to 16, 32, etc.
    helloFromThreads<<<1, threadsPerBlock>>>(); // 1 block, multiple threads
    cudaDeviceSynchronize();
    return 0;
}


Overwriting ps2_hello_single_block.cu


In [None]:
!nvcc ps2_hello_single_block.cu -o ps2_hello_single_block
!./ps2_hello_single_block


In [None]:
%%writefile ps3_hello_multi_block.cu
#include <stdio.h>

__global__ void helloFromThreads() {
    int blockId = blockIdx.x;
    int threadId = threadIdx.x;
    printf("Hello World from block %d, thread %d!\n", blockId, threadId);
}

int main() {
    int blocks = 3;           // 3 blocks
    int threadsPerBlock = 4;  // 4 threads in each block
    helloFromThreads<<<blocks, threadsPerBlock>>>();
    cudaDeviceSynchronize();
    return 0;
}


Writing ps3_hello_multi_block.cu


In [None]:
!nvcc ps3_hello_multi_block.cu -o ps3_hello_multi_block
!./ps3_hello_multi_block


In [None]:
%%writefile ps4_hello_2d_blocks_threads.cu
#include <stdio.h>

__global__ void helloFrom2D() {
    int block_x = blockIdx.x;
    int block_y = blockIdx.y;
    int thread_x = threadIdx.x;
    int thread_y = threadIdx.y;

    printf("Hello from block(%d,%d), thread(%d,%d)\n",
           block_x, block_y, thread_x, thread_y);
}

int main() {
    dim3 blocks(2, 2);   // 2x2 = 4 blocks
    dim3 threads(3, 2);  // 3x2 = 6 threads per block
    helloFrom2D<<<blocks, threads>>>();
    cudaDeviceSynchronize();
    return 0;
}


Writing ps4_hello_2d_blocks_threads.cu


In [None]:
!nvcc ps4_hello_2d_blocks_threads.cu -o ps4_hello_2d_blocks_threads
!./ps4_hello_2d_blocks_threads


In [None]:
%%writefile ps5_vector_add.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <sys/time.h>

// Timing function
double get_time(){
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec*1e-6;
}

// CUDA kernel
__global__ void vectorAdd(float *A, float *B, float *C, int N) {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N)
        C[i] = A[i] + B[i];
}

int main() {
    int N = 1e6;  // change to 1e5, 1e6, 1e7
    size_t size = N * sizeof(float);

    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float *h_C = (float*)malloc(size);
    float *d_A, *d_B, *d_C;

    for (int i = 0; i < N; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }

    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // CPU computation
    double startCPU = get_time();
    for (int i = 0; i < N; i++)
        h_C[i] = h_A[i] + h_B[i];
    double endCPU = get_time();
    double cpuTime = endCPU - startCPU;

    // GPU computation
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    double startGPU = get_time();
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();
    double endGPU = get_time();
    double gpuTime = endGPU - startGPU;

    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    printf("Vector Size = %d\n", N);
    printf("CPU Time = %f sec\n", cpuTime);
    printf("GPU Time = %f sec\n", gpuTime);
    printf("Speedup = %f\n", cpuTime / gpuTime);

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);
    return 0;
}


Writing ps5_vector_add.cu


In [None]:
!nvcc ps5_vector_add.cu -o ps5_vector_add
!./ps5_vector_add | cat


Vector Size = 1000000
CPU Time = 0.004659 sec
GPU Time = 0.007375 sec
Speedup = 0.631720


In [None]:
%%writefile ps6_matrix_add.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <sys/time.h>

double get_time(){
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec*1e-6;
}

__global__ void matrixAdd(float *A, float *B, float *C, int M, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < M && col < N)
        C[row*N + col] = A[row*N + col] + B[row*N + col];
}

int main() {
    int M = 500, N = 500;
    size_t size = M * N * sizeof(float);
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float *h_C = (float*)malloc(size);
    float *d_A, *d_B, *d_C;

    for (int i = 0; i < M * N; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }

    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    double startCPU = get_time();
    for (int i = 0; i < M * N; i++)
        h_C[i] = h_A[i] + h_B[i];
    double endCPU = get_time();
    double cpuTime = endCPU - startCPU;

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((N + 15) / 16, (M + 15) / 16);

    double startGPU = get_time();
    matrixAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, M, N);
    cudaDeviceSynchronize();
    double endGPU = get_time();
    double gpuTime = endGPU - startGPU;

    printf("Matrix Size = %dx%d\n", M, N);
    printf("CPU Time = %f sec\n", cpuTime);
    printf("GPU Time = %f sec\n", gpuTime);
    printf("Speedup = %f\n", cpuTime / gpuTime);

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);
    return 0;
}


Writing ps6_matrix_add.cu


In [None]:
!nvcc ps6_matrix_add.cu -o ps6_matrix_add
!./ps6_matrix_add | cat


Matrix Size = 500x500
CPU Time = 0.001278 sec
GPU Time = 0.008201 sec
Speedup = 0.155852


In [None]:
%%writefile ps7_dot_product.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <sys/time.h>

double get_time(){
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec*1e-6;
}

__global__ void dotProduct(float *A, float *B, float *C, int N) {
    __shared__ float cache[256];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int cacheIndex = threadIdx.x;
    float temp = 0;
    while (tid < N) {
        temp += A[tid] * B[tid];
        tid += blockDim.x * gridDim.x;
    }
    cache[cacheIndex] = temp;
    __syncthreads();

    int i = blockDim.x / 2;
    while (i != 0) {
        if (cacheIndex < i)
            cache[cacheIndex] += cache[cacheIndex + i];
        __syncthreads();
        i /= 2;
    }

    if (cacheIndex == 0)
        atomicAdd(C, cache[0]);
}

int main() {
    int N = 1e6;
    size_t size = N * sizeof(float);
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float h_C = 0, *d_A, *d_B, *d_C;

    for (int i = 0; i < N; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }

    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, sizeof(float));

    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    cudaMemset(d_C, 0, sizeof(float));

    double startCPU = get_time();
    float cpu_sum = 0;
    for (int i = 0; i < N; i++)
        cpu_sum += h_A[i] * h_B[i];
    double endCPU = get_time();
    double cpuTime = endCPU - startCPU;

    int threads = 256, blocks = 256;

    double startGPU = get_time();
    dotProduct<<<blocks, threads>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();
    double endGPU = get_time();
    double gpuTime = endGPU - startGPU;

    cudaMemcpy(&h_C, d_C, sizeof(float), cudaMemcpyDeviceToHost);

    printf("Dot Product = %f\n", h_C);
    printf("CPU Time = %f sec\n", cpuTime);
    printf("GPU Time = %f sec\n", gpuTime);
    printf("Speedup = %f\n", cpuTime / gpuTime);

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B);
    return 0;
}


Overwriting ps7_dot_product.cu


In [None]:
!nvcc ps7_dot_product.cu -o ps7_dot_product
!./ps7_dot_product | cat


Dot Product = 0.000000
CPU Time = 0.003720 sec
GPU Time = 0.009787 sec
Speedup = 0.380073


In [None]:
%%writefile ps8_matrix_mul.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <sys/time.h>

double get_time(){
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec*1e-6;
}

__global__ void matMul(float *A, float *B, float *C, int M, int N, int P) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < M && col < P) {
        float sum = 0;
        for (int k = 0; k < N; k++)
            sum += A[row * N + k] * B[k * P + col];
        C[row * P + col] = sum;
    }
}

int main() {
    int M = 200, N = 200, P = 200;
    size_t sizeA = M * N * sizeof(float);
    size_t sizeB = N * P * sizeof(float);
    size_t sizeC = M * P * sizeof(float);

    float *h_A = (float*)malloc(sizeA);
    float *h_B = (float*)malloc(sizeB);
    float *h_C = (float*)malloc(sizeC);
    float *d_A, *d_B, *d_C;

    for (int i = 0; i < M * N; i++) h_A[i] = rand() / (float)RAND_MAX;
    for (int i = 0; i < N * P; i++) h_B[i] = rand() / (float)RAND_MAX;

    cudaMalloc((void**)&d_A, sizeA);
    cudaMalloc((void**)&d_B, sizeB);
    cudaMalloc((void**)&d_C, sizeC);

    double startCPU = get_time();
    for (int i = 0; i < M; i++)
        for (int j = 0; j < P; j++) {
            float sum = 0;
            for (int k = 0; k < N; k++)
                sum += h_A[i * N + k] * h_B[k * P + j];
            h_C[i * P + j] = sum;
        }
    double endCPU = get_time();
    double cpuTime = endCPU - startCPU;

    cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((P + 15) / 16, (M + 15) / 16);

    double startGPU = get_time();
    matMul<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, M, N, P);
    cudaDeviceSynchronize();
    double endGPU = get_time();
    double gpuTime = endGPU - startGPU;

    printf("Matrix Size = %dx%dx%d\n", M, N, P);
    printf("CPU Time = %f sec\n", cpuTime);
    printf("GPU Time = %f sec\n", gpuTime);
    printf("Speedup = %f\n", cpuTime / gpuTime);

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);
    return 0;
}


Writing ps8_matrix_mul.cu


In [None]:
!nvcc ps8_matrix_mul.cu -o ps8_matrix_mul
!./ps8_matrix_mul | cat


Matrix Size = 200x200x200
CPU Time = 0.025090 sec
GPU Time = 0.007593 sec
Speedup = 3.304393
