In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!nvidia-smi


Sun Dec 21 20:13:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P0             25W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
%%writefile vector_add.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>

__global__ void vectorAdd(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x;   // one thread per block

    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n = 16;
    size_t size = n * sizeof(float);

    float *h_a = new float[n];
    float *h_b = new float[n];
    float *h_c = new float[n];

    for (int i = 0; i < n; i++) {
        h_a[i] = i * 1.0f;
        h_b[i] = 2.0f * i;
        h_c[i] = 0.0f;
    }

    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // n blocks, 1 thread per block
    vectorAdd<<<n, 1>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize();

    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    std::cout << "Results:\n";
    for (int i = 0; i < n; i++) {
        std::cout << "i=" << i
                  << " a=" << h_a[i]
                  << " b=" << h_b[i]
                  << " c=" << h_c[i]
                  << std::endl;
    }

    bool correct = true;
    for (int i = 0; i < n; i++) {
        float expected = h_a[i] + h_b[i];
        if (fabs(h_c[i] - expected) > 1e-5) {
            correct = false;
            break;
        }
    }

    std::cout << "\nVector Add: "
              << (correct ? "PASS" : "FAIL") << std::endl;

    delete[] h_a;
    delete[] h_b;
    delete[] h_c;
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Writing vector_add.cu


In [3]:
!ls

vector_add.cu


In [4]:
!sed -n '1,20p' vector_add.cu


#include <iostream>
#include <cuda_runtime.h>
#include <cmath>

__global__ void vectorAdd(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x;   // one thread per block

    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n = 16;
    size_t size = n * sizeof(float);

    float *h_a = new float[n];
    float *h_b = new float[n];
    float *h_c = new float[n];



In [5]:
!nvcc vector_add.cu -o vector_add
!./vector_add


Results:
i=0 a=0 b=0 c=0
i=1 a=1 b=2 c=3
i=2 a=2 b=4 c=6
i=3 a=3 b=6 c=9
i=4 a=4 b=8 c=12
i=5 a=5 b=10 c=15
i=6 a=6 b=12 c=18
i=7 a=7 b=14 c=21
i=8 a=8 b=16 c=24
i=9 a=9 b=18 c=27
i=10 a=10 b=20 c=30
i=11 a=11 b=22 c=33
i=12 a=12 b=24 c=36
i=13 a=13 b=26 c=39
i=14 a=14 b=28 c=42
i=15 a=15 b=30 c=45

Vector Add: PASS


In [6]:
%%writefile vector_add.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>

__global__ void vectorAdd(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_a = new float[n];
    float *h_b = new float[n];
    float *h_c = new float[n];

    for (int i = 0; i < n; i++) {
        h_a[i] = i * 1.0f;
        h_b[i] = 2.0f * i;
    }

    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize();

    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    bool correct = true;
    for (int i = 0; i < n; i++) {
        float expected = h_a[i] + h_b[i];
        if (fabs(h_c[i] - expected) > 1e-5) {
            correct = false;
            break;
        }
    }

    std::cout << "Vector Add: " << (correct ? "PASS" : "FAIL") << std::endl;

    delete[] h_a;
    delete[] h_b;
    delete[] h_c;
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Overwriting vector_add.cu


In [7]:
%%writefile multiply_scale.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>

__global__ void multiplyScale(const float* a, const float* b, float* c,
                              float alpha, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = alpha * a[idx] * b[idx];
    }
}

int main() {
    int n = 1000000;
    float alpha = 0.5f;
    size_t size = n * sizeof(float);

    float *h_a = new float[n];
    float *h_b = new float[n];
    float *h_c = new float[n];

    for (int i = 0; i < n; i++) {
        h_a[i] = i * 1.0f;
        h_b[i] = 3.0f;
    }

    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    multiplyScale<<<gridSize, blockSize>>>(d_a, d_b, d_c, alpha, n);
    cudaDeviceSynchronize();

    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    bool correct = true;
    for (int i = 0; i < n; i++) {
        float expected = alpha * h_a[i] * h_b[i];
        if (fabs(h_c[i] - expected) > 1e-5) {
            correct = false;
            break;
        }
    }

    std::cout << "Multiply Scale: " << (correct ? "PASS" : "FAIL") << std::endl;

    delete[] h_a;
    delete[] h_b;
    delete[] h_c;
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Writing multiply_scale.cu


In [8]:
%%writefile relu.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>

__global__ void relu(const float* x, float* y, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        y[idx] = (x[idx] > 0.0f) ? x[idx] : 0.0f;
    }
}

int main() {
    int n = 1000000;
    size_t size = n * sizeof(float);

    float *h_x = new float[n];
    float *h_y = new float[n];

    for (int i = 0; i < n; i++) {
        h_x[i] = (i % 2 == 0) ? i * 1.0f : -i * 1.0f;
    }

    float *d_x, *d_y;
    cudaMalloc(&d_x, size);
    cudaMalloc(&d_y, size);

    cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    relu<<<gridSize, blockSize>>>(d_x, d_y, n);
    cudaDeviceSynchronize();

    cudaMemcpy(h_y, d_y, size, cudaMemcpyDeviceToHost);

    bool correct = true;
    for (int i = 0; i < n; i++) {
        float expected = (h_x[i] > 0.0f) ? h_x[i] : 0.0f;
        if (fabs(h_y[i] - expected) > 1e-5) {
            correct = false;
            break;
        }
    }

    std::cout << "ReLU: " << (correct ? "PASS" : "FAIL") << std::endl;

    delete[] h_x;
    delete[] h_y;
    cudaFree(d_x);
    cudaFree(d_y);

    return 0;
}


Writing relu.cu


In [9]:
!nvcc vector_add.cu -o vector_add
!./vector_add

!nvcc multiply_scale.cu -o multiply_scale
!./multiply_scale

!nvcc relu.cu -o relu
!./relu


Vector Add: PASS
Multiply Scale: PASS
ReLU: PASS


In [10]:
%%writefile vector_add_2.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cmath>

__global__ void vectorAdd(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int Ns[] = {1000, 100000, 10000000};
    int blockSizes[] = {32, 128, 256, 512};

    for (int ni = 0; ni < 3; ni++) {
        int n = Ns[ni];
        size_t size = n * sizeof(float);

        float *h_a = new float[n];
        float *h_b = new float[n];
        float *h_c = new float[n];

        for (int i = 0; i < n; i++) {
            h_a[i] = i * 1.0f;
            h_b[i] = 2.0f * i;
        }

        float *d_a, *d_b, *d_c;
        cudaMalloc(&d_a, size);
        cudaMalloc(&d_b, size);
        cudaMalloc(&d_c, size);

        cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

        std::cout << "\nInput size n = " << n << std::endl;

        for (int bi = 0; bi < 4; bi++) {
            int blockSize = blockSizes[bi];
            int gridSize = (n + blockSize - 1) / blockSize;

            vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
            cudaDeviceSynchronize();

            cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

            bool correct = true;
            for (int i = 0; i < n; i++) {
                float expected = h_a[i] + h_b[i];
                if (fabs(h_c[i] - expected) > 1e-5) {
                    correct = false;
                    break;
                }
            }

            std::cout << "  blockSize = " << blockSize
                      << ", gridSize = " << gridSize
                      << ", totalThreads = " << gridSize * blockSize
                      << " -> " << (correct ? "PASS" : "FAIL")
                      << std::endl;
        }

        delete[] h_a;
        delete[] h_b;
        delete[] h_c;
        cudaFree(d_a);
        cudaFree(d_b);
        cudaFree(d_c);
    }

    return 0;
}


Writing vector_add_2.cu


In [12]:
!nvcc vector_add_2.cu -o vector_add_2
!./vector_add_2



Input size n = 1000
  blockSize = 32, gridSize = 32, totalThreads = 1024 -> PASS
  blockSize = 128, gridSize = 8, totalThreads = 1024 -> PASS
  blockSize = 256, gridSize = 4, totalThreads = 1024 -> PASS
  blockSize = 512, gridSize = 2, totalThreads = 1024 -> PASS

Input size n = 100000
  blockSize = 32, gridSize = 3125, totalThreads = 100000 -> PASS
  blockSize = 128, gridSize = 782, totalThreads = 100096 -> PASS
  blockSize = 256, gridSize = 391, totalThreads = 100096 -> PASS
  blockSize = 512, gridSize = 196, totalThreads = 100352 -> PASS

Input size n = 10000000
  blockSize = 32, gridSize = 312500, totalThreads = 10000000 -> PASS
  blockSize = 128, gridSize = 78125, totalThreads = 10000000 -> PASS
  blockSize = 256, gridSize = 39063, totalThreads = 10000128 -> PASS
  blockSize = 512, gridSize = 19532, totalThreads = 10000384 -> PASS
