In [9]:
%%writefile add.cu

#include <stdio.h>

// CUDA kernel for vector addition
__global__ void vectorAddition(float *a, float *b, float *c, int n) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < n)
        c[index] = a[index] + b[index];
}

int main() {
    // Size of vectors
    int n = 10000; // size of vectors

    // Allocate memory for vectors on host
    float *h_a, *h_b, *h_c;
    h_a = (float *)malloc(n * sizeof(float));
    h_b = (float *)malloc(n * sizeof(float));
    h_c = (float *)malloc(n * sizeof(float));

    // Initialize vectors on host
    for (int i = 0; i < n; ++i) {
        h_a[i] = i;
        h_b[i] = i * 2;
    }

    // Allocate memory for vectors on device
    float *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, n * sizeof(float));
    cudaMalloc((void **)&d_b, n * sizeof(float));
    cudaMalloc((void **)&d_c, n * sizeof(float));

    // Copy vectors from host to device
    cudaMemcpy(d_a, h_a, n * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, n * sizeof(float), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    int blockSize = 256;
    int numBlocks = (n + blockSize - 1) / blockSize;

    // Perform vector addition on device
    vectorAddition<<<numBlocks, blockSize>>>(d_a, d_b, d_c, n);

    // Copy result vector from device to host
    cudaMemcpy(h_c, d_c, n * sizeof(float), cudaMemcpyDeviceToHost);

    // Output result vector
    printf("Vector Addition Result:\n");
    for (int i = 0; i < n; ++i) {
        printf("%f ", h_c[i]);
    }
    printf("\n");

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}



Overwriting add.cu


In [10]:

!nvcc add.cu -o add
!./add

Vector Addition Result:
0.000000 3.000000 6.000000 9.000000 12.000000 15.000000 18.000000 21.000000 24.000000 27.000000 30.000000 33.000000 36.000000 39.000000 42.000000 45.000000 48.000000 51.000000 54.000000 57.000000 60.000000 63.000000 66.000000 69.000000 72.000000 75.000000 78.000000 81.000000 84.000000 87.000000 90.000000 93.000000 96.000000 99.000000 102.000000 105.000000 108.000000 111.000000 114.000000 117.000000 120.000000 123.000000 126.000000 129.000000 132.000000 135.000000 138.000000 141.000000 144.000000 147.000000 150.000000 153.000000 156.000000 159.000000 162.000000 165.000000 168.000000 171.000000 174.000000 177.000000 180.000000 183.000000 186.000000 189.000000 192.000000 195.000000 198.000000 201.000000 204.000000 207.000000 210.000000 213.000000 216.000000 219.000000 222.000000 225.000000 228.000000 231.000000 234.000000 237.000000 240.000000 243.000000 246.000000 249.000000 252.000000 255.000000 258.000000 261.000000 264.000000 267.000000 270.000000 273.000000 27

In [11]:
%%writefile matrix.cu

#include <stdio.h>

// CUDA kernel for matrix multiplication
__global__ void matrixMultiplication(float *a, float *b, float *c, int m, int n, int k) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < m && col < k) {
        float sum = 0.0;
        for (int i = 0; i < n; ++i) {
            sum += a[row * n + i] * b[i * k + col];
        }
        c[row * k + col] = sum;
    }
}

int main() {
    // Size of matrices
    int m = 1000;  // rows of matrix A
    int n_mat = 1000; // columns of matrix A and rows of matrix B
    int k = 1000;  // columns of matrix B

    // Allocate memory for matrices on host
    float *h_a, *h_b, *h_c;
    h_a = (float *)malloc(m * n_mat * sizeof(float));
    h_b = (float *)malloc(n_mat * k * sizeof(float));
    h_c = (float *)malloc(m * k * sizeof(float));

    // Initialize matrices on host
    for (int i = 0; i < m * n_mat; ++i) {
        h_a[i] = i;
    }
    for (int i = 0; i < n_mat * k; ++i) {
        h_b[i] = i * 2;
    }

    // Allocate memory for matrices on device
    float *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, m * n_mat * sizeof(float));
    cudaMalloc((void **)&d_b, n_mat * k * sizeof(float));
    cudaMalloc((void **)&d_c, m * k * sizeof(float));

    // Copy matrices from host to device
    cudaMemcpy(d_a, h_a, m * n_mat * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, n_mat * k * sizeof(float), cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((k + threadsPerBlock.x - 1) / threadsPerBlock.x, (m + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // Perform matrix multiplication on device
    matrixMultiplication<<<numBlocks, threadsPerBlock>>>(d_a, d_b, d_c, m, n_mat, k);

    // Copy result matrix from device to host
    cudaMemcpy(h_c, d_c, m * k * sizeof(float), cudaMemcpyDeviceToHost);

    // Output result matrix
    printf("Matrix Multiplication Result:\n");
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < k; ++j) {
            printf("%f ", h_c[i * k + j]);
        }
        printf("\n");
    }

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}


Writing matrix.cu


In [None]:
!nvcc matrix.cu -o matrix
!./matrix