In [None]:
!which nvcc

/usr/local/cuda/bin/nvcc


In [None]:
%%writefile vector_add_input_fixed.cu
#include <stdio.h>
#include <stdlib.h>

__global__ void vectorAdd(int *a, int *b, int *c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

int main() {
    int n;
    scanf("%d", &n);  // Number of elements

    int *a = (int*)malloc(n * sizeof(int));
    int *b = (int*)malloc(n * sizeof(int));
    int *c = (int*)malloc(n * sizeof(int));

    for (int i = 0; i < n; i++) {
        scanf("%d", &a[i]);
    }

    for (int i = 0; i < n; i++) {
        scanf("%d", &b[i]);
    }

    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, n * sizeof(int));
    cudaMalloc(&d_b, n * sizeof(int));
    cudaMalloc(&d_c, n * sizeof(int));

    cudaMemcpy(d_a, a, n * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, n * sizeof(int), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;
    vectorAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize();  // Ensure GPU finishes before copying results

    cudaMemcpy(c, d_c, n * sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i < n; i++) {
        printf("%d ", c[i]);
    }
    printf("\n");

    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    free(a); free(b); free(c);
    return 0;
}


Overwriting vector_add_input_fixed.cu


In [None]:
!nvcc -arch=sm_75 vector_add_input_fixed.cu -o vector_add_input_fixed

In [None]:
!echo -e "5\n1 2 3 4 5\n10 20 30 40 50" | ./vector_add_input_fixed

11 22 33 44 55 


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [None]:
%%writefile cuda_runtime.cu

#include <stdio.h>
#include <stdlib.h>

#define N 512  // Size of the square matrix (N x N)

// CUDA kernel for matrix multiplication
__global__ void matrixMul(int *A, int *B, int *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;  // Row index
    int col = blockIdx.x * blockDim.x + threadIdx.x;  // Column index

    if (row < n && col < n) {
        int sum = 0;
        for (int k = 0; k < n; k++) {
            sum += A[row * n + k] * B[k * n + col];
        }
        C[row * n + col] = sum;
    }
}

int main() {
    int size = N * N * sizeof(int);
    int *A, *B, *C;         // Host matrices
    int *d_A, *d_B, *d_C;   // Device matrices

    // Allocate memory on host
    A = (int*)malloc(size);
    B = (int*)malloc(size);
    C = (int*)malloc(size);

    // Initialize matrices A and B
    for (int i = 0; i < N * N; i++) {
        A[i] = 1;  // Or any values you want
        B[i] = 2;
    }

    // Allocate memory on device
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // Copy data from host to device
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    // Define block and grid dimensions
    dim3 blockDim(16, 16);  // Each block has 16x16 threads
    dim3 gridDim((N + 15) / 16, (N + 15) / 16);  // Enough blocks to cover matrix

    // Launch the kernel
    matrixMul<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);

    // Copy result back to host
    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

    // Verify a few values (optional)
    printf("Result at (0,0): %d\n", C[0]);
    printf("Result at (N-1,N-1): %d\n", C[N*N - 1]);

    // Free memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(A);
    free(B);
    free(C);

    return 0;
}


Writing cuda_runtime.cu


In [None]:
!nvcc -arch=sm_75 -gencode=arch=compute_75,code=sm_75 cuda_runtime.cu -o cuda_runtime

In [None]:
!./cuda_runtime

Result at (0,0): 1024
Result at (N-1,N-1): 1024
