In [1]:
!pip install git+https://github.com/afnan47/cuda.git

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-mau_vug0
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-mau_vug0
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4289 sha256=17a442dcff8fbde91b773e85bcdd3f570dd7f4e094db89e99bead154f2e11be1
  Stored in directory: /tmp/pip-ephem-wheel-cache-n86fphyr/wheels/aa/f3/44/e10c1d226ec561d971fcd4b0463f6bff08602afa928a3e7bc7
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [6]:
%%cu
#include <bits/stdc++.h>
#include <chrono>
using namespace std;
using namespace std::chrono;

__global__ void add(int* A, int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

__global__ void matrixMul(int* A, int* B, int* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < K) {
        int sum = 0;
        for (int i = 0; i < N; i++) {
            sum += A[row * N + i] * B[i * K + col];
        }
        C[row * K + col] = sum;
    }
}

void print(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        cout << vector[i] << " ";
    }
    cout << endl;
}

void printMatrix(int* matrix, int rows, int cols) {
    for (int i = 0; i < rows; i++) {
        for (int j = 0; j < cols; j++) {
            cout << matrix[i * cols + j] << " ";
        }
        cout << endl;
    }
    cout << endl;
}

void sequentialAddition(int* A, int* B, int* C, int size) {
    for (int i = 0; i < size; i++) {
        C[i] = A[i] + B[i];
    }
}

void sequentialMatrixMul(int* A, int* B, int* C, int M, int N, int K) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < K; j++) {
            int sum = 0;
            for (int k = 0; k < N; k++) {
                sum += A[i * N + k] * B[k * K + j];
            }
            C[i * K + j] = sum;
        }
    }
}

int main() {
    int N = 100000;

    // Vector addition
    int A[N], B[N], C[N];
    for (int i = 0; i < N; i++) {
        A[i] = rand() % 1000;
        B[i] = rand() % 1000;
    }

    int vectorSize = N;
    size_t vectorBytes = vectorSize * sizeof(int);

    cout << "Vector A: ";
    print(A, N);
    cout << "Vector B: ";
    print(B, N);

    int* X, * Y, * Z;
    cudaMalloc(&X, vectorBytes);
    cudaMalloc(&Y, vectorBytes);
    cudaMalloc(&Z, vectorBytes);

    cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Sequential addition
    auto start = high_resolution_clock::now();
    sequentialAddition(A, B, C, N);
    auto stop = high_resolution_clock::now();
    auto seq_duration = duration_cast<microseconds>(stop - start);

    cout << "Sequential Addition: ";
    print(C, N);

    // Parallel addition
    start = high_resolution_clock::now();
    add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);
    cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);
    stop = high_resolution_clock::now();
    auto par_duration = duration_cast<microseconds>(stop - start);

    cout << "Parallel Addition: ";
    print(C, N);

    cout << "Sequential Addition Time: " << seq_duration.count() << " microseconds" << endl;
    cout << "Parallel Addition Time: " << par_duration.count() << " microseconds" << endl;





    // Matrix multiplication
    int M = 2, N_mat = 2, K = 2;
    int A_mat[2][2] = {{1, 2}, {3, 4}};
    int B_mat[2][2] = {{5, 6}, {7, 8}};
    int C_mat[2][2];

    cout << "Matrix A: " << endl;
    printMatrix((int*)A_mat, M, K);
    cout<<endl;

    cout << "Matrix B: " << endl;
    printMatrix((int*)B_mat, M, K);
    cout<<endl;

    // Sequential matrix multiplication
    start = high_resolution_clock::now();
    sequentialMatrixMul((int*)A_mat, (int*)B_mat, (int*)C_mat, M, N_mat, K);
    stop = high_resolution_clock::now();
    auto seq_mat_duration = duration_cast<microseconds>(stop - start);

    cout << "Sequential Matrix Multiplication: " << endl;
    printMatrix((int*)C_mat, M, K);

    // Parallel matrix multiplication
    int* X_mat, * Y_mat, * Z_mat;
    cudaMalloc(&X_mat, M * N_mat * sizeof(int));
    cudaMalloc(&Y_mat, N_mat * K * sizeof(int));
    cudaMalloc(&Z_mat, M * K * sizeof(int));

    cudaMemcpy(X_mat, A_mat, M * N_mat * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(Y_mat, B_mat, N_mat * K * sizeof(int), cudaMemcpyHostToDevice);

    threadsPerBlock = 16;
    blocksPerGrid = (M + threadsPerBlock - 1) / threadsPerBlock;

    start = high_resolution_clock::now();
    matrixMul<<<blocksPerGrid, threadsPerBlock>>>(X_mat, Y_mat, Z_mat, M, N_mat, K);
    cudaMemcpy(C_mat, Z_mat, M * K * sizeof(int), cudaMemcpyDeviceToHost);
    stop = high_resolution_clock::now();
    auto par_mat_duration = duration_cast<microseconds>(stop - start);

    cout << "Parallel Matrix Multiplication: " << endl;
    printMatrix((int*)C_mat, M, K);

    cout << "Sequential Matrix Multiplication Time: " << seq_mat_duration.count() << " microseconds" << endl;
    cout << "Parallel Matrix Multiplication Time: " << par_mat_duration.count() << " microseconds" << endl;
}


Vector A: 383 777 793 386 649 362 690 763 540 172 211 567 782 862 67 929 22 69 393 11 229 421 784 198 315 413 91 956 862 996 305 84 336 846 313 124 582 814 434 43 87 276 788 403 754 932 676 739 226 94 795 434 467 97 317 652 301 286 865 444 440 31 97 481 709 567 497 586 306 219 528 732 503 270 708 340 796 618 846 921 379 764 841 193 34 124 987 743 227 859 432 437 275 474 858 29 235 818 143 928 776 443 613 606 904 128 369 917 324 470 490 772 644 505 954 669 542 197 355 348 622 299 746 340 311 605 661 878 320 444 522 708 282 924 62 600 452 379 468 973 881 933 660 199 899 959 813 190 926 84 90 376 936 445 179 887 348 659 336 342 206 713 321 819 721 939 940 705 127 984 920 422 396 630 292 672 625 222 640 898 298 524 209 819 732 994 379 273 850 860 579 993 621 504 961 326 944 202 784 842 528 872 958 36 753 303 133 890 567 368 500 788 249 303 363 253 686 152 975 157 436 414 460 28 50 556 794 699 39 428 500 647 159 535 339 215 504 49 285 343 177 238 949 367 292 743 829 682 541 826 261 360 23 8