In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-h9zxrea_
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-h9zxrea_
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=d7e647ce88fd0580ac78dec8c954d6f4b650389bac3953712dcdd55fca3260ca
  Stored in directory: /tmp/pip-ephem-wheel-cache-11fq2u17/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
! pip install nlohmann-json

Collecting nlohmann-json
  Downloading nlohmann_json-3.11.2-py3-none-any.whl (167 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/167.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/167.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.1/167.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlohmann-json
Successfully installed nlohmann-json-3.11.2


In [None]:
%%cu
/* Evaluate Service written in cuda-cpp for execution on Nvidia GPUs  */
#include <iostream>
#include <nlohmann/json.hpp>
#include <cuda_runtime.h>

using json = nlohmann::json;

__device__ float plus(float a, float b) {
    return a + b;
}

__device__ float minus(float a, float b) {
    return a - b;
}

__device__ float multiply(float a, float b) {
    return a * b;
}

__device__ float handle_variable(const json& row, const std::string& key) {
    if (row.find(key) != row.end()) {
        return row[key].get<float>();
    }
    return 0.0; // Default value if variable is not found
}

template <typename T>
__global__ void evaluateFormula(const char* formulaJson, const char* rowsJson, T* results, int numRows) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < numRows) {
        json ast = json::parse(formulaJson);
        json row = json::parse(rowsJson); // Parse the row as a JSON object

        // Parse the JSON AST dynamically and evaluate the formula for row 'tid'
        T result = 0.0; // Initialize with the neutral element

        // Traverse the JSON AST and apply operations dynamically
        for (const auto& item : ast) {
            if (item.is_object()) {
                const std::string& operator_str = item.begin().key();
                const json& operands = item.begin().value();

                if (operator_str == "+") {
                    result = plus(result, operands[0].is_string() ? handle_variable(row, operands[0]) : operands[0].get<float>());
                } else if (operator_str == "-") {
                    result = minus(result, operands[0].is_string() ? handle_variable(row, operands[0]) : operands[0].get<float>());
                } else if (operator_str == "*") {
                    result = multiply(result, operands[0].is_string() ? handle_variable(row, operands[0]) : operands[0].get<float>());
                }
            }
        }

        results[tid] = result; // Store the result in the results array
    }
}

int main() {
    // Initialize CUDA
    cudaError_t cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        std::cerr << "CUDA initialization failed!" << std::endl;
        return 1;
    }

    // Define problem size
    int numRows = 1; // Adjust as needed
    int blockSize = 1; // Adjust as needed

    // Allocate memory on the CPU for results
    float* results = new float[numRows];

    // Convert the JSON AST and row data to strings
    std::string formulaJson = "{\"+\": [\"a\", {\"-\": [\"b\", {\"*\": [\"c\", \"d\"]}]}]}";
    std::string rowJson = "{\"a\": 1.0, \"b\": 2.0, \"c\": 3.0, \"d\": 4.0}";

    const char* formulaJsonCStr = formulaJson.c_str();
    const char* rowJsonCStr = rowJson.c_str();

    // Allocate memory on the GPU for results
    float* d_results;
    cudaStatus = cudaMalloc((void**)&d_results, numRows * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        std::cerr << "CUDA memory allocation failed!" << std::endl;
        delete[] results;
        return 1;
    }

    // Launch the CUDA kernel
    evaluateFormula<float><<<numRows, blockSize>>>(formulaJsonCStr, rowJsonCStr, d_results, numRows);

    // Check for kernel launch errors
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        std::cerr << "CUDA kernel launch failed: " << cudaGetErrorString(cudaStatus) << std::endl;
        cudaFree(d_results);
        delete[] results;
        return 1;
    }

    // Copy results from GPU to CPU
    cudaStatus = cudaMemcpy(results, d_results, numRows * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        std::cerr << "CUDA memcpy failed: " << cudaGetErrorString(cudaStatus) << std::endl;
        cudaFree(d_results);
        delete[] results;
        return 1;
    }

    // Print or use the result
    std::cout << "Result: " << results[0] << std::endl;

    // Cleanup
    cudaFree(d_results);
    delete[] results;

    return 0;
}

/tmp/tmp_j6tbnhe/1478df41-44ff-4e53-8122-235aaf64df69.cu:3:10: fatal error: nlohmann/json.hpp: No such file or directory
    3 | #include <nlohmann/json.hpp>
      |          ^~~~~~~~~~~~~~~~~~~
compilation terminated.



In [None]:
import cupy as cp
import numpy as np
import time

# Function to perform matrix multiplication using CuPy on GPU
def matrix_multiply_gpu(A, B):
    # Move data to the GPU
    A_gpu = cp.asarray(A)
    B_gpu = cp.asarray(B)

    # Perform matrix multiplication on the GPU
    C_gpu = cp.matmul(A_gpu, B_gpu)

    # Move the result back to the CPU
    C_cpu = cp.asnumpy(C_gpu)

    return C_cpu

# Function to perform matrix multiplication using NumPy on CPU
def matrix_multiply_cpu(A, B):
    return np.matmul(A, B)

# Create two random matrices (adjust the size as needed)
matrix_size = (10000, 10000)
A = np.random.rand(*matrix_size).astype(np.float32)
B = np.random.rand(*matrix_size).astype(np.float32)

# Measure the time taken by CuPy on GPU
start_time_gpu = time.time()
result_gpu = matrix_multiply_gpu(A, B)
end_time_gpu = time.time()

# Measure the time taken by NumPy on CPU
start_time_cpu = time.time()
result_cpu = matrix_multiply_cpu(A, B)
end_time_cpu = time.time()

# Compare the results (for correctness)
if np.allclose(result_gpu, result_cpu):
    print("Results match.")
else:
    print("Results do not match.")

# Compare the performance
time_gpu = end_time_gpu - start_time_gpu
time_cpu = end_time_cpu - start_time_cpu

print(f"Time taken by CuPy on GPU: {time_gpu:.4f} seconds")
print(f"Time taken by NumPy on CPU: {time_cpu:.4f} seconds")

# Calculate the speedup (GPU time / CPU time)
speedup = time_cpu / time_gpu
print(f"Speedup: {speedup:.2f}x")


Results match.
Time taken by CuPy on GPU: 1.2838 seconds
Time taken by NumPy on CPU: 29.6650 seconds
Speedup: 23.11x


In [None]:
import cupy as cp
import numpy as np
import time

# Function to perform element-wise multiplication using CuPy on GPU
def elementwise_multiply_gpu(vector, number):
    # Move data to the GPU
    vector_gpu = cp.asarray(vector)
    number_gpu = cp.asarray(number)

    # Perform element-wise multiplication on the GPU
    result_gpu = cp.multiply(vector_gpu, number_gpu)

    # Move the result back to the CPU
    result_cpu = cp.asnumpy(result_gpu)

    return result_cpu

# Function to perform element-wise multiplication using NumPy on CPU
def elementwise_multiply_cpu(vector, number):
    return vector * number

# Create a random vector (adjust the length as needed)
n = 100000000  # Length of the vector
vector = np.random.rand(n).astype(np.float32)
number = 2.0  # Number to multiply with

# Measure the time taken by CuPy on GPU
start_time_gpu = time.time()
result_gpu = elementwise_multiply_gpu(vector, number)
end_time_gpu = time.time()

# Measure the time taken by NumPy on CPU
start_time_cpu = time.time()
result_cpu = elementwise_multiply_cpu(vector, number)
end_time_cpu = time.time()

# Compare the results (for correctness)
if np.allclose(result_gpu, result_cpu):
    print("Results match.")
else:
    print("Results do not match.")

# Compare the performance
time_gpu = end_time_gpu - start_time_gpu
time_cpu = end_time_cpu - start_time_cpu

print(f"Time taken by CuPy on GPU: {time_gpu:.4f} seconds")
print(f"Time taken by NumPy on CPU: {time_cpu:.4f} seconds")

# Calculate the speedup (CPU time / GPU time)
speedup = time_cpu / time_gpu
print(f"Speedup: {speedup:.2f}")


Results match.
Time taken by CuPy on GPU: 0.4395 seconds
Time taken by NumPy on CPU: 0.1313 seconds
Speedup: 0.30
