## Record Keeping
To keep good records of the systems that we test on start by dumping all of the information about the system to be sent to further processing.



In [None]:
import os, pathlib

# Setup directories.
curDir = pathlib.Path(pathlib.Path.cwd()).resolve()
workDir = curDir.joinpath("cgar")
workDir.mkdir(parents=True,exist_ok=True)
!cd {workDir.resolve()}

hostInfoDir = workDir.joinpath("hostInfo")
hostInfoDir.mkdir(parents=True, exist_ok=True)

ErrorInfo = workDir.joinpath("ErrorLog.txt")
!touch {ErrorInfo}

ProviderInfo = hostInfoDir.joinpath("ccPInfo.txt")
ProviderInfo.touch(600)

#Set the variable indicating that this isn't running in the cloud.
MyLocalMachine = os.environ.get('THW_MACHINE') is not None
VastAI = os.environ.get('VAST_CONTAINERLABEL') is not None
if VastAI:

  !pip install vastai
  !vastai start instance $CONTAINER_ID --api-key $CONTAINER_API_KEY
  if _exit_code != 0:
    # TODO: Log error!
    !echo "Error - This appears to be running in a Vast.ai instance, but the CLI threw an error: " {_exit_code} >> {ErrorInfo}
    # ErrorInfo

!uname -a > {ProviderInfo} || echo "Error getting Provider Info(uname)!" >> {ErrorInfo}

!echo -e "### /etc/os-release ###\n\n" >> {ProviderInfo}
!cat /etc/os-release >> {ProviderInfo}
!echo -e "### /proc/version ###\n\n" >> {ProviderInfo}
!cat /proc/version >> {ProviderInfo}
!echo -e "### uptime ###\n\n" >> {ProviderInfo}
!uptime >> {ProviderInfo} || echo "Error getting Provider Info(uptime)!" >> {ErrorInfo}
!echo -e "### environment ###\n\n" >> {ProviderInfo}
!env >> {ProviderInfo}
!echo -e "### cuda-gdb version ###\n\n" >> {ProviderInfo}
!cuda-gdb --version >> {ProviderInfo} || echo "Error getting CPU Info(cuda-gdb)!" >> {ErrorInfo}

# !command -v gcloud &> /dev/null
# if _exit_code == 0:
#   !echo -e "### gcloud info ###\n\n" >> {ProviderInfo}
#   !cuda-gdb --version >> {ProviderInfo}

# !cat {ProviderInfo}

CPUInfo = hostInfoDir.joinpath("ccCInfo.txt")
# Get CPU info in JSON
!lscpu -J > {CPUInfo} || echo "Error getting CPU Info(lscpu)!" >> {ErrorInfo}

GPUInfo = hostInfoDir.joinpath("ccGInfo.txt")
# Get GPU info
!nvidia-smi -q -x > {GPUInfo} || echo "Error getting CPU Info(nvidia-smi)!" >> {ErrorInfo}

print('Host configuration documented.')
def disconnect():
  if not MyLocalMachine:
    from google.colab import runtime
    runtime.unassign()

buildDir = workDir.joinpath("build")
buildDir.mkdir(parents=True,exist_ok=True)
kernelPath = buildDir.joinpath("CGARKernel.cu")
!cd {buildDir}

# !tar -cjf cgar-data-bundle.bz2 ./*
if VastAI:
  !vastai destroy instance $CONTAINER_ID --api-key $CONTAINER_API_KEY

/bin/bash: line 1: nvidia-smi: command not found
Host configuration documented.


In [None]:
%%writefile test.ipy

import os, pathlib

# Setup directories.
curDir = pathlib.Path(pathlib.Path.cwd()).resolve()
workDir = curDir.joinpath("cgar")
workDir.mkdir(parents=True,exist_ok=True)
!cd {workDir.resolve()}

hostInfoDir = workDir.joinpath("hostInfo")
hostInfoDir.mkdir(parents=True, exist_ok=True)

ErrorInfo = workDir.joinpath("ErrorLog.txt")
!touch {ErrorInfo}

ProviderInfo = hostInfoDir.joinpath("ccPInfo.txt")
ProviderInfo.touch(600)

#Set the variable indicating that this isn't running in the cloud.
MyLocalMachine = os.environ.get('THW_MACHINE') is not None
VastAI = os.environ.get('VAST_CONTAINERLABEL') is not None
if VastAI:

  !pip install vastai
  !vastai start instance $CONTAINER_ID --api-key $CONTAINER_API_KEY
  if _exit_code != 0:
    # TODO: Log error!
    !echo "Error - This appears to be running in a Vast.ai instance, but the CLI threw an error: " {_exit_code} >> {ErrorInfo}
    # ErrorInfo

!uname -a > {ProviderInfo} || echo "Error getting Provider Info(uname)!" >> {ErrorInfo}

!echo -e "### /etc/os-release ###\n\n" >> {ProviderInfo}
!cat /etc/os-release >> {ProviderInfo}
!echo -e "### /proc/version ###\n\n" >> {ProviderInfo}
!cat /proc/version >> {ProviderInfo}
!echo -e "### uptime ###\n\n" >> {ProviderInfo}
!uptime >> {ProviderInfo} || echo "Error getting Provider Info(uptime)!" >> {ErrorInfo}
!echo -e "### environment ###\n\n" >> {ProviderInfo}
!env >> {ProviderInfo}
!echo -e "### cuda-gdb version ###\n\n" >> {ProviderInfo}
!cuda-gdb --version >> {ProviderInfo} || echo "Error getting CPU Info(cuda-gdb)!" >> {ErrorInfo}

# !command -v gcloud &> /dev/null
# if _exit_code == 0:
#   !echo -e "### gcloud info ###\n\n" >> {ProviderInfo}
#   !cuda-gdb --version >> {ProviderInfo}

# !cat {ProviderInfo}

CPUInfo = hostInfoDir.joinpath("ccCInfo.txt")
# Get CPU info in JSON
!lscpu -J > {CPUInfo} || echo "Error getting CPU Info(lscpu)!" >> {ErrorInfo}

GPUInfo = hostInfoDir.joinpath("ccGInfo.txt")
# Get GPU info
!nvidia-smi -q -x > {GPUInfo} || echo "Error getting CPU Info(nvidia-smi)!" >> {ErrorInfo}

print('Host configuration documented.')
def disconnect():
  if VastAI:
    !vastai destroy instance $CONTAINER_ID --api-key $CONTAINER_API_KEY
  if not MyLocalMachine:
    from google.colab import runtime
    runtime.unassign()

buildDir = workDir.joinpath("build")
buildDir.mkdir(parents=True,exist_ok=True)
kernelPath = buildDir.joinpath("CGARKernel.cu")
!cd {buildDir}


# !tar -cjf cgar-data-bundle.bz2 ./*

# Setup Build Sources

In [None]:
%%writefile $kernelPath

#include <stdio.h>
#include <unistd.h>
#include <cuda.h>

__global__ void cgar_kernel(){
            printf("Hello World from GPU!\n");
}

int main() {

  int deviceCount = 0;
  cuDeviceGetCount(&deviceCount);
  if (deviceCount == 0) {
      printf("There is no device supporting CUDA.\n");
      exit (0);
  }

  // Get handle for device 0
  CUdevice cuDevice;
  cuDeviceGet(&cuDevice, 0);

  // Create context
  CUcontext cuContext;
  cuCtxCreate(&cuContext, 0, cuDevice);
  while(true){
    sleep(30);
  }
  cgar_kernel<<<1,1>>>();
  return 0;
}

Writing /content/build/CGARKernel.cu


In [None]:
os.environ['CUDA_DEBUGGER_SOFTWARE_PREEMPTION']='1'
!!cd $buildDir
# nvcc --help
# !nvcc -g -G -c $kernelPath --generate-dependencies-with-compile --dependency-output $kernelPath.stem.d -o $kernelPath.stem
!nvcc -g -G $kernelPath -o $kernelPath.stem


/usr/bin/ld: /tmp/tmpxft_00000320_00000000-11_CGARKernel.o: in function `main':
/content/build/CGARKernel.cu:13: undefined reference to `cuDeviceGetCount'
/usr/bin/ld: /content/build/CGARKernel.cu:21: undefined reference to `cuDeviceGet'
/usr/bin/ld: /content/build/CGARKernel.cu:25: undefined reference to `cuCtxCreate_v2'
collect2: error: ld returned 1 exit status


In [14]:
%%writefile cuda_test.cu
//From https://developer.nvidia.com/blog/even-easier-introduction-cuda/
#include <cuda.h>
#include <iostream>
#include <iomanip>

// Kernel function to add the elements of two arrays
__global__ void validation_gen(const unsigned int maxIdx, unsigned int* dataPtr)
{
  int index = threadIdx.x;
  int stride = blockDim.x;
  for (int idx = index; idx < maxIdx; idx += stride)
      dataPtr[idx] = stride << 16 & index;
}

int main(int argc, char *argv[])
{
  unsigned int N = 256<<2;
  unsigned int* dataPtr;

  // Allocate Unified Memory - accessible from CPU or GPU
  cudaMallocManaged(&dataPtr, N*sizeof(dataPtr[0]));

  // Run kernel on 1M elements on the GPU
  validation_gen<<<1, 256>>>(N, dataPtr);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  std::cout << std::hex;
  for(uint i = 0; i < N; i++){
    auto idx = i & 0xffff80;
    auto stride = i & 0x7f;
    stride = stride << 16-5;

    std::cout << "(" << std::dec << i << "): " << std::hex << dataPtr[i] << " -> " << stride << idx;
    if(i % 4 == 0){
        std::cout << std::endl;
    }
  }

  // Free memory
  cudaFree(dataPtr);

  return 0;
}

Overwriting cuda_test.cu


In [15]:
!nvcc cuda_test.cu -o cuda_validator

In [None]:
%%writefile cuda_tester.cu

#include <cuda.h>
#include <stdio.h>
#include <iostream>
#include <algorithm>

void errChk(cudaError_t status, size_t line){
    if(status != cudaSuccess){
        std::cerr << "There was a cuda error at line " << line << "." << std::endl;
        std::cerr << "Error (" << status << "): " << cudaGetErrorName(status) << "::" << cudaGetErrorString(status) << std::endl;
        throw 1;
    }
}

#define MByte 1024*1024

__global__ void validationGen(const unsigned int* endPtr, unsigned int* basePtr)
{
  auto index = threadIdx.x;
  auto stride = blockDim.x;
  for (int i = index; i < n; i += stride)
      y[i] = x[i] + y[i];
}

int main() {

    cudaError_t cuErr;

    cudaDeviceProp props;
    errChk(cudaGetDeviceProperties(&props, 0), __LINE__);

    size_t totalMem = props.totalGlobalMem;

    void* devPtr;
    std::cout << "Trying to allocate " << totalMem/MByte << " M bytes." << std::endl;
    while(cudaMalloc(&devPtr, totalMem) != cudaSuccess)
    {
        cuErr = cudaPeekAtLastError();
        if(totalMem < 5*MByte || cuErr != cudaErrorMemoryAllocation){
            std::cerr << "There was a cuda error when allocating " << totalMem << " bytes." << std::endl;
            errChk(cuErr, __LINE__);
            return 2;
        }
        totalMem -= MByte;
        std::cout << "Trying to allocate " << totalMem/MByte << " M bytes." << std::endl;
    }
    std::cout << "Successfully allocated " << totalMem << " bytes." << std::endl;


    char* hostPtr = (char*)malloc(totalMem);
    try{
        errChk(cudaMemcpy((void*)hostPtr, devPtr, totalMem, cudaMemcpyDeviceToHost), __LINE__);

        std::cout << "Read " << totalMem << " bytes from GPU memory." << std::endl;

        errChk(cudaFree(devPtr), __LINE__);

        std::cout << "Writing " << totalMem << " bytes to gdump.bin file." << std::endl;
        FILE* dumpFilePtr = fopen("gdump.bin", "wb");
        if(dumpFilePtr == NULL){
            std::cerr << "Error opening 'gdump.bin'!" << std::endl;
            return 1;
        }

        // Optimizations for sparse file generation.

        //Check larger sections using 64b
        unsigned long long* startDataPtr = reinterpret_cast<unsigned long long*>(hostPtr);
        unsigned long long* endHostPtr = reinterpret_cast<unsigned long long*>(hostPtr+totalMem);
        unsigned long long* endDataPtr = startDataPtr;

        while(endDataPtr < endHostPtr){
          //Get ptr to first non-zero data.
          startDataPtr = std::find_if(startDataPtr,
                                endHostPtr, [](auto datum){ return datum != 0;});
          //Get ptr to first zero data after non-zero data.
          endDataPtr = std::find_if(static_cast<unsigned long long*>(startDataPtr),
                                endHostPtr, [](auto datum){ return datum == 0;});

          std::cout << "  Writing " << (endDataPtr - startDataPtr) * sizeof(*endDataPtr)
                << " bytes offset by " << (char*)startDataPtr - hostPtr << " from " << startDataPtr << " to " << endDataPtr << "." << std::endl;
          fseek(dumpFilePtr, (char*)startDataPtr - hostPtr, SEEK_SET);
          fwrite(startDataPtr, sizeof(*startDataPtr), endDataPtr-startDataPtr, dumpFilePtr);
          fflush(dumpFilePtr);
        }
        //fwrite(hostPtr, 1, totalMem, dumpFilePtr);

        fclose(dumpFilePtr);

        free(hostPtr);
    }
    catch(int err){
        cudaFree(devPtr);
        free(hostPtr);
        return 1;
    }
    return 0;
}

Writing cuda_test.cu


In [None]:
os.environ['CUDA_DEBUGGER_SOFTWARE_PREEMPTION']='1'
!cd /content/
# nvcc --help
# !nvcc -g -G -c $kernelPath --generate-dependencies-with-compile --dependency-output $kernelPath.stem.d -o $kernelPath.stem
!nvcc -g -G basic_read.cu -o basic_read

In [None]:

disconnect()

# Promising info

https://docs.nvidia.com/cuda/cuda-gdb/#variable-storage-and-accessibility

https://forums.developer.nvidia.com/t/gpu-memory-dump-with-cuda-gdb-python/46226/2



# Python Approach

Maybe this can be combined with `cuda-gdb` to simplify the process.

In [None]:
!pip install cuda-python
from cuda import cuda, nvrtc



In [None]:
from cuda import cuda, nvrtc
import time

# Much of this is from the cuda-python docs at https://nvidia.github.io/cuda-python/overview.html

def _cudaGetErrorEnum(error):
    if isinstance(error, cuda.CUresult):
        err, name = cuda.cuGetErrorName(error)
        return name if err == cuda.CUresult.CUDA_SUCCESS else "<unknown>"
    elif isinstance(error, nvrtc.nvrtcResult):
        return nvrtc.nvrtcGetErrorString(error)[1]
    else:
        raise RuntimeError('Unknown error type: {}'.format(error))

def errChk(result):
    if result[0].value:
        raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0])))
    if len(result) == 1:
        return None
    elif len(result) == 2:
        return result[1]
    else:
        return result[1:]

errChk(cuda.cuInit(0))

numDevices = errChk(cuda.cuDeviceGetCount())
if numDevices < 1:
  raise RuntimeError("No CUDA devices.")
else:
  print(f"There are {numDevices} CUDA devices.")

for devIdx in range(numDevices):
  print(errChk(cuda.cuDeviceGetProperties(devIdx)))

#TODO decide if we look at all of the devices????

dev = errChk(cuda.cuDeviceGet(0))

context = errChk(cuda.cuCtxCreate(0, dev))
try:
  cuda.cuMemGetInfo()
  waitCount = 10
  print('Starting wait loop.')
  while waitCount >= 0:
    print('.')
    waitCount -= 1
    time.sleep(60)
finally:
  errChk(cuda.cuCtxDestroy(context))

RuntimeError: Failed to dlopen libcuda.so.1

# Old CUDA Setup

In [None]:
# import torch, os, math, gzip, pickle
# import matplotlib.pyplot as plt
# from urllib.request import urlretrieve
# from pathlib import Path

# from torch import tensor
# import torchvision as tv
# import torchvision.transforms.functional as tvf
# from torchvision import io
# from torch.utils.cpp_extension import load_inline

# os.environ['CUDA_LAUNCH_BLOCKING']='1'
# %pip install -q wurlitzer ninja
# %load_ext wurlitzer

# def load_cuda(cuda_src, cpp_src, funcs, opt=False, verbose=False):
#     return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs,
#                        extra_cuda_cflags=["-O2"] if opt else [], verbose=verbose,
#                        name="inline_ext", build_directory=buildDir, keep_intermediates=True)


In [None]:

# os.environ['CUDA_LAUNCH_BLOCKING']='1'
# buildDir = os.curdir + "/build"
# !mkdir -p {buildDir}

# cuda_begin = r'''
# #include <torch/extension.h>
# #include <stdio.h>
# #include <unistd.h>
# #include <c10/cuda/CUDAException.h>

# #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
# #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
# #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

# inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
# '''

# cuda_src = cuda_begin + r'''
# __global__ void cuda_test_kernel(unsigned char* x, int n) {
#     int i = blockIdx.x*blockDim.x + threadIdx.x;

#     int test;
#     if (i<n) {
#       test = 0.2989*x[i] + 0.5870*x[i+n] + 0.1140*x[i+2*n];
#     }
# }

# int cuda_test_host(int n) {
#     // CHECK_INPUT(input);
#     //int h = input.size(1);
#     //int w = input.size(2);
#     //printf("h*w: %d*%d\n", h, w);
#     //auto output = torch::empty({h,w}, input.options());
#     //int threads = 256;

#     while(1==1){
#       sleep(30);
#     }
#     unsigned char testArray[100];
#     testArray[0] = 'h';

#     cuda_test_kernel<<<1, 1>>>(&testArray[0], 0);
#     C10_CUDA_KERNEL_LAUNCH_CHECK();
#     return 1;
# }'''

# cpp_src = "int cuda_test_host(int n);"

# module = load_cuda(cuda_src, cpp_src, ['cuda_test_host'], verbose=True)


The input conditions for extension module inline_ext have changed. Bumping to version 1 and re-building as inline_ext_v1...
Detected CUDA files, patching ldflags
Emitting ninja build file ./build/build.ninja...
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
Building extension module inline_ext_v1...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/3] c++ -MMD -MF main.o.d -DTORCH_EXTENSION_NAME=inline_ext_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.10/dist-packages/torch/include/TH -isystem /usr/local/lib/python3.10/dist-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /usr/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -c /content/build/main.cpp -o main.o 
[2/3] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output cuda.cuda.o.d -DTORCH_EXTENSION_NAME=inline_ext_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.10/dist-packages/torch/include -isystem /usr/local/lib/python3

Loading extension module inline_ext_v1...
