Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1240 lines (1078 sloc)
48.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| int NumberAlloc=0; | |
| #if defined(CUDA) | |
| cudaDeviceProp deviceProperties; | |
| int *sm13DeviceList; // Device handles for each CUDA >=1.3 capable device | |
| int deviceCount = 0; // Total number of devices | |
| int sm13DeviceCount = 0; // Number of devices that are CUDA >=1.3 capable | |
| //---------------------------------------------------------------// | |
| // Find all CUDA >=1.3-capable devices // | |
| //---------------------------------------------------------------// | |
| // Check for number of devices total | |
| PRINTF("before cudaGetDeviceCount \n"); | |
| cudaGetDeviceCount(&deviceCount); | |
| PRINTF("after cudaGetDeviceCount \n"); | |
| if(deviceCount == 0) | |
| { | |
| ERROR_STRING("There are no CUDA devices.\n"); | |
| } | |
| else | |
| { | |
| PRINTF("There %s %i device%s.\n", deviceCount > 1 ? "are" : "is", | |
| deviceCount, | |
| deviceCount > 1 ? "s" : ""); | |
| } | |
| // Make list of devices | |
| sm13DeviceList = (int*) calloc(deviceCount, sizeof(int)); | |
| for(int deviceID = 0; deviceID < deviceCount; deviceID++) | |
| { | |
| // Check device properties | |
| if(cudaGetDeviceProperties(&deviceProperties, deviceID) == cudaSuccess) | |
| { | |
| PRINTF("Found device [%d:%d]:\n", sm13DeviceCount, deviceID); | |
| PRINTF(" Name: %s\n", deviceProperties.name); | |
| PRINTF(" Compute capability: %i.%i\n", deviceProperties.major, deviceProperties.minor); | |
| PRINTF(" Total memory: %li bytes\n", deviceProperties.totalGlobalMem); | |
| PRINTF(" Threads per block: %i\n",deviceProperties.maxThreadsPerBlock); | |
| PRINTF(" Max block dimensions: %i x %i x %i\n", deviceProperties.maxThreadsDim[0], deviceProperties.maxThreadsDim[1], deviceProperties.maxThreadsDim[2]); | |
| PRINTF(" Max grid size: %i x %i x %i\n", deviceProperties.maxGridSize[0], deviceProperties.maxGridSize[1], deviceProperties.maxGridSize[2]); | |
| PRINTF(" Shared memory per block: %i bytes\n", (int) deviceProperties.sharedMemPerBlock); | |
| PRINTF(" Registers per block: %i\n", deviceProperties.regsPerBlock); | |
| // Check major/minor CUDA versions | |
| if(deviceProperties.major >=3 && strstr(deviceProperties.name, DefaultGPUDeviceName_pr) ) | |
| { | |
| PRINTF(" At least one device available [%s] for calculations \n", deviceProperties.name); | |
| // Add device to 3-capable list | |
| sm13DeviceList[sm13DeviceCount] = deviceID; | |
| sm13DeviceCount++; | |
| } | |
| } | |
| } | |
| // Were any of the devices 1.3-capable? | |
| if(sm13DeviceCount == 0) | |
| { | |
| ERROR_STRING("There are no devices supporting CUDA or that matches selected device.\n"); | |
| } | |
| if (INHOST(DefaultGPUDeviceNumber)>= sm13DeviceCount) | |
| { | |
| PRINTF("The requested device [%i] (0-base index) is more than the number of devices available [%i] \n",INHOST(DefaultGPUDeviceNumber),sm13DeviceCount); | |
| ERROR_STRING("Unable to select requested device.\n"); | |
| } | |
| PRINTF("Selecting device [%s] with number [%i] for calculations\n", DefaultGPUDeviceName_pr,INHOST(DefaultGPUDeviceNumber)); | |
| mxcheckGPUErrors(cudaSetDevice(sm13DeviceList[INHOST(DefaultGPUDeviceNumber)])); | |
| mxcheckGPUErrors(cudaDeviceSetCacheConfig (cudaFuncCachePreferL1)); | |
| #endif | |
| #ifdef OPENCL | |
| cl_uint numPlatforms; | |
| int err; | |
| cl_device_id device_id[10]; // compute device id | |
| cl_context context; // compute context | |
| cl_command_queue commands; // compute command queue | |
| cl_program program; // compute program | |
| cl_kernel StressKernel; // compute kernel | |
| cl_kernel ParticleKernel; // compute kernel | |
| cl_kernel SnapShot; // compute kernel | |
| cl_kernel SensorsKernel; // compute kernel | |
| cl_char device_name[1024]; | |
| // Find number of platforms | |
| mxcheckGPUErrors(clGetPlatformIDs(0, NULL, &numPlatforms)); | |
| if (numPlatforms == 0) | |
| { | |
| ERROR_STRING("Found 0 OPENCL platforms!\n"); | |
| } | |
| // Get all platforms | |
| cl_platform_id * Platform = (cl_platform_id*) malloc(numPlatforms*sizeof(cl_platform_id)); | |
| mxcheckGPUErrors(clGetPlatformIDs(numPlatforms, Platform, NULL)); | |
| // Secure a GPU | |
| unsigned int total_devices; | |
| int SelDevice=-1; | |
| // Create a compute context | |
| for (unsigned int icpu = 0; icpu < numPlatforms; icpu++) | |
| { | |
| err = clGetDeviceIDs(Platform[icpu], CL_DEVICE_TYPE_ALL, 10, device_id, &total_devices); | |
| if (err == CL_SUCCESS) | |
| { | |
| break; | |
| } | |
| } | |
| if (device_id[0] == NULL) | |
| ERROR_STRING("Found 0 OPENCL devices!\n"); | |
| for (unsigned int icpu = 0; icpu <total_devices;icpu++) | |
| { | |
| mxcheckGPUErrors(output_device_info(device_id[icpu])); | |
| err = clGetDeviceInfo(device_id[icpu], CL_DEVICE_NAME, sizeof(device_name), &device_name, NULL); | |
| PRINTF("GPU device = %s\n",device_name); | |
| if (NULL!=strstr((char *)device_name,DefaultGPUDeviceName_pr)) | |
| { | |
| PRINTF("Found %s device!\n",DefaultGPUDeviceName_pr); | |
| SelDevice=icpu; | |
| break; | |
| } | |
| } | |
| if (SelDevice==-1) | |
| { | |
| PRINTF("Device requested %s \n",DefaultGPUDeviceName_pr); | |
| ERROR_STRING("Device requested was not found!\n"); | |
| } | |
| size_t size_bits; | |
| cl_uint address_bits; | |
| clGetDeviceInfo(device_id[SelDevice], CL_DEVICE_ADDRESS_BITS, 0, NULL, &size_bits); | |
| clGetDeviceInfo(device_id[SelDevice], CL_DEVICE_ADDRESS_BITS, size_bits, &address_bits, NULL); | |
| PRINTF("size: %lu , bits: %u\n", size_bits, address_bits); | |
| if (address_bits==32) | |
| { | |
| PRINTF("********************************\n"); | |
| PRINTF("WARNING - OpenCL driver only supports 32 bits, simulations only will be\n"); | |
| PRINTF("WARNING - useful for domains that uses less than 4 GB in total memory\n"); | |
| PRINTF("WARNING - Program may crash without an easy way to catch the error\n"); | |
| PRINTF("WARNING - Consider using CUDA backend if NVIDIA GPU in Win or Linux, or METAL backend in MacOS\n"); | |
| } | |
| context = clCreateContext(0, 1, &device_id[SelDevice], NULL, NULL, &err); | |
| mxcheckGPUErrors(err); | |
| // Create a command queue | |
| commands = clCreateCommandQueue(context, device_id[SelDevice], 0, &err); | |
| mxcheckGPUErrors(err); | |
| #endif | |
| #ifdef METAL | |
| _PT _c_mex_type[12] = {0,0,0,0,0,0,0,0,0,0,0,0}; | |
| _PT _c_uint_type = 0; | |
| _PT HOST_INDEX_MEX[LENGTH_INDEX_MEX][2]; //need to encode 64 bits numbers in 32 arrays... | |
| _PT HOST_INDEX_UINT[LENGTH_INDEX_UINT][2]; | |
| ns::Array<mtlpp::Device> AllDev= mtlpp::Device::CopyAllDevices(); | |
| mxcheckGPUErrors(((int)AllDev)); | |
| if (AllDev.GetSize()==0) | |
| { | |
| ERROR_STRING("Found 0 METAL platforms!\n"); | |
| } | |
| unsigned int SelDevice=0; | |
| { | |
| for (int _n = 0;_n<AllDev.GetSize();_n++) | |
| { | |
| PRINTF("Metal device available: %i %s\n",_n,AllDev[_n].GetName().GetCStr()); | |
| if (NULL!=strstr(AllDev[_n].GetName().GetCStr(),DefaultGPUDeviceName_pr)) | |
| { | |
| PRINTF("Found %s device!\n",DefaultGPUDeviceName_pr); | |
| SelDevice=_n; | |
| break; | |
| } | |
| } | |
| } | |
| if (SelDevice==-1) | |
| { | |
| PRINTF("Device requested %s \n",DefaultGPUDeviceName_pr); | |
| ERROR_STRING("Device requested was not found!\n"); | |
| } | |
| mtlpp::Device device= AllDev[SelDevice]; | |
| ns::Error error; | |
| ns::String PathToLib(kernbinfile_pr); | |
| mtlpp::Library library = device.NewLibrary(PathToLib, &error); | |
| if (((int)library)==0 ) | |
| { | |
| PRINTF("GetLocalizedDescription = %s\n",error.GetLocalizedDescription().GetCStr()); | |
| PRINTF("GetLocalizedFailureReason = %s\n",error.GetLocalizedFailureReason().GetCStr()); | |
| PRINTF("GetLocalizedRecoverySuggestion = %s\n",error.GetLocalizedRecoverySuggestion().GetCStr()); | |
| PRINTF("GetLocalizedRecoveryOptions = %s\n",error.GetLocalizedRecoveryOptions().GetCStr()); | |
| PRINTF("GetHelpAnchor = %s\n",error.GetHelpAnchor().GetCStr()); | |
| PRINTF("GetCode = %i\n",error.GetCode()); | |
| ERROR_STRING("Error loading Metal library") | |
| } | |
| mxcheckGPUErrors(((int)library)); | |
| PRINTF("After compiling code \n"); | |
| GET_KERNEL_STRESS_FUNCTION(PML_1) | |
| GET_KERNEL_STRESS_FUNCTION(PML_2) | |
| GET_KERNEL_STRESS_FUNCTION(PML_3) | |
| GET_KERNEL_STRESS_FUNCTION(PML_4) | |
| GET_KERNEL_STRESS_FUNCTION(PML_5) | |
| GET_KERNEL_STRESS_FUNCTION(PML_6) | |
| GET_KERNEL_STRESS_FUNCTION(MAIN_1) | |
| GET_KERNEL_PARTICLE_FUNCTION(PML_1) | |
| GET_KERNEL_PARTICLE_FUNCTION(PML_2) | |
| GET_KERNEL_PARTICLE_FUNCTION(PML_3) | |
| GET_KERNEL_PARTICLE_FUNCTION(PML_4) | |
| GET_KERNEL_PARTICLE_FUNCTION(PML_5) | |
| GET_KERNEL_PARTICLE_FUNCTION(PML_6) | |
| GET_KERNEL_PARTICLE_FUNCTION(MAIN_1) | |
| mtlpp::Function SnapShotFunc = library.NewFunction("SnapShot"); | |
| mxcheckGPUErrors(((int)SnapShotFunc)); | |
| mtlpp::ComputePipelineState computePipelineStateSnapShot = device.NewComputePipelineState(SnapShotFunc, nullptr); | |
| mxcheckGPUErrors(((int)computePipelineStateSnapShot)); | |
| mtlpp::Function SensorsKernelFunc = library.NewFunction("SensorsKernel"); | |
| mxcheckGPUErrors(((int)SensorsKernelFunc)); | |
| mtlpp::ComputePipelineState computePipelineStateSensors = device.NewComputePipelineState(SensorsKernelFunc, nullptr); | |
| mxcheckGPUErrors(((int)computePipelineStateSensors)); | |
| PRINTF("After getting all functions code \n"); | |
| mtlpp::CommandQueue commandQueue = device.NewCommandQueue(); | |
| mxcheckGPUErrors(((int)commandQueue)); | |
| mtlpp::Buffer _CONSTANT_BUFFER_UINT = device.NewBuffer(sizeof(unsigned int) * LENGTH_CONST_UINT, mtlpp::ResourceOptions::StorageModeManaged); | |
| mxcheckGPUErrors(((int)_CONSTANT_BUFFER_UINT)); | |
| mtlpp::Buffer _CONSTANT_BUFFER_MEX = device.NewBuffer(sizeof(mexType) * LENGTH_CONST_MEX, mtlpp::ResourceOptions::StorageModeManaged); | |
| mxcheckGPUErrors(((int)_CONSTANT_BUFFER_MEX)); | |
| #endif | |
| //initilizing constant memory variables | |
| InitSymbol(DT,mexType,G_FLOAT); | |
| InitSymbol(N1,unsigned int,G_INT); | |
| InitSymbol(N2,unsigned int,G_INT); | |
| InitSymbol(N3,unsigned int,G_INT); | |
| InitSymbol(Limit_I_low_PML,unsigned int,G_INT); | |
| InitSymbol(Limit_J_low_PML,unsigned int,G_INT); | |
| InitSymbol(Limit_K_low_PML,unsigned int,G_INT); | |
| InitSymbol(Limit_I_up_PML,unsigned int,G_INT); | |
| InitSymbol(Limit_J_up_PML,unsigned int,G_INT); | |
| InitSymbol(Limit_K_up_PML,unsigned int,G_INT); | |
| InitSymbol(SizeCorrI,unsigned int,G_INT); | |
| InitSymbol(SizeCorrJ,unsigned int,G_INT); | |
| InitSymbol(SizeCorrK,unsigned int,G_INT); | |
| InitSymbol(PML_Thickness,unsigned int,G_INT); | |
| InitSymbol(NumberSources,unsigned int,G_INT); | |
| InitSymbol(LengthSource,unsigned int,G_INT); | |
| InitSymbol(ZoneCount,unsigned int,G_INT); | |
| InitSymbol(SizePMLxp1,unsigned int,G_INT); | |
| InitSymbol(SizePMLyp1,unsigned int,G_INT); | |
| InitSymbol(SizePMLzp1,unsigned int,G_INT); | |
| InitSymbol(SizePML,unsigned int,G_INT); | |
| InitSymbol(SizePMLxp1yp1zp1,unsigned int,G_INT); | |
| InitSymbol(NumberSensors,unsigned int,G_INT); | |
| InitSymbol(TimeSteps,unsigned int,G_INT); | |
| InitSymbol(SelRMSorPeak,unsigned int,G_INT); | |
| InitSymbol(SelMapsRMSPeak,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_ALLV,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Vx,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Vy,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Vz,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Sigmaxx,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Sigmayy,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Sigmazz,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Sigmaxy,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Sigmaxz,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Sigmayz,unsigned int,G_INT); | |
| InitSymbol(IndexRMSPeak_Pressure,unsigned int,G_INT); | |
| InitSymbol(NumberSelRMSPeakMaps,unsigned int,G_INT); | |
| InitSymbol(SelMapsSensors,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_ALLV,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Vx,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Vy,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Vz,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Sigmaxx,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Sigmayy,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Sigmazz,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Sigmaxy,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Sigmaxz,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Sigmayz,unsigned int,G_INT); | |
| InitSymbol(IndexSensor_Pressure,unsigned int,G_INT); | |
| InitSymbol(NumberSelSensorMaps,unsigned int,G_INT); | |
| InitSymbol(SensorSubSampling,unsigned int,G_INT); | |
| InitSymbol(SensorStart,unsigned int,G_INT); | |
| //~ | |
| #ifdef CUDA //CUDA specifics | |
| mxcheckGPUErrors(cudaMemcpyToSymbol(gpuInvDXDTpluspr,InvDXDTplus_pr,(INHOST(PML_Thickness)+1)*sizeof(mexType))); | |
| mxcheckGPUErrors(cudaMemcpyToSymbol(gpuDXDTminuspr,DXDTminus_pr,(INHOST(PML_Thickness)+1)*sizeof(mexType))); | |
| mxcheckGPUErrors(cudaMemcpyToSymbol(gpuInvDXDTplushppr,InvDXDTplushp_pr,(INHOST(PML_Thickness)+1)*sizeof(mexType))); | |
| mxcheckGPUErrors(cudaMemcpyToSymbol(gpuDXDTminushppr,DXDTminushp_pr,(INHOST(PML_Thickness)+1)*sizeof(mexType))); | |
| #endif | |
| #if defined(OPENCL) || defined(METAL) | |
| InitSymbolArray(InvDXDTplus,G_FLOAT,INHOST(PML_Thickness)+1); | |
| InitSymbolArray(DXDTminus,G_FLOAT,INHOST(PML_Thickness)+1); | |
| InitSymbolArray(InvDXDTplushp,G_FLOAT,INHOST(PML_Thickness)+1); | |
| InitSymbolArray(DXDTminushp,G_FLOAT,INHOST(PML_Thickness)+1); | |
| #endif | |
| #ifdef OPENCL | |
| //PRINTF("%s",BUFFER_FOR_GPU_CODE); | |
| // size_t szKernelLength = strlen(BUFFER_FOR_GPU_CODE); | |
| // program = clCreateProgramWithSource(context, 1, (const char **) & BUFFER_FOR_GPU_CODE, &szKernelLength, &err); | |
| // mxcheckGPUErrors(err); | |
| char scmd [8000]; | |
| snprintf(scmd,8000,"\"%s\" --input \"%s\" --output \"%s\" --device %i",PI_OCL_PATH_pr, | |
| kernelfile_pr,kernbinfile_pr, | |
| SelDevice); | |
| PRINTF("compiling kernel with \"%s\"\n",scmd) | |
| if (system(scmd)!=0) | |
| { | |
| ERROR_STRING("Error when trying to compile program"); | |
| } | |
| char * binary; | |
| size_t binary_size; | |
| long l_szie; | |
| cl_int binary_status; | |
| binary = common_read_file(kernbinfile_pr, &l_szie); | |
| if (binary==NULL) | |
| { | |
| PRINTF("problem when trying to open kernel binary [%s]\n",kernbinfile_pr); | |
| ERROR_STRING("Stopping execution") | |
| } | |
| binary_size=l_szie; | |
| program = clCreateProgramWithBinary( | |
| context, 1, &device_id[SelDevice], &binary_size, | |
| (const unsigned char **)&binary, &binary_status, &err | |
| ); | |
| mxcheckGPUErrors(err); | |
| free(binary); | |
| PRINTF("After clCreateProgramWithSource\n"); | |
| // Build the program | |
| err = clBuildProgram(program, 1, &device_id[SelDevice], NULL, NULL, NULL); | |
| if (err != CL_SUCCESS) | |
| { | |
| size_t len; | |
| char buffer[200048]; | |
| PRINTF("Error: Failed to build program executable!\n%s\n", opencl_err_code(err)); | |
| clGetProgramBuildInfo(program, device_id[SelDevice], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); | |
| PRINTF("%s\n", buffer); | |
| ERROR_STRING("Unable to build program"); | |
| } | |
| // Create the compute kernel from the program | |
| StressKernel = clCreateKernel(program, "StressKernel", &err); | |
| mxcheckGPUErrors(err); | |
| ParticleKernel = clCreateKernel(program, "ParticleKernel", &err); | |
| mxcheckGPUErrors(err); | |
| SnapShot = clCreateKernel(program, "SnapShot", &err); | |
| mxcheckGPUErrors(err); | |
| SensorsKernel = clCreateKernel(program, "SensorsKernel", &err); | |
| mxcheckGPUErrors(err); | |
| #endif | |
| //Only used these for the PML | |
| _PT SizeCopy; | |
| ownGpuCalloc(V_x_x,mexType,INHOST(SizePMLxp1)); | |
| ownGpuCalloc(V_y_x,mexType,INHOST(SizePMLxp1)); | |
| ownGpuCalloc(V_z_x,mexType,INHOST(SizePMLxp1)); | |
| ownGpuCalloc(V_x_y,mexType,INHOST(SizePMLyp1)); | |
| ownGpuCalloc(V_y_y,mexType,INHOST(SizePMLyp1)); | |
| ownGpuCalloc(V_z_y,mexType,INHOST(SizePMLyp1)); | |
| ownGpuCalloc(V_x_z,mexType,INHOST(SizePMLzp1)); | |
| ownGpuCalloc(V_y_z,mexType,INHOST(SizePMLzp1)); | |
| ownGpuCalloc(V_z_z,mexType,INHOST(SizePMLzp1)); | |
| ownGpuCalloc(Sigma_x_xx,mexType,INHOST(SizePML)); | |
| ownGpuCalloc(Sigma_y_xx,mexType,INHOST(SizePML)); | |
| ownGpuCalloc(Sigma_z_xx,mexType,INHOST(SizePML)); | |
| ownGpuCalloc(Sigma_x_yy,mexType,INHOST(SizePML)); | |
| ownGpuCalloc(Sigma_y_yy,mexType,INHOST(SizePML)); | |
| ownGpuCalloc(Sigma_z_yy,mexType,INHOST(SizePML)); | |
| ownGpuCalloc(Sigma_x_zz,mexType,INHOST(SizePML)); | |
| ownGpuCalloc(Sigma_y_zz,mexType,INHOST(SizePML)); | |
| ownGpuCalloc(Sigma_z_zz,mexType,INHOST(SizePML)); | |
| ownGpuCalloc(Sigma_x_xy,mexType,INHOST(SizePMLxp1yp1zp1)); | |
| ownGpuCalloc(Sigma_y_xy,mexType,INHOST(SizePMLxp1yp1zp1)); | |
| ownGpuCalloc(Sigma_x_xz,mexType,INHOST(SizePMLxp1yp1zp1)); | |
| ownGpuCalloc(Sigma_z_xz,mexType,INHOST(SizePMLxp1yp1zp1)); | |
| ownGpuCalloc(Sigma_y_yz,mexType,INHOST(SizePMLxp1yp1zp1)); | |
| ownGpuCalloc(Sigma_z_yz,mexType,INHOST(SizePMLxp1yp1zp1)); | |
| SizeCopy = GET_NUMBER_ELEMS(Sigma_xx_res); | |
| ownGpuCalloc(Rxx,mexType,SizeCopy); | |
| ownGpuCalloc(Ryy,mexType,SizeCopy); | |
| ownGpuCalloc(Rzz,mexType,SizeCopy); | |
| SizeCopy = GET_NUMBER_ELEMS(Sigma_xy_res); | |
| ownGpuCalloc(Rxy,mexType,SizeCopy); | |
| ownGpuCalloc(Rxz,mexType,SizeCopy); | |
| ownGpuCalloc(Ryz,mexType,SizeCopy); | |
| //These come from the user input | |
| CreateAndCopyFromMXVarOnGPU(LambdaMiuMatOverH,mexType); | |
| CreateAndCopyFromMXVarOnGPU(LambdaMatOverH ,mexType); | |
| CreateAndCopyFromMXVarOnGPU(MiuMatOverH,mexType); | |
| CreateAndCopyFromMXVarOnGPU(TauLong,mexType); | |
| CreateAndCopyFromMXVarOnGPU(OneOverTauSigma ,mexType); | |
| CreateAndCopyFromMXVarOnGPU(TauShear,mexType); | |
| CreateAndCopyFromMXVarOnGPU(InvRhoMatH ,mexType); | |
| CreateAndCopyFromMXVarOnGPU(Ox ,mexType); | |
| CreateAndCopyFromMXVarOnGPU(Oy ,mexType); | |
| CreateAndCopyFromMXVarOnGPU(Oz ,mexType); | |
| CreateAndCopyFromMXVarOnGPU(SourceFunctions ,mexType); | |
| CreateAndCopyFromMXVarOnGPU(IndexSensorMap ,unsigned int); | |
| CreateAndCopyFromMXVarOnGPU(SourceMap ,unsigned int); | |
| CreateAndCopyFromMXVarOnGPU(MaterialMap ,unsigned int); | |
| ownGpuCalloc(Vx,mexType,GET_NUMBER_ELEMS(Vx_res)); | |
| ownGpuCalloc(Vy,mexType,GET_NUMBER_ELEMS(Vy_res)); | |
| ownGpuCalloc(Vz,mexType,GET_NUMBER_ELEMS(Vz_res)); | |
| ownGpuCalloc(Sigma_xx,mexType,GET_NUMBER_ELEMS(Sigma_xx_res)); | |
| ownGpuCalloc(Sigma_yy,mexType,GET_NUMBER_ELEMS(Sigma_yy_res)); | |
| ownGpuCalloc(Sigma_zz,mexType,GET_NUMBER_ELEMS(Sigma_zz_res)); | |
| ownGpuCalloc(Sigma_xy,mexType,GET_NUMBER_ELEMS(Sigma_xy_res)); | |
| ownGpuCalloc(Sigma_xz,mexType,GET_NUMBER_ELEMS(Sigma_xz_res)); | |
| ownGpuCalloc(Sigma_yz,mexType,GET_NUMBER_ELEMS(Sigma_yz_res)); | |
| ownGpuCalloc(Pressure,mexType,GET_NUMBER_ELEMS(Pressure_res)); | |
| #ifdef CUDA | |
| mexType * gpu_Snapshots_pr=NULL; | |
| InputDataKernel pHost; | |
| #endif | |
| #ifdef OPENCL | |
| cl_mem gpu_Snapshots_pr; | |
| #endif | |
| #ifdef METAL | |
| mtlpp::Buffer gpu_Snapshots_pr; | |
| #endif | |
| CreateAndCopyFromMXVarOnGPU2(Snapshots,mexType); | |
| CreateAndCopyFromMXVarOnGPU(SensorOutput,mexType); | |
| CreateAndCopyFromMXVarOnGPU(SqrAcc,mexType); | |
| #ifdef METAL | |
| mtlpp::Buffer _MEX_BUFFER[12]; | |
| for (_PT ii=0;ii<12;ii++) | |
| { | |
| PRINTF("Allocating Buffer %i with %lu float entries\n",ii,_c_mex_type[ii]); | |
| _MEX_BUFFER[ii]= device.NewBuffer(sizeof(mexType) *_c_mex_type[ii], | |
| mtlpp::ResourceOptions::StorageModeManaged); | |
| mxcheckGPUErrors(((int)_MEX_BUFFER[ii])); | |
| if (_MEX_BUFFER[ii].GetLength() != sizeof(mexType) *_c_mex_type[ii]) | |
| { | |
| PRINTF("ERROR, size of buffer is not what is expected %lu, %lu\n",_MEX_BUFFER[ii].GetLength(),sizeof(mexType) *_c_mex_type[ii]); | |
| ERROR_STRING("Stopping simulation"); | |
| } | |
| } | |
| mtlpp::Buffer _UINT_BUFFER = device.NewBuffer(sizeof(unsigned int) *_c_uint_type, | |
| mtlpp::ResourceOptions::StorageModeManaged); | |
| mxcheckGPUErrors(((int)_UINT_BUFFER)); | |
| mtlpp::Buffer _INDEX_MEX = device.NewBuffer(sizeof(unsigned int) * | |
| LENGTH_INDEX_MEX*2, | |
| mtlpp::ResourceOptions::StorageModeManaged); | |
| mxcheckGPUErrors(((int)_INDEX_MEX)); | |
| mtlpp::Buffer _INDEX_UINT = device.NewBuffer(sizeof(unsigned int) * | |
| LENGTH_INDEX_UINT*2, | |
| mtlpp::ResourceOptions::StorageModeManaged); | |
| mxcheckGPUErrors(((int)_INDEX_UINT)); | |
| { | |
| unsigned int * inData = static_cast<unsigned int *>(_INDEX_MEX.GetContents()); | |
| for (uint32_t j=0; j<LENGTH_INDEX_MEX; j++) | |
| { | |
| inData[j*2] = (unsigned int) (0xFFFFFFFF & HOST_INDEX_MEX[j][0]); | |
| inData[j*2+1] = (unsigned int) (HOST_INDEX_MEX[j][0]>>32); | |
| } | |
| _INDEX_MEX.DidModify(ns::Range(0, sizeof(unsigned int) * LENGTH_INDEX_MEX*2)); | |
| } | |
| { | |
| unsigned int * inData = static_cast< unsigned int *>(_INDEX_UINT.GetContents()); | |
| for (uint32_t j=0; j<LENGTH_INDEX_UINT; j++) | |
| { | |
| inData[j*2] = (unsigned int) (0xFFFFFFFF & HOST_INDEX_UINT[j][0]); | |
| inData[j*2+1] = (unsigned int) (HOST_INDEX_UINT[j][0]>>32); | |
| } | |
| _INDEX_UINT.DidModify(ns::Range(0, sizeof(unsigned int) * LENGTH_INDEX_UINT*2)); | |
| } | |
| _CONSTANT_BUFFER_UINT.DidModify(ns::Range(0, sizeof(unsigned int)*LENGTH_CONST_UINT)); | |
| _CONSTANT_BUFFER_MEX.DidModify(ns::Range(0,sizeof(mexType) * LENGTH_CONST_MEX)); | |
| CompleteCopyToGpu(LambdaMiuMatOverH,mexType); | |
| CompleteCopyToGpu(LambdaMatOverH ,mexType); | |
| CompleteCopyToGpu(MiuMatOverH,mexType); | |
| CompleteCopyToGpu(TauLong,mexType); | |
| CompleteCopyToGpu(OneOverTauSigma ,mexType); | |
| CompleteCopyToGpu(TauShear,mexType); | |
| CompleteCopyToGpu(InvRhoMatH ,mexType); | |
| CompleteCopyToGpu(Ox ,mexType); | |
| CompleteCopyToGpu(Oy ,mexType); | |
| CompleteCopyToGpu(Oz ,mexType); | |
| CompleteCopyToGpu(SourceFunctions ,mexType); | |
| CompleteCopyToGpu(IndexSensorMap ,unsigned int); | |
| CompleteCopyToGpu(SourceMap ,unsigned int); | |
| CompleteCopyToGpu(MaterialMap ,unsigned int); | |
| _PT totalfloat=0; | |
| for (_PT ii=0;ii<12;ii++) | |
| { | |
| _MEX_BUFFER[ii].DidModify(ns::Range(0,sizeof(mexType) *_c_mex_type[ii])); | |
| totalfloat+=_c_mex_type[ii]; | |
| } | |
| _UINT_BUFFER.DidModify(ns::Range(0,sizeof(unsigned int) *_c_uint_type)); | |
| PRINTF("Total float entries %lu and int entries %lu\n",totalfloat,_c_uint_type); | |
| #endif | |
| //putting Pointers in structure | |
| InParamP(V_x_x); | |
| InParamP(V_y_x); | |
| InParamP(V_z_x); | |
| InParamP(V_x_y); | |
| InParamP(V_y_y); | |
| InParamP(V_z_y); | |
| InParamP(V_x_z); | |
| InParamP(V_y_z); | |
| InParamP(V_z_z); | |
| InParamP(Sigma_x_xx); | |
| InParamP(Sigma_y_xx); | |
| InParamP(Sigma_z_xx); | |
| InParamP(Sigma_x_yy); | |
| InParamP(Sigma_y_yy); | |
| InParamP(Sigma_z_yy); | |
| InParamP(Sigma_x_zz); | |
| InParamP(Sigma_y_zz); | |
| InParamP(Sigma_z_zz); | |
| InParamP(Sigma_x_xy); | |
| InParamP(Sigma_y_xy); | |
| InParamP(Sigma_x_xz); | |
| InParamP(Sigma_z_xz); | |
| InParamP(Sigma_y_yz); | |
| InParamP(Sigma_z_yz); | |
| InParamP(LambdaMiuMatOverH); | |
| InParamP(LambdaMatOverH); | |
| InParamP(MiuMatOverH); | |
| InParamP(TauLong); | |
| InParamP(OneOverTauSigma); | |
| InParamP(TauShear); | |
| InParamP(InvRhoMatH); | |
| InParamP(SourceFunctions); | |
| InParamP(SourceMap); | |
| InParamP(MaterialMap); | |
| InParamP(Vx); | |
| InParamP(Vy); | |
| InParamP(Vz); | |
| InParamP(Rxx); | |
| InParamP(Ryy); | |
| InParamP(Rzz); | |
| InParamP(Rxy); | |
| InParamP(Rxz); | |
| InParamP(Ryz); | |
| InParamP(Sigma_xx); | |
| InParamP(Sigma_yy); | |
| InParamP(Sigma_zz); | |
| InParamP(Sigma_xy); | |
| InParamP(Sigma_xz); | |
| InParamP(Sigma_yz); | |
| InParamP(SqrAcc); | |
| InParamP(Pressure); | |
| #ifdef CUDA | |
| InParamP(SensorOutput); | |
| #endif | |
| InParamP(Ox); | |
| InParamP(Oy); | |
| InParamP(Oz); | |
| //unsigned int MaxIndex = INHOST(InternalIndexCount)> INHOST(EdgeIndexCount) ? INHOST(InternalIndexCount):INHOST(EdgeIndexCount); | |
| #if defined(CUDA) | |
| //copying the structure to GPU memory | |
| InputDataKernel * pGPU; | |
| mxcheckGPUErrors(cudaMalloc((void **)&pGPU,sizeof(InputDataKernel))); | |
| NumberAlloc++; | |
| PRINTF("size of InputDataKernel =%ld\n", sizeof(InputDataKernel)); | |
| mxcheckGPUErrors(cudaMemcpy(pGPU, &pHost, sizeof(InputDataKernel), cudaMemcpyHostToDevice)); | |
| int minBlockSize; | |
| int minGridSize; | |
| //Calculate the block dimensions | |
| #define CUDA_GRID_BLOC_BASE(__KERNEL__)\ | |
| dim3 dimBlock## __KERNEL__; \ | |
| dim3 dimGrid## __KERNEL__; \ | |
| mxcheckGPUErrors(cudaOccupancyMaxPotentialBlockSize( &minGridSize, &minBlockSize,\ | |
| __KERNEL__, 0, 0));\ | |
| PRINTF("minGridSize and Blocksize from API for " #__KERNEL__ " = %i and %i\n",minGridSize,minBlockSize)\ | |
| dimBlock## __KERNEL__.x=4;\ | |
| dimBlock## __KERNEL__.y=4;\ | |
| dimBlock## __KERNEL__.z=(unsigned int)floor(minBlockSize/(dimBlock ## __KERNEL__.x*dimBlock ## __KERNEL__.y)); | |
| #define CUDA_GRID_BLOC_CALC_MAIN(__KERNEL__)\ | |
| CUDA_GRID_BLOC_BASE(__KERNEL__)\ | |
| dimGrid## __KERNEL__.x = (unsigned int)ceil((float)(INHOST(N1)-INHOST(PML_Thickness)*2) / dimBlock ## __KERNEL__.x);\ | |
| dimGrid## __KERNEL__.y = (unsigned int)ceil((float)(INHOST(N2)-INHOST(PML_Thickness)*2) / dimBlock ## __KERNEL__.y);\ | |
| dimGrid## __KERNEL__.z = (unsigned int)ceil((float)(INHOST(N3)-INHOST(PML_Thickness)*2) / dimBlock ## __KERNEL__.z);\ | |
| PRINTF(#__KERNEL__ " block size to %dx%dx%d\n", dimBlock ## __KERNEL__.x, dimBlock ## __KERNEL__.y,dimBlock## __KERNEL__.z);\ | |
| PRINTF(#__KERNEL__ " Stress grid size to %dx%dx%d\n", dimGrid ## __KERNEL__.x, dimGrid ## __KERNEL__.y,dimGrid## __KERNEL__.z); | |
| #define CUDA_GRID_BLOC_CALC_PML(__TYPE__)\ | |
| CUDA_GRID_BLOC_BASE(PML_1_ ##__TYPE__ ##Kernel);\ | |
| CUDA_GRID_BLOC_BASE(PML_2_ ##__TYPE__ ##Kernel);\ | |
| CUDA_GRID_BLOC_BASE(PML_3_ ##__TYPE__ ##Kernel);\ | |
| CUDA_GRID_BLOC_BASE(PML_4_ ##__TYPE__ ##Kernel);\ | |
| CUDA_GRID_BLOC_BASE(PML_5_ ##__TYPE__ ##Kernel);\ | |
| CUDA_GRID_BLOC_BASE(PML_6_ ##__TYPE__ ##Kernel);\ | |
| dimGridPML_1_## __TYPE__ ##Kernel.x =(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_1_## __TYPE__ ##Kernel.x);\ | |
| dimGridPML_1_## __TYPE__ ##Kernel.y=(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_1_## __TYPE__ ##Kernel.y);\ | |
| dimGridPML_1_## __TYPE__ ##Kernel.z=(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_1_## __TYPE__ ##Kernel.z);\ | |
| \ | |
| dimGridPML_2_## __TYPE__ ##Kernel.x=(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_2_## __TYPE__ ##Kernel.x);\ | |
| dimGridPML_2_## __TYPE__ ##Kernel.y=(unsigned int)ceil((float)(INHOST(N2)-INHOST(PML_Thickness)*2) / (float) dimBlockPML_2_## __TYPE__ ##Kernel.y);\ | |
| dimGridPML_2_## __TYPE__ ##Kernel.z=(unsigned int)ceil((float)(INHOST(N3)-INHOST(PML_Thickness)*2) / (float) dimBlockPML_2_## __TYPE__ ##Kernel.z);\ | |
| \ | |
| dimGridPML_3_## __TYPE__ ##Kernel.x=(unsigned int)ceil((float)(INHOST(N1)-INHOST(PML_Thickness)*2) / (float) dimBlockPML_3_## __TYPE__ ##Kernel.x);\ | |
| dimGridPML_3_## __TYPE__ ##Kernel.y=(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_3_## __TYPE__ ##Kernel.y);\ | |
| dimGridPML_3_## __TYPE__ ##Kernel.z=(unsigned int)ceil((float)(INHOST(N3)-INHOST(PML_Thickness)*2) / (float) dimBlockPML_3_## __TYPE__ ##Kernel.z);\ | |
| \ | |
| dimGridPML_4_## __TYPE__ ##Kernel.x=(unsigned int)ceil((float)(INHOST(N1)-INHOST(PML_Thickness)*2) / (float) dimBlockPML_4_## __TYPE__ ##Kernel.x);\ | |
| dimGridPML_4_## __TYPE__ ##Kernel.y=(unsigned int)ceil((float)(INHOST(N2)-INHOST(PML_Thickness)*2) / (float) dimBlockPML_4_## __TYPE__ ##Kernel.y);\ | |
| dimGridPML_4_## __TYPE__ ##Kernel.z=(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_4_## __TYPE__ ##Kernel.z);\ | |
| \ | |
| dimGridPML_5_## __TYPE__ ##Kernel.x=(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_5_## __TYPE__ ##Kernel.x);\ | |
| dimGridPML_5_## __TYPE__ ##Kernel.y=(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_5_## __TYPE__ ##Kernel.y);\ | |
| dimGridPML_5_## __TYPE__ ##Kernel.z=(unsigned int)ceil((float)(INHOST(N3)-INHOST(PML_Thickness)*2) / (float) dimBlockPML_5_## __TYPE__ ##Kernel.z);\ | |
| \ | |
| dimGridPML_6_## __TYPE__ ##Kernel.x=(unsigned int)ceil((float)(INHOST(N1)-INHOST(PML_Thickness)*2) / (float) dimBlockPML_6_## __TYPE__ ##Kernel.x);\ | |
| dimGridPML_6_## __TYPE__ ##Kernel.y=(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_6_## __TYPE__ ##Kernel.y);\ | |
| dimGridPML_6_## __TYPE__ ##Kernel.z=(unsigned int)ceil((float)(INHOST(PML_Thickness)*2) / (float) dimBlockPML_6_## __TYPE__ ##Kernel.z);\ | |
| \ | |
| PRINTF("PML_1_global_" #__TYPE__ "=[%i,%i,%i],[%i,%i,%i]\n",dimGridPML_1_## __TYPE__ ##Kernel.x,dimGridPML_1_## __TYPE__ ##Kernel.y,dimGridPML_1_## __TYPE__ ##Kernel.z,\ | |
| dimGridPML_1_## __TYPE__ ##Kernel.x*dimBlockPML_1_## __TYPE__ ##Kernel.x,dimGridPML_1_## __TYPE__ ##Kernel.y*dimBlockPML_1_## __TYPE__ ##Kernel.y,dimGridPML_1_## __TYPE__ ##Kernel.z*dimBlockPML_1_## __TYPE__ ##Kernel.z);\ | |
| PRINTF("PML_2_global_" #__TYPE__ "=[%i,%i,%i],[%i,%i,%i]\n",dimGridPML_2_## __TYPE__ ##Kernel.x,dimGridPML_2_## __TYPE__ ##Kernel.y,dimGridPML_2_## __TYPE__ ##Kernel.z,\ | |
| dimGridPML_2_## __TYPE__ ##Kernel.x*dimBlockPML_2_## __TYPE__ ##Kernel.x,dimGridPML_2_## __TYPE__ ##Kernel.y*dimBlockPML_2_## __TYPE__ ##Kernel.y,dimGridPML_2_## __TYPE__ ##Kernel.z*dimBlockPML_2_## __TYPE__ ##Kernel.z);\ | |
| PRINTF("PML_3_global_" #__TYPE__ "=[%i,%i,%i],[%i,%i,%i]\n",dimGridPML_3_## __TYPE__ ##Kernel.x,dimGridPML_3_## __TYPE__ ##Kernel.y,dimGridPML_3_## __TYPE__ ##Kernel.z,\ | |
| dimGridPML_3_## __TYPE__ ##Kernel.x*dimBlockPML_3_## __TYPE__ ##Kernel.x,dimGridPML_3_## __TYPE__ ##Kernel.y*dimBlockPML_3_## __TYPE__ ##Kernel.y,dimGridPML_3_## __TYPE__ ##Kernel.z*dimBlockPML_3_## __TYPE__ ##Kernel.z);\ | |
| PRINTF("PML_4_global_" #__TYPE__ "=[%i,%i,%i],[%i,%i,%i]\n",dimGridPML_4_## __TYPE__ ##Kernel.x,dimGridPML_4_## __TYPE__ ##Kernel.y,dimGridPML_4_## __TYPE__ ##Kernel.z,\ | |
| dimGridPML_4_## __TYPE__ ##Kernel.x*dimBlockPML_4_## __TYPE__ ##Kernel.x,dimGridPML_4_## __TYPE__ ##Kernel.y*dimBlockPML_4_## __TYPE__ ##Kernel.y,dimGridPML_4_## __TYPE__ ##Kernel.z*dimBlockPML_4_## __TYPE__ ##Kernel.z);\ | |
| PRINTF("PML_5_global_" #__TYPE__ "=[%i,%i,%i],[%i,%i,%i]\n",dimGridPML_5_## __TYPE__ ##Kernel.x,dimGridPML_5_## __TYPE__ ##Kernel.y,dimGridPML_5_## __TYPE__ ##Kernel.z,\ | |
| dimGridPML_5_## __TYPE__ ##Kernel.x*dimBlockPML_5_## __TYPE__ ##Kernel.x,dimGridPML_5_## __TYPE__ ##Kernel.y*dimBlockPML_5_## __TYPE__ ##Kernel.y,dimGridPML_5_## __TYPE__ ##Kernel.z*dimBlockPML_5_## __TYPE__ ##Kernel.z);\ | |
| PRINTF("PML_6_global_" #__TYPE__ "=[%i,%i,%i],[%i,%i,%i]\n",dimGridPML_6_## __TYPE__ ##Kernel.x,dimGridPML_6_## __TYPE__ ##Kernel.y,dimGridPML_6_## __TYPE__ ##Kernel.z,\ | |
| dimGridPML_6_## __TYPE__ ##Kernel.x*dimBlockPML_6_## __TYPE__ ##Kernel.x,dimGridPML_6_## __TYPE__ ##Kernel.y*dimBlockPML_6_## __TYPE__ ##Kernel.y,dimGridPML_6_## __TYPE__ ##Kernel.z*dimBlockPML_6_## __TYPE__ ##Kernel.z); | |
| CUDA_GRID_BLOC_CALC_MAIN(MAIN_1_StressKernel); | |
| CUDA_GRID_BLOC_CALC_PML(Stress); | |
| CUDA_GRID_BLOC_CALC_MAIN(MAIN_1_ParticleKernel); | |
| CUDA_GRID_BLOC_CALC_PML(Particle); | |
| //We handle the case the user wants to specify manually the computing grid sizes | |
| if (ManualLocalSize_pr[0] != -1) | |
| { | |
| dimBlockMAIN_1_StressKernel.x=(unsigned int)ManualLocalSize_pr[0]; | |
| dimBlockMAIN_1_StressKernel.y=(unsigned int)ManualLocalSize_pr[1]; | |
| dimBlockMAIN_1_StressKernel.z=(unsigned int)ManualLocalSize_pr[2]; | |
| dimBlockMAIN_1_ParticleKernel.x=(unsigned int)ManualLocalSize_pr[0]; | |
| dimBlockMAIN_1_ParticleKernel.y=(unsigned int)ManualLocalSize_pr[1]; | |
| dimBlockMAIN_1_ParticleKernel.z=(unsigned int)ManualLocalSize_pr[2]; | |
| } | |
| if (ManualGroupSize_pr[0] != -1) | |
| { | |
| dimGridMAIN_1_StressKernel.x = (unsigned int)ManualGroupSize_pr[0]; | |
| dimGridMAIN_1_StressKernel.y = (unsigned int)ManualGroupSize_pr[1]; | |
| dimGridMAIN_1_StressKernel.z = (unsigned int)ManualGroupSize_pr[2]; | |
| dimGridMAIN_1_ParticleKernel.x = (unsigned int)ManualGroupSize_pr[0]; | |
| dimGridMAIN_1_ParticleKernel.y = (unsigned int)ManualGroupSize_pr[1]; | |
| dimGridMAIN_1_ParticleKernel.z = (unsigned int)ManualGroupSize_pr[2]; | |
| } | |
| dim3 dimBlockSnap; | |
| dim3 dimGridSnap; | |
| mxcheckGPUErrors(cudaOccupancyMaxPotentialBlockSize( &minGridSize, &minBlockSize, | |
| SnapShot, 0, 0)); | |
| PRINTF("N1:minGridSize and Blocksize from API for SnapShot = %i and %i\n",minGridSize,minBlockSize); | |
| dimBlockSnap.x=8; | |
| dimBlockSnap.y=(unsigned int)floor(minBlockSize/(dimBlockSnap.x)); | |
| dimGridSnap.x = (unsigned int)ceil((float)(INHOST(N1)+1) / dimBlockSnap.x); | |
| dimGridSnap.y = (unsigned int)ceil((float)(INHOST(N2)+1) / dimBlockSnap.y); | |
| PRINTF(" Snapshot block size to %dx%d\n", dimBlockSnap.x, dimBlockSnap.y); | |
| PRINTF(" Snapshot grid size to %dx%d\n", dimGridSnap.x, dimGridSnap.y); | |
| dim3 dimBlockSensors; | |
| dim3 dimGridSensors; | |
| mxcheckGPUErrors(cudaOccupancyMaxPotentialBlockSize( &minGridSize, &minBlockSize, | |
| SensorsKernel, 0, 0)); | |
| PRINTF("minGridSize and Blocksize from API for SensorsKernel = %i and %i\n",minGridSize,minBlockSize); | |
| dimBlockSensors.x=minBlockSize; | |
| dimBlockSensors.y=1; | |
| dimGridSensors.x = (unsigned int)ceil((float)(INHOST(NumberSensors)) / dimBlockSensors.x); | |
| dimGridSensors.y=1; | |
| PRINTF(" set sensor block size to %dx%d\n", dimBlockSensors.x, dimBlockSensors.y); | |
| PRINTF(" set sensor grid size to %dx%d\n", dimGridSensors.x, dimGridSensors.y); | |
| size_t free_byte ; | |
| size_t total_byte ; | |
| mxcheckGPUErrors(cudaMemGetInfo( &free_byte, &total_byte )); | |
| double free_db = (double)free_byte ; | |
| double total_db = (double)total_byte ; | |
| double used_db = total_db - free_db ; | |
| PRINTF("GPU memory usage: used = %f, free = %f MB, total = %f MB\n",used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); | |
| #define TOTAL_streams 7 | |
| cudaStream_t streams[TOTAL_streams]; | |
| for (unsigned n =0;n<TOTAL_streams;n++) | |
| mxcheckGPUErrors(cudaStreamCreate ( &streams[n])) ; | |
| #endif | |
| #ifdef OPENCL | |
| //We handle the case the user wants to specify manually the computing grid sizes | |
| size_t global_stress_particle[3]; | |
| if (ManualGroupSize_pr[0] != -1) | |
| { | |
| global_stress_particle[0]=(size_t)ManualGroupSize_pr[0]; | |
| global_stress_particle[1]=(size_t)ManualGroupSize_pr[1]; | |
| global_stress_particle[2]=(size_t)ManualGroupSize_pr[2]; | |
| } | |
| else | |
| { | |
| global_stress_particle[0]=(size_t)INHOST(N1); | |
| global_stress_particle[1]=(size_t)INHOST(N2); | |
| global_stress_particle[2]=(size_t)INHOST(N3); | |
| } | |
| size_t * local_stress = NULL; | |
| size_t local_stress_manual[3]; | |
| if (ManualLocalSize_pr[0] != -1) | |
| { | |
| local_stress_manual[0]=(size_t)ManualLocalSize_pr[0]; | |
| local_stress_manual[1]=(size_t)ManualLocalSize_pr[1]; | |
| local_stress_manual[2]=(size_t)ManualLocalSize_pr[2]; | |
| local_stress=local_stress_manual; | |
| } | |
| PRINTF("global_stress_particle %i %i %i\n", | |
| global_stress_particle[0],global_stress_particle[1],global_stress_particle[2]); | |
| if (local_stress!=NULL) | |
| { | |
| PRINTF("local_stress %i %i %i\n", | |
| local_stress[0],local_stress[1],local_stress[2]); | |
| } | |
| else{ | |
| PRINTF("local_stress is NULL\n"); | |
| } | |
| const size_t global_sensors[1] ={INHOST(NumberSensors)}; | |
| if (NumberSnapshots>0) | |
| { | |
| mxcheckGPUErrors(clSetKernelArg(SnapShot, 1, sizeof(cl_mem), &gpu_Snapshots_pr)); | |
| mxcheckGPUErrors(clSetKernelArg(SnapShot, 2, sizeof(cl_mem), &gpu_Sigma_xx_pr)); | |
| mxcheckGPUErrors(clSetKernelArg(SnapShot, 3, sizeof(cl_mem), &gpu_Sigma_yy_pr)); | |
| mxcheckGPUErrors(clSetKernelArg(SnapShot, 4, sizeof(cl_mem), &gpu_Sigma_zz_pr)); | |
| } | |
| mxcheckGPUErrors(clSetKernelArg(SensorsKernel, 54, sizeof(cl_mem), &gpu_SensorOutput_pr)); | |
| mxcheckGPUErrors(clSetKernelArg(SensorsKernel, 55, sizeof(cl_mem), &gpu_IndexSensorMap_pr)); | |
| #endif | |
| #ifdef METAL | |
| unsigned int PML_1_local_stress[3]; | |
| unsigned int PML_1_global_stress[3]; | |
| unsigned int PML_2_local_stress[3]; | |
| unsigned int PML_2_global_stress[3]; | |
| unsigned int PML_3_local_stress[3]; | |
| unsigned int PML_3_global_stress[3]; | |
| unsigned int PML_4_local_stress[3]; | |
| unsigned int PML_4_global_stress[3]; | |
| unsigned int PML_5_local_stress[3]; | |
| unsigned int PML_5_global_stress[3]; | |
| unsigned int PML_6_local_stress[3]; | |
| unsigned int PML_6_global_stress[3]; | |
| unsigned int MAIN_1_local_stress[3]; | |
| unsigned int MAIN_1_global_stress[3]; | |
| unsigned int PML_1_local_particle[3]; | |
| unsigned int PML_1_global_particle[3]; | |
| unsigned int PML_2_local_particle[3]; | |
| unsigned int PML_2_global_particle[3]; | |
| unsigned int PML_3_local_particle[3]; | |
| unsigned int PML_3_global_particle[3]; | |
| unsigned int PML_4_local_particle[3]; | |
| unsigned int PML_4_global_particle[3]; | |
| unsigned int PML_5_local_particle[3]; | |
| unsigned int PML_5_global_particle[3]; | |
| unsigned int PML_6_local_particle[3]; | |
| unsigned int PML_6_global_particle[3]; | |
| unsigned int MAIN_1_local_particle[3]; | |
| unsigned int MAIN_1_global_particle[3]; | |
| if (ManualLocalSize_pr[0] != -1) | |
| { | |
| SET_USER_LOCAL_STRESS(MAIN_1) | |
| SET_USER_LOCAL_PARTICLE(MAIN_1) | |
| } | |
| else | |
| { | |
| CALC_USER_LOCAL_STRESS(MAIN_1) | |
| CALC_USER_LOCAL_PARTICLE(MAIN_1) | |
| } | |
| CALC_USER_LOCAL_STRESS(PML_1) | |
| CALC_USER_LOCAL_STRESS(PML_2) | |
| CALC_USER_LOCAL_STRESS(PML_3) | |
| CALC_USER_LOCAL_STRESS(PML_4) | |
| CALC_USER_LOCAL_STRESS(PML_5) | |
| CALC_USER_LOCAL_STRESS(PML_6) | |
| CALC_USER_LOCAL_PARTICLE(PML_1) | |
| CALC_USER_LOCAL_PARTICLE(PML_2) | |
| CALC_USER_LOCAL_PARTICLE(PML_3) | |
| CALC_USER_LOCAL_PARTICLE(PML_4) | |
| CALC_USER_LOCAL_PARTICLE(PML_5) | |
| CALC_USER_LOCAL_PARTICLE(PML_6) | |
| unsigned int local_sensors[3]; | |
| local_sensors[0]=computePipelineStateSensors.GetMaxTotalThreadsPerThreadgroup(); | |
| local_sensors[1]=1; | |
| local_sensors[2]=1; | |
| if (ManualGroupSize_pr[0] != -1) | |
| { | |
| SET_USER_GROUP_STRESS(MAIN_1) | |
| SET_USER_GROUP_PARTICLE(MAIN_1) | |
| } | |
| else | |
| { | |
| CALC_USER_GROUP_STRESS_MAIN(MAIN_1) | |
| CALC_USER_GROUP_PARTICLE_MAIN(MAIN_1) | |
| } | |
| CALC_USER_GROUP_PML(stress); | |
| CALC_USER_GROUP_PML(particle); | |
| unsigned int global_sensors[3]; | |
| global_sensors[0]=(unsigned int)ceil((float)(INHOST(NumberSensors)) / (float)local_sensors[0]); | |
| global_sensors[1]=1; | |
| global_sensors[2]=1; | |
| // PRINTF("global_stress %i %i %i, local_stress %i %i %i\n", | |
| // global_stress[0],global_stress[1],global_stress[2], | |
| // local_stress[0],local_stress[1],local_stress[2]); | |
| // PRINTF("global_particle %i %i %i, local_particle %i %i %i\n", | |
| // global_particle[0],global_particle[1],global_particle[2], | |
| // local_particle[0],local_particle[1],local_particle[2]); | |
| // PRINTF("global_sensors %i %i %i, local_sensors %i %i %i\n", | |
| // global_sensors[0],global_sensors[1],global_sensors[2], | |
| // local_sensors[0],local_sensors[1],local_sensors[2]); | |
| #endif | |
| LOCAL_CALLOC(Vx,GET_NUMBER_ELEMS(Vx_res)); | |
| LOCAL_CALLOC(Vy,GET_NUMBER_ELEMS(Vy_res)); | |
| LOCAL_CALLOC(Vz,GET_NUMBER_ELEMS(Vz_res)); | |
| LOCAL_CALLOC(Sigma_xx,GET_NUMBER_ELEMS(Sigma_xx_res)); | |
| LOCAL_CALLOC(Sigma_yy,GET_NUMBER_ELEMS(Sigma_yy_res)); | |
| LOCAL_CALLOC(Sigma_zz,GET_NUMBER_ELEMS(Sigma_zz_res)); | |
| LOCAL_CALLOC(Sigma_xy,GET_NUMBER_ELEMS(Sigma_xy_res)); | |
| LOCAL_CALLOC(Sigma_xz,GET_NUMBER_ELEMS(Sigma_xz_res)); | |
| LOCAL_CALLOC(Sigma_yz,GET_NUMBER_ELEMS(Sigma_yz_res)); | |
| LOCAL_CALLOC(Pressure,GET_NUMBER_ELEMS(Pressure_res)); | |
| unsigned int INHOST(nStep)=0; | |
| unsigned int SensorEntry=0; | |
| //%%%%%%%%%%%%% MAIN TEMPORAL LOOP %%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
| //%%%%%%%%%%%%% MAIN TEMPORAL LOOP %%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
| //%%%%%%%%%%%%% MAIN TEMPORAL LOOP %%%%%%%%%%%%%%%%%%%%%%%%%%%% | |
| while(INHOST(nStep)<INHOST(TimeSteps)) | |
| { | |
| // PRINTF("nStep %i of %i\n",INHOST(nStep),INHOST(TimeSteps)); | |
| #define CUDA_CALL(__KERNEL__,_IDSTREAM)\ | |
| __KERNEL__ <<< dimGrid## __KERNEL__,dimBlock## __KERNEL__,0,streams[_IDSTREAM] >>> (pGPU,INHOST(nStep),INHOST(TypeSource)); | |
| #if defined(CUDA) | |
| CUDA_CALL(MAIN_1_StressKernel,0); | |
| CUDA_CALL(PML_1_StressKernel,1); | |
| CUDA_CALL(PML_2_StressKernel,2); | |
| CUDA_CALL(PML_3_StressKernel,3); | |
| CUDA_CALL(PML_4_StressKernel,4); | |
| CUDA_CALL(PML_5_StressKernel,5); | |
| CUDA_CALL(PML_6_StressKernel,6); | |
| for(unsigned int nSyncStream=0;nSyncStream<TOTAL_streams;nSyncStream++) | |
| mxcheckGPUErrors(cudaStreamSynchronize(streams[nSyncStream])); | |
| CUDA_CALL(MAIN_1_ParticleKernel,0); | |
| CUDA_CALL(PML_1_ParticleKernel,1); | |
| CUDA_CALL(PML_2_ParticleKernel,2); | |
| CUDA_CALL(PML_3_ParticleKernel,3); | |
| CUDA_CALL(PML_4_ParticleKernel,4); | |
| CUDA_CALL(PML_5_ParticleKernel,5); | |
| CUDA_CALL(PML_6_ParticleKernel,6); | |
| for(unsigned int nSyncStream=0;nSyncStream<TOTAL_streams;nSyncStream++) | |
| mxcheckGPUErrors(cudaStreamSynchronize(streams[nSyncStream])); | |
| cudaDeviceSynchronize(); | |
| #endif | |
| #ifdef OPENCL | |
| int nextSnap=-1; | |
| if (NumberSnapshots>0) | |
| nextSnap=SnapshotsPos_pr[INHOST(CurrSnap)]-1; | |
| mxcheckGPUErrors(clSetKernelArg(StressKernel, 54, sizeof(unsigned int), &INHOST(nStep))); | |
| mxcheckGPUErrors(clSetKernelArg(StressKernel, 55, sizeof(unsigned int), &INHOST(TypeSource))); | |
| mxcheckGPUErrors(clSetKernelArg(ParticleKernel, 54, sizeof(unsigned int), &INHOST(nStep))); | |
| mxcheckGPUErrors(clSetKernelArg(ParticleKernel, 55, sizeof(unsigned int), &INHOST(TypeSource))); | |
| mxcheckGPUErrors(clEnqueueNDRangeKernel(commands, StressKernel, 3, NULL, global_stress_particle, local_stress, 0, NULL, NULL)); | |
| mxcheckGPUErrors(clFinish(commands)); | |
| mxcheckGPUErrors(clEnqueueNDRangeKernel(commands, ParticleKernel, 3, NULL, global_stress_particle, local_stress, 0, NULL, NULL)); | |
| mxcheckGPUErrors(clFinish(commands)); | |
| #endif | |
| #ifdef METAL | |
| InitSymbol(nStep,unsigned int,G_INT); | |
| InitSymbol(TypeSource,unsigned int,G_INT); | |
| InitSymbol(SelK,unsigned int,G_INT); | |
| mtlpp::CommandBuffer StresscommandBuffer = commandQueue.CommandBuffer(); | |
| mxcheckGPUErrors(((int)StresscommandBuffer)); | |
| ENCODE_STRESS(PML_1) | |
| ENCODE_STRESS(PML_2) | |
| ENCODE_STRESS(PML_3) | |
| ENCODE_STRESS(PML_4) | |
| ENCODE_STRESS(PML_5) | |
| ENCODE_STRESS(PML_6) | |
| ENCODE_STRESS(MAIN_1) | |
| ENCODE_PARTICLE(PML_1) | |
| ENCODE_PARTICLE(PML_2) | |
| ENCODE_PARTICLE(PML_3) | |
| ENCODE_PARTICLE(PML_4) | |
| ENCODE_PARTICLE(PML_5) | |
| ENCODE_PARTICLE(PML_6) | |
| ENCODE_PARTICLE(MAIN_1) | |
| StresscommandBuffer.Commit(); | |
| StresscommandBuffer.WaitUntilCompleted(); | |
| #endif | |
| // Snapshots | |
| if (INHOST(CurrSnap) <NumberSnapshots) | |
| if(INHOST(nStep)==SnapshotsPos_pr[INHOST(CurrSnap)]-1) | |
| { | |
| #if defined(CUDA) | |
| SnapShot<<<dimGridSnap,dimBlockSnap,0,streams[0]>>>(INHOST(SelK),gpu_Snapshots_pr,gpu_Sigma_xx_pr,gpu_Sigma_yy_pr,gpu_Sigma_zz_pr,INHOST(CurrSnap)); | |
| mxcheckGPUErrors(cudaStreamSynchronize(streams[0])); | |
| #endif | |
| #if defined(OPENCL) | |
| int selfSlice=INHOST(SelK); | |
| mxcheckGPUErrors(clSetKernelArg(SnapShot, 0, sizeof(unsigned int), &selfSlice)); | |
| mxcheckGPUErrors(clSetKernelArg(SnapShot, 5, sizeof(unsigned int), &INHOST(CurrSnap))); | |
| mxcheckGPUErrors(clEnqueueNDRangeKernel(commands, SnapShot, 2, NULL, global_stress_particle, NULL, 0, NULL, NULL)); | |
| mxcheckGPUErrors(clFinish(commands)); | |
| #endif | |
| #if defined(METAL) | |
| InitSymbol(CurrSnap,unsigned int,G_INT); | |
| mtlpp::CommandBuffer StresscommandBuffer = commandQueue.CommandBuffer(); | |
| mtlpp::ComputeCommandEncoder commandEncoderSnapShot = StresscommandBuffer.ComputeCommandEncoder(); | |
| commandEncoderSnapShot.SetBuffer(_CONSTANT_BUFFER_UINT, 0, 0); | |
| commandEncoderSnapShot.SetBuffer(_CONSTANT_BUFFER_MEX, 0, 1); | |
| commandEncoderSnapShot.SetBuffer(_INDEX_MEX, 0, 2); | |
| commandEncoderSnapShot.SetBuffer(_INDEX_UINT, 0, 3); | |
| commandEncoderSnapShot.SetBuffer(_UINT_BUFFER, 0, 4); | |
| for (_PT ii=0;ii<12;ii++) | |
| commandEncoderSnapShot.SetBuffer(_MEX_BUFFER[ii], 0, 5+ii); | |
| commandEncoderSnapShot.SetBuffer(gpu_Snapshots_pr, 0, 17); | |
| commandEncoderSnapShot.SetComputePipelineState(computePipelineStateSnapShot); | |
| commandEncoderSnapShot.DispatchThreadgroups( | |
| mtlpp::Size( | |
| (unsigned int)ceil((float)(INHOST(N1)+1) / 8), | |
| (unsigned int)ceil((float)(INHOST(N2)+1) / 8), | |
| 1), | |
| mtlpp::Size(8, 8,1)); | |
| commandEncoderSnapShot.EndEncoding(); | |
| #endif | |
| INHOST(CurrSnap)++; | |
| } | |
| //~ //Finally, the sensors | |
| if (((((_PT)INHOST(nStep)) % ((_PT)INHOST(SensorSubSampling)))==0) && | |
| ((((_PT)INHOST(nStep)) / ((_PT)INHOST(SensorSubSampling)))>=((_PT)INHOST(SensorStart))) && | |
| (SensorEntry < MaxSensorSteps)) | |
| { | |
| SensorEntry++; | |
| #if defined(CUDA) | |
| SensorsKernel<<<dimGridSensors,dimBlockSensors,0,streams[0]>>>(pGPU,gpu_IndexSensorMap_pr,INHOST(nStep)); | |
| mxcheckGPUErrors(cudaStreamSynchronize(streams[0])); | |
| #endif | |
| #if defined(OPENCL) | |
| mxcheckGPUErrors(clSetKernelArg(SensorsKernel, 56, sizeof(unsigned int), &INHOST(nStep))); | |
| mxcheckGPUErrors(clEnqueueNDRangeKernel(commands, SensorsKernel, 1, NULL, global_sensors, NULL, 0, NULL, NULL)); | |
| mxcheckGPUErrors(clFinish(commands)); | |
| #endif | |
| #if defined(METAL) | |
| mtlpp::CommandBuffer commandBufferSensors = commandQueue.CommandBuffer(); | |
| mxcheckGPUErrors(((int)commandBufferSensors)); | |
| mtlpp::ComputeCommandEncoder commandEncoderSensors = commandBufferSensors.ComputeCommandEncoder(); | |
| commandEncoderSensors.SetBuffer(_CONSTANT_BUFFER_UINT, 0, 0); | |
| commandEncoderSensors.SetBuffer(_CONSTANT_BUFFER_MEX, 0, 1); | |
| commandEncoderSensors.SetBuffer(_INDEX_MEX, 0, 2); | |
| commandEncoderSensors.SetBuffer(_INDEX_UINT, 0, 3); | |
| commandEncoderSensors.SetBuffer(_UINT_BUFFER, 0, 4); | |
| for (_PT ii=0;ii<12;ii++) | |
| commandEncoderSensors.SetBuffer(_MEX_BUFFER[ii], 0, 5+ii); | |
| commandEncoderSensors.SetComputePipelineState(computePipelineStateSensors); | |
| commandEncoderSensors.DispatchThreadgroups( | |
| mtlpp::Size( | |
| global_sensors[0], | |
| global_sensors[1], | |
| global_sensors[2]), | |
| mtlpp::Size(local_sensors[0], | |
| local_sensors[1], | |
| local_sensors[2])); | |
| commandEncoderSensors.EndEncoding(); | |
| commandBufferSensors.Commit(); | |
| commandBufferSensors.WaitUntilCompleted(); | |
| #endif | |
| } | |
| INHOST(nStep)++; | |
| } | |
| #if defined(METAL) | |
| //#we just synchronize before transferring data back to CPU | |
| mtlpp::CommandBuffer commandBufferSync = commandQueue.CommandBuffer(); | |
| mxcheckGPUErrors(((int)commandBufferSync)); | |
| mtlpp::BlitCommandEncoder blitCommandEncoderSync = commandBufferSync.BlitCommandEncoder(); | |
| for (_PT ii=0;ii<12;ii++) | |
| blitCommandEncoderSync.Synchronize(_MEX_BUFFER[ii]); | |
| blitCommandEncoderSync.EndEncoding(); | |
| commandBufferSync.Commit(); | |
| commandBufferSync.WaitUntilCompleted(); | |
| #endif | |
| //DONE, just to copy to the host the results | |
| #if defined(CUDA) || defined(OPENCL) | |
| CopyFromGPUToMX3(SensorOutput,mexType); | |
| CopyFromGPUToMX3(SqrAcc,mexType); | |
| #else | |
| CopyFromGPUToMX4(SensorOutput,mexType); | |
| CopyFromGPUToMX4(SqrAcc,mexType); | |
| #endif | |
| CopyFromGPUToMX(Vx,mexType); | |
| CopyFromGPUToMX(Vy,mexType); | |
| CopyFromGPUToMX(Vz,mexType); | |
| CopyFromGPUToMX(Sigma_xx,mexType); | |
| CopyFromGPUToMX(Sigma_yy,mexType); | |
| CopyFromGPUToMX(Sigma_xy,mexType); | |
| CopyFromGPUToMX(Sigma_xz,mexType); | |
| CopyFromGPUToMX(Sigma_yz,mexType); | |
| CopyFromGPUToMX(Pressure,mexType); | |
| { | |
| _PT i,j,k,CurZone; | |
| #pragma omp parallel for private(j,i,CurZone) | |
| for(k=0; k<((_PT)INHOST(N3)); k++) | |
| for(j=0; j<((_PT)INHOST(N2)); j++) | |
| for(i=0; i<((_PT)INHOST(N1)); i++) | |
| { | |
| ASSIGN_RES(Vx); | |
| ASSIGN_RES(Vy); | |
| ASSIGN_RES(Vz); | |
| ASSIGN_RES(Sigma_xx); | |
| ASSIGN_RES(Sigma_yy); | |
| ASSIGN_RES(Sigma_zz); | |
| ASSIGN_RES(Sigma_xy); | |
| ASSIGN_RES(Sigma_xz); | |
| ASSIGN_RES(Sigma_yz); | |
| ASSIGN_RES(Pressure); | |
| } | |
| } | |
| #if defined(CUDA) | |
| for (unsigned n =0;n<TOTAL_streams;n++) | |
| mxcheckGPUErrors(cudaStreamDestroy ( streams[n])) ; | |
| #endif | |
| CopyFromGPUToMX3(Snapshots,mexType); | |
| #if defined(CUDA) | |
| mxcheckGPUErrors(cudaFree(pGPU)); NumberAlloc--; | |
| free(sm13DeviceList); | |
| #endif | |
| free(Vx_pr); | |
| free(Vy_pr); | |
| free(Vz_pr); | |
| free(Sigma_xx_pr); | |
| free(Sigma_yy_pr); | |
| free(Sigma_zz_pr); | |
| free(Sigma_xy_pr); | |
| free(Sigma_xz_pr); | |
| free(Sigma_yz_pr); | |
| free(Pressure_pr); | |
| ownGPUFree(SensorOutput); | |
| ownGPUFree(V_x_x); | |
| ownGPUFree(V_y_x); | |
| ownGPUFree(V_z_x); | |
| ownGPUFree(V_x_y); | |
| ownGPUFree(V_y_y); | |
| ownGPUFree(V_z_y); | |
| ownGPUFree(V_x_z); | |
| ownGPUFree(V_y_z); | |
| ownGPUFree(V_z_z); | |
| ownGPUFree(Sigma_x_xx); | |
| ownGPUFree(Sigma_y_xx); | |
| ownGPUFree(Sigma_z_xx); | |
| ownGPUFree(Sigma_x_yy); | |
| ownGPUFree(Sigma_y_yy); | |
| ownGPUFree(Sigma_z_yy); | |
| ownGPUFree(Sigma_x_zz); | |
| ownGPUFree(Sigma_y_zz); | |
| ownGPUFree(Sigma_z_zz); | |
| ownGPUFree(Sigma_x_xy); | |
| ownGPUFree(Sigma_y_xy); | |
| ownGPUFree(Sigma_x_xz); | |
| ownGPUFree(Sigma_z_xz); | |
| ownGPUFree(Sigma_y_yz); | |
| ownGPUFree(Sigma_z_yz); | |
| ownGPUFree(LambdaMiuMatOverH); | |
| ownGPUFree(LambdaMatOverH); | |
| ownGPUFree(MiuMatOverH); | |
| ownGPUFree(TauLong); | |
| ownGPUFree(OneOverTauSigma); | |
| ownGPUFree(TauShear); | |
| ownGPUFree(InvRhoMatH); | |
| ownGPUFree(IndexSensorMap); | |
| ownGPUFree(SourceFunctions); | |
| ownGPUFree(SourceMap); | |
| ownGPUFree(Ox); | |
| ownGPUFree(Oy); | |
| ownGPUFree(Oz); | |
| ownGPUFree(MaterialMap); | |
| ownGPUFree(Vx); | |
| ownGPUFree(Vy); | |
| ownGPUFree(Vz); | |
| ownGPUFree(Sigma_xx); | |
| ownGPUFree(Sigma_yy); | |
| ownGPUFree(Sigma_zz); | |
| ownGPUFree(Sigma_xy); | |
| ownGPUFree(Sigma_xz); | |
| ownGPUFree(Sigma_yz); | |
| ownGPUFree(Pressure); | |
| ownGPUFree(Rxx); | |
| ownGPUFree(Ryy); | |
| ownGPUFree(Rzz); | |
| ownGPUFree(Rxy); | |
| ownGPUFree(Rxz); | |
| ownGPUFree(Ryz); | |
| ownGPUFree(Snapshots); | |
| ownGPUFree(SqrAcc); | |
| #if defined(CUDA) | |
| mxcheckGPUErrors(cudaMemGetInfo( &free_byte, &total_byte )); | |
| free_db = (double)free_byte ; | |
| total_db = (double)total_byte ; | |
| used_db = total_db - free_db ; | |
| PRINTF("GPU memory remaining (free should be equal to total): used = %f, free = %f MB, total = %f MB\n",used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0); | |
| #endif | |
| #if defined(OPENCL) | |
| clReleaseProgram(program); | |
| clReleaseKernel(StressKernel); | |
| clReleaseKernel(ParticleKernel); | |
| clReleaseKernel(SensorsKernel); | |
| clReleaseKernel(SnapShot); | |
| clReleaseCommandQueue(commands); | |
| clReleaseContext(context); | |
| free(Platform); | |
| #endif | |
| PRINTF("Number of unfreed allocs (it should be 0):%i\n",NumberAlloc); |