From 251ac8d6392d2f150c5d0ee2c399b8fc77dc44df Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Mon, 5 Jan 2015 13:29:10 -0600 Subject: [PATCH 1/3] fixes #6 --- ospray/api/LocalDevice.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ospray/api/LocalDevice.cpp b/ospray/api/LocalDevice.cpp index 49f48e7442..017c884215 100644 --- a/ospray/api/LocalDevice.cpp +++ b/ospray/api/LocalDevice.cpp @@ -61,10 +61,11 @@ namespace ospray { embreeConfig << " threads=1,verbose=2"; rtcInit(embreeConfig.str().c_str()); - if (rtcGetError() != RTC_NO_ERROR) { + RTCError erc = rtcGetError(); + if (erc != RTC_NO_ERROR) { // why did the error function not get called !? - std::cerr << "#osp:init: embree internal error number " << (int)rtcGetError() << std::endl; - assert(rtcGetError() == RTC_NO_ERROR); + std::cerr << "#osp:init: embree internal error number " << (int)erc << std::endl; + assert(erc == RTC_NO_ERROR); } TiledLoadBalancer::instance = new LocalTiledLoadBalancer; } From 9908c0f24e72829b6327944f63c6061be702c278 Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Mon, 5 Jan 2015 13:40:00 -0600 Subject: [PATCH 2/3] removing explosion factor from instance (fixes #4, plus some doc updates to instance class --- ospray/geometry/Instance.cpp | 7 ------- ospray/geometry/Instance.h | 24 +++++++++++++++++------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/ospray/geometry/Instance.cpp b/ospray/geometry/Instance.cpp index 6d2aa281ad..48fc72fa19 100644 --- a/ospray/geometry/Instance.cpp +++ b/ospray/geometry/Instance.cpp @@ -41,13 +41,6 @@ namespace ospray { instancedScene->embreeSceneHandle); assert(instancedScene); - - const vec3f mesh_center = xfm.p == vec3f(0.f, 0.f, 0.f) - ? embree::center(instancedScene->geometry[0]->bounds) - : xfm.p; - const vec3f model_center = model->getParam3f("explosion.center", mesh_center); - const vec3f dir = mesh_center - model_center; - xfm.p += dir * model->getParam1f("explosion.factor", 0.f); rtcSetTransform(model->embreeSceneHandle,embreeGeomID, RTC_MATRIX_COLUMN_MAJOR, diff --git a/ospray/geometry/Instance.h b/ospray/geometry/Instance.h index bf046cdde2..fa7fb322e6 100644 --- a/ospray/geometry/Instance.h +++ b/ospray/geometry/Instance.h @@ -23,15 +23,18 @@ namespace ospray { /*! \defgroup geometry_instance Instancing ("instance") - \brief Implements instancing via a single instnace of another - model + \brief Implements instancing via a single instance of another + model. \ingroup ospray_supported_geometries Once created, a trianglemesh recognizes the following parameters
-    affine3f "xfm"   // transformation matrix the model is instantiated with
-    OSPModel "model" // model we're instancing
+    float3 "xfm.l.vx" // 1st column of the affine transformation matrix
+    float3 "xfm.l.vy" // 1st column of the affine transformation matrix
+    float3 "xfm.l.vz" // 1st column of the affine transformation matrix
+    float3 "xfm.p"    // 4th column (translation) of the affine transformation matrix
+    OSPModel "model"  // model we're instancing
     
The functionality for this geometry is implemented via the @@ -45,13 +48,20 @@ namespace ospray { */ struct Instance : public Geometry { + /*! Constructor */ Instance(); + //! \brief common function to help printf-debugging virtual std::string toString() const { return "ospray::Instance"; } + /*! \brief integrates this geometry's primitives into the respective + model's acceleration structure */ virtual void finalize(Model *model); - AffineSpace3f xfm; - Ref instancedScene; - uint32 embreeGeomID; + /*! transformation matrix associated with that instance's geometry. may be embree::one */ + AffineSpace3f xfm; + /*! reference to instanced model. Must be a *model* that we're instancing, not a geometry */ + Ref instancedScene; + /*! geometry ID of this geometry in the parent model */ + uint32 embreeGeomID; }; } // ::ospray From b46a837d2a0cbc8f5e2d6077a4fc6cc91800d0de Mon Sep 17 00:00:00 2001 From: Ingo Wald Date: Mon, 5 Jan 2015 14:03:50 -0600 Subject: [PATCH 3/3] several files had tabs; removed --- apps/common/widgets/glut3D.cpp | 10 +- apps/modelViewer/modelViewer.cpp | 56 +- apps/streamLineViewer/StreamLineViewer.cpp | 108 +-- ospray/api/API.cpp | 10 +- ospray/common/ISPC_KNC_Backend.h | 956 ++++++++++----------- ospray/common/Model.ispc | 2 +- ospray/fb/FrameBuffer.cpp | 4 +- ospray/mpi/MPILoadBalancer.cpp | 13 +- ospray/mpi/worker.cpp | 2 +- ospray/render/Renderer.ispc | 29 +- 10 files changed, 590 insertions(+), 600 deletions(-) diff --git a/apps/common/widgets/glut3D.cpp b/apps/common/widgets/glut3D.cpp index bce41e64b1..3b711bae4f 100644 --- a/apps/common/widgets/glut3D.cpp +++ b/apps/common/widgets/glut3D.cpp @@ -277,9 +277,9 @@ namespace ospray { if (!dumpFileRoot) dumpFileRoot = getenv("OSPRAY_SCREEN_DUMP_ROOT"); if (!dumpFileRoot) { - mkstemp(tmpFileName); + mkstemp(tmpFileName); dumpFileRoot = tmpFileName; - } + } char fileName[100000]; sprintf(fileName,"%s_%08ld.ppm",dumpFileRoot,times(NULL)); @@ -713,14 +713,14 @@ namespace ospray { if (animating) { dumpScreensDuringAnimation = !dumpScreensDuringAnimation; } else { - char tmpFileName[] = "/tmp/ospray_screen_dump_file.XXXXXXXX"; + char tmpFileName[] = "/tmp/ospray_screen_dump_file.XXXXXXXX"; static const char *dumpFileRoot; if (!dumpFileRoot) dumpFileRoot = getenv("OSPRAY_SCREEN_DUMP_ROOT"); if (!dumpFileRoot) { - mkstemp(tmpFileName); + mkstemp(tmpFileName); dumpFileRoot = tmpFileName; - } + } char fileName[100000]; static int frameDumpSequenceID = 0; sprintf(fileName,"%s_%05d.ppm",dumpFileRoot,frameDumpSequenceID++); diff --git a/apps/modelViewer/modelViewer.cpp b/apps/modelViewer/modelViewer.cpp index 51a848505e..0c0aed9c6f 100644 --- a/apps/modelViewer/modelViewer.cpp +++ b/apps/modelViewer/modelViewer.cpp @@ -213,23 +213,23 @@ namespace ospray { virtual void specialkey(int32 key, const vec2f where) { switch(key) { - case GLUT_KEY_PAGE_UP: - g_near_clip += 10.f * motionSpeed; - ospSet1f(renderer, "near_clip", g_near_clip); - ospCommit(renderer); - ospFrameBufferClear(fb,OSP_FB_ACCUM); - forceRedraw(); - break; - case GLUT_KEY_PAGE_DOWN: - g_near_clip -= 10.f * motionSpeed; - g_near_clip = std::max(g_near_clip, 1e-6f); - ospSet1f(renderer, "near_clip", g_near_clip); - ospCommit(renderer); - ospFrameBufferClear(fb,OSP_FB_ACCUM); - forceRedraw(); - break; - default: - Glut3DWidget::keypress(key,where); + case GLUT_KEY_PAGE_UP: + g_near_clip += 10.f * motionSpeed; + ospSet1f(renderer, "near_clip", g_near_clip); + ospCommit(renderer); + ospFrameBufferClear(fb,OSP_FB_ACCUM); + forceRedraw(); + break; + case GLUT_KEY_PAGE_DOWN: + g_near_clip -= 10.f * motionSpeed; + g_near_clip = std::max(g_near_clip, 1e-6f); + ospSet1f(renderer, "near_clip", g_near_clip); + ospCommit(renderer); + ospFrameBufferClear(fb,OSP_FB_ACCUM); + forceRedraw(); + break; + default: + Glut3DWidget::keypress(key,where); } } @@ -247,7 +247,7 @@ namespace ospray { viewPort.at = p; //viewPort.from += offset; viewPort.modified = true; - computeFrame(); + computeFrame(); accumID = 0; ospFrameBufferClear(fb,OSP_FB_ACCUM); //((glut3D::InspectCenter*)inspectCenterManipulator)->pivot = p; @@ -311,15 +311,15 @@ namespace ospray { ++accumID; if (showDepthBuffer) { - depthFB = (float *) ospMapFrameBuffer(fb, OSP_FB_DEPTH); - frameBufferMode = Glut3DWidget::FRAMEBUFFER_DEPTH; - Glut3DWidget::display(); - ospUnmapFrameBuffer(depthFB,fb); + depthFB = (float *) ospMapFrameBuffer(fb, OSP_FB_DEPTH); + frameBufferMode = Glut3DWidget::FRAMEBUFFER_DEPTH; + Glut3DWidget::display(); + ospUnmapFrameBuffer(depthFB,fb); } else { - ucharFB = (uint32 *) ospMapFrameBuffer(fb, OSP_FB_COLOR); - frameBufferMode = Glut3DWidget::FRAMEBUFFER_UCHAR; - Glut3DWidget::display(); - ospUnmapFrameBuffer(ucharFB,fb); + ucharFB = (uint32 *) ospMapFrameBuffer(fb, OSP_FB_COLOR); + frameBufferMode = Glut3DWidget::FRAMEBUFFER_UCHAR; + Glut3DWidget::display(); + ospUnmapFrameBuffer(ucharFB,fb); } // frameBufferMode = g_frameBufferMode; // switch(frameBufferMode) { @@ -694,7 +694,7 @@ namespace ospray { // add color array to mesh if (!msgMesh->color.empty()) { OSPData color = ospNewData(msgMesh->color.size(),OSP_FLOAT3A, - &msgMesh->color[0],OSP_DATA_SHARED_BUFFER); + &msgMesh->color[0],OSP_DATA_SHARED_BUFFER); assert(msgMesh->color.size() > 0); ospSetData(ospMesh,"vertex.color",color); } else { @@ -726,7 +726,7 @@ namespace ospray { materialList.push_back(createMaterial(ospRenderer, msgMesh->materialList[i].ptr)); for (miniSG::Material::ParamMap::const_iterator it = msgMesh->materialList[i]->params.begin(); - it != msgMesh->materialList[i]->params.end(); it++) { + it != msgMesh->materialList[i]->params.end(); it++) { const char *name = it->first.c_str(); const miniSG::Material::Param *p = it->second.ptr; if(p->type == miniSG::Material::Param::TEXTURE) { diff --git a/apps/streamLineViewer/StreamLineViewer.cpp b/apps/streamLineViewer/StreamLineViewer.cpp index 35085981c4..a18f36b76a 100644 --- a/apps/streamLineViewer/StreamLineViewer.cpp +++ b/apps/streamLineViewer/StreamLineViewer.cpp @@ -281,66 +281,66 @@ namespace ospray { } } - void exportOSX(const char *fn,StreamLines *streamLines, Triangles *triangles) + void exportOSX(const char *fn,StreamLines *streamLines, Triangles *triangles) + { + FILE *file = fopen(fn,"w"); + fprintf(file,"\n\n"); + fprintf(file,"\n"); { - FILE *file = fopen(fn,"w"); - fprintf(file,"\n\n"); - fprintf(file,"\n"); + fprintf(file,"\n"); { - fprintf(file,"\n"); + fprintf(file,"\n"); { - fprintf(file,"\n"); - { - fprintf(file,"\n"); - for (int i=0;ivertex.size();i++) - fprintf(file,"%f %f %f\n", - streamLines->vertex[i].x, - streamLines->vertex[i].y, - streamLines->vertex[i].z); - fprintf(file,"\n"); - - fprintf(file,"\n"); - for (int i=0;iindex.size();i++) - fprintf(file,"%i ",streamLines->index[i]); - fprintf(file,"\n\n"); - } - fprintf(file,"\n"); - - - fprintf(file,"\n"); - { - fprintf(file,"\n"); - for (int i=0;ivertex.size();i++) - fprintf(file,"%f %f %f\n", - triangles->vertex[i].x, - triangles->vertex[i].y, - triangles->vertex[i].z); - fprintf(file,"\n"); - - fprintf(file,"\n"); - for (int i=0;icolor.size();i++) - fprintf(file,"%f %f %f\n", - triangles->color[i].x, - triangles->color[i].y, - triangles->color[i].z); - fprintf(file,"\n"); - - fprintf(file,"\n"); - for (int i=0;iindex.size();i++) - fprintf(file,"%i %i %i\n", - triangles->index[i].x, - triangles->index[i].y, - triangles->index[i].z); - fprintf(file,"\n"); + fprintf(file,"\n"); + for (int i=0;ivertex.size();i++) + fprintf(file,"%f %f %f\n", + streamLines->vertex[i].x, + streamLines->vertex[i].y, + streamLines->vertex[i].z); + fprintf(file,"\n"); + + fprintf(file,"\n"); + for (int i=0;iindex.size();i++) + fprintf(file,"%i ",streamLines->index[i]); + fprintf(file,"\n\n"); + } + fprintf(file,"\n"); + + + fprintf(file,"\n"); + { + fprintf(file,"\n"); + for (int i=0;ivertex.size();i++) + fprintf(file,"%f %f %f\n", + triangles->vertex[i].x, + triangles->vertex[i].y, + triangles->vertex[i].z); + fprintf(file,"\n"); + + fprintf(file,"\n"); + for (int i=0;icolor.size();i++) + fprintf(file,"%f %f %f\n", + triangles->color[i].x, + triangles->color[i].y, + triangles->color[i].z); + fprintf(file,"\n"); + + fprintf(file,"\n"); + for (int i=0;iindex.size();i++) + fprintf(file,"%i %i %i\n", + triangles->index[i].x, + triangles->index[i].y, + triangles->index[i].z); + fprintf(file,"\n"); - } - fprintf(file,"\n"); } - fprintf(file,"\n"); + fprintf(file,"\n"); } - fprintf(file,"\n"); - fclose(file); + fprintf(file,"\n"); } + fprintf(file,"\n"); + fclose(file); + } struct StreamLineViewer : public Glut3DWidget { /*! construct volume from file name and dimensions \see volview_notes_on_volume_interface */ @@ -501,7 +501,7 @@ namespace ospray { else throw std::runtime_error("unknown file format "+fn.str()); } else if (arg == "--module") { - ospLoadModule(av[++i]); + ospLoadModule(av[++i]); } else if (arg == "--renderer") { rendererType = av[++i]; } else if (arg == "--radius") { diff --git a/ospray/api/API.cpp b/ospray/api/API.cpp index 34b9046666..fb0373b2d5 100644 --- a/ospray/api/API.cpp +++ b/ospray/api/API.cpp @@ -77,7 +77,7 @@ namespace ospray { #if OSPRAY_MPI std::cout << "#osp: launching ospray mpi ring - make sure that mpd is running" << std::endl; ospray::api::Device::current - = mpi::createMPI_LaunchWorkerGroup(_ac,_av,OSP_MPI_LAUNCH_FROM_ENV); + = mpi::createMPI_LaunchWorkerGroup(_ac,_av,OSP_MPI_LAUNCH_FROM_ENV); #else throw std::runtime_error("OSPRay MPI support not compiled in"); #endif @@ -336,10 +336,10 @@ namespace ospray { } extern "C" OSPTexture2D ospNewTexture2D(int width, - int height, - OSPDataType type, - void *data = NULL, - int flags = 0) + int height, + OSPDataType type, + void *data = NULL, + int flags = 0) { ASSERT_DEVICE(); Assert2(width > 0, "Width must be greater than 0 in ospNewTexture2D"); diff --git a/ospray/common/ISPC_KNC_Backend.h b/ospray/common/ISPC_KNC_Backend.h index b2937ab5e6..69bc24b554 100644 --- a/ospray/common/ISPC_KNC_Backend.h +++ b/ospray/common/ISPC_KNC_Backend.h @@ -15,23 +15,23 @@ // ======================================================================== // /** - Copyright (c) 2010-2014, Intel Corporation - All rights reserved. + Copyright (c) 2010-2014, Intel Corporation + All rights reserved. - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS @@ -70,13 +70,13 @@ #if 0 - #define STRING(x) #x - #define TOSTRING(x) STRING(x) - #define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl - #define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl - #define PRINT2(x,y) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << std::endl - #define PRINT3(x,y,z) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << std::endl - #define PRINT4(x,y,z,w) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << std::endl +#define STRING(x) #x +#define TOSTRING(x) STRING(x) +#define PING std::cout << __FILE__ << " (" << __LINE__ << "): " << __FUNCTION__ << std::endl +#define PRINT(x) std::cout << STRING(x) << " = " << (x) << std::endl +#define PRINT2(x,y) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << std::endl +#define PRINT3(x,y,z) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << std::endl +#define PRINT4(x,y,z,w) std::cout << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " << (w) << std::endl #endif #define FORCEINLINE __forceinline @@ -112,10 +112,10 @@ struct __vec16_i32; #if 1 /* (iw) actually, this *SHOULD* be the right implementation for a -vec16_i1: this one is a class that can have a constructor (which -ISPC sometimes emits for these vectors...) This version might -not be working with embree's ISPC bindings, probably because -embree still uses the 'wrong' implementation */ + vec16_i1: this one is a class that can have a constructor (which + ISPC sometimes emits for these vectors...) This version might + not be working with embree's ISPC bindings, probably because + embree still uses the 'wrong' implementation */ typedef struct PRE_ALIGN(2) __vec16_i1 { FORCEINLINE operator __mmask16() const { return v; } @@ -126,21 +126,21 @@ typedef struct PRE_ALIGN(2) __vec16_i1 bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15) { v = ((v0 & 1) | - ((v1 & 1) << 1) | - ((v2 & 1) << 2) | - ((v3 & 1) << 3) | - ((v4 & 1) << 4) | - ((v5 & 1) << 5) | - ((v6 & 1) << 6) | - ((v7 & 1) << 7) | - ((v8 & 1) << 8) | - ((v9 & 1) << 9) | - ((v10 & 1) << 10) | - ((v11 & 1) << 11) | - ((v12 & 1) << 12) | - ((v13 & 1) << 13) | - ((v14 & 1) << 14) | - ((v15 & 1) << 15)); + ((v1 & 1) << 1) | + ((v2 & 1) << 2) | + ((v3 & 1) << 3) | + ((v4 & 1) << 4) | + ((v5 & 1) << 5) | + ((v6 & 1) << 6) | + ((v7 & 1) << 7) | + ((v8 & 1) << 8) | + ((v9 & 1) << 9) | + ((v10 & 1) << 10) | + ((v11 & 1) << 11) | + ((v12 & 1) << 12) | + ((v13 & 1) << 13) | + ((v14 & 1) << 14) | + ((v15 & 1) << 15)); } FORCEINLINE uint8_t operator[](const int i) const { return ((v >> i) & 1); } FORCEINLINE uint8_t operator[](const int i) { return ((v >> i) & 1); } @@ -216,22 +216,22 @@ typedef struct PRE_ALIGN(64) __vec16_i64 { __m512i v1 = _mm512_set_8to8_epi64(v15, v14, v13, v12, v11, v10, v09, v08); __m512i v2 = _mm512_set_8to8_epi64(v07, v06, v05, v04, v03, v02, v01, v00); v_hi = _mm512_mask_permutevar_epi32(v_hi, 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v1); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); v_hi = _mm512_mask_permutevar_epi32(v_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v2); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); v_lo = _mm512_mask_permutevar_epi32(v_lo, 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v1); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); v_lo = _mm512_mask_permutevar_epi32(v_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v2); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); } FORCEINLINE int64_t operator[](const int i) const { - return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); } + return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); } FORCEINLINE int64_t operator[](const int i) { - return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); } + return ((uint64_t(((int32_t*)this)[i])<<32)+((int32_t*)this)[i+16]); } __m512i v_hi; __m512i v_lo; } POST_ALIGN(64) __vec16_i64; @@ -239,40 +239,40 @@ typedef struct PRE_ALIGN(64) __vec16_i64 { static __vec16_i64 zmm2hilo(const __m512i v1, const __m512i v2){ __vec16_i64 v; v.v_hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v2); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); v.v_hi = _mm512_mask_permutevar_epi32(v.v_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v1); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); v.v_lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v2); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); v.v_lo = _mm512_mask_permutevar_epi32(v.v_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v1); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); return v; } static void hilo2zmm(const __vec16_i64 &v, __m512i &_v1, __m512i &_v2) { _v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_hi); + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_hi); _v2 = _mm512_mask_permutevar_epi32(_v2, 0x5555, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_lo); + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_lo); _v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_hi); + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_hi); _v1 = _mm512_mask_permutevar_epi32(_v1, 0x5555, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_lo); + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_lo); } template struct vec16 { FORCEINLINE vec16() { } FORCEINLINE vec16(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, - T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { + T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) { v[0] = v0; v[1] = v1; v[2] = v2; v[3] = v3; v[4] = v4; v[5] = v5; v[6] = v6; v[7] = v7; v[8] = v8; v[9] = v9; v[10] = v10; v[11] = v11; @@ -460,7 +460,7 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) { } static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a, - __vec16_i1 b) { + __vec16_i1 b) { return ((a & mask) | (b & ~mask)); //return __or(__and(a, mask), __andnr(b, mask)); } @@ -489,14 +489,14 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, uint32_t inde /* - static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, - bool val) { - if (val == false) - vec->v &= ~(1 << index); - else - vec->v |= (1 << index); - } - */ + static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index, + bool val) { + if (val == false) + vec->v &= ~(1 << index); + else + vec->v |= (1 << index); + } +*/ template static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) { const uint16_t *ptr = (const uint16_t *)p; @@ -599,7 +599,7 @@ static FORCEINLINE __vec16_i1 __equal_i32(const __vec16_i32 &a, const __vec16_i3 } static FORCEINLINE __vec16_i1 __equal_i32_and_mask(const __vec16_i32 &a, const __vec16_i32 &b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpeq_epi32_mask(m, a, b); } @@ -608,7 +608,7 @@ static FORCEINLINE __vec16_i1 __not_equal_i32(__vec16_i32 a, __vec16_i32 b) { } static FORCEINLINE __vec16_i1 __not_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpneq_epi32_mask(m, a, b); } @@ -617,7 +617,7 @@ static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32(__vec16_i32 a, __vec16_i } static FORCEINLINE __vec16_i1 __unsigned_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmple_epu32_mask(m, a, b); } @@ -626,7 +626,7 @@ static FORCEINLINE __vec16_i1 __signed_less_equal_i32(__vec16_i32 a, __vec16_i32 } static FORCEINLINE __vec16_i1 __signed_less_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmple_epi32_mask(m, a, b); } @@ -635,7 +635,7 @@ static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32(__vec16_i32 a, __vec1 } static FORCEINLINE __vec16_i1 __unsigned_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpge_epu32_mask(m, a, b); } @@ -644,7 +644,7 @@ static FORCEINLINE __vec16_i1 __signed_greater_equal_i32(__vec16_i32 a, __vec16_ } static FORCEINLINE __vec16_i1 __signed_greater_equal_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpge_epi32_mask(m, a, b); } @@ -653,7 +653,7 @@ static FORCEINLINE __vec16_i1 __unsigned_less_than_i32(__vec16_i32 a, __vec16_i3 } static FORCEINLINE __vec16_i1 __unsigned_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmplt_epu32_mask(m, a, b); } @@ -662,7 +662,7 @@ static FORCEINLINE __vec16_i1 __signed_less_than_i32(__vec16_i32 a, __vec16_i32 } static FORCEINLINE __vec16_i1 __signed_less_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmplt_epi32_mask(m, a, b); } @@ -671,7 +671,7 @@ static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32(__vec16_i32 a, __vec16 } static FORCEINLINE __vec16_i1 __unsigned_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpgt_epu32_mask(m, a, b); } @@ -680,12 +680,12 @@ static FORCEINLINE __vec16_i1 __signed_greater_than_i32(__vec16_i32 a, __vec16_i } static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a, __vec16_i32 b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpgt_epi32_mask(m, a, b); } static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask, - __vec16_i32 a, __vec16_i32 b) { + __vec16_i32 a, __vec16_i32 b) { return _mm512_mask_mov_epi32(b.v, mask, a.v); } @@ -771,12 +771,12 @@ static FORCEINLINE __vec16_i32 __shuffle_i32(__vec16_i32 v, __vec16_i32 index) { } static FORCEINLINE __vec16_i32 __shuffle2_i32(__vec16_i32 v0, __vec16_i32 v1, __vec16_i32 index) { - const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10)); - index = __and(index, __smear_i32<__vec16_i32>(0xF)); - __vec16_i32 ret = __undef_i32<__vec16_i32>(); - ret = _mm512_mask_permutevar_epi32(ret, mask, index, v0); - ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1); - return ret; + const __vec16_i1 mask = __signed_less_than_i32(index, __smear_i32<__vec16_i32>(0x10)); + index = __and(index, __smear_i32<__vec16_i32>(0xF)); + __vec16_i32 ret = __undef_i32<__vec16_i32>(); + ret = _mm512_mask_permutevar_epi32(ret, mask, index, v0); + ret = _mm512_mask_permutevar_epi32(ret, __not(mask), index, v1); + return ret; } static FORCEINLINE __vec16_i32 __shift_i32(__vec16_i32 v, int index) { @@ -826,7 +826,7 @@ template <> FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { /////////////////////////////////////////////////////////////////////////// static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask, - __vec16_i64 a, __vec16_i64 b) { + __vec16_i64 a, __vec16_i64 b) { __vec16_i64 ret; ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask, a.v_hi); ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask, a.v_lo); @@ -839,17 +839,17 @@ void __masked_store_i64(void *p, const __vec16_i64 &v, __vec16_i1 mask) __m512i v1; __m512i v2; v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_hi); + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_hi); v1 = _mm512_mask_permutevar_epi32(v1, 0x5555, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_lo); + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_lo); v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_hi); + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_hi); v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_lo); + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_lo); _mm512_mask_store_epi64(p, mask, v2); _mm512_mask_store_epi64(((uint8_t*)p)+64, mask>>8, v1); } @@ -894,8 +894,8 @@ static FORCEINLINE __vec16_i64 __sub(const __vec16_i64 &a, const __vec16_i64 &b) static FORCEINLINE __vec16_i64 __mul(const __vec16_i32 &a, const __vec16_i64 &b) { return __vec16_i64(_mm512_mullo_epi32(a.v,b.v_lo), - _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi), - _mm512_mulhi_epi32(a.v, b.v_lo))); + _mm512_add_epi32(_mm512_mullo_epi32(a.v, b.v_hi), + _mm512_mulhi_epi32(a.v, b.v_lo))); } static FORCEINLINE void __abs_i32i64(__m512i &_hi, __m512i &_lo) @@ -969,10 +969,10 @@ static FORCEINLINE __vec16_i64 __xor(__vec16_i64 a, __vec16_i64 b) { static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) { /* this is a safety gate in case b-shift >= 32 */ const __vec16_i32 xfer = __select( - __signed_less_than_i32(b.v_lo, __ispc_thirty_two), - __lshr(a.v_lo, __sub(__ispc_thirty_two, b.v_lo)), - __shl (a.v_lo, __sub(b.v_lo, __ispc_thirty_two)) - ); + __signed_less_than_i32(b.v_lo, __ispc_thirty_two), + __lshr(a.v_lo, __sub(__ispc_thirty_two, b.v_lo)), + __shl (a.v_lo, __sub(b.v_lo, __ispc_thirty_two)) + ); const __vec16_i32 hi = __or(__shl(a.v_hi, b.v_lo), xfer); const __vec16_i32 lo = __shl(a.v_lo, b.v_lo); return __vec16_i64(lo, hi); @@ -980,7 +980,7 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, __vec16_i64 b) { static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, unsigned long long b) { __vec16_i32 hi = _mm512_or_epi32(_mm512_slli_epi32(a.v_hi, b), - _mm512_srli_epi32(a.v_lo, 32-b)); + _mm512_srli_epi32(a.v_lo, 32-b)); __vec16_i32 lo = _mm512_slli_epi32(a.v_lo, b); return __vec16_i64(lo, hi); } @@ -988,10 +988,10 @@ static FORCEINLINE __vec16_i64 __shl(__vec16_i64 a, unsigned long long b) { static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, __vec16_i64 b) { /* this is a safety gate in case b-shift >= 32 */ const __vec16_i32 xfer = __select( - __signed_less_than_i32(b.v_lo, __ispc_thirty_two), - __shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)), - __lshr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two)) - ); + __signed_less_than_i32(b.v_lo, __ispc_thirty_two), + __shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)), + __lshr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two)) + ); const __vec16_i32 lo = __or(__lshr(a.v_lo, b.v_lo), xfer); const __vec16_i32 hi = __lshr(a.v_hi, b.v_lo); @@ -1012,10 +1012,10 @@ static FORCEINLINE __vec16_i64 __lshr(__vec16_i64 a, unsigned long long b) { static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) { /* this is a safety gate in case b-shift >= 32 */ const __vec16_i32 xfer = __select( - __signed_less_than_i32(b.v_lo, __ispc_thirty_two), - __shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)), - __ashr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two)) - ); + __signed_less_than_i32(b.v_lo, __ispc_thirty_two), + __shl (a.v_hi, __sub(__ispc_thirty_two, b.v_lo)), + __ashr(a.v_hi, __sub(b.v_lo, __ispc_thirty_two)) + ); const __vec16_i32 lo = __or(__lshr(a.v_lo, b.v_lo), xfer); const __vec16_i32 hi = __ashr(a.v_hi, b.v_lo); return __vec16_i64(lo, hi); @@ -1024,8 +1024,8 @@ static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, __vec16_i64 b) { static FORCEINLINE __vec16_i64 __ashr(__vec16_i64 a, unsigned long long b) { __vec16_i32 xfer = _mm512_slli_epi32(_mm512_and_epi32(a.v_hi, - _mm512_set1_epi32((1< static FORCEINLINE __vec16_i64 __load(const __vec16_i64 *p) __vec16_i64 ret; ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v1); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v2); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v1); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v2); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); return ret; } @@ -1195,17 +1195,17 @@ template <> FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) { __m512i v1 = _mm512_load_epi32(((uint8_t*)p)+64); __vec16_i64 ret; ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v1); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v1); ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v2); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v2); ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - v1); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + v1); ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - v2); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + v2); return ret; } @@ -1218,17 +1218,17 @@ template static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64 __m512i v1; __m512i v2; v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_hi); + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_hi); v1 = _mm512_mask_permutevar_epi32(v1, 0x5555, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_lo); + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_lo); v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_hi); + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_hi); v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_lo); + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_lo); _mm512_extpackstorelo_epi32(p, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); _mm512_extpackstorehi_epi32((uint8_t*)p+64, v2, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); _mm512_extpackstorelo_epi32((uint8_t*)p+64, v1, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); @@ -1239,17 +1239,17 @@ template <> FORCEINLINE void __store<64>(__vec16_i64 *p, __vec16_i64 v) { __m512i v1; __m512i v2; v1 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_hi); + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_hi); v1 = _mm512_mask_permutevar_epi32(v1, 0x5555, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - v.v_lo); + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + v.v_lo); v2 = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_hi); + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_hi); v2 = _mm512_mask_permutevar_epi32(v2, 0x5555, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - v.v_lo); + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + v.v_lo); _mm512_store_epi64(p, v2); _mm512_store_epi64(((uint8_t*)p)+64, v1); } @@ -1262,33 +1262,33 @@ template <> FORCEINLINE void __store<128>(__vec16_i64 *p, __vec16_i64 v) { /*! gather vector of 64-bit ints from addresses pointing to uniform ints (iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF - ***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't - expose whether it's from array of uniform or array of varying - poitners, so in here there's no way to tell - only thing we can do - is pick one... - */ + ***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't + expose whether it's from array of uniform or array of varying + poitners, so in here there's no way to tell - only thing we can do + is pick one... +*/ static FORCEINLINE __vec16_i64 __gather_base_offsets32_i64(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { + __vec16_i1 mask) { __vec16_i64 ret; ret.v_lo = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, - base, _MM_UPCONV_EPI32_NONE, scale, - _MM_HINT_NONE); + base, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); ret.v_hi = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, - base+4, _MM_UPCONV_EPI32_NONE, scale, - _MM_HINT_NONE); + base+4, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); return ret; } /*! gather vector of 64-bit ints from addresses pointing to uniform ints (iw) WARNING: THIS CODE ONLY WORKS FOR GATHERS FROM ARRAYS OF - ***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't - expose whether it's from array of uniform or array of varying - poitners, so in here there's no way to tell - only thing we can do - is pick one... - */ - static FORCEINLINE __vec16_i64 + ***UNIFORM*** INT64's/POINTERS. (problem is that ispc doesn't + expose whether it's from array of uniform or array of varying + poitners, so in here there's no way to tell - only thing we can do + is pick one... +*/ +static FORCEINLINE __vec16_i64 __gather64_i64(__vec16_i64 addr, __vec16_i1 mask) { __vec16_i64 ret; @@ -1301,16 +1301,16 @@ __gather64_i64(__vec16_i64 addr, __vec16_i1 mask) int first_active_lane = _mm_tzcnt_32((int)still_to_do); const uint32_t &hi32 = ((uint*)&addr.v_hi)[first_active_lane]; __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN)); ret.v_lo = _mm512_mask_i32extgather_epi32(ret.v_lo, match, signed_offsets, - base, _MM_UPCONV_EPI32_NONE, 1, - _MM_HINT_NONE); + base, _MM_UPCONV_EPI32_NONE, 1, + _MM_HINT_NONE); ret.v_hi = _mm512_mask_i32extgather_epi32(ret.v_hi, match, signed_offsets, - base+4, _MM_UPCONV_EPI32_NONE, 1, - _MM_HINT_NONE); + base+4, _MM_UPCONV_EPI32_NONE, 1, + _MM_HINT_NONE); still_to_do = _mm512_kxor(match, still_to_do); } @@ -1345,7 +1345,7 @@ static FORCEINLINE __vec16_i1 __equal_float(__vec16_f a, __vec16_f b) { } static FORCEINLINE __vec16_i1 __equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpeq_ps_mask(m, a, b); } @@ -1354,7 +1354,7 @@ static FORCEINLINE __vec16_i1 __not_equal_float(__vec16_f a, __vec16_f b) { } static FORCEINLINE __vec16_i1 __not_equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpneq_ps_mask(m, a, b); } @@ -1363,7 +1363,7 @@ static FORCEINLINE __vec16_i1 __less_than_float(__vec16_f a, __vec16_f b) { } static FORCEINLINE __vec16_i1 __less_than_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmplt_ps_mask(m, a, b); } @@ -1372,7 +1372,7 @@ static FORCEINLINE __vec16_i1 __less_equal_float(__vec16_f a, __vec16_f b) { } static FORCEINLINE __vec16_i1 __less_equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmple_ps_mask(m, a, b); } @@ -1381,7 +1381,7 @@ static FORCEINLINE __vec16_i1 __greater_than_float(__vec16_f a, __vec16_f b) { } static FORCEINLINE __vec16_i1 __greater_than_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpnle_ps_mask(m, a, b); } @@ -1390,7 +1390,7 @@ static FORCEINLINE __vec16_i1 __greater_equal_float(__vec16_f a, __vec16_f b) { } static FORCEINLINE __vec16_i1 __greater_equal_float_and_mask(__vec16_f a, __vec16_f b, - __vec16_i1 m) { + __vec16_i1 m) { return _mm512_mask_cmpnlt_ps_mask(m, a, b); } @@ -1519,7 +1519,7 @@ class Float16Compressor static int32_t const maxD = infC - maxC - 1; static int32_t const minD = minC - subC - 1; - public: +public: static uint16_t compress(float value) { @@ -1629,7 +1629,7 @@ static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) { } static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { + __vec16_i1 m) { __vec16_i1 ret1; __vec16_i1 ret2; ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1); @@ -1647,7 +1647,7 @@ static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) { } static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { + __vec16_i1 m) { __vec16_i1 ret1; __vec16_i1 ret2; __vec16_i1 tmp_m = m; @@ -1665,7 +1665,7 @@ static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) { } static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { + __vec16_i1 m) { __vec16_i1 ret1; __vec16_i1 ret2; __vec16_i1 tmp_m = m; @@ -1683,7 +1683,7 @@ static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) { } static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { + __vec16_i1 m) { __vec16_i1 ret1; __vec16_i1 ret2; __vec16_i1 tmp_m = m; @@ -1701,7 +1701,7 @@ static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) { } static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { + __vec16_i1 m) { __vec16_i1 ret1; __vec16_i1 ret2; __vec16_i1 tmp_m = m; @@ -1719,7 +1719,7 @@ static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) { } static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b, - __vec16_i1 m) { + __vec16_i1 m) { __vec16_i1 ret1; __vec16_i1 ret2; __vec16_i1 tmp_m = m; @@ -1967,15 +1967,15 @@ static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i64 val) { hilo2zmm(val, tmp1, tmp2); __vec16_f ret; -/* + /* // Cycles don't work. It seems that it is icc bug. for (int i = 0; i < 8; i++) { - ret[i] = (float)(((int64_t*)&tmp1)[i]); + ret[i] = (float)(((int64_t*)&tmp1)[i]); } for (int i = 0; i < 8; i++) { - ((float*)&ret)[i + 8] = (float)(((int64_t*)&tmp2)[i]); + ((float*)&ret)[i + 8] = (float)(((int64_t*)&tmp2)[i]); } -*/ + */ ret[0] = (float)(((int64_t*)&tmp1)[0]); ret[1] = (float)(((int64_t*)&tmp1)[1]); @@ -2065,12 +2065,12 @@ static FORCEINLINE __vec16_f __cast_uitofp(__vec16_f, __vec16_i64 val) { __vec16_f ret; // Cycles don't work. It seems that it is icc bug. /* - for (int i = 0; i < 8; i++) { + for (int i = 0; i < 8; i++) { ((float*)&ret)[i] = ((float)(((uint64_t*)&tmp1)[i])); - } - for (int i = 0; i < 8; i++) { + } + for (int i = 0; i < 8; i++) { ((float*)&ret)[i + 8] = ((float)(((uint64_t*)&tmp2)[i])); - } + } */ ret[0] = ((float)(((uint64_t*)&tmp1)[0])); ret[1] = ((float)(((uint64_t*)&tmp1)[1])); @@ -2296,38 +2296,38 @@ static FORCEINLINE __vec16_i32 __cast_bits(__vec16_i32, __vec16_f val) { static FORCEINLINE __vec16_i64 __cast_bits(__vec16_i64, __vec16_d val) { __vec16_i64 ret; ret.v_hi = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - _mm512_castpd_si512(val.v2)); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + _mm512_castpd_si512(val.v2)); ret.v_hi = _mm512_mask_permutevar_epi32(ret.v_hi, 0x00FF, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - _mm512_castpd_si512(val.v1)); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + _mm512_castpd_si512(val.v1)); ret.v_lo = _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xFF00, - _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), - _mm512_castpd_si512(val.v2)); + _mm512_set_16to16_pi(14,12,10,8,6,4,2,0,15,13,11,9,7,5,3,1), + _mm512_castpd_si512(val.v2)); ret.v_lo = _mm512_mask_permutevar_epi32(ret.v_lo, 0x00FF, - _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), - _mm512_castpd_si512(val.v1)); + _mm512_set_16to16_pi(15,13,11,9,7,5,3,1,14,12,10,8,6,4,2,0), + _mm512_castpd_si512(val.v1)); return ret; } static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { __vec16_d ret; ret.v2 = _mm512_castsi512_pd( - _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - val.v_hi)); + _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + val.v_hi)); ret.v2 = _mm512_castsi512_pd( - _mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v2), 0x5555, - _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), - val.v_lo)); + _mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v2), 0x5555, + _mm512_set_16to16_pi(15,15,14,14,13,13,12,12,11,11,10,10,9,9,8,8), + val.v_lo)); ret.v1 = _mm512_castsi512_pd( - _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - val.v_hi)); + _mm512_mask_permutevar_epi32(_mm512_undefined_epi32(), 0xAAAA, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + val.v_hi)); ret.v1 = _mm512_castsi512_pd( - _mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v1), 0x5555, - _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), - val.v_lo)); + _mm512_mask_permutevar_epi32(_mm512_castpd_si512(ret.v1), 0x5555, + _mm512_set_16to16_pi(7,7,6,6,5,5,4,4,3,3,2,2,1,1,0,0), + val.v_lo)); return ret; } @@ -2335,65 +2335,65 @@ static FORCEINLINE __vec16_d __cast_bits(__vec16_d, __vec16_i64 val) { /////////////////////////////////////////////////////////////////////////// // templates for int8/16 operations /////////////////////////////////////////////////////////////////////////// -#define BINARY_OP(TYPE, NAME, OP) \ -static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ - TYPE ret; \ - for (int i = 0; i < 16; ++i) \ - ret[i] = a[i] OP b[i]; \ - return ret; \ -} +#define BINARY_OP(TYPE, NAME, OP) \ + static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ + TYPE ret; \ + for (int i = 0; i < 16; ++i) \ + ret[i] = a[i] OP b[i]; \ + return ret; \ + } /* knc::macro::used */ -#define BINARY_OP_CAST(TYPE, CAST, NAME, OP) \ -static FORCEINLINE TYPE NAME(TYPE a, TYPE b) { \ - TYPE ret; \ - for (int i = 0; i < 16; ++i) \ - ret[i] = (CAST)(a[i]) OP (CAST)(b[i]); \ - return ret; \ -} - -#define CMP_OP(TYPE, SUFFIX, CAST, NAME, OP) \ -static FORCEINLINE __vec16_i1 NAME##_##SUFFIX(TYPE a, TYPE b) { \ - __vec16_i1 ret; \ - ret.v = 0; \ - for (int i = 0; i < 16; ++i) \ - ret.v |= ((CAST)(a[i]) OP (CAST)(b[i])) << i; \ - return ret; \ -} - -#define SHIFT_UNIFORM(TYPE, CAST, NAME, OP) \ -static FORCEINLINE TYPE NAME(TYPE a, int32_t b) { \ - TYPE ret; \ - for (int i = 0; i < 16; ++i) \ - ret[i] = (CAST)(a[i]) OP b; \ - return ret; \ -} - -#define SELECT(TYPE) \ -static FORCEINLINE TYPE __select(__vec16_i1 mask, TYPE a, TYPE b) { \ - TYPE ret; \ - for (int i = 0; i < 16; ++i) \ - ret[i] = (mask.v & (1< static RetVecType __setzero_i8(); template <> FORCEINLINE __vec16_i8 __setzero_i8<__vec16_i8>() { - return __vec16_i8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + return __vec16_i8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } BINARY_OP(__vec16_i8, __add, +) @@ -2430,25 +2430,25 @@ CMP_OP(__vec16_i8, i8, int8_t, __signed_greater_than, >) SELECT(__vec16_i8) static FORCEINLINE int8_t __extract_element(__vec16_i8 v, uint32_t index) { - return v[index]; + return v[index]; } static FORCEINLINE void __insert_element(__vec16_i8 *v, uint32_t index, int8_t val) { - ((int32_t *)v)[index] = val; + ((int32_t *)v)[index] = val; } static FORCEINLINE __vec16_i8 __broadcast_i8(__vec16_i8 v, int index) { - int32_t val = __extract_element(v, index & 0xf); - __vec16_i32 tmp = _mm512_set1_epi32(val); - __vec16_i8 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; + int32_t val = __extract_element(v, index & 0xf); + __vec16_i32 tmp = _mm512_set1_epi32(val); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; } static FORCEINLINE __vec16_i1 __not_equal_i8(__vec16_i8 a, __vec16_i8 b) { - __vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - __vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - return __not_equal_i32(tmp_a, tmp_b); + __vec16_i32 tmp_a = _mm512_extload_epi32(&a, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp_b = _mm512_extload_epi32(&b, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + return __not_equal_i32(tmp_a, tmp_b); } static FORCEINLINE __vec16_i1 __equal_i8_and_mask(const __vec16_i8 &a, const __vec16_i8 &b, __vec16_i1 m) { @@ -2464,37 +2464,37 @@ static FORCEINLINE __vec16_i1 __not_equal_i8_and_mask(__vec16_i8 a, __vec16_i8 b } static FORCEINLINE __vec16_i8 __rotate_i8(__vec16_i8 v, int index) { - __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - __vec16_i32 tmp = __rotate_i32(tmp_v, index); - __vec16_i8 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; + __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = __rotate_i32(tmp_v, index); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; } static FORCEINLINE __vec16_i8 __shuffle_i8(__vec16_i8 v, __vec16_i32 index) { - __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - __vec16_i32 tmp = __shuffle_i32(tmp_v, index); - __vec16_i8 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; + __vec16_i32 tmp_v = _mm512_extload_epi32(&v, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = __shuffle_i32(tmp_v, index); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; } template static RetVecType __smear_i8(int8_t i); template <> FORCEINLINE __vec16_i8 __smear_i8<__vec16_i8>(int8_t i) { - __vec16_i32 tmp = __smear_i32<__vec16_i32>(i); - __vec16_i8 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; + __vec16_i32 tmp = __smear_i32<__vec16_i32>(i); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; } static FORCEINLINE __vec16_i8 __shuffle2_i8(__vec16_i8 v0, __vec16_i8 v1, __vec16_i32 index) { - __vec16_i32 tmp_v0 = _mm512_extload_epi32(&v0, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - __vec16_i32 tmp_v1 = _mm512_extload_epi32(&v1, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); - __vec16_i32 tmp = __shuffle2_i32(tmp_v0, tmp_v1, index); - __vec16_i8 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); - return ret; + __vec16_i32 tmp_v0 = _mm512_extload_epi32(&v0, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp_v1 = _mm512_extload_epi32(&v1, _MM_UPCONV_EPI32_SINT8, _MM_BROADCAST32_NONE, _MM_HINT_NONE); + __vec16_i32 tmp = __shuffle2_i32(tmp_v0, tmp_v1, index); + __vec16_i8 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); + return ret; } /////////////////////////////////////////////////////////////////////////// // int16 @@ -2503,7 +2503,7 @@ static FORCEINLINE __vec16_i8 __shuffle2_i8(__vec16_i8 v0, __vec16_i8 v1, __vec1 template static RetVecType __setzero_i16(); template <> FORCEINLINE __vec16_i16 __setzero_i16<__vec16_i16>() { - return __vec16_i16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + return __vec16_i16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } BINARY_OP(__vec16_i16, __add, +) @@ -2540,19 +2540,19 @@ CMP_OP(__vec16_i16, i16, int16_t, __signed_greater_than, >) SELECT(__vec16_i16) static FORCEINLINE int16_t __extract_element(__vec16_i16 v, uint32_t index) { - return v[index]; + return v[index]; } static FORCEINLINE void __insert_element(__vec16_i16 *v, uint32_t index, int16_t val) { - ((int16_t *)v)[index] = val; + ((int16_t *)v)[index] = val; } static FORCEINLINE __vec16_i16 __broadcast_i16(__vec16_i16 v, int index) { - int32_t val = __extract_element(v, index & 0xf); - __vec16_i32 tmp = _mm512_set1_epi32(val); - __vec16_i16 ret; - _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); - return ret; + int32_t val = __extract_element(v, index & 0xf); + __vec16_i32 tmp = _mm512_set1_epi32(val); + __vec16_i16 ret; + _mm512_extstore_epi32(&ret, tmp, _MM_DOWNCONV_EPI32_SINT16,_MM_HINT_NONE); + return ret; } static FORCEINLINE __vec16_i1 __not_equal_i16(__vec16_i16 a, __vec16_i16 b) { @@ -2794,8 +2794,8 @@ static FORCEINLINE int32_t __count_leading_zeros_i32(__vec1_i32 mask) { return 32; while (1) { if (mask < 0) break; - n ++; - mask <<= 1; + n ++; + mask <<= 1; } return n; } @@ -2806,8 +2806,8 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(__vec1_i64 mask) { return 64; while (1) { if (mask < 0) break; - n ++; - mask <<= 1; + n ++; + mask <<= 1; } return n; } @@ -2990,16 +2990,16 @@ static FORCEINLINE void __masked_store_i8(void *p, const __vec16_i8 &val, __vec1 #ifdef ISPC_FORCE_ALIGNED_MEMORY _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); #else - #if 0 // TODO: both implementations seem to work, need to test which one is faster +#if 0 // TODO: both implementations seem to work, need to test which one is faster _mm512_mask_i32extscatter_epi32 (p, mask, __vec16_i32(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), tmp, _MM_DOWNCONV_EPI32_SINT8, sizeof(uint8_t), _MM_HINT_NONE); - #else +#else __vec16_i32 tmp_; tmp_.v = _mm512_extloadunpacklo_epi32(tmp_.v, p, _MM_UPCONV_EPI32_SINT8, _MM_HINT_NONE); tmp_.v = _mm512_extloadunpackhi_epi32(tmp_.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_SINT8, _MM_HINT_NONE); tmp_.v = _mm512_mask_mov_epi32(tmp_.v, mask, tmp.v); _mm512_extpackstorelo_epi32(p, tmp_.v, _MM_DOWNCONV_EPI32_SINT8, _MM_HINT_NONE); _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp_.v, _MM_DOWNCONV_EPI32_SINT8, _MM_HINT_NONE); - #endif // if 0 +#endif // if 0 #endif } @@ -3025,13 +3025,13 @@ template static FORCEINLINE void __store(__vec16_i8 *p, __vec16_i8 v static FORCEINLINE void __scatter_base_offsets32_i8(uint8_t *b, uint32_t scale, __vec16_i32 offsets, - __vec16_i8 val, __vec16_i1 mask) + __vec16_i8 val, __vec16_i1 mask) { __vec16_i32 tmp = _mm512_extload_epi32(&val,_MM_UPCONV_EPI32_SINT8, - _MM_BROADCAST32_NONE, _MM_HINT_NONE); + _MM_BROADCAST32_NONE, _MM_HINT_NONE); _mm512_mask_i32extscatter_epi32(b, mask, offsets, tmp, - _MM_DOWNCONV_EPI32_SINT8, scale, - _MM_HINT_NONE); + _MM_DOWNCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); } @@ -3040,16 +3040,16 @@ static FORCEINLINE void __masked_store_i16(void *p, const __vec16_i16 &val, __ve #ifdef ISPC_FORCE_ALIGNED_MEMORY _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT16, _MM_HINT_NONE); #else - #if 0 // TODO: both implementations seem to work, need to test which one is faster +#if 0 // TODO: both implementations seem to work, need to test which one is faster _mm512_mask_i32extscatter_epi32 (p, mask, __vec16_i32(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), tmp, _MM_DOWNCONV_EPI32_SINT16, sizeof(uint16_t), _MM_HINT_NONE); - #else +#else __vec16_i32 tmp_; tmp_.v = _mm512_extloadunpacklo_epi32(tmp_.v, p, _MM_UPCONV_EPI32_SINT16, _MM_HINT_NONE); tmp_.v = _mm512_extloadunpackhi_epi32(tmp_.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_SINT16, _MM_HINT_NONE); tmp_.v = _mm512_mask_mov_epi32(tmp_.v, mask, tmp.v); _mm512_extpackstorelo_epi32(p, tmp_.v, _MM_DOWNCONV_EPI32_SINT16, _MM_HINT_NONE); _mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp_.v, _MM_DOWNCONV_EPI32_SINT16, _MM_HINT_NONE); - #endif // if 0 +#endif // if 0 #endif } @@ -3090,7 +3090,7 @@ static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 } static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, - __vec16_i1 mask) { + __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY _mm512_mask_store_ps(p, mask, val.v); #else @@ -3104,7 +3104,7 @@ static FORCEINLINE void __masked_store_float(void *p, __vec16_f val, } static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, - __vec16_i1 mask) { + __vec16_i1 mask) { #ifdef ISPC_FORCE_ALIGNED_MEMORY __vec16_i1 tmp_m = mask; tmp_m = _mm512_kswapb(tmp_m, tmp_m); @@ -3128,12 +3128,12 @@ static FORCEINLINE void __masked_store_double(void *p, __vec16_d val, } static FORCEINLINE void __masked_store_blend_i32(void *p, __vec16_i32 val, - __vec16_i1 mask) { + __vec16_i1 mask) { __masked_store_i32(p, val, mask); } static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, - __vec16_i1 mask) { + __vec16_i1 mask) { __masked_store_float(p, val, mask); } @@ -3145,11 +3145,11 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, static FORCEINLINE __vec16_i8 __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { + __vec16_i1 mask) { // (iw): need to temporarily store as int because gathers can only return ints. __vec16_i32 tmp = _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, base, - _MM_UPCONV_EPI32_SINT8, scale, - _MM_HINT_NONE); + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); // now, downconverting to chars into temporary char vector __vec16_i8 ret; _mm512_extstore_epi32(ret.v,tmp,_MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); @@ -3158,31 +3158,31 @@ __gather_base_offsets32_i8(uint8_t *base, uint32_t scale, __vec16_i32 offsets, static FORCEINLINE __vec16_i32 __gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { + __vec16_i1 mask) { return _mm512_mask_i32extgather_epi32(_mm512_undefined_epi32(), mask, offsets, - base, _MM_UPCONV_EPI32_NONE, scale, - _MM_HINT_NONE); + base, _MM_UPCONV_EPI32_NONE, scale, + _MM_HINT_NONE); } static FORCEINLINE __vec16_f __gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { + __vec16_i1 mask) { return _mm512_mask_i32extgather_ps(_mm512_undefined_ps(), mask, offsets, - base, _MM_UPCONV_PS_NONE, scale, - _MM_HINT_NONE); + base, _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); } static FORCEINLINE __vec16_d __gather_base_offsets32_double(uint8_t *base, uint32_t scale, __vec16_i32 offsets, - __vec16_i1 mask) { + __vec16_i1 mask) { __vec16_d ret; ret.v1 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, offsets, - base, _MM_UPCONV_PD_NONE, scale, - _MM_HINT_NONE); + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); __m512i shuffled_offsets = _mm512_permute4f128_epi32(offsets.v, _MM_PERM_DCDC); ret.v2 = _mm512_mask_i32loextgather_pd(_mm512_undefined_pd(), mask, shuffled_offsets, - base, _MM_UPCONV_PD_NONE, scale, - _MM_HINT_NONE); + base, _MM_UPCONV_PD_NONE, scale, + _MM_HINT_NONE); return ret; } @@ -3199,14 +3199,14 @@ __gather64_float(__vec16_i64 addr, __vec16_i1 mask) int first_active_lane = _mm_tzcnt_32((int)still_to_do); const uint &hi32 = ((uint*)&addr.v_hi)[first_active_lane]; __vec16_i1 match = _mm512_mask_cmp_epi32_mask(still_to_do,addr.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); void * base = (void*)((((unsigned long)hi32) << 32) + (unsigned long)(-(long)INT_MIN)); ret.v = _mm512_mask_i32extgather_ps(ret.v, match, signed_offsets, - base, _MM_UPCONV_PS_NONE, 1, - _MM_HINT_NONE); + base, _MM_UPCONV_PS_NONE, 1, + _MM_HINT_NONE); still_to_do = _mm512_kxor(match, still_to_do); } @@ -3223,7 +3223,7 @@ __gather64_float(__vec16_i64 addr, __vec16_i1 mask) static FORCEINLINE __vec16_f __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i1 mask) { + __vec16_i1 mask) { const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); // There is no gather instruction with 64-bit offsets in KNC. @@ -3234,14 +3234,14 @@ __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offset int first_active_lane = _mm_tzcnt_32((int)still_to_do); const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); ret = _mm512_mask_i32extgather_ps(ret, match, signed_offsets, base, - _MM_UPCONV_PS_NONE, scale, - _MM_HINT_NONE); + _MM_UPCONV_PS_NONE, scale, + _MM_HINT_NONE); still_to_do = _mm512_kxor(match, still_to_do); } @@ -3250,7 +3250,7 @@ __gather_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offset static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i1 mask) + __vec16_i1 mask) { const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); @@ -3260,14 +3260,14 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_ int first_active_lane = _mm_tzcnt_32((int)still_to_do); const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); tmp = _mm512_mask_i32extgather_epi32(tmp, match, signed_offsets, base, - _MM_UPCONV_EPI32_SINT8, scale, - _MM_HINT_NONE); + _MM_UPCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); still_to_do = _mm512_kxor(match,still_to_do); } __vec16_i8 ret; @@ -3277,8 +3277,8 @@ static FORCEINLINE __vec16_i8 __gather_base_offsets64_i8(uint8_t *_base, uint32_ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_f value, - __vec16_i1 mask) { + __vec16_f value, + __vec16_i1 mask) { const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); __vec16_i1 still_to_do = mask; @@ -3286,22 +3286,22 @@ static FORCEINLINE void __scatter_base_offsets64_float(uint8_t *_base, uint32_t int first_active_lane = _mm_tzcnt_32((int)still_to_do); const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); _mm512_mask_i32extscatter_ps(base, match, signed_offsets, - value, - _MM_DOWNCONV_PS_NONE, scale, - _MM_HINT_NONE); + value, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); still_to_do = _mm512_kxor(match,still_to_do); } } static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i32 value, - __vec16_i1 mask) { + __vec16_i32 value, + __vec16_i1 mask) { const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); __vec16_i1 still_to_do = mask; @@ -3309,22 +3309,22 @@ static FORCEINLINE void __scatter_base_offsets64_i32(uint8_t *_base, uint32_t sc int first_active_lane = _mm_tzcnt_32((int)still_to_do); const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); _mm512_mask_i32extscatter_epi32(base, match, signed_offsets, - value, - _MM_DOWNCONV_EPI32_NONE, scale, - _MM_HINT_NONE); + value, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); still_to_do = _mm512_kxor(match,still_to_do); } } static FORCEINLINE void __scatter_base_offsets64_i64(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i64 value, - __vec16_i1 mask) { + __vec16_i64 value, + __vec16_i1 mask) { const __vec16_i32 signed_offsets = _mm512_add_epi32(offsets.v_lo, __smear_i32<__vec16_i32>((int32_t)INT_MIN)); __vec16_i1 still_to_do = mask; @@ -3332,8 +3332,8 @@ static FORCEINLINE void __scatter_base_offsets64_i64(uint8_t *_base, uint32_t sc int first_active_lane = _mm_tzcnt_32((int)still_to_do); const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + ((scale*(unsigned long)hi32) << 32) + scale*(unsigned long)(-(long)INT_MIN)); @@ -3346,27 +3346,27 @@ static FORCEINLINE void __scatter_base_offsets64_i64(uint8_t *_base, uint32_t sc static FORCEINLINE void // TODO __scatter_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i8 value, - __vec16_i1 mask) { + __vec16_i8 value, + __vec16_i1 mask) { __vec16_i1 still_to_do = mask; __vec16_i32 tmp = _mm512_extload_epi32(&value, _MM_UPCONV_EPI32_SINT8, - _MM_BROADCAST32_NONE, _MM_HINT_NONE); + _MM_BROADCAST32_NONE, _MM_HINT_NONE); // _mm512_mask_extstore_epi32(p, mask, tmp, _MM_DOWNCONV_EPI32_SINT8,_MM_HINT_NONE); while (still_to_do) { int first_active_lane = _mm_tzcnt_32((int)still_to_do); const uint &hi32 = ((uint*)&offsets.v_hi)[first_active_lane]; __vec16_i1 match = _mm512_mask_cmp_epi32_mask(mask,offsets.v_hi, - __smear_i32<__vec16_i32>((int32_t)hi32), - _MM_CMPINT_EQ); + __smear_i32<__vec16_i32>((int32_t)hi32), + _MM_CMPINT_EQ); void * base = (void*)((unsigned long)_base + - ((scale*(unsigned long)hi32) << 32)); + ((scale*(unsigned long)hi32) << 32)); _mm512_mask_i32extscatter_epi32(base, match, offsets.v_lo, - tmp, - _MM_DOWNCONV_EPI32_SINT8, scale, - _MM_HINT_NONE); + tmp, + _MM_DOWNCONV_EPI32_SINT8, scale, + _MM_HINT_NONE); still_to_do = _mm512_kxor(match,still_to_do); } } @@ -3374,7 +3374,7 @@ __scatter_base_offsets64_i8(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, static FORCEINLINE __vec16_i32 __gather_base_offsets64_i32(uint8_t *_base, uint32_t scale, __vec16_i64 offsets, - __vec16_i1 mask) + __vec16_i1 mask) { __vec16_f r = __gather_base_offsets64_float(_base,scale,offsets,mask); return (__vec16_i32&)r; @@ -3408,25 +3408,25 @@ static FORCEINLINE void __scatter_base_offsets32_double(void *base, uint32_t sca /* -static FORCEINLINE void __scatter64_float(__vec16_i64 ptrs, __vec16_f val, __vec16_i1 mask) { -} + static FORCEINLINE void __scatter64_float(__vec16_i64 ptrs, __vec16_f val, __vec16_i1 mask) { + } -static FORCEINLINE void __scatter64_double(__vec16_i64 ptrs, __vec16_d val, __vec16_i1 mask) { -} + static FORCEINLINE void __scatter64_double(__vec16_i64 ptrs, __vec16_d val, __vec16_i1 mask) { + } -static FORCEINLINE void __scatter64_i8(__vec16_i64 ptrs, __vec16_i8 val, __vec16_i1 mask) { -} + static FORCEINLINE void __scatter64_i8(__vec16_i64 ptrs, __vec16_i8 val, __vec16_i1 mask) { + } -static FORCEINLINE void __scatter64_i16(__vec16_i64 ptrs, __vec16_i16 val, __vec16_i1 mask) { -} + static FORCEINLINE void __scatter64_i16(__vec16_i64 ptrs, __vec16_i16 val, __vec16_i1 mask) { + } -static FORCEINLINE void __scatter64_i32(__vec16_i64 ptrs, __vec16_i32 val, __vec16_i1 mask) { -} + static FORCEINLINE void __scatter64_i32(__vec16_i64 ptrs, __vec16_i32 val, __vec16_i1 mask) { + } */ static FORCEINLINE void __scatter64_i64(__vec16_i64 ptrs, __vec16_i64 val, __vec16_i1 mask) { #if __INTEL_COMPILER < 1500 - #warning "__scatter64_i64 is slow due to outdated compiler" +#warning "__scatter64_i64 is slow due to outdated compiler" __scatter_base_offsets64_i64(0, 1, ptrs, val, mask); #else __vec16_i32 first8ptrs, second8ptrs; @@ -3485,12 +3485,12 @@ static FORCEINLINE void __prefetch_read_uniform_nt(uint8_t *p) { // _mm_prefetch(p, _MM_HINT_NTA); // prefetch into L1$ with non-temporal hint } -#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \ -static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ - __vec16_i32 offsets, __vec16_i1 mask) { \ - _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \ -} \ -static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ +#define PREFETCH_READ_VARYING(CACHE_NUM, HINT) \ + static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM##_native(uint8_t *base, uint32_t scale, \ + __vec16_i32 offsets, __vec16_i1 mask) { \ + _mm512_mask_prefetch_i32gather_ps (offsets, mask, base, scale, HINT); \ + } \ + static FORCEINLINE void __prefetch_read_varying_##CACHE_NUM(__vec16_i64 addr, __vec16_i1 mask) {} \ PREFETCH_READ_VARYING(1, _MM_HINT_T0) PREFETCH_READ_VARYING(2, _MM_HINT_T1) @@ -3504,177 +3504,177 @@ PREFETCH_READ_VARYING(nt, _MM_HINT_T2) static FORCEINLINE uint32_t __atomic_add(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return InterlockedAdd((LONG volatile *)p, v) - v; +return InterlockedAdd((LONG volatile *)p, v) - v; #else - return __sync_fetch_and_add(p, v); +return __sync_fetch_and_add(p, v); #endif } static FORCEINLINE uint32_t __atomic_sub(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return InterlockedAdd((LONG volatile *)p, -v) + v; +return InterlockedAdd((LONG volatile *)p, -v) + v; #else - return __sync_fetch_and_sub(p, v); +return __sync_fetch_and_sub(p, v); #endif } static FORCEINLINE uint32_t __atomic_and(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return _InterlockedAnd((LONG volatile *)p, v); +return _InterlockedAnd((LONG volatile *)p, v); #else - return __sync_fetch_and_and(p, v); +return __sync_fetch_and_and(p, v); #endif } static FORCEINLINE uint32_t __atomic_or(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return _InterlockedOr((LONG volatile *)p, v); +return _InterlockedOr((LONG volatile *)p, v); #else - return __sync_fetch_and_or(p, v); +return __sync_fetch_and_or(p, v); #endif } static FORCEINLINE uint32_t __atomic_xor(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return _InterlockedXor((LONG volatile *)p, v); +return _InterlockedXor((LONG volatile *)p, v); #else - return __sync_fetch_and_xor(p, v); +return __sync_fetch_and_xor(p, v); #endif } static FORCEINLINE uint32_t __atomic_min(uint32_t *p, uint32_t v) { - int32_t old, min; - do { - old = *((volatile int32_t *)p); - min = (old < (int32_t)v) ? old : (int32_t)v; +int32_t old, min; +do { +old = *((volatile int32_t *)p); +min = (old < (int32_t)v) ? old : (int32_t)v; #ifdef _MSC_VER - } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +} while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, min) == false); +} while (__sync_bool_compare_and_swap(p, old, min) == false); #endif - return old; +return old; } static FORCEINLINE uint32_t __atomic_max(uint32_t *p, uint32_t v) { - int32_t old, max; - do { - old = *((volatile int32_t *)p); - max = (old > (int32_t)v) ? old : (int32_t)v; +int32_t old, max; +do { +old = *((volatile int32_t *)p); +max = (old > (int32_t)v) ? old : (int32_t)v; #ifdef _MSC_VER - } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +} while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, max) == false); +} while (__sync_bool_compare_and_swap(p, old, max) == false); #endif - return old; +return old; } static FORCEINLINE uint32_t __atomic_umin(uint32_t *p, uint32_t v) { - uint32_t old, min; - do { - old = *((volatile uint32_t *)p); - min = (old < v) ? old : v; +uint32_t old, min; +do { +old = *((volatile uint32_t *)p); +min = (old < v) ? old : v; #ifdef _MSC_VER - } while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); +} while (InterlockedCompareExchange((LONG volatile *)p, min, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, min) == false); +} while (__sync_bool_compare_and_swap(p, old, min) == false); #endif - return old; +return old; } static FORCEINLINE uint32_t __atomic_umax(uint32_t *p, uint32_t v) { - uint32_t old, max; - do { - old = *((volatile uint32_t *)p); - max = (old > v) ? old : v; +uint32_t old, max; +do { +old = *((volatile uint32_t *)p); +max = (old > v) ? old : v; #ifdef _MSC_VER - } while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); +} while (InterlockedCompareExchange((LONG volatile *)p, max, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, max) == false); +} while (__sync_bool_compare_and_swap(p, old, max) == false); #endif - return old; +return old; } static FORCEINLINE uint32_t __atomic_xchg(uint32_t *p, uint32_t v) { #ifdef _MSC_VER - return InterlockedExchange((LONG volatile *)p, v); +return InterlockedExchange((LONG volatile *)p, v); #else - return __sync_lock_test_and_set(p, v); +return __sync_lock_test_and_set(p, v); #endif } static FORCEINLINE uint32_t __atomic_cmpxchg(uint32_t *p, uint32_t cmpval, - uint32_t newval) { + uint32_t newval) { #ifdef _MSC_VER - return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval); +return InterlockedCompareExchange((LONG volatile *)p, newval, cmpval); #else - return __sync_val_compare_and_swap(p, cmpval, newval); +return __sync_val_compare_and_swap(p, cmpval, newval); #endif } static FORCEINLINE uint64_t __atomic_add(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedAdd64((LONGLONG volatile *)p, v) - v; +return InterlockedAdd64((LONGLONG volatile *)p, v) - v; #else - return __sync_fetch_and_add(p, v); +return __sync_fetch_and_add(p, v); #endif } static FORCEINLINE uint64_t __atomic_sub(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedAdd64((LONGLONG volatile *)p, -v) + v; +return InterlockedAdd64((LONGLONG volatile *)p, -v) + v; #else - return __sync_fetch_and_sub(p, v); +return __sync_fetch_and_sub(p, v); #endif } static FORCEINLINE uint64_t __atomic_and(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedAnd64((LONGLONG volatile *)p, v) - v; +return InterlockedAnd64((LONGLONG volatile *)p, v) - v; #else - return __sync_fetch_and_and(p, v); +return __sync_fetch_and_and(p, v); #endif } static FORCEINLINE uint64_t __atomic_or(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedOr64((LONGLONG volatile *)p, v) - v; +return InterlockedOr64((LONGLONG volatile *)p, v) - v; #else - return __sync_fetch_and_or(p, v); +return __sync_fetch_and_or(p, v); #endif } static FORCEINLINE uint64_t __atomic_xor(uint64_t *p, uint64_t v) { #ifdef _MSC_VER - return InterlockedXor64((LONGLONG volatile *)p, v) - v; +return InterlockedXor64((LONGLONG volatile *)p, v) - v; #else - return __sync_fetch_and_xor(p, v); +return __sync_fetch_and_xor(p, v); #endif } static FORCEINLINE uint64_t __atomic_min(uint64_t *p, uint64_t v) { - int64_t old, min; - do { - old = *((volatile int64_t *)p); - min = (old < (int64_t)v) ? old : (int64_t)v; +int64_t old, min; +do { +old = *((volatile int64_t *)p); +min = (old < (int64_t)v) ? old : (int64_t)v; #ifdef _MSC_VER - } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); +} while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, min) == false); +} while (__sync_bool_compare_and_swap(p, old, min) == false); #endif - return old; +return old; } static FORCEINLINE uint64_t __atomic_max(uint64_t *p, uint64_t v) { - int64_t old, max; - do { - old = *((volatile int64_t *)p); - max = (old > (int64_t)v) ? old : (int64_t)v; +int64_t old, max; +do { +old = *((volatile int64_t *)p); +max = (old > (int64_t)v) ? old : (int64_t)v; #ifdef _MSC_VER - } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); + } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, max) == false); +} while (__sync_bool_compare_and_swap(p, old, max) == false); #endif - return old; +return old; } static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) { @@ -3685,9 +3685,9 @@ static FORCEINLINE uint64_t __atomic_umin(uint64_t *p, uint64_t v) { #ifdef _MSC_VER } while (InterlockedCompareExchange64((LONGLONG volatile *)p, min, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, min) == false); +} while (__sync_bool_compare_and_swap(p, old, min) == false); #endif - return old; +return old; } static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) { @@ -3698,9 +3698,9 @@ static FORCEINLINE uint64_t __atomic_umax(uint64_t *p, uint64_t v) { #ifdef _MSC_VER } while (InterlockedCompareExchange64((LONGLONG volatile *)p, max, old) != old); #else - } while (__sync_bool_compare_and_swap(p, old, max) == false); +} while (__sync_bool_compare_and_swap(p, old, max) == false); #endif - return old; +return old; } static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) { @@ -3712,7 +3712,7 @@ static FORCEINLINE uint64_t __atomic_xchg(uint64_t *p, uint64_t v) { } static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval, - uint64_t newval) { + uint64_t newval) { #ifdef _MSC_VER return InterlockedCompareExchange64((LONGLONG volatile *)p, newval, cmpval); #else @@ -3728,10 +3728,10 @@ static FORCEINLINE uint64_t __clock() { uint32_t low, high; #ifdef __x86_64 __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" - ::: "%rax", "%rbx", "%rcx", "%rdx" ); + ::: "%rax", "%rbx", "%rcx", "%rdx" ); #else __asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid" - ::: "%eax", "%ebx", "%ecx", "%edx" ); + ::: "%eax", "%ebx", "%ecx", "%edx" ); #endif __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high)); return (uint64_t)high << 32 | low; @@ -3742,11 +3742,11 @@ static FORCEINLINE uint64_t __clock() { // Transcendentals -#define TRANSCENDENTALS(op) \ -static FORCEINLINE __vec16_f __##op##_varying_float(__vec16_f v) { return _mm512_##op##_ps(v); } \ -static FORCEINLINE float __##op##_uniform_float(float v) { return op##f(v); } \ -static FORCEINLINE __vec16_d __##op##_varying_double(__vec16_d v) { return __vec16_d(_mm512_##op##_pd(v.v1),_mm512_##op##_pd(v.v2)); } \ -static FORCEINLINE double __##op##_uniform_double(double a) { return op(a); } +#define TRANSCENDENTALS(op) \ + static FORCEINLINE __vec16_f __##op##_varying_float(__vec16_f v) { return _mm512_##op##_ps(v); } \ + static FORCEINLINE float __##op##_uniform_float(float v) { return op##f(v); } \ + static FORCEINLINE __vec16_d __##op##_varying_double(__vec16_d v) { return __vec16_d(_mm512_##op##_pd(v.v1),_mm512_##op##_pd(v.v2)); } \ + static FORCEINLINE double __##op##_uniform_double(double a) { return op(a); } TRANSCENDENTALS(log) TRANSCENDENTALS(exp) diff --git a/ospray/common/Model.ispc b/ospray/common/Model.ispc index e3124f291c..9224e6c130 100644 --- a/ospray/common/Model.ispc +++ b/ospray/common/Model.ispc @@ -37,7 +37,7 @@ export void Model_init(void *uniform _model, uniform int32 numGeometries, unifor rtcDeleteScene(model->embreeSceneHandle); model->embreeSceneHandle = rtcNewScene(//RTC_SCENE_STATIC|RTC_SCENE_HIGH_QUALITY, - RTC_SCENE_STATIC,//|RTC_SCENE_COMPACT, + RTC_SCENE_STATIC,//|RTC_SCENE_COMPACT, //RTC_SCENE_DYNAMIC, //RTC_SCENE_DYNAMIC|RTC_SCENE_COMPACT, RTC_INTERSECT_UNIFORM|RTC_INTERSECT_VARYING); diff --git a/ospray/fb/FrameBuffer.cpp b/ospray/fb/FrameBuffer.cpp index 95039a60a2..93e127db37 100644 --- a/ospray/fb/FrameBuffer.cpp +++ b/ospray/fb/FrameBuffer.cpp @@ -64,7 +64,7 @@ namespace ospray { colorBuffer = new uint32[size.x*size.y]; break; default: - throw std::runtime_error("color buffer format not supported"); + throw std::runtime_error("color buffer format not supported"); } } @@ -97,7 +97,7 @@ namespace ospray { delete[] ((uint32*)colorBuffer); break; default: - throw std::runtime_error("color buffer format not supported"); + throw std::runtime_error("color buffer format not supported"); } if (accumBuffer) delete[] accumBuffer; } diff --git a/ospray/mpi/MPILoadBalancer.cpp b/ospray/mpi/MPILoadBalancer.cpp index 4c0e4b89b4..d264c20ffb 100644 --- a/ospray/mpi/MPILoadBalancer.cpp +++ b/ospray/mpi/MPILoadBalancer.cpp @@ -25,32 +25,33 @@ namespace ospray { using std::endl; namespace staticLoadBalancer { + Master::Master() { } + void Master::renderFrame(Renderer *tiledRenderer, FrameBuffer *fb, const uint32 channelFlags) { - int rc; MPI_Status status; + int rc; + MPI_Status status; // mpidevice already sent the 'cmd_render_frame' event; we // only have to wait for tiles... - const size_t numTiles = divRoundUp(fb->size.x,TILE_SIZE) * divRoundUp(fb->size.y,TILE_SIZE); - // printf("MASTER: num tiles %li\n",numTiles); assert(fb->colorBufferFormat == OSP_RGBA_I8); uint32 rgba_i8[TILE_SIZE][TILE_SIZE]; for (int i=0;i 2) - cout << "#w: new geometry " << handle << " " << geometry->toString() << endl; + cout << "#w: new geometry " << handle << " " << geometry->toString() << endl; } break; case api::MPIDevice::CMD_FRAMEBUFFER_CREATE: { diff --git a/ospray/render/Renderer.ispc b/ospray/render/Renderer.ispc index 20625ae166..792e9657b8 100644 --- a/ospray/render/Renderer.ispc +++ b/ospray/render/Renderer.ispc @@ -32,7 +32,7 @@ void Renderer_default_renderSample(uniform Renderer *uniform renderer, } void Renderer_default_beginFrame(uniform Renderer *uniform renderer, - uniform FrameBuffer *uniform fb) + uniform FrameBuffer *uniform fb) { renderer->fb = fb; if (renderer->camera == NULL) @@ -79,14 +79,9 @@ void Renderer_default_renderTile(uniform Renderer *uniform renderer, (screenSample.sampleID.y >= fb->size.y)) continue; - vec3f col = make_vec3f(0); -#if 0 - const uint32 zorderIdx = z_order.xyIdx[index]; - const uint32 pixel = getZOrderX(zorderIdx) + (getZOrderY(zorderIdx) * TILE_SIZE); -#else + vec3f col = make_vec3f(0.f); const uint32 pixel = z_order.xs[index] + (z_order.ys[index] * TILE_SIZE); -#endif - for (uniform uint32 s = 0; sspp; s++) { + for (uniform uint32 s = 0; s= fb->size.x) | (screenSample.sampleID.y >= fb->size.y)) { continue; @@ -132,7 +121,7 @@ void Renderer_default_renderTile(uniform Renderer *uniform renderer, const uint32 pixel = z_order.xs[index] + (z_order.ys[index] * TILE_SIZE); assert(pixel < TILE_SIZE*TILE_SIZE); - for (uniform uint32 s = 0; sspp; s++) { + for (uniform uint32 s = 0; srcpSize.x; cameraSample.screen.y = (screenSample.sampleID.y + pixel_dv) * fb->rcpSize.y; @@ -195,11 +184,11 @@ export void Renderer_setNearClip(void *uniform _renderer, uniform float nearClip } export void Renderer_unproject( void *uniform _renderer, - const uniform vec2f &screenPos, - uniform bool &hit, - uniform float &x, - uniform float &y, - uniform float &z) + const uniform vec2f &screenPos, + uniform bool &hit, + uniform float &x, + uniform float &y, + uniform float &z) { uniform Renderer *uniform renderer = (uniform Renderer *uniform)_renderer; uniform Camera *uniform camera = renderer->camera;