Permalink
Browse files

OpenGL Renderer: Use memcmp() to ensure that UploadClearImage() uploa…

…ds clear image data only when necessary. Significantly improves the performance of games that use clear images.
  • Loading branch information...
rogerman committed Aug 16, 2018
1 parent 208f99f commit c9a405e959065dacaabae5f4b5462e46ddc70f2d
Showing with 45 additions and 19 deletions.
  1. +36 −13 desmume/src/OGLRender.cpp
  2. +7 −5 desmume/src/OGLRender.h
  3. +2 −1 desmume/src/OGLRender_3_2.cpp
View
@@ -1066,6 +1066,7 @@ OpenGLRenderer::OpenGLRenderer()
_needsZeroDstAlphaPass = true;
_currentPolyIndex = 0;
_lastTextureDrawTarget = OGLTextureUnitID_GColor;
_clearImageIndex = 0;
}
OpenGLRenderer::~OpenGLRenderer()
@@ -2703,12 +2704,13 @@ Render3DError OpenGLRenderer_1_2::CreateFBOs()
glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8_EXT, this->_framebufferWidth, this->_framebufferHeight, 0, GL_DEPTH_STENCIL_EXT, GL_UNSIGNED_INT_24_8_EXT, NULL);
}
memset(OGLRef.workingCIColorBuffer, 0, sizeof(OGLRef.workingCIColorBuffer));
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIColorID);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIColorBuffer);
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthStencilID);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
@@ -3409,38 +3411,58 @@ Render3DError OpenGLRenderer_1_2::DestroyToonTable()
Render3DError OpenGLRenderer_1_2::UploadClearImage(const u16 *__restrict colorBuffer, const u32 *__restrict depthBuffer, const u8 *__restrict fogBuffer, const u8 *__restrict polyIDBuffer)
{
OGLRenderRef &OGLRef = *this->ref;
this->_clearImageIndex ^= 0x01;
if (this->isShaderSupported)
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
{
OGLRef.workingCIDepthStencilBuffer[i] = (depthBuffer[i] << 8) | polyIDBuffer[i];
OGLRef.workingCIFogAttributesBuffer[i] = (fogBuffer[i]) ? 0xFF0000FF : 0xFF000000;
OGLRef.workingCIPolyIDBuffer[i] = (GLuint)polyIDBuffer[i] | 0xFF000000;
OGLRef.workingCIDepthStencilBuffer[this->_clearImageIndex][i] = (depthBuffer[i] << 8) | polyIDBuffer[i];
OGLRef.workingCIFogAttributesBuffer[this->_clearImageIndex][i] = (fogBuffer[i]) ? 0xFF0000FF : 0xFF000000;
OGLRef.workingCIPolyIDBuffer[this->_clearImageIndex][i] = (GLuint)polyIDBuffer[i] | 0xFF000000;
}
}
else
{
for (size_t i = 0; i < GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT; i++)
{
OGLRef.workingCIDepthStencilBuffer[i] = (depthBuffer[i] << 8) | polyIDBuffer[i];
OGLRef.workingCIDepthStencilBuffer[this->_clearImageIndex][i] = (depthBuffer[i] << 8) | polyIDBuffer[i];
}
}
const bool didColorChange = (memcmp(OGLRef.workingCIColorBuffer, colorBuffer, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16)) != 0);
const bool didDepthStencilChange = (memcmp(OGLRef.workingCIDepthStencilBuffer[this->_clearImageIndex], OGLRef.workingCIDepthStencilBuffer[this->_clearImageIndex ^ 0x01], GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(GLuint)) != 0);
const bool didFogAttributesChange = (memcmp(OGLRef.workingCIFogAttributesBuffer[this->_clearImageIndex], OGLRef.workingCIFogAttributesBuffer[this->_clearImageIndex ^ 0x01], GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(GLuint)) != 0);
const bool didPolyIDChange = (memcmp(OGLRef.workingCIPolyIDBuffer[this->_clearImageIndex], OGLRef.workingCIPolyIDBuffer[this->_clearImageIndex ^ 0x01], GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(GLuint)) != 0);
glActiveTextureARB(GL_TEXTURE0_ARB);
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIColorID);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, colorBuffer);
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthStencilID);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_DEPTH_STENCIL_EXT, GL_UNSIGNED_INT_24_8_EXT, OGLRef.workingCIDepthStencilBuffer);
if (didColorChange)
{
memcpy(OGLRef.workingCIColorBuffer, colorBuffer, GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT * sizeof(u16));
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIColorID);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV, OGLRef.workingCIColorBuffer);
}
if (didDepthStencilChange)
{
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthStencilID);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_DEPTH_STENCIL_EXT, GL_UNSIGNED_INT_24_8_EXT, OGLRef.workingCIDepthStencilBuffer[this->_clearImageIndex]);
}
if (this->isShaderSupported)
{
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIFogAttrID);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIFogAttributesBuffer);
if (didFogAttributesChange)
{
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIFogAttrID);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIFogAttributesBuffer[this->_clearImageIndex]);
}
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIPolyID);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIPolyIDBuffer);
if (didPolyIDChange)
{
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIPolyID);
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIPolyIDBuffer[this->_clearImageIndex]);
}
}
glBindTexture(GL_TEXTURE_2D, 0);
@@ -4742,6 +4764,7 @@ Render3DError OpenGLRenderer_1_2::Reset()
OGLRef.vtxPtrTexCoord = (GLvoid *)offsetof(VERT, texcoord);
OGLRef.vtxPtrColor = (this->isShaderSupported) ? (GLvoid *)offsetof(VERT, color) : OGLRef.color4fBuffer;
memset(OGLRef.workingCIColorBuffer, 0, sizeof(OGLRef.workingCIColorBuffer));
memset(this->clearImageColor16Buffer, 0, sizeof(this->clearImageColor16Buffer));
memset(this->clearImageDepthBuffer, 0, sizeof(this->clearImageDepthBuffer));
memset(this->clearImagePolyIDBuffer, 0, sizeof(this->clearImagePolyIDBuffer));
View
@@ -534,10 +534,11 @@ struct OGLRenderRef
// Client-side Buffers
GLfloat *color4fBuffer;
GLushort *vertIndexBuffer;
CACHE_ALIGN GLuint workingCIDepthStencilBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN GLuint workingCIFogAttributesBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN GLuint workingCIPolyIDBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
GLushort *vertIndexBuffer;
CACHE_ALIGN GLushort workingCIColorBuffer[GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN GLuint workingCIDepthStencilBuffer[2][GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN GLuint workingCIFogAttributesBuffer[2][GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
CACHE_ALIGN GLuint workingCIPolyIDBuffer[2][GPU_FRAMEBUFFER_NATIVE_WIDTH * GPU_FRAMEBUFFER_NATIVE_HEIGHT];
// Vertex Attributes Pointers
GLvoid *vtxPtrPosition;
@@ -661,7 +662,8 @@ class OpenGLRenderer : public Render3D
size_t _currentPolyIndex;
OGLTextureUnitID _lastTextureDrawTarget;
bool _enableMultisampledRendering;
bool _enableMultisampledRendering;
size_t _clearImageIndex;
Render3DError FlushFramebuffer(const FragmentColor *__restrict srcFramebuffer, FragmentColor *__restrict dstFramebufferMain, u16 *__restrict dstFramebuffer16);
OpenGLTexture* GetLoadedTextureFromPolygon(const POLY &thePoly, bool enableTexturing);
@@ -879,12 +879,13 @@ Render3DError OpenGLRenderer_3_2::CreateFBOs()
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_COMPARE_MODE, GL_NONE);
glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, this->_framebufferWidth, this->_framebufferHeight, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL);
memset(OGLRef.workingCIColorBuffer, 0, sizeof(OGLRef.workingCIColorBuffer));
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIColorID);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, NULL);
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, GPU_FRAMEBUFFER_NATIVE_WIDTH, GPU_FRAMEBUFFER_NATIVE_HEIGHT, 0, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, OGLRef.workingCIColorBuffer);
glBindTexture(GL_TEXTURE_2D, OGLRef.texCIDepthStencilID);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);

4 comments on commit c9a405e

@Jules-A

This comment has been minimized.

Show comment
Hide comment
@Jules-A

Jules-A Aug 16, 2018

Is there a way to tell which games benefit from this or do you have any examples? I've tested Pokemon HeartGold in the Appveyor builds which seemed to result in a ~1.5% performance increase but I can't replicate on the Nightly builds due to the upgrade in MSVC compiler from 1910 to 1914 I'm seeing 3% lower FPS (I'm using an old FX8320 so I guess it's expected as the focus moves to newer CPUs).

Jules-A replied Aug 16, 2018

Is there a way to tell which games benefit from this or do you have any examples? I've tested Pokemon HeartGold in the Appveyor builds which seemed to result in a ~1.5% performance increase but I can't replicate on the Nightly builds due to the upgrade in MSVC compiler from 1910 to 1914 I'm seeing 3% lower FPS (I'm using an old FX8320 so I guess it's expected as the focus moves to newer CPUs).

@zeromus

This comment has been minimized.

Show comment
Hide comment
@zeromus

zeromus Aug 16, 2018

Contributor

pokemon doesnt use clear images. this commit shouldnt affect anything that doesn't use clear images. you might need to improve your benchmarking methods. differences of 1.5% are hard to be sure about, anyway. search render3d.cpp ClearFramebuffer().

Contributor

zeromus replied Aug 16, 2018

pokemon doesnt use clear images. this commit shouldnt affect anything that doesn't use clear images. you might need to improve your benchmarking methods. differences of 1.5% are hard to be sure about, anyway. search render3d.cpp ClearFramebuffer().

@rogerman

This comment has been minimized.

Show comment
Hide comment
@rogerman

rogerman Aug 16, 2018

Collaborator

As a matter of fact, most games don't use clear images, but there are a select few that do. Some examples would be:

  • Sonic Chronicles: Dark Brotherhood
  • Harry Potter and the Order of Phoenix
  • Blazer Drive
Collaborator

rogerman replied Aug 16, 2018

As a matter of fact, most games don't use clear images, but there are a select few that do. Some examples would be:

  • Sonic Chronicles: Dark Brotherhood
  • Harry Potter and the Order of Phoenix
  • Blazer Drive
@Jules-A

This comment has been minimized.

Show comment
Hide comment
@Jules-A

Jules-A Aug 16, 2018

Yeah I only did 2 60sec Fraps benches in diff locations but with a few apps open. I did suspect it was MoE which is partially why I was asking. I just did 2 more with everything closed and things are much closer now.

2018-08-17 07:26:28 - DeSmuME-VS2015-x64-Releaseolder
Frames: 4485 - Time: 60000ms - Avg: 74.750 - Min: 69 - Max: 98

2018-08-17 07:28:43 - DeSmuME-VS2015-x64-Releasenewer
Frames: 4513 - Time: 60000ms - Avg: 75.217 - Min: 68 - Max: 99

2018-08-17 07:39:31 - DeSmuME-VS2015-x64-Releasenewer
Frames: 4145 - Time: 60000ms - Avg: 69.083 - Min: 63 - Max: 88

2018-08-17 07:41:19 - DeSmuME-VS2015-x64-Releaseolder
Frames: 4116 - Time: 60000ms - Avg: 68.600 - Min: 62 - Max: 87

Thanks for clearing things up :)

Jules-A replied Aug 16, 2018

Yeah I only did 2 60sec Fraps benches in diff locations but with a few apps open. I did suspect it was MoE which is partially why I was asking. I just did 2 more with everything closed and things are much closer now.

2018-08-17 07:26:28 - DeSmuME-VS2015-x64-Releaseolder
Frames: 4485 - Time: 60000ms - Avg: 74.750 - Min: 69 - Max: 98

2018-08-17 07:28:43 - DeSmuME-VS2015-x64-Releasenewer
Frames: 4513 - Time: 60000ms - Avg: 75.217 - Min: 68 - Max: 99

2018-08-17 07:39:31 - DeSmuME-VS2015-x64-Releasenewer
Frames: 4145 - Time: 60000ms - Avg: 69.083 - Min: 63 - Max: 88

2018-08-17 07:41:19 - DeSmuME-VS2015-x64-Releaseolder
Frames: 4116 - Time: 60000ms - Avg: 68.600 - Min: 62 - Max: 87

Thanks for clearing things up :)

Please sign in to comment.