Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Optimize TransformationMatrix::multiply() for x86_64
https://bugs.webkit.org/show_bug.cgi?id=105719

Reviewed by Sam Weinig.

On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
We can use that in two ways to optimize matrix multiplications:
-Keep the source matrix completely in registers. Write the result directly in
 the source matrix's memory. This avoids the memcpy at the end of the multiplication
 and various memory operations.
-Use SIMD with SSE to perform 2 operations at a time.

The parameter from the second matrix are loaded one by one in XMM registers.
Loading them with SSE then shuffling the values perform worse than loading
one by one.

This is only enabled on 64bits as x86 only has access to 8 XMM registers and
the function should be written differently.

On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.

* platform/graphics/transforms/TransformationMatrix.cpp:
(WebCore::TransformationMatrix::multiply):
* platform/graphics/transforms/TransformationMatrix.h:
(TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.


Canonical link: https://commits.webkit.org/124339@main
git-svn-id: https://svn.webkit.org/repository/webkit/trunk@138866 268f45cc-cd09-0410-ab3c-d52691b4dbfc
  • Loading branch information
BenjaminPoulain committed Jan 5, 2013
1 parent 9b88b89 commit b79026f
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 5 deletions.
28 changes: 28 additions & 0 deletions Source/WebCore/ChangeLog
@@ -1,3 +1,31 @@
2013-01-04 Benjamin Poulain <benjamin@webkit.org>

Optimize TransformationMatrix::multiply() for x86_64
https://bugs.webkit.org/show_bug.cgi?id=105719

Reviewed by Sam Weinig.

On x86_64, we have access to 16 XMM registers. This can hold 32 double values.
We can use that in two ways to optimize matrix multiplications:
-Keep the source matrix completely in registers. Write the result directly in
the source matrix's memory. This avoids the memcpy at the end of the multiplication
and various memory operations.
-Use SIMD with SSE to perform 2 operations at a time.

The parameter from the second matrix are loaded one by one in XMM registers.
Loading them with SSE then shuffling the values perform worse than loading
one by one.

This is only enabled on 64bits as x86 only has access to 8 XMM registers and
the function should be written differently.

On a i5, TransformationMatrix::multiply() perform about 3 times faster with the change.

* platform/graphics/transforms/TransformationMatrix.cpp:
(WebCore::TransformationMatrix::multiply):
* platform/graphics/transforms/TransformationMatrix.h:
(TransformationMatrix): Fix an incorrect comment. Unify the comment with the cpp file.

2013-01-04 Joshua Bell <jsbell@chromium.org>

Unreviewed, fix Apple Win build following http://trac.webkit.org/changeset/138838
Expand Down
131 changes: 128 additions & 3 deletions Source/WebCore/platform/graphics/transforms/TransformationMatrix.cpp
Expand Up @@ -36,6 +36,10 @@
#include <wtf/Assertions.h>
#include <wtf/MathExtras.h>

#if CPU(X86_64)
#include <emmintrin.h>
#endif

using namespace std;

namespace WebCore {
Expand Down Expand Up @@ -968,9 +972,7 @@ TransformationMatrix TransformationMatrix::rectToRect(const FloatRect& from, con
to.y() - from.y());
}

//
// *this = mat * *this
//
// this = mat * this.
TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix& mat)
{
#if CPU(APPLE_ARMV7S)
Expand Down Expand Up @@ -1115,6 +1117,129 @@ TransformationMatrix& TransformationMatrix::multiply(const TransformationMatrix&
}
#undef MATRIX_MULTIPLY_ONE_LINE

#elif defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
// x86_64 has 16 XMM registers which is enough to do the multiplication fully in registers.
__m128d matrixBlockA = _mm_load_pd(&(m_matrix[0][0]));
__m128d matrixBlockC = _mm_load_pd(&(m_matrix[1][0]));
__m128d matrixBlockE = _mm_load_pd(&(m_matrix[2][0]));
__m128d matrixBlockG = _mm_load_pd(&(m_matrix[3][0]));

// First row.
__m128d otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[0][0]);
__m128d otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[0][1]);
__m128d otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[0][2]);
__m128d otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[0][3]);

// output00 and output01.
__m128d accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
__m128d temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
__m128d temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
__m128d temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);

__m128d matrixBlockB = _mm_load_pd(&(m_matrix[0][2]));
__m128d matrixBlockD = _mm_load_pd(&(m_matrix[1][2]));
__m128d matrixBlockF = _mm_load_pd(&(m_matrix[2][2]));
__m128d matrixBlockH = _mm_load_pd(&(m_matrix[3][2]));

accumulator = _mm_add_pd(accumulator, temp1);
accumulator = _mm_add_pd(accumulator, temp2);
accumulator = _mm_add_pd(accumulator, temp3);
_mm_store_pd(&m_matrix[0][0], accumulator);

// output02 and output03.
accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);

accumulator = _mm_add_pd(accumulator, temp1);
accumulator = _mm_add_pd(accumulator, temp2);
accumulator = _mm_add_pd(accumulator, temp3);
_mm_store_pd(&m_matrix[0][2], accumulator);

// Second row.
otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[1][0]);
otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[1][1]);
otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[1][2]);
otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[1][3]);

// output10 and output11.
accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);

accumulator = _mm_add_pd(accumulator, temp1);
accumulator = _mm_add_pd(accumulator, temp2);
accumulator = _mm_add_pd(accumulator, temp3);
_mm_store_pd(&m_matrix[1][0], accumulator);

// output12 and output13.
accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);

accumulator = _mm_add_pd(accumulator, temp1);
accumulator = _mm_add_pd(accumulator, temp2);
accumulator = _mm_add_pd(accumulator, temp3);
_mm_store_pd(&m_matrix[1][2], accumulator);

// Third row.
otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[2][0]);
otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[2][1]);
otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[2][2]);
otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[2][3]);

// output20 and output21.
accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);

accumulator = _mm_add_pd(accumulator, temp1);
accumulator = _mm_add_pd(accumulator, temp2);
accumulator = _mm_add_pd(accumulator, temp3);
_mm_store_pd(&m_matrix[2][0], accumulator);

// output22 and output23.
accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);

accumulator = _mm_add_pd(accumulator, temp1);
accumulator = _mm_add_pd(accumulator, temp2);
accumulator = _mm_add_pd(accumulator, temp3);
_mm_store_pd(&m_matrix[2][2], accumulator);

// Fourth row.
otherMatrixFirstParam = _mm_set1_pd(mat.m_matrix[3][0]);
otherMatrixSecondParam = _mm_set1_pd(mat.m_matrix[3][1]);
otherMatrixThirdParam = _mm_set1_pd(mat.m_matrix[3][2]);
otherMatrixFourthParam = _mm_set1_pd(mat.m_matrix[3][3]);

// output30 and output31.
accumulator = _mm_mul_pd(matrixBlockA, otherMatrixFirstParam);
temp1 = _mm_mul_pd(matrixBlockC, otherMatrixSecondParam);
temp2 = _mm_mul_pd(matrixBlockE, otherMatrixThirdParam);
temp3 = _mm_mul_pd(matrixBlockG, otherMatrixFourthParam);

accumulator = _mm_add_pd(accumulator, temp1);
accumulator = _mm_add_pd(accumulator, temp2);
accumulator = _mm_add_pd(accumulator, temp3);
_mm_store_pd(&m_matrix[3][0], accumulator);

// output32 and output33.
accumulator = _mm_mul_pd(matrixBlockB, otherMatrixFirstParam);
temp1 = _mm_mul_pd(matrixBlockD, otherMatrixSecondParam);
temp2 = _mm_mul_pd(matrixBlockF, otherMatrixThirdParam);
temp3 = _mm_mul_pd(matrixBlockH, otherMatrixFourthParam);

accumulator = _mm_add_pd(accumulator, temp1);
accumulator = _mm_add_pd(accumulator, temp2);
accumulator = _mm_add_pd(accumulator, temp3);
_mm_store_pd(&m_matrix[3][2], accumulator);
#else
Matrix4 tmp;

Expand Down
Expand Up @@ -69,10 +69,14 @@ class LayoutRect;
class FloatRect;
class FloatQuad;

#if CPU(X86_64) && !PLATFORM(WINDOWS)
#define TRANSFORMATION_MATRIX_USE_X86_64_SSE2
#endif

class TransformationMatrix {
WTF_MAKE_FAST_ALLOCATED;
public:
#if CPU(APPLE_ARMV7S)
#if CPU(APPLE_ARMV7S) || defined(TRANSFORMATION_MATRIX_USE_X86_64_SSE2)
typedef double Matrix4[4][4] __attribute__((aligned (16)));
#else
typedef double Matrix4[4][4];
Expand Down Expand Up @@ -226,7 +230,7 @@ class TransformationMatrix {
double f() const { return m_matrix[3][1]; }
void setF(double f) { m_matrix[3][1] = f; }

// this = this * mat
// this = mat * this.
TransformationMatrix& multiply(const TransformationMatrix&);

TransformationMatrix& scale(double);
Expand Down

0 comments on commit b79026f

Please sign in to comment.