Skip to content

Commit

Permalink
Optimize pattern search for intel CPUs:
Browse files Browse the repository at this point in the history
- Use AES-NI instructions if available for encryption
- Use AVX2 instructions if available for faster SHA512.
  • Loading branch information
Optiminer committed Aug 6, 2016
1 parent 8f6835b commit d31e02e
Show file tree
Hide file tree
Showing 6 changed files with 551 additions and 11 deletions.
6 changes: 6 additions & 0 deletions src/Makefile.am
Expand Up @@ -71,6 +71,7 @@ endif
# hodlcoin core #
BITCOIN_CORE_H = \
addrman.h \
aes.h \
alert.h \
amount.h \
arith_uint256.h \
Expand Down Expand Up @@ -129,6 +130,7 @@ BITCOIN_CORE_H = \
script/sigcache.h \
script/sign.h \
script/standard.h \
sha512.h \
serialize.h \
streams.h \
support/allocators/secure.h \
Expand Down Expand Up @@ -177,6 +179,7 @@ libbitcoin_util_a-clientversion.$(OBJEXT): obj/build.h
libbitcoin_server_a_CPPFLAGS = $(BITCOIN_INCLUDES) $(MINIUPNPC_CPPFLAGS)
libbitcoin_server_a_SOURCES = \
addrman.cpp \
aes.cpp \
alert.cpp \
bloom.cpp \
chain.cpp \
Expand All @@ -199,6 +202,7 @@ libbitcoin_server_a_SOURCES = \
rpcrawtransaction.cpp \
rpcserver.cpp \
script/sigcache.cpp \
sha512.cpp \
timedata.cpp \
txdb.cpp \
txmempool.cpp \
Expand Down Expand Up @@ -247,6 +251,7 @@ univalue_libbitcoin_univalue_a_SOURCES = \
# common: shared between hodlcoind, and hodlcoin-qt and non-server tools
libbitcoin_common_a_CPPFLAGS = $(BITCOIN_INCLUDES)
libbitcoin_common_a_SOURCES = \
aes.cpp \
amount.cpp \
arith_uint256.cpp \
base58.cpp \
Expand All @@ -272,6 +277,7 @@ libbitcoin_common_a_SOURCES = \
script/script_error.cpp \
script/sign.cpp \
script/standard.cpp \
sha512.cpp \
$(BITCOIN_CORE_H)

# util: shared between all executables.
Expand Down
156 changes: 156 additions & 0 deletions src/aes.cpp
@@ -0,0 +1,156 @@
#include "aes.h"

#include <stdint.h>
#include <x86intrin.h>

#pragma GCC target("aes")

static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
{
__m128i tmp4;
*tmp2 = _mm_shuffle_epi32(*tmp2, 0xFF);
tmp4 = _mm_slli_si128(*tmp1, 0x04);
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp1 = _mm_xor_si128(*tmp1, tmp4);
*tmp1 = _mm_xor_si128(*tmp1, *tmp2);
}

static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
{
__m128i tmp2, tmp4;

tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
tmp2 = _mm_shuffle_epi32(tmp4, 0xAA);
tmp4 = _mm_slli_si128(*tmp3, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
tmp4 = _mm_slli_si128(tmp4, 0x04);
*tmp3 = _mm_xor_si128(*tmp3, tmp4);
*tmp3 = _mm_xor_si128(*tmp3, tmp2);
}

// Special thanks to Intel for helping me
// with ExpandAESKey256() and its subroutines
static void ExpandAESKey256(__m128i *keys, const __m128i *KeyBuf)
{
__m128i tmp1, tmp2, tmp3;

tmp1 = keys[0] = KeyBuf[0];
tmp3 = keys[1] = KeyBuf[1];

tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x01);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[2] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[3] = tmp3;

tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x02);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[4] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[5] = tmp3;

tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x04);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[6] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[7] = tmp3;

tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x08);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[8] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[9] = tmp3;

tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[10] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[11] = tmp3;

tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[12] = tmp1;
ExpandAESKey256_sub2(&tmp1, &tmp3);
keys[13] = tmp3;

tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
ExpandAESKey256_sub1(&tmp1, &tmp2);
keys[14] = tmp1;
}

#define AESENC(i,j) \
State[j] = _mm_aesenc_si128(State[j], ExpandedKey[(j*16)+i]);

#define AESENC_N(i) \
AESENC(i,0) \
AESENC(i,1) \
AESENC(i,2) \
AESENC(i,3) \
AESENC(i,4) \
AESENC(i,5) \
AESENC(i,6) \
AESENC(i,7) \


static inline void AES256Core(__m128i* State, const __m128i* ExpandedKey)
{
const uint32_t N = AES_PARALLEL_N;

for(unsigned int j=0; j<N; ++j)
State[j] = _mm_xor_si128(State[j], ExpandedKey[j*16+0]);

AESENC_N(1)
AESENC_N(2)
AESENC_N(3)
AESENC_N(4)
AESENC_N(5)
AESENC_N(6)
AESENC_N(7)
AESENC_N(8)
AESENC_N(9)
AESENC_N(10)
AESENC_N(11)
AESENC_N(12)
AESENC_N(13)

for(unsigned int j=0; j<N; ++j)
State[j] = _mm_aesenclast_si128(State[j], ExpandedKey[j*16+14]);
}

static void AES256CBC(__m128i** data, const __m128i** next, const __m128i* ExpandedKey, __m128i* IV)
{
const uint32_t N = AES_PARALLEL_N;
__m128i State[N];
for(unsigned int j=0; j<N; ++j) {
State[j] = _mm_xor_si128( _mm_xor_si128(data[j][0], next[j][0]), IV[j]);
}

AES256Core(State, ExpandedKey);
for(unsigned int j=0; j<N; ++j)
data[j][0] = State[j];

for(int i = 1; i < BLOCK_COUNT; ++i)
{
for(unsigned int j=0; j<N; ++j) {
State[j] = _mm_xor_si128( _mm_xor_si128(data[j][i], next[j][i]), data[j][i - 1]);
}
AES256Core(State, ExpandedKey);
for(unsigned int j=0; j<N; ++j) {
data[j][i] = State[j];
}
}
}

void ExpandAESKey256_int(uint32_t *keys, const uint32_t *KeyBuf) {
return ExpandAESKey256((__m128i*)keys, (const __m128i*)KeyBuf);
}

void AES256CBC_int(uint32_t** data, const uint32_t** next, const uint32_t* ExpandedKey, uint32_t* IV) {
return AES256CBC((__m128i**)data, (const __m128i**)next, (const __m128i*)ExpandedKey, (__m128i*)IV);
}

13 changes: 13 additions & 0 deletions src/aes.h
@@ -0,0 +1,13 @@
#ifndef AES_H
#define AES_H

#include <stdint.h>
#include <x86intrin.h>

#define AES_PARALLEL_N 8
#define BLOCK_COUNT 256

void ExpandAESKey256_int(uint32_t *keys, const uint32_t *KeyBuf);
void AES256CBC_int(uint32_t** data, const uint32_t** next, const uint32_t* ExpandedKey, uint32_t* IV);

#endif // __WOLF_AES_H

0 comments on commit d31e02e

Please sign in to comment.