Skip to content

Commit

Permalink
Add AVX optimizations for 7-tuple and 8-tuple sieving (#26)
Browse files Browse the repository at this point in the history
* 7 tuple SIMD sieve
* 8-tuple sieving optimizations
* Add AVX2 version of 7-tuple sieve
  • Loading branch information
MichaelBell committed Apr 9, 2021
1 parent da95132 commit 449646d
Show file tree
Hide file tree
Showing 2 changed files with 245 additions and 4 deletions.
236 changes: 233 additions & 3 deletions Miner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ void Miner::init(const MinerParameters &minerParameters) {
std::cout << "Table with all " << primes.size() << " first primes generated in " << timeSince(t0) << " s (" << primes.size()*sizeof(decltype(primes)::value_type) << " bytes)." << std::endl;
}

if (primes.size() % 2 == 1 && _parameters.pattern.size() == 6) // Needs to be even to use optimizations for 6-tuples
if (primes.size() % 2 == 1) // Needs to be even to use SIMD sieving optimizations
primes.pop_back();

try {
Expand Down Expand Up @@ -241,7 +241,7 @@ void Miner::init(const MinerParameters &minerParameters) {
if (p >= _factorMax) {
if (_primesIndexThreshold == 0) {
_primesIndexThreshold = i;
if (_primesIndexThreshold % 2 == 1 && _parameters.pattern.size() == 6) // Needs to be even to use optimizations for 6-tuples
if (_primesIndexThreshold % 2 == 1) // Needs to be even to use SIMD sieving optimizations
_primesIndexThreshold--;
}
sumInversesOfPrimes += 1./static_cast<double>(p);
Expand Down Expand Up @@ -309,7 +309,7 @@ void Miner::init(const MinerParameters &minerParameters) {
try {
std::cout << "Allocating " << sizeof(uint32_t)*_parameters.sieveWorkers*factorsToEliminateEntries << " bytes for the primorial factors..." << std::endl;
for (auto &sieve : _sieves) {
sieve.factorsToEliminate = new uint32_t[factorsToEliminateEntries];
sieve.factorsToEliminate = reinterpret_cast<uint32_t*>(new __m256i[(factorsToEliminateEntries + 7) / 8]);
memset(sieve.factorsToEliminate, 0, sizeof(uint32_t)*factorsToEliminateEntries);
}
}
Expand Down Expand Up @@ -661,6 +661,226 @@ void Miner::_processSieve6(uint64_t *factorsTable, uint32_t* factorsToEliminate,
}
}

void Miner::_processSieve7(uint64_t *factorsTable, uint32_t* factorsToEliminate, uint64_t firstPrimeIndex, const uint64_t lastPrimeIndex) { // Assembly optimized sieving for 7-tuples by Michael Bell
assert(_parameters.pattern.size() == 7);

std::array<uint32_t, sieveCacheSize> sieveCache{0};
uint64_t sieveCachePos(0);

xmmreg_t offsetmax;
offsetmax.m128 = _mm_set1_epi32(_parameters.sieveSize);
for (uint64_t i(firstPrimeIndex) ; i < lastPrimeIndex ; i += 1) {
xmmreg_t p1;
xmmreg_t factor1, factor2, nextIncr1, nextIncr2;
xmmreg_t cmpres1, cmpres2;
p1.m128 = _mm_set1_epi32(_primes32[i]);
factor1.m128 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(&factorsToEliminate[i*7 + 0]));
factor2.m128 = _mm_loadu_si128(reinterpret_cast<__m128i const*>(&factorsToEliminate[i*7 + 3]));
while (true) {
cmpres1.m128 = _mm_cmpgt_epi32(offsetmax.m128, factor1.m128);
cmpres2.m128 = _mm_cmpgt_epi32(offsetmax.m128, factor2.m128);
const int mask1(_mm_movemask_epi8(cmpres1.m128));
const int mask2(_mm_movemask_epi8(cmpres2.m128));
if ((mask1 == 0) && (mask2 == 0)) break;
if (mask1 & 0x0008) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[0]);
if (mask1 & 0x0080) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[1]);
if (mask1 & 0x0800) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[2]);
if (mask1 & 0x8000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[3]);
if (mask2 & 0x0080) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[1]);
if (mask2 & 0x0800) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[2]);
if (mask2 & 0x8000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[3]);
nextIncr1.m128 = _mm_and_si128(cmpres1.m128, p1.m128);
nextIncr2.m128 = _mm_and_si128(cmpres2.m128, p1.m128);
factor1.m128 = _mm_add_epi32(factor1.m128, nextIncr1.m128);
factor2.m128 = _mm_add_epi32(factor2.m128, nextIncr2.m128);
}
factor1.m128 = _mm_sub_epi32(factor1.m128, offsetmax.m128);
factor2.m128 = _mm_sub_epi32(factor2.m128, offsetmax.m128);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&factorsToEliminate[i*7 + 0]), factor1.m128);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&factorsToEliminate[i*7 + 3]), factor2.m128);
}
_endSieveCache(factorsTable, sieveCache);
}

void Miner::_processSieve7_avx2(uint64_t *factorsTable, uint32_t* factorsToEliminate, uint64_t firstPrimeIndex, const uint64_t lastPrimeIndex) { // Assembly optimized sieving for 7-tuples by Michael Bell
assert(_parameters.pattern.size() == 7);

#ifdef __AVX2__
std::array<uint32_t, sieveCacheSize> sieveCache{0};
uint64_t sieveCachePos(0);

assert((lastPrimeIndex & 1) == 0);
// Already eliminate for the first prime to sieve if it is odd to align for the optimizations
if ((firstPrimeIndex & 1) != 0) {
for (uint64_t f(0) ; f < 7 ; f++) {
while (factorsToEliminate[firstPrimeIndex*7 + f] < _parameters.sieveSize) {
_addToSieveCache(factorsTable, sieveCache, sieveCachePos, factorsToEliminate[firstPrimeIndex*7 + f]);
factorsToEliminate[firstPrimeIndex*7 + f] += _primes32[firstPrimeIndex];
}
factorsToEliminate[firstPrimeIndex*7 + f] -= _parameters.sieveSize;
}
firstPrimeIndex++;
}

ymmreg_t offsetmax;
offsetmax.m256 = _mm256_set1_epi32(_parameters.sieveSize);
ymmreg_t storemask;
storemask.m256 = _mm256_set1_epi32(0xffffffff);
storemask.v[0] = 0;
for (uint64_t i(firstPrimeIndex) ; i < lastPrimeIndex ; i += 2) {
ymmreg_t p1, p2;
ymmreg_t factor1, factor2, nextIncr1, nextIncr2;
ymmreg_t cmpres1, cmpres2;
p1.m256 = _mm256_set1_epi32(_primes32[i]);
p2.m256 = _mm256_set1_epi32(_primes32[i + 1]);
factor1.m256 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(&factorsToEliminate[i*7 + 0]));
factor2.m256 = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(&factorsToEliminate[i*7 + 6]));
while (true) {
cmpres1.m256 = _mm256_cmpgt_epi32(offsetmax.m256, factor1.m256);
cmpres2.m256 = _mm256_cmpgt_epi32(offsetmax.m256, factor2.m256);
const int mask1(_mm256_movemask_epi8(cmpres1.m256));
const int mask2(_mm256_movemask_epi8(cmpres2.m256));
if ((mask1 == 0) && (mask2 == 0)) break;
if (mask1 & 0x00000008) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[0]);
if (mask1 & 0x00000080) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[1]);
if (mask1 & 0x00000800) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[2]);
if (mask1 & 0x00008000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[3]);
if (mask1 & 0x00080000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[4]);
if (mask1 & 0x00800000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[5]);
if (mask1 & 0x08000000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[6]);
if (mask2 & 0x00000080) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[1]);
if (mask2 & 0x00000800) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[2]);
if (mask2 & 0x00008000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[3]);
if (mask2 & 0x00080000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[4]);
if (mask2 & 0x00800000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[5]);
if (mask2 & 0x08000000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[6]);
if (mask2 & 0x80000000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[7]);
nextIncr1.m256 = _mm256_and_si256(cmpres1.m256, p1.m256);
nextIncr2.m256 = _mm256_and_si256(cmpres2.m256, p2.m256);
factor1.m256 = _mm256_add_epi32(factor1.m256, nextIncr1.m256);
factor2.m256 = _mm256_add_epi32(factor2.m256, nextIncr2.m256);
}
factor1.m256 = _mm256_sub_epi32(factor1.m256, offsetmax.m256);
factor2.m256 = _mm256_sub_epi32(factor2.m256, offsetmax.m256);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&factorsToEliminate[i*7 + 0]), factor1.m256);
_mm256_maskstore_epi32(reinterpret_cast<int*>(&factorsToEliminate[i*7 + 6]), storemask.m256, factor2.m256);
}
_endSieveCache(factorsTable, sieveCache);
#else
printf("Not compiled with AVX2. Exit\n");
exit(3);
#endif
}

void Miner::_processSieve8(uint64_t *factorsTable, uint32_t* factorsToEliminate, uint64_t firstPrimeIndex, const uint64_t lastPrimeIndex) { // Assembly optimized sieving for 8-tuples by Michael Bell
assert(_parameters.pattern.size() == 8);
std::array<uint32_t, sieveCacheSize> sieveCache{0};
uint64_t sieveCachePos(0);
xmmreg_t offsetmax;
offsetmax.m128 = _mm_set1_epi32(_parameters.sieveSize);
for (uint64_t i(firstPrimeIndex) ; i < lastPrimeIndex ; i += 1) {
xmmreg_t p1;
xmmreg_t factor1, factor2, nextIncr1, nextIncr2;
xmmreg_t cmpres1, cmpres2;
p1.m128 = _mm_set1_epi32(_primes32[i]);
factor1.m128 = _mm_load_si128(reinterpret_cast<__m128i const*>(&factorsToEliminate[i*8 + 0]));
factor2.m128 = _mm_load_si128(reinterpret_cast<__m128i const*>(&factorsToEliminate[i*8 + 4]));
while (true) {
cmpres1.m128 = _mm_cmpgt_epi32(offsetmax.m128, factor1.m128);
cmpres2.m128 = _mm_cmpgt_epi32(offsetmax.m128, factor2.m128);
const int mask1(_mm_movemask_epi8(cmpres1.m128));
const int mask2(_mm_movemask_epi8(cmpres2.m128));
if ((mask1 == 0) && (mask2 == 0)) break;
if (mask1 & 0x0008) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[0]);
if (mask1 & 0x0080) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[1]);
if (mask1 & 0x0800) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[2]);
if (mask1 & 0x8000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[3]);
if (mask2 & 0x0008) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[0]);
if (mask2 & 0x0080) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[1]);
if (mask2 & 0x0800) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[2]);
if (mask2 & 0x8000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[3]);
nextIncr1.m128 = _mm_and_si128(cmpres1.m128, p1.m128);
nextIncr2.m128 = _mm_and_si128(cmpres2.m128, p1.m128);
factor1.m128 = _mm_add_epi32(factor1.m128, nextIncr1.m128);
factor2.m128 = _mm_add_epi32(factor2.m128, nextIncr2.m128);
}
factor1.m128 = _mm_sub_epi32(factor1.m128, offsetmax.m128);
factor2.m128 = _mm_sub_epi32(factor2.m128, offsetmax.m128);
_mm_store_si128(reinterpret_cast<__m128i*>(&factorsToEliminate[i*8 + 0]), factor1.m128);
_mm_store_si128(reinterpret_cast<__m128i*>(&factorsToEliminate[i*8 + 4]), factor2.m128);
}
_endSieveCache(factorsTable, sieveCache);
}

void Miner::_processSieve8_avx2(uint64_t *factorsTable, uint32_t* factorsToEliminate, uint64_t firstPrimeIndex, const uint64_t lastPrimeIndex) { // Assembly optimized sieving for 8-tuples by Michael Bell
assert(_parameters.pattern.size() == 8);

#ifdef __AVX2__
std::array<uint32_t, sieveCacheSize> sieveCache{0};
uint64_t sieveCachePos(0);

assert((lastPrimeIndex & 1) == 0);
// Already eliminate for the first prime to sieve if it is odd to align for the optimizations
if ((firstPrimeIndex & 1) != 0) {
for (uint64_t f(0) ; f < 8 ; f++) {
while (factorsToEliminate[firstPrimeIndex*8 + f] < _parameters.sieveSize) {
_addToSieveCache(factorsTable, sieveCache, sieveCachePos, factorsToEliminate[firstPrimeIndex*8 + f]);
factorsToEliminate[firstPrimeIndex*8 + f] += _primes32[firstPrimeIndex];
}
factorsToEliminate[firstPrimeIndex*8 + f] -= _parameters.sieveSize;
}
firstPrimeIndex++;
}

ymmreg_t offsetmax;
offsetmax.m256 = _mm256_set1_epi32(_parameters.sieveSize);
for (uint64_t i(firstPrimeIndex) ; i < lastPrimeIndex ; i += 2) {
ymmreg_t p1, p2;
ymmreg_t factor1, factor2, nextIncr1, nextIncr2;
ymmreg_t cmpres1, cmpres2;
p1.m256 = _mm256_set1_epi32(_primes32[i]);
p2.m256 = _mm256_set1_epi32(_primes32[i + 1]);
factor1.m256 = _mm256_load_si256(reinterpret_cast<__m256i const*>(&factorsToEliminate[i*8 + 0]));
factor2.m256 = _mm256_load_si256(reinterpret_cast<__m256i const*>(&factorsToEliminate[i*8 + 8]));
while (true) {
cmpres1.m256 = _mm256_cmpgt_epi32(offsetmax.m256, factor1.m256);
cmpres2.m256 = _mm256_cmpgt_epi32(offsetmax.m256, factor2.m256);
const int mask1(_mm256_movemask_epi8(cmpres1.m256));
const int mask2(_mm256_movemask_epi8(cmpres2.m256));
if ((mask1 == 0) && (mask2 == 0)) break;
if (mask1 & 0x00000008) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[0]);
if (mask1 & 0x00000080) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[1]);
if (mask1 & 0x00000800) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[2]);
if (mask1 & 0x00008000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[3]);
if (mask1 & 0x00080000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[4]);
if (mask1 & 0x00800000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[5]);
if (mask1 & 0x08000000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[6]);
if (mask1 & 0x80000000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor1.v[7]);
if (mask2 & 0x00000008) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[0]);
if (mask2 & 0x00000080) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[1]);
if (mask2 & 0x00000800) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[2]);
if (mask2 & 0x00008000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[3]);
if (mask2 & 0x00080000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[4]);
if (mask2 & 0x00800000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[5]);
if (mask2 & 0x08000000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[6]);
if (mask2 & 0x80000000) _addToSieveCache(factorsTable, sieveCache, sieveCachePos, factor2.v[7]);
nextIncr1.m256 = _mm256_and_si256(cmpres1.m256, p1.m256);
nextIncr2.m256 = _mm256_and_si256(cmpres2.m256, p2.m256);
factor1.m256 = _mm256_add_epi32(factor1.m256, nextIncr1.m256);
factor2.m256 = _mm256_add_epi32(factor2.m256, nextIncr2.m256);
}
factor1.m256 = _mm256_sub_epi32(factor1.m256, offsetmax.m256);
factor2.m256 = _mm256_sub_epi32(factor2.m256, offsetmax.m256);
_mm256_store_si256(reinterpret_cast<__m256i*>(&factorsToEliminate[i*8 + 0]), factor1.m256);
_mm256_store_si256(reinterpret_cast<__m256i*>(&factorsToEliminate[i*8 + 8]), factor2.m256);
}
_endSieveCache(factorsTable, sieveCache);
#else
printf("Not compiled with AVX2. Exit\n");
exit(3);
#endif
}

void Miner::_doSieveTask(Task task) {
Sieve& sieve(_sieves[task.sieve.id]);
std::unique_lock<std::mutex> presieveLock(sieve.presieveLock, std::defer_lock);
Expand All @@ -677,6 +897,16 @@ void Miner::_doSieveTask(Task task) {
// Eliminate the p*i + fp factors (p < factorMax).
if (_parameters.pattern.size() == 6)
_processSieve6(sieve.factorsTable, sieve.factorsToEliminate, firstPrimeIndex, _primesIndexThreshold);
else if (_parameters.pattern.size() == 7)
if (_parameters.useAvx2)
_processSieve7_avx2(sieve.factorsTable, sieve.factorsToEliminate, firstPrimeIndex, _primesIndexThreshold);
else
_processSieve7(sieve.factorsTable, sieve.factorsToEliminate, firstPrimeIndex, _primesIndexThreshold);
else if (_parameters.pattern.size() == 8)
if (_parameters.useAvx2)
_processSieve8_avx2(sieve.factorsTable, sieve.factorsToEliminate, firstPrimeIndex, _primesIndexThreshold);
else
_processSieve8(sieve.factorsTable, sieve.factorsToEliminate, firstPrimeIndex, _primesIndexThreshold);
else
_processSieve(sieve.factorsTable, sieve.factorsToEliminate, firstPrimeIndex, _primesIndexThreshold);

Expand Down
13 changes: 12 additions & 1 deletion Miner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <atomic>
#include <cassert>
#include <immintrin.h>
#include "Stats.hpp"
#include "Client.hpp"
#include "StratumClient.hpp"
Expand All @@ -17,7 +18,13 @@ union xmmreg_t {
__m128i m128;
};

constexpr uint32_t sieveCacheSize(16);
union ymmreg_t {
uint32_t v[8];
uint64_t v64[4];
__m256i m256;
};

constexpr uint32_t sieveCacheSize(32);
constexpr uint32_t nWorks(2);

inline mpz_class u64ToMpz(const uint64_t u64) {
Expand Down Expand Up @@ -147,6 +154,10 @@ class Miner {
void _doPresieveTask(const Task&);
void _processSieve(uint64_t*, uint32_t*, const uint64_t, const uint64_t);
void _processSieve6(uint64_t*, uint32_t*, uint64_t, const uint64_t);
void _processSieve7(uint64_t*, uint32_t*, uint64_t, const uint64_t);
void _processSieve7_avx2(uint64_t*, uint32_t*, uint64_t, const uint64_t);
void _processSieve8(uint64_t*, uint32_t*, uint64_t, const uint64_t);
void _processSieve8_avx2(uint64_t*, uint32_t*, uint64_t, const uint64_t);
void _doSieveTask(Task);
bool _testPrimesIspc(const std::array<uint32_t, maxCandidatesPerCheckTask>&, uint32_t[maxCandidatesPerCheckTask], const mpz_class&, mpz_class&);
void _doCheckTask(Task);
Expand Down

0 comments on commit 449646d

Please sign in to comment.