Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
131 lines (130 sloc) 11.4 KB
input size: 67108864
improved scalar ... 0.22892
scalar ... 0.27601 (speed up: 0.83)
scalar & BMI2 ... 0.27560 (speed up: 0.83)
SSE (lookup: base, pack: naive) ... 0.12418 (speed up: 1.84)
SSE (lookup: byte blend, pack: naive) ... 0.23887 (speed up: 0.96)
SSE (lookup: incremental, pack: naive) ... 0.16513 (speed up: 1.39)
SSE (lookup: pshufb, pack: naive) ... 0.19014 (speed up: 1.20)
SSE (lookup: base, pack: multiply-add) ... 0.12883 (speed up: 1.78)
SSE (lookup: byte blend, pack: multiply-add) ... 0.23542 (speed up: 0.97)
SSE (lookup: incremental, pack: multiply-add) ... 0.16103 (speed up: 1.42)
SSE (lookup: pshufb, pack: multiply-add) ... 0.18666 (speed up: 1.23)
SSE & BMI2 (lookup: base, pack: N/A) ... 0.18426 (speed up: 1.24)
SSE & BMI2 (lookup: byte blend, pack: N/A) ... 0.30707 (speed up: 0.75)
SSE & BMI2 (lookup: incremental, pack: N/A) ... 0.22651 (speed up: 1.01)
AVX2 (lookup: base, pack: naive) ... 0.08318 (speed up: 2.75)
AVX2 (lookup: byte blend, pack: naive) ... 0.13912 (speed up: 1.65)
AVX2 (lookup: pshufb, pack: naive) ... 0.12525 (speed up: 1.83)
AVX2 (lookup: base, pack: multiply-add) ... 0.10915 (speed up: 2.10)
AVX2 (lookup: byte blend, pack: multiply-add) ... 0.16510 (speed up: 1.39)
AVX2 (lookup: pshufb, pack: multiply-add) ... 0.15071 (speed up: 1.52)
AVX2 & BMI2 (lookup: base, pack: N/A) ... 0.15987 (speed up: 1.43)
AVX2 & BMI2 (lookup: byte blend, pack: N/A) ... 0.20551 (speed up: 1.11)
AVX512 (gather) ... 0.05150 (speed up: 4.45)
AVX512 (store: vectorized) (lookup: vectorized, pack: improved) ... 0.04385 (speed up: 5.22)
AVX512 (store: scatter) (lookup: vectorized, pack: improved) ... 0.04820 (speed up: 4.75)
input size: 67108864
improved scalar ... 0.22890
scalar ... 0.27606 (speed up: 0.83)
scalar & BMI2 ... 0.27553 (speed up: 0.83)
SSE (lookup: base, pack: naive) ... 0.12420 (speed up: 1.84)
SSE (lookup: byte blend, pack: naive) ... 0.23880 (speed up: 0.96)
SSE (lookup: incremental, pack: naive) ... 0.16509 (speed up: 1.39)
SSE (lookup: pshufb, pack: naive) ... 0.19003 (speed up: 1.20)
SSE (lookup: base, pack: multiply-add) ... 0.12922 (speed up: 1.77)
SSE (lookup: byte blend, pack: multiply-add) ... 0.23541 (speed up: 0.97)
SSE (lookup: incremental, pack: multiply-add) ... 0.16114 (speed up: 1.42)
SSE (lookup: pshufb, pack: multiply-add) ... 0.18676 (speed up: 1.23)
SSE & BMI2 (lookup: base, pack: N/A) ... 0.18443 (speed up: 1.24)
SSE & BMI2 (lookup: byte blend, pack: N/A) ... 0.30721 (speed up: 0.75)
SSE & BMI2 (lookup: incremental, pack: N/A) ... 0.22664 (speed up: 1.01)
AVX2 (lookup: base, pack: naive) ... 0.08320 (speed up: 2.75)
AVX2 (lookup: byte blend, pack: naive) ... 0.13898 (speed up: 1.65)
AVX2 (lookup: pshufb, pack: naive) ... 0.12543 (speed up: 1.82)
AVX2 (lookup: base, pack: multiply-add) ... 0.10937 (speed up: 2.09)
AVX2 (lookup: byte blend, pack: multiply-add) ... 0.16506 (speed up: 1.39)
AVX2 (lookup: pshufb, pack: multiply-add) ... 0.15092 (speed up: 1.52)
AVX2 & BMI2 (lookup: base, pack: N/A) ... 0.15992 (speed up: 1.43)
AVX2 & BMI2 (lookup: byte blend, pack: N/A) ... 0.20581 (speed up: 1.11)
AVX512 (gather) ... 0.05202 (speed up: 4.40)
AVX512 (store: vectorized) (lookup: vectorized, pack: improved) ... 0.04453 (speed up: 5.14)
AVX512 (store: scatter) (lookup: vectorized, pack: improved) ... 0.04888 (speed up: 4.68)
input size: 67108864
improved scalar ... 0.22910
scalar ... 0.27604 (speed up: 0.83)
scalar & BMI2 ... 0.27541 (speed up: 0.83)
SSE (lookup: base, pack: naive) ... 0.12366 (speed up: 1.85)
SSE (lookup: byte blend, pack: naive) ... 0.23879 (speed up: 0.96)
SSE (lookup: incremental, pack: naive) ... 0.16513 (speed up: 1.39)
SSE (lookup: pshufb, pack: naive) ... 0.19002 (speed up: 1.21)
SSE (lookup: base, pack: multiply-add) ... 0.12887 (speed up: 1.78)
SSE (lookup: byte blend, pack: multiply-add) ... 0.23544 (speed up: 0.97)
SSE (lookup: incremental, pack: multiply-add) ... 0.16100 (speed up: 1.42)
SSE (lookup: pshufb, pack: multiply-add) ... 0.18676 (speed up: 1.23)
SSE & BMI2 (lookup: base, pack: N/A) ... 0.18427 (speed up: 1.24)
SSE & BMI2 (lookup: byte blend, pack: N/A) ... 0.30725 (speed up: 0.75)
SSE & BMI2 (lookup: incremental, pack: N/A) ... 0.22665 (speed up: 1.01)
AVX2 (lookup: base, pack: naive) ... 0.08321 (speed up: 2.75)
AVX2 (lookup: byte blend, pack: naive) ... 0.13910 (speed up: 1.65)
AVX2 (lookup: pshufb, pack: naive) ... 0.12537 (speed up: 1.83)
AVX2 (lookup: base, pack: multiply-add) ... 0.10932 (speed up: 2.10)
AVX2 (lookup: byte blend, pack: multiply-add) ... 0.16504 (speed up: 1.39)
AVX2 (lookup: pshufb, pack: multiply-add) ... 0.15094 (speed up: 1.52)
AVX2 & BMI2 (lookup: base, pack: N/A) ... 0.16001 (speed up: 1.43)
AVX2 & BMI2 (lookup: byte blend, pack: N/A) ... 0.20553 (speed up: 1.11)
AVX512 (gather) ... 0.05198 (speed up: 4.41)
AVX512 (store: vectorized) (lookup: vectorized, pack: improved) ... 0.04432 (speed up: 5.17)
AVX512 (store: scatter) (lookup: vectorized, pack: improved) ... 0.04854 (speed up: 4.72)
input size: 67108864
improved scalar ... 0.22901
scalar ... 0.27600 (speed up: 0.83)
scalar & BMI2 ... 0.27556 (speed up: 0.83)
SSE (lookup: base, pack: naive) ... 0.12384 (speed up: 1.85)
SSE (lookup: byte blend, pack: naive) ... 0.23888 (speed up: 0.96)
SSE (lookup: incremental, pack: naive) ... 0.16494 (speed up: 1.39)
SSE (lookup: pshufb, pack: naive) ... 0.19004 (speed up: 1.21)
SSE (lookup: base, pack: multiply-add) ... 0.12893 (speed up: 1.78)
SSE (lookup: byte blend, pack: multiply-add) ... 0.23528 (speed up: 0.97)
SSE (lookup: incremental, pack: multiply-add) ... 0.16110 (speed up: 1.42)
SSE (lookup: pshufb, pack: multiply-add) ... 0.18659 (speed up: 1.23)
SSE & BMI2 (lookup: base, pack: N/A) ... 0.18425 (speed up: 1.24)
SSE & BMI2 (lookup: byte blend, pack: N/A) ... 0.30713 (speed up: 0.75)
SSE & BMI2 (lookup: incremental, pack: N/A) ... 0.22661 (speed up: 1.01)
AVX2 (lookup: base, pack: naive) ... 0.08319 (speed up: 2.75)
AVX2 (lookup: byte blend, pack: naive) ... 0.13904 (speed up: 1.65)
AVX2 (lookup: pshufb, pack: naive) ... 0.12526 (speed up: 1.83)
AVX2 (lookup: base, pack: multiply-add) ... 0.10924 (speed up: 2.10)
AVX2 (lookup: byte blend, pack: multiply-add) ... 0.16506 (speed up: 1.39)
AVX2 (lookup: pshufb, pack: multiply-add) ... 0.15076 (speed up: 1.52)
AVX2 & BMI2 (lookup: base, pack: N/A) ... 0.15993 (speed up: 1.43)
AVX2 & BMI2 (lookup: byte blend, pack: N/A) ... 0.20582 (speed up: 1.11)
AVX512 (gather) ... 0.05194 (speed up: 4.41)
AVX512 (store: vectorized) (lookup: vectorized, pack: improved) ... 0.04414 (speed up: 5.19)
AVX512 (store: scatter) (lookup: vectorized, pack: improved) ... 0.04831 (speed up: 4.74)
input size: 67108864
improved scalar ... 0.22893
scalar ... 0.27610 (speed up: 0.83)
scalar & BMI2 ... 0.27561 (speed up: 0.83)
SSE (lookup: base, pack: naive) ... 0.12339 (speed up: 1.86)
SSE (lookup: byte blend, pack: naive) ... 0.23890 (speed up: 0.96)
SSE (lookup: incremental, pack: naive) ... 0.16515 (speed up: 1.39)
SSE (lookup: pshufb, pack: naive) ... 0.18996 (speed up: 1.21)
SSE (lookup: base, pack: multiply-add) ... 0.12894 (speed up: 1.78)
SSE (lookup: byte blend, pack: multiply-add) ... 0.23543 (speed up: 0.97)
SSE (lookup: incremental, pack: multiply-add) ... 0.16112 (speed up: 1.42)
SSE (lookup: pshufb, pack: multiply-add) ... 0.18685 (speed up: 1.23)
SSE & BMI2 (lookup: base, pack: N/A) ... 0.18427 (speed up: 1.24)
SSE & BMI2 (lookup: byte blend, pack: N/A) ... 0.30710 (speed up: 0.75)
SSE & BMI2 (lookup: incremental, pack: N/A) ... 0.22669 (speed up: 1.01)
AVX2 (lookup: base, pack: naive) ... 0.08324 (speed up: 2.75)
AVX2 (lookup: byte blend, pack: naive) ... 0.13914 (speed up: 1.65)
AVX2 (lookup: pshufb, pack: naive) ... 0.12542 (speed up: 1.83)
AVX2 (lookup: base, pack: multiply-add) ... 0.10928 (speed up: 2.09)
AVX2 (lookup: byte blend, pack: multiply-add) ... 0.16505 (speed up: 1.39)
AVX2 (lookup: pshufb, pack: multiply-add) ... 0.15093 (speed up: 1.52)
AVX2 & BMI2 (lookup: base, pack: N/A) ... 0.15995 (speed up: 1.43)
AVX2 & BMI2 (lookup: byte blend, pack: N/A) ... 0.20591 (speed up: 1.11)
AVX512 (gather) ... 0.05200 (speed up: 4.40)
AVX512 (store: vectorized) (lookup: vectorized, pack: improved) ... 0.04478 (speed up: 5.11)
AVX512 (store: scatter) (lookup: vectorized, pack: improved) ... 0.04884 (speed up: 4.69)