After upgrading to Almalinux 10, we noticed that a simple sanity check of dot product indicated a wrong result. Checking with this simple program shows the problem:
This happens because the vectorized part of dot_kernel_asimd gathers its result into v0.4s (aka s0) register, but the surrounding code uses the symbolic %s[DOT_] via the OUT macro. This only works when the compiler happens to pick s0 as the register for the "dot" variable.
0000000000000000 <sdot_k_NEOVERSEN1>:
0: 0f000400 movi v0.2s, #0x0
4: f100001f cmp x0, #0x0
8: 54000f8d b.le 1f8 <sdot_k_NEOVERSEN1+0x1f8>
c: d2800005 mov x5, #0x0 // #0
10: 1e2703ff fmov s31, wzr
14: 9e6703e1 fmov d1, xzr
18: 9e6703e2 fmov d2, xzr
1c: 9e6703e3 fmov d3, xzr
20: 9e6703e4 fmov d4, xzr
24: 9e6703e5 fmov d5, xzr
28: 9e6703e6 fmov d6, xzr
2c: 9e6703e7 fmov d7, xzr
30: f100045f cmp x2, #0x1
34: 54000981 b.ne 164 <sdot_k_NEOVERSEN1+0x164> // b.any
38: f100049f cmp x4, #0x1
3c: 54000941 b.ne 164 <sdot_k_NEOVERSEN1+0x164> // b.any
40: d37ef442 lsl x2, x2, #2
44: d37ef484 lsl x4, x4, #2
48: 9346fc05 asr x5, x0, #6
4c: eb1f00bf cmp x5, xzr
50: 54000760 b.eq 13c <sdot_k_NEOVERSEN1+0x13c> // b.none
54: d503201f nop
58: d503201f nop
5c: d503201f nop
60: ad404430 ldp q16, q17, [x1]
64: ad406478 ldp q24, q25, [x3]
68: ad414c32 ldp q18, q19, [x1, #32]
6c: ad416c7a ldp q26, q27, [x3, #32]
70: 4e38ce00 fmla v0.4s, v16.4s, v24.4s
74: 4e39ce21 fmla v1.4s, v17.4s, v25.4s
78: ad425434 ldp q20, q21, [x1, #64]
7c: ad42747c ldp q28, q29, [x3, #64]
80: 4e3ace42 fmla v2.4s, v18.4s, v26.4s
84: 4e3bce63 fmla v3.4s, v19.4s, v27.4s
88: ad435c36 ldp q22, q23, [x1, #96]
8c: ad437c7e ldp q30, q31, [x3, #96]
90: 91020063 add x3, x3, #0x80
94: 91020021 add x1, x1, #0x80
98: 4e3cce84 fmla v4.4s, v20.4s, v28.4s
9c: 4e3dcea5 fmla v5.4s, v21.4s, v29.4s
a0: f981c020 prfm pldl1keep, [x1, #896]
a4: f981c060 prfm pldl1keep, [x3, #896]
a8: f981e020 prfm pldl1keep, [x1, #960]
ac: f981e060 prfm pldl1keep, [x3, #960]
b0: 4e3ecec6 fmla v6.4s, v22.4s, v30.4s
b4: 4e3fcee7 fmla v7.4s, v23.4s, v31.4s
b8: ad404430 ldp q16, q17, [x1]
bc: ad406478 ldp q24, q25, [x3]
c0: ad414c32 ldp q18, q19, [x1, #32]
c4: ad416c7a ldp q26, q27, [x3, #32]
c8: 4e38ce00 fmla v0.4s, v16.4s, v24.4s
cc: 4e39ce21 fmla v1.4s, v17.4s, v25.4s
d0: ad425434 ldp q20, q21, [x1, #64]
d4: ad42747c ldp q28, q29, [x3, #64]
d8: 4e3ace42 fmla v2.4s, v18.4s, v26.4s
dc: 4e3bce63 fmla v3.4s, v19.4s, v27.4s
e0: ad435c36 ldp q22, q23, [x1, #96]
e4: ad437c7e ldp q30, q31, [x3, #96]
e8: 91020063 add x3, x3, #0x80
ec: 91020021 add x1, x1, #0x80
f0: 4e3cce84 fmla v4.4s, v20.4s, v28.4s
f4: 4e3dcea5 fmla v5.4s, v21.4s, v29.4s
f8: f981c020 prfm pldl1keep, [x1, #896]
fc: f981c060 prfm pldl1keep, [x3, #896]
100: f981e020 prfm pldl1keep, [x1, #960]
104: f981e060 prfm pldl1keep, [x3, #960]
108: 4e3ecec6 fmla v6.4s, v22.4s, v30.4s
10c: 4e3fcee7 fmla v7.4s, v23.4s, v31.4s
110: f10004a5 subs x5, x5, #0x1
114: 54fffa61 b.ne 60 <sdot_k_NEOVERSEN1+0x60> // b.any
118: 4e21d400 fadd v0.4s, v0.4s, v1.4s
11c: 4e23d442 fadd v2.4s, v2.4s, v3.4s
120: 4e25d484 fadd v4.4s, v4.4s, v5.4s
124: 4e27d4c6 fadd v6.4s, v6.4s, v7.4s
128: 4e22d400 fadd v0.4s, v0.4s, v2.4s
12c: 4e26d484 fadd v4.4s, v4.4s, v6.4s
130: 4e24d400 fadd v0.4s, v0.4s, v4.4s
134: 6e20d400 faddp v0.4s, v0.4s, v0.4s
138: 6e20d400 faddp v0.4s, v0.4s, v0.4s
13c: f2401405 ands x5, x0, #0x3f
140: 540005ad b.le 1f4 <sdot_k_NEOVERSEN1+0x1f4>
144: bd400030 ldr s16, [x1]
148: bd400078 ldr s24, [x3]
14c: 8b020021 add x1, x1, x2
150: 8b040063 add x3, x3, x4
154: 1f187e1f fmadd s31, s16, s24, s31
158: f10004a5 subs x5, x5, #0x1
15c: 54ffff41 b.ne 144 <sdot_k_NEOVERSEN1+0x144> // b.any
160: 14000025 b 1f4 <sdot_k_NEOVERSEN1+0x1f4>
164: d37ef442 lsl x2, x2, #2
168: d37ef484 lsl x4, x4, #2
16c: 9342fc05 asr x5, x0, #2
170: eb1f00bf cmp x5, xzr
174: 540002ed b.le 1d0 <sdot_k_NEOVERSEN1+0x1d0>
178: bd400030 ldr s16, [x1]
17c: bd400078 ldr s24, [x3]
180: 8b020021 add x1, x1, x2
184: 8b040063 add x3, x3, x4
188: 1f187e1f fmadd s31, s16, s24, s31
18c: bd400030 ldr s16, [x1]
190: bd400078 ldr s24, [x3]
194: 8b020021 add x1, x1, x2
198: 8b040063 add x3, x3, x4
19c: 1f187e1f fmadd s31, s16, s24, s31
1a0: bd400030 ldr s16, [x1]
1a4: bd400078 ldr s24, [x3]
1a8: 8b020021 add x1, x1, x2
1ac: 8b040063 add x3, x3, x4
1b0: 1f187e1f fmadd s31, s16, s24, s31
1b4: bd400030 ldr s16, [x1]
1b8: bd400078 ldr s24, [x3]
1bc: 8b020021 add x1, x1, x2
1c0: 8b040063 add x3, x3, x4
1c4: 1f187e1f fmadd s31, s16, s24, s31
1c8: f10004a5 subs x5, x5, #0x1
1cc: 54fffd61 b.ne 178 <sdot_k_NEOVERSEN1+0x178> // b.any
1d0: f2400405 ands x5, x0, #0x3
1d4: 5400010d b.le 1f4 <sdot_k_NEOVERSEN1+0x1f4>
1d8: bd400030 ldr s16, [x1]
1dc: bd400078 ldr s24, [x3]
1e0: 8b020021 add x1, x1, x2
1e4: 8b040063 add x3, x3, x4
1e8: 1f187e1f fmadd s31, s16, s24, s31
1ec: f10004a5 subs x5, x5, #0x1
1f0: 54ffff41 b.ne 1d8 <sdot_k_NEOVERSEN1+0x1d8> // b.any
1f4: 1e2043e0 fmov s0, s31
1f8: d65f03c0 ret
After upgrading to Almalinux 10, we noticed that a simple sanity check of dot product indicated a wrong result. Checking with this simple program shows the problem:
[arnej@a10-1 work]$ cat tob.c
[arnej@a10-1 work]$ cc tob.c -I /usr/include/openblas -lopenblas
[arnej@a10-1 work]$ ./a.out
manual dot product = 346.000000; cblas_sdot = 10.000000This happens because the vectorized part of dot_kernel_asimd gathers its result into v0.4s (aka s0) register, but the surrounding code uses the symbolic %s[DOT_] via the OUT macro. This only works when the compiler happens to pick s0 as the register for the "dot" variable.
Below is the assembly code I get by running:
$ cc -DCNAME=sdot_k_NEOVERSEN1 -c -O2 -g -march=armv8.2-a -mtune=neoverse-n1 -I. kernel/arm64/dot.c
$ objdump -d dot.o
Notice how at 138 it uses faddp with v0.4s as target, then potentially branching to 1f4, where s0 is overwritten using the contents of s31.