Skip to content

wrong result from cblas_sdot on arm64 #5293

@arnej27959

Description

@arnej27959

After upgrading to Almalinux 10, we noticed that a simple sanity check of dot product indicated a wrong result. Checking with this simple program shows the problem:

[arnej@a10-1 work]$ cat tob.c

#include <cblas.h>
#include <stdio.h>

int main(int argc, char **argv) {
	float a[65];
	float b[65];
	float sum;
	int i;
	for (i = 0; i < 65; i++) {
		a[i] = (i % 3) + 1;
		b[i] = 4 - (i % 3);
		sum += (a[i] * b[i]);
	}
	float dotprod = cblas_sdot(65, a, 1, b, 1);
	printf("manual dot product = %f; cblas_sdot = %f\n", sum, dotprod);
	return 0;
}

[arnej@a10-1 work]$ cc tob.c -I /usr/include/openblas -lopenblas
[arnej@a10-1 work]$ ./a.out
manual dot product = 346.000000; cblas_sdot = 10.000000

This happens because the vectorized part of dot_kernel_asimd gathers its result into v0.4s (aka s0) register, but the surrounding code uses the symbolic %s[DOT_] via the OUT macro. This only works when the compiler happens to pick s0 as the register for the "dot" variable.

Below is the assembly code I get by running:
$ cc -DCNAME=sdot_k_NEOVERSEN1 -c -O2 -g -march=armv8.2-a -mtune=neoverse-n1 -I. kernel/arm64/dot.c
$ objdump -d dot.o
Notice how at 138 it uses faddp with v0.4s as target, then potentially branching to 1f4, where s0 is overwritten using the contents of s31.

0000000000000000 <sdot_k_NEOVERSEN1>:
   0:	0f000400 	movi	v0.2s, #0x0
   4:	f100001f 	cmp	x0, #0x0
   8:	54000f8d 	b.le	1f8 <sdot_k_NEOVERSEN1+0x1f8>
   c:	d2800005 	mov	x5, #0x0                   	// #0
  10:	1e2703ff 	fmov	s31, wzr
  14:	9e6703e1 	fmov	d1, xzr
  18:	9e6703e2 	fmov	d2, xzr
  1c:	9e6703e3 	fmov	d3, xzr
  20:	9e6703e4 	fmov	d4, xzr
  24:	9e6703e5 	fmov	d5, xzr
  28:	9e6703e6 	fmov	d6, xzr
  2c:	9e6703e7 	fmov	d7, xzr
  30:	f100045f 	cmp	x2, #0x1
  34:	54000981 	b.ne	164 <sdot_k_NEOVERSEN1+0x164>  // b.any
  38:	f100049f 	cmp	x4, #0x1
  3c:	54000941 	b.ne	164 <sdot_k_NEOVERSEN1+0x164>  // b.any
  40:	d37ef442 	lsl	x2, x2, #2
  44:	d37ef484 	lsl	x4, x4, #2
  48:	9346fc05 	asr	x5, x0, #6
  4c:	eb1f00bf 	cmp	x5, xzr
  50:	54000760 	b.eq	13c <sdot_k_NEOVERSEN1+0x13c>  // b.none
  54:	d503201f 	nop
  58:	d503201f 	nop
  5c:	d503201f 	nop
  60:	ad404430 	ldp	q16, q17, [x1]
  64:	ad406478 	ldp	q24, q25, [x3]
  68:	ad414c32 	ldp	q18, q19, [x1, #32]
  6c:	ad416c7a 	ldp	q26, q27, [x3, #32]
  70:	4e38ce00 	fmla	v0.4s, v16.4s, v24.4s
  74:	4e39ce21 	fmla	v1.4s, v17.4s, v25.4s
  78:	ad425434 	ldp	q20, q21, [x1, #64]
  7c:	ad42747c 	ldp	q28, q29, [x3, #64]
  80:	4e3ace42 	fmla	v2.4s, v18.4s, v26.4s
  84:	4e3bce63 	fmla	v3.4s, v19.4s, v27.4s
  88:	ad435c36 	ldp	q22, q23, [x1, #96]
  8c:	ad437c7e 	ldp	q30, q31, [x3, #96]
  90:	91020063 	add	x3, x3, #0x80
  94:	91020021 	add	x1, x1, #0x80
  98:	4e3cce84 	fmla	v4.4s, v20.4s, v28.4s
  9c:	4e3dcea5 	fmla	v5.4s, v21.4s, v29.4s
  a0:	f981c020 	prfm	pldl1keep, [x1, #896]
  a4:	f981c060 	prfm	pldl1keep, [x3, #896]
  a8:	f981e020 	prfm	pldl1keep, [x1, #960]
  ac:	f981e060 	prfm	pldl1keep, [x3, #960]
  b0:	4e3ecec6 	fmla	v6.4s, v22.4s, v30.4s
  b4:	4e3fcee7 	fmla	v7.4s, v23.4s, v31.4s
  b8:	ad404430 	ldp	q16, q17, [x1]
  bc:	ad406478 	ldp	q24, q25, [x3]
  c0:	ad414c32 	ldp	q18, q19, [x1, #32]
  c4:	ad416c7a 	ldp	q26, q27, [x3, #32]
  c8:	4e38ce00 	fmla	v0.4s, v16.4s, v24.4s
  cc:	4e39ce21 	fmla	v1.4s, v17.4s, v25.4s
  d0:	ad425434 	ldp	q20, q21, [x1, #64]
  d4:	ad42747c 	ldp	q28, q29, [x3, #64]
  d8:	4e3ace42 	fmla	v2.4s, v18.4s, v26.4s
  dc:	4e3bce63 	fmla	v3.4s, v19.4s, v27.4s
  e0:	ad435c36 	ldp	q22, q23, [x1, #96]
  e4:	ad437c7e 	ldp	q30, q31, [x3, #96]
  e8:	91020063 	add	x3, x3, #0x80
  ec:	91020021 	add	x1, x1, #0x80
  f0:	4e3cce84 	fmla	v4.4s, v20.4s, v28.4s
  f4:	4e3dcea5 	fmla	v5.4s, v21.4s, v29.4s
  f8:	f981c020 	prfm	pldl1keep, [x1, #896]
  fc:	f981c060 	prfm	pldl1keep, [x3, #896]
 100:	f981e020 	prfm	pldl1keep, [x1, #960]
 104:	f981e060 	prfm	pldl1keep, [x3, #960]
 108:	4e3ecec6 	fmla	v6.4s, v22.4s, v30.4s
 10c:	4e3fcee7 	fmla	v7.4s, v23.4s, v31.4s
 110:	f10004a5 	subs	x5, x5, #0x1
 114:	54fffa61 	b.ne	60 <sdot_k_NEOVERSEN1+0x60>  // b.any
 118:	4e21d400 	fadd	v0.4s, v0.4s, v1.4s
 11c:	4e23d442 	fadd	v2.4s, v2.4s, v3.4s
 120:	4e25d484 	fadd	v4.4s, v4.4s, v5.4s
 124:	4e27d4c6 	fadd	v6.4s, v6.4s, v7.4s
 128:	4e22d400 	fadd	v0.4s, v0.4s, v2.4s
 12c:	4e26d484 	fadd	v4.4s, v4.4s, v6.4s
 130:	4e24d400 	fadd	v0.4s, v0.4s, v4.4s
 134:	6e20d400 	faddp	v0.4s, v0.4s, v0.4s
 138:	6e20d400 	faddp	v0.4s, v0.4s, v0.4s
 13c:	f2401405 	ands	x5, x0, #0x3f
 140:	540005ad 	b.le	1f4 <sdot_k_NEOVERSEN1+0x1f4>
 144:	bd400030 	ldr	s16, [x1]
 148:	bd400078 	ldr	s24, [x3]
 14c:	8b020021 	add	x1, x1, x2
 150:	8b040063 	add	x3, x3, x4
 154:	1f187e1f 	fmadd	s31, s16, s24, s31
 158:	f10004a5 	subs	x5, x5, #0x1
 15c:	54ffff41 	b.ne	144 <sdot_k_NEOVERSEN1+0x144>  // b.any
 160:	14000025 	b	1f4 <sdot_k_NEOVERSEN1+0x1f4>
 164:	d37ef442 	lsl	x2, x2, #2
 168:	d37ef484 	lsl	x4, x4, #2
 16c:	9342fc05 	asr	x5, x0, #2
 170:	eb1f00bf 	cmp	x5, xzr
 174:	540002ed 	b.le	1d0 <sdot_k_NEOVERSEN1+0x1d0>
 178:	bd400030 	ldr	s16, [x1]
 17c:	bd400078 	ldr	s24, [x3]
 180:	8b020021 	add	x1, x1, x2
 184:	8b040063 	add	x3, x3, x4
 188:	1f187e1f 	fmadd	s31, s16, s24, s31
 18c:	bd400030 	ldr	s16, [x1]
 190:	bd400078 	ldr	s24, [x3]
 194:	8b020021 	add	x1, x1, x2
 198:	8b040063 	add	x3, x3, x4
 19c:	1f187e1f 	fmadd	s31, s16, s24, s31
 1a0:	bd400030 	ldr	s16, [x1]
 1a4:	bd400078 	ldr	s24, [x3]
 1a8:	8b020021 	add	x1, x1, x2
 1ac:	8b040063 	add	x3, x3, x4
 1b0:	1f187e1f 	fmadd	s31, s16, s24, s31
 1b4:	bd400030 	ldr	s16, [x1]
 1b8:	bd400078 	ldr	s24, [x3]
 1bc:	8b020021 	add	x1, x1, x2
 1c0:	8b040063 	add	x3, x3, x4
 1c4:	1f187e1f 	fmadd	s31, s16, s24, s31
 1c8:	f10004a5 	subs	x5, x5, #0x1
 1cc:	54fffd61 	b.ne	178 <sdot_k_NEOVERSEN1+0x178>  // b.any
 1d0:	f2400405 	ands	x5, x0, #0x3
 1d4:	5400010d 	b.le	1f4 <sdot_k_NEOVERSEN1+0x1f4>
 1d8:	bd400030 	ldr	s16, [x1]
 1dc:	bd400078 	ldr	s24, [x3]
 1e0:	8b020021 	add	x1, x1, x2
 1e4:	8b040063 	add	x3, x3, x4
 1e8:	1f187e1f 	fmadd	s31, s16, s24, s31
 1ec:	f10004a5 	subs	x5, x5, #0x1
 1f0:	54ffff41 	b.ne	1d8 <sdot_k_NEOVERSEN1+0x1d8>  // b.any
 1f4:	1e2043e0 	fmov	s0, s31
 1f8:	d65f03c0 	ret

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions