-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Open
Description
Extracting from rust-ndarray/ndarray#1521.
The following reproduction fails on Mac M4 (version 0.3.30) but passes on aarch64 Linux (version 0.3.21):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
// Include BLAS headers - OpenBLAS or Accelerate
#ifdef __APPLE__
#include <Accelerate/Accelerate.h>
#else
#include <cblas.h>
#endif
int main() {
// Test case from ndarray test_mat_mul
// A is 45x33, B is 33x33 identity, C should be 45x33 (same as A)
const int m = 45, n = 33, k = 33;
const float alpha = 1.0f, beta = 0.0f;
// Allocate matrices
float *A = (float*)malloc(m * k * sizeof(float));
float *B = (float*)malloc(k * n * sizeof(float));
float *C = (float*)malloc(m * n * sizeof(float));
// Initialize A with linspace(0, 1484, 45*33) like ndarray test
for (int i = 0; i < m * k; i++) {
A[i] = (float)i;
}
// Initialize B as identity matrix
memset(B, 0, k * n * sizeof(float));
for (int i = 0; i < k; i++) {
B[i * n + i] = 1.0f; // B[i][i] = 1
}
// Initialize C to zero
memset(C, 0, m * n * sizeof(float));
printf("Before SGEMM:\n");
printf("A[0..2,0..2]:\n");
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
printf("%.2f ", A[i * k + j]);
}
printf("\n");
}
printf("B[0..2,0..2]:\n");
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
printf("%.2f ", B[i * n + j]);
}
printf("\n");
}
// Call OpenBLAS SGEMM
// C = alpha * A * B + beta * C
// Row-major layout: CblasRowMajor, no transpose
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k,
alpha,
A, k, // lda = k (leading dimension of A)
B, n, // ldb = n (leading dimension of B)
beta,
C, n); // ldc = n (leading dimension of C)
printf("\nAfter SGEMM:\n");
printf("C[0..2,0..2] (should equal A[0..2,0..2]):\n");
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
printf("%.2f ", C[i * n + j]);
}
printf("\n");
}
// Check if result is correct (C should equal A since A * I = A)
int correct = 1;
for (int i = 0; i < m * n; i++) {
if (fabs(C[i] - A[i]) > 1e-5) {
correct = 0;
break;
}
}
printf("\nResult: %s\n", correct ? "CORRECT" : "INCORRECT");
// Print some diagnostics
printf("\nDiagnostics:\n");
printf("All C values zero? %s\n", C[0] == 0.0f && C[1] == 0.0f ? "YES" : "NO");
free(A);
free(B);
free(C);
return correct ? 0 : 1;
}Compile with gcc -I/opt/homebrew/opt/openblas/include -L/opt/homebrew/opt/openblas/lib -lopenblas debug_openblas.c -o debug_openblas -Wno-deprecated
On Mac with OpenBlas, this returns:
After SGEMM:
C[0..2,0..2] (should equal A[0..2,0..2]):
0.00 0.00 0.00
0.00 0.00 0.00
0.00 0.00 0.00
On Linux with OpenBlas or on Mac with Accelerate, this returns:
After SGEMM:
C[0..2,0..2] (should equal A[0..2,0..2]):
0.00 1.00 2.00
33.00 34.00 35.00
66.00 67.00 68.00
Metadata
Metadata
Assignees
Labels
No labels