Fix sparse tensor gradients and add backend checks

nathanneike · nathanneike · commit 54479d5a9c23 · 2025-11-21T14:29:12.000+01:00
- Preserve PyTorch sparse tensors through numpy conversion for autograd
- Verify gradient w.r.t. M equals transport plan
- Add sparse backend compatibility checks and teststhrow error when unsupported backend used for sparse"
diff --git a/ot/backend.py b/ot/backend.py
@@ -178,7 +178,16 @@ def _get_backend_instance(backend_impl):
 
 
 def _check_args_backend(backend_impl, args):
-    is_instance = set(isinstance(arg, backend_impl.__type__) for arg in args)
+    # Get backend instance to use issparse method
+    backend = _get_backend_instance(backend_impl)
+
+    # Check if each arg is either:
+    # 1. An instance of backend.__type__ (e.g., np.ndarray for NumPy)
+    # 2. A sparse matrix recognized by backend.issparse() (e.g., scipy.sparse for NumPy)
+    is_instance = set(
+        isinstance(arg, backend_impl.__type__) or backend.issparse(arg) for arg in args
+    )
+
     # check that all arguments matched or not the type
     if len(is_instance) == 1:
         return is_instance.pop()
diff --git a/ot/lp/_network_simplex.py b/ot/lp/_network_simplex.py
@@ -10,7 +10,6 @@
 
 import numpy as np
 import warnings
-from scipy.sparse import issparse as scipy_issparse
 
 from ..utils import list_to_array, check_number_threads
 from ..backend import get_backend
@@ -295,12 +294,14 @@ def emd(
     edge_costs = None
     n1, n2 = None, None
 
-    # Get backend to check if M is sparse
-    a, b = list_to_array(a, b)
-    nx = get_backend(a, b)
+    # Get backend from M first, then use it for list_to_array
+    # This ensures empty lists [] are converted to arrays in the correct backend
+    nx_M = get_backend(M)
+    a, b = list_to_array(a, b, nx=nx_M)
+    nx = get_backend(a, b, M)
 
-    # Check if M is sparse (either backend sparse or scipy.sparse)
-    is_sparse = nx.issparse(M) or scipy_issparse(M)
+    # Check if M is sparse using backend's issparse method
+    is_sparse = nx.issparse(M)
 
     if is_sparse:
         # Check if backend supports sparse matrices
@@ -579,14 +580,17 @@ def emd2(
     edge_costs = None
     n1, n2 = None, None
 
-    # Get backend to check if M is sparse
-    a, b = list_to_array(a, b)
-    nx = get_backend(a, b)
+    # Get backend from M first, then use it for list_to_array
+    # This ensures empty lists [] are converted to arrays in the correct backend
+    nx_M = get_backend(M)
+    a, b = list_to_array(a, b, nx=nx_M)
+    nx = get_backend(a, b, M)
 
-    # Check if M is sparse (either backend sparse or scipy.sparse)
-    from scipy.sparse import issparse as scipy_issparse
+    # Check if M is sparse using backend's issparse method
+    is_sparse = nx.issparse(M)
 
-    is_sparse = nx.issparse(M) or scipy_issparse(M)
+    # Save original sparse tensor for gradient tracking (before conversion to numpy)
+    M_original_sparse = None
 
     if is_sparse:
         # Check if backend supports sparse matrices
@@ -599,6 +603,9 @@ def emd2(
                 "matrix to dense format using M.toarray() or equivalent before calling emd2()."
             )
 
+        # Save original M for gradient tracking (before numpy conversion)
+        M_original_sparse = M
+
         # Extract COO data using backend method - returns numpy arrays
         edge_sources, edge_targets, edge_costs, (n1, n2) = nx.sparse_coo_data(M)
 
@@ -641,7 +648,9 @@ def emd2(
     M0 = None if is_sparse else M
 
     if is_sparse:
-        edge_costs_original = nx.from_numpy(edge_costs, type_as=type_as)
+        # Use the original sparse tensor (preserves gradients for PyTorch)
+        # instead of converting from numpy
+        edge_costs_original = M_original_sparse
     else:
         edge_costs_original = None
 
@@ -713,13 +722,27 @@ def f(b):
                 if edge_idx >= 0:
                     grad_edge_costs[edge_idx] = flow
 
+            # Convert gradient to sparse format matching edge_costs_original
+            grad_edge_costs_backend = nx.from_numpy(grad_edge_costs, type_as=type_as)
+            if nx.issparse(edge_costs_original):
+                # Reconstruct sparse gradient tensor with same structure as original
+                grad_M_sparse = nx.coo_matrix(
+                    grad_edge_costs_backend,
+                    nx.from_numpy(edge_sources.astype(np.int64), type_as=type_as),
+                    nx.from_numpy(edge_targets.astype(np.int64), type_as=type_as),
+                    shape=(n1, n2),
+                    type_as=type_as,
+                )
+            else:
+                grad_M_sparse = grad_edge_costs_backend
+
             cost = nx.set_gradients(
                 nx.from_numpy(cost, type_as=type_as),
                 (a0, b0, edge_costs_original),
                 (
                     nx.from_numpy(u - np.mean(u), type_as=type_as),
                     nx.from_numpy(v - np.mean(v), type_as=type_as),
-                    nx.from_numpy(grad_edge_costs, type_as=type_as),
+                    grad_M_sparse,
                 ),
             )
         else:
diff --git a/test/test_backend.py b/test/test_backend.py
@@ -75,6 +75,48 @@ class nx_subclass(nx.__type__):
         assert effective_nx.__name__ == nx.__name__
 
 
+def test_get_backend_sparse_matrix():
+    """Test that get_backend correctly handles sparse matrices and rejects mixed backends."""
+    from scipy.sparse import coo_matrix
+
+    a_np = np.array([0.5, 0.5])
+    b_np = np.array([0.5, 0.5])
+    M_scipy = coo_matrix(([1.0, 2.0], ([0, 1], [0, 1])), shape=(2, 2))
+
+    nx = get_backend(a_np, b_np, M_scipy)
+    assert nx.__name__ == "numpy", "NumPy backend should accept scipy.sparse matrices"
+
+    nx = get_backend(M_scipy)
+    assert nx.__name__ == "numpy", "scipy.sparse should use NumPy backend"
+
+    if torch:
+        a_torch = torch.tensor([0.5, 0.5])
+        b_torch = torch.tensor([0.5, 0.5])
+        M_torch_sparse = torch.sparse_coo_tensor(
+            torch.tensor([[0, 1], [0, 1]]), torch.tensor([1.0, 2.0]), (2, 2)
+        )
+
+        nx = get_backend(a_torch, b_torch, M_torch_sparse)
+        assert (
+            nx.__name__ == "torch"
+        ), "PyTorch backend should accept torch.sparse tensors"
+
+        nx = get_backend(M_torch_sparse)
+        assert nx.__name__ == "torch", "torch.sparse should use PyTorch backend"
+
+        # Case 1: PyTorch dense + scipy.sparse (incompatible)
+        with pytest.raises(ValueError):
+            get_backend(a_torch, b_torch, M_scipy)
+
+        # Case 2: NumPy dense + torch.sparse (incompatible)
+        with pytest.raises(ValueError):
+            get_backend(a_np, b_np, M_torch_sparse)
+
+        # Case 3: scipy.sparse + torch.sparse (incompatible)
+        with pytest.raises(ValueError):
+            get_backend(M_scipy, M_torch_sparse)
+
+
 def test_convert_between_backends(nx):
     A = np.zeros((3, 2))
     B = np.zeros((3, 1))
diff --git a/test/test_ot.py b/test/test_ot.py
@@ -1083,6 +1083,102 @@ def test_emd2_sparse_vs_dense():
     np.testing.assert_allclose(cost_dense, cost_sparse, rtol=1e-5, atol=1e-7)
 
 
+def test_emd2_sparse_gradients():
+    """Test that PyTorch sparse tensors support gradient computation."""
+    if not torch:
+        pytest.skip("PyTorch not available")
+
+    n = 10
+    a = torch.tensor(ot.utils.unif(n), requires_grad=True, dtype=torch.float64)
+    b = torch.tensor(ot.utils.unif(n), requires_grad=True, dtype=torch.float64)
+
+    rows, cols, costs = [], [], []
+    for i in range(n):
+        rows.append(i)
+        cols.append(i)
+        costs.append(0.1)
+        for offset in [1, 2]:
+            j = (i + offset) % n
+            rows.append(i)
+            cols.append(j)
+            costs.append(float(offset))
+
+    indices = torch.tensor(
+        np.vstack([np.array(rows), np.array(cols)]), dtype=torch.int64
+    )
+    values = torch.tensor(costs, dtype=torch.float64)
+    M_sparse = torch.sparse_coo_tensor(indices, values, (n, n), dtype=torch.float64)
+
+    cost = ot.emd2(a, b, M_sparse)
+    cost.backward()
+
+    assert a.grad is not None
+    assert b.grad is not None
+    np.testing.assert_allclose(
+        a.grad.sum().item(), -b.grad.sum().item(), rtol=1e-5, atol=1e-7
+    )
+
+
+def test_emd2_sparse_vs_dense_gradients():
+    """Verify gradient w.r.t. cost matrix M equals transport plan G."""
+    if not torch:
+        pytest.skip("PyTorch not available")
+
+    n = 4
+    a = torch.tensor([0.25, 0.25, 0.25, 0.25], requires_grad=True, dtype=torch.float64)
+    b = torch.tensor([0.25, 0.25, 0.25, 0.25], requires_grad=True, dtype=torch.float64)
+
+    M_full = torch.tensor(
+        [
+            [0.1, 1.0, 2.0, 3.0],
+            [1.0, 0.1, 1.0, 2.0],
+            [2.0, 1.0, 0.1, 1.0],
+            [3.0, 2.0, 1.0, 0.1],
+        ],
+        dtype=torch.float64,
+        requires_grad=True,
+    )
+
+    cost_dense = ot.emd2(a, b, M_full)
+    cost_dense.backward()
+    G_dense = ot.emd(a.detach(), b.detach(), M_full.detach())
+
+    np.testing.assert_allclose(
+        M_full.grad.numpy(), G_dense.numpy(), rtol=1e-7, atol=1e-10
+    )
+
+    a.grad = None
+    b.grad = None
+
+    rows, cols, costs = [], [], []
+    for i in range(n):
+        for j in range(max(0, i - 1), min(n, i + 2)):
+            rows.append(i)
+            cols.append(j)
+            costs.append(M_full[i, j].item())
+
+    rows_t = torch.tensor(rows, dtype=torch.int64)
+    cols_t = torch.tensor(cols, dtype=torch.int64)
+    M_sparse = torch.sparse_coo_tensor(
+        torch.stack([rows_t, cols_t]),
+        torch.tensor(costs, dtype=torch.float64),
+        (n, n),
+        dtype=torch.float64,
+        requires_grad=True,
+    )
+
+    cost_sparse = ot.emd2(a, b, M_sparse)
+    cost_sparse.backward()
+    G_sparse = ot.emd(a.detach(), b.detach(), M_sparse.detach()).to_dense()
+
+    grad_values = M_sparse.grad.coalesce().values().numpy()
+    G_values = G_sparse[rows_t, cols_t].numpy()
+
+    np.testing.assert_allclose(grad_values, G_values, rtol=1e-7, atol=1e-10)
+    assert grad_values.sum() > 0
+    assert np.abs(grad_values.sum() - 1.0) < 1e-7
+
+
 def test_emd_sparse_backends(nx):
     """Test that sparse EMD works with different backends for weights a and b.