From 7791cee40bf1bed80486fc7747dc14fb47dbc5a1 Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Fri, 4 Aug 2023 14:28:44 +0200
Subject: [PATCH 1/4] fix gromov doc

---
 ot/gromov/_bregman.py     |  9 ++--
 ot/gromov/_estimators.py  |  5 ++-
 ot/gromov/_gw.py          | 89 ++++++++++++++++++++++++---------------
 ot/gromov/_semirelaxed.py | 24 ++++++-----
 4 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/ot/gromov/_bregman.py b/ot/gromov/_bregman.py
index 18cef568b..10a0d65c2 100644
--- a/ot/gromov/_bregman.py
+++ b/ot/gromov/_bregman.py
@@ -225,8 +225,9 @@ def entropic_gromov_wasserstein2(
         C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1, symmetric=None, G0=None, max_iter=1000,
         tol=1e-9, solver='PGD', warmstart=False, verbose=False, log=False, **kwargs):
     r"""
-    Returns the Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    Returns the Gromov-Wasserstein discrepancy :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
     estimated using Sinkhorn projections.
+    The Gromov-Wasserstein distance as defined in [13] satisfies :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
     If `solver="PGD"`, the function solves the following entropic-regularized
     Gromov-Wasserstein optimization problem using Projected Gradient Descent [12]:
@@ -351,7 +352,7 @@ def entropic_gromov_barycenters(
 
     .. math::
 
-        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
+        \mathbf{C}^* = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
@@ -700,7 +701,7 @@ def entropic_fused_gromov_wasserstein2(
         symmetric=None, alpha=0.5, G0=None, max_iter=1000, tol=1e-9,
         solver='PGD', warmstart=False, verbose=False, log=False, **kwargs):
     r"""
-    Returns the Fused Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
+    Returns the Fused Gromov-Wasserstein distance between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
     with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}`,
     estimated using Sinkhorn projections.
 
@@ -832,7 +833,7 @@ def entropic_fused_gromov_barycenters(
 
     .. math::
 
-        \mathbf{C}, \mathbf{Y} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}, \mathbf{Y}\in \mathbb{Y}^{N \times d}} \quad \sum_s \lambda_s \mathrm{FGW}_{\alpha}(\mathbf{C}, \mathbf{C}_s, \mathbf{Y}, \mathbf{Y}_s, \mathbf{p}, \mathbf{p}_s)
+        \mathbf{C}^*, \mathbf{Y}^* = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}, \mathbf{Y}\in \mathbb{Y}^{N \times d}} \quad \sum_s \lambda_s \mathrm{FGW}_{\alpha}(\mathbf{C}, \mathbf{C}_s, \mathbf{Y}, \mathbf{Y}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
diff --git a/ot/gromov/_estimators.py b/ot/gromov/_estimators.py
index 0a29a918b..9407ecf64 100644
--- a/ot/gromov/_estimators.py
+++ b/ot/gromov/_estimators.py
@@ -20,14 +20,15 @@
 def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
                            nb_samples_p=None, nb_samples_q=None, std=True, random_state=None):
     r"""
-    Returns an approximation of the gromov-wasserstein cost between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    Returns an approximation of the Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
     with a fixed transport plan :math:`\mathbf{T}`.
+    An approximation of the Gromov-Wasserstein distance as defined in [13] satisfies :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
     The function gives an unbiased approximation of the following equation:
 
     .. math::
 
-        GW = \sum_{i,j,k,l} L(\mathbf{C_{1}}_{i,k}, \mathbf{C_{2}}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
+        \mathbf{GW} = \sum_{i,j,k,l} L(\mathbf{C_{1}}_{i,k}, \mathbf{C_{2}}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
     Where :
 
diff --git a/ot/gromov/_gw.py b/ot/gromov/_gw.py
index adf6b82b1..5b935e34f 100644
--- a/ot/gromov/_gw.py
+++ b/ot/gromov/_gw.py
@@ -26,9 +26,9 @@
 def gromov_wasserstein(C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, log=False, armijo=False, G0=None,
                        max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Returns the Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    Returns the Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`.
 
-    The function solves the following optimization problem:
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
         \mathbf{T}^* \in \mathop{\arg \min}_\mathbf{T} \quad \sum_{i,j,k,l}
@@ -182,9 +182,10 @@ def line_search(cost, G, deltaG, Mi, cost_G, **kwargs):
 def gromov_wasserstein2(C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, log=False, armijo=False, G0=None,
                         max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Returns the Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    Returns the Gromov-Wasserstein discrepancy :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`.
+    The Gromov-Wasserstein distance as defined in [13] satisfies :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
-    The function solves the following optimization problem:
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
         \mathbf{GW} = \min_\mathbf{T} \quad \sum_{i,j,k,l}
@@ -308,10 +309,13 @@ def gromov_wasserstein2(C1, C2, p=None, q=None, loss_fun='square_loss', symmetri
 def fused_gromov_wasserstein(M, C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, alpha=0.5,
                              armijo=False, G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Computes the FGW transport between two graphs (see :ref:`[24] <references-fused-gromov-wasserstein>`)
+    Returns the Fused Gromov-Wasserstein transport between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
+    with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}` (see :ref:`[24] <references-fused-gromov-wasserstein>`).
+
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
-        \mathbf{T}^* \in \mathop{\arg \min}_\mathbf{T}  \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \mathbf{T}^* \in\mathop{\arg\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
         \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
@@ -319,20 +323,22 @@ def fused_gromov_wasserstein(M, C1, C2, p=None, q=None, loss_fun='square_loss',
              \mathbf{T}^T \mathbf{1} &= \mathbf{q}
 
              \mathbf{T} &\geq 0
+    Where :
 
-    where :
-
-    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
-    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
-    - `L` is a loss function to account for the misfit between the similarity matrices
-
+    - :math:`\mathbf{M}`: metric cost matrix between features across domains
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity and feature matrices
+    - :math:`\alpha`: trade-off parameter
+    
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends. But the algorithm uses the C++ CPU backend
         which can lead to copy overhead on GPU arrays.
     .. note:: All computations in the conjugate gradient solver are done with
         numpy to limit memory overhead.
 
-    The algorithm used for solving the problem is conditional gradient as discussed in :ref:`[24] <references-fused-gromov-wasserstein>`
 
     Parameters
     ----------
@@ -465,26 +471,32 @@ def line_search(cost, G, deltaG, Mi, cost_G, **kwargs):
 def fused_gromov_wasserstein2(M, C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, alpha=0.5,
                               armijo=False, G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Computes the FGW distance between two graphs see (see :ref:`[24] <references-fused-gromov-wasserstein2>`)
+    Returns the Fused Gromov-Wasserstein distance between :math:`(\mathbf{C_1}, \mathbf{Y_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{Y_2}, \mathbf{q})`
+    with pairwise distance matrix :math:`\mathbf{M}` between node feature matrices :math:`\mathbf{Y_1}` and :math:`\mathbf{Y_2}` (see :ref:`[24] <references-fused-gromov-wasserstein>`).
 
-    .. math::
-        \mathbf{GW} = \min_\mathbf{T} \quad (1 - \alpha) \langle \mathbf(T), \mathbf{M} \rangle_F + \alpha \sum_{i,j,k,l}
-        L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
-
-        s.t. \ \mathbf(T)\mathbf{1} &= \mathbf{p}
+    The function solves the following optimization problem using Conditional Gradient:
 
-             \mathbf(T)^T \mathbf{1} &= \mathbf{q}
+    .. math::
+        \mathbf{FGW} = \mathop{\min}_\mathbf{T} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
-             \mathbf(T) &\geq 0
+        s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
 
-    where :
+             \mathbf{T}^T \mathbf{1} &= \mathbf{q}
 
-    - :math:`\mathbf{M}` is the (`ns`, `nt`) metric cost matrix
-    - :math:`\mathbf{p}` and :math:`\mathbf{q}` are source and target weights (sum to 1)
-    - `L` is a loss function to account for the misfit between the similarity matrices
+             \mathbf{T} &\geq 0
+    Where :
 
-    The algorithm used for solving the problem is conditional gradient as
-    discussed in :ref:`[24] <references-fused-gromov-wasserstein2>`
+    - :math:`\mathbf{M}`: metric cost matrix between features across domains
+    - :math:`\mathbf{C_1}`: Metric cost matrix in the source space
+    - :math:`\mathbf{C_2}`: Metric cost matrix in the target space
+    - :math:`\mathbf{p}`: distribution in the source space
+    - :math:`\mathbf{q}`: distribution in the target space
+    - `L`: loss function to account for the misfit between the similarity and feature matrices
+    - :math:`\alpha`: trade-off parameter
+    
+    Note that when using backends, this loss function is differentiable wrt the
+    matrices (C1, C2, M) and weights (p, q) for quadratic loss using the gradients from [38]_.
 
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends. But the algorithm uses the C++ CPU backend
@@ -492,9 +504,6 @@ def fused_gromov_wasserstein2(M, C1, C2, p=None, q=None, loss_fun='square_loss',
     .. note:: All computations in the conjugate gradient solver are done with
         numpy to limit memory overhead.
 
-    Note that when using backends, this loss function is differentiable wrt the
-    matrices (C1, C2, M) and weights (p, q) for quadratic loss using the gradients from [38]_.
-
     Parameters
     ----------
     M : array-like, shape (ns, nt)
@@ -668,13 +677,13 @@ def gromov_barycenters(
         max_iter=1000, tol=1e-9, warmstartT=False, verbose=False, log=False,
         init_C=None, random_state=None, **kwargs):
     r"""
-    Returns the gromov-wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
+    Returns the Gromov-Wasserstein barycenters of `S` measured similarity matrices :math:`(\mathbf{C}_s)_{1 \leq s \leq S}`
 
     The function solves the following optimization problem with block coordinate descent:
 
     .. math::
 
-        \mathbf{C} = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
+        \mathbf{C}^* = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}} \quad \sum_s \lambda_s \mathrm{GW}(\mathbf{C}, \mathbf{C}_s, \mathbf{p}, \mathbf{p}_s)
 
     Where :
 
@@ -812,8 +821,22 @@ def fgw_barycenters(
         fixed_features=False, p=None, loss_fun='square_loss', armijo=False,
         symmetric=True, max_iter=100, tol=1e-9, warmstartT=False, verbose=False,
         log=False, init_C=None, init_X=None, random_state=None, **kwargs):
-    r"""Compute the fgw barycenter as presented eq (5) in :ref:`[24] <references-fgw-barycenters>`
+    r"""
+    Returns the Fused Gromov-Wasserstein barycenters of `S` measurable networks with node features :math:`(\mathbf{C}_s, \mathbf{Y}_s, \mathbf{p}_s)_{1 \leq s \leq S}`
+    (see eq (5) in :ref:`[24] <references-fgw-barycenters>`), estimated using Fused Gromov-Wasserstein transports from Conditional Gradient solvers.
 
+    The function solves the following optimization problem:
+
+    .. math::
+
+        \mathbf{C}^*, \mathbf{Y}^* = \mathop{\arg \min}_{\mathbf{C}\in \mathbb{R}^{N \times N}, \mathbf{Y}\in \mathbb{Y}^{N \times d}} \quad \sum_s \lambda_s \mathrm{FGW}_{\alpha}(\mathbf{C}, \mathbf{C}_s, \mathbf{Y}, \mathbf{Y}_s, \mathbf{p}, \mathbf{p}_s)
+
+    Where :
+
+    - :math:`\mathbf{Y}_s`: feature matrix
+    - :math:`\mathbf{C}_s`: metric cost matrix
+    - :math:`\mathbf{p}_s`: distribution
+    
     Parameters
     ----------
     N : int
diff --git a/ot/gromov/_semirelaxed.py b/ot/gromov/_semirelaxed.py
index 206329dfc..b36a81c75 100644
--- a/ot/gromov/_semirelaxed.py
+++ b/ot/gromov/_semirelaxed.py
@@ -21,12 +21,12 @@
 def semirelaxed_gromov_wasserstein(C1, C2, p=None, loss_fun='square_loss', symmetric=None, log=False, G0=None,
                                    max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Returns the semi-relaxed Gromov-Wasserstein divergence transport from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}`
+    Returns the semi-relaxed Gromov-Wasserstein divergence transport from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}` (see [48]).
 
-    The function solves the following optimization problem:
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
-        \mathbf{T}^^* \in \mathop{\arg \min}_{\mathbf{T}} \quad \sum_{i,j,k,l}
+        \mathbf{T}^* \in \mathop{\arg \min}_{\mathbf{T}} \quad \sum_{i,j,k,l}
         L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
@@ -152,9 +152,9 @@ def line_search(cost, G, deltaG, Mi, cost_G, **kwargs):
 def semirelaxed_gromov_wasserstein2(C1, C2, p=None, loss_fun='square_loss', symmetric=None, log=False, G0=None,
                                     max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Returns the semi-relaxed gromov-wasserstein divergence from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}`
+    Returns the semi-relaxed Gromov-Wasserstein divergence from :math:`(\mathbf{C_1}, \mathbf{p})` to :math:`\mathbf{C_2}` (see [48]).
 
-    The function solves the following optimization problem:
+    The function solves the following optimization problem using Conditional Gradient:
 
     .. math::
         \text{srGW} = \min_{\mathbf{T}} \quad \sum_{i,j,k,l}
@@ -255,7 +255,7 @@ def semirelaxed_fused_gromov_wasserstein(
         M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, alpha=0.5,
         G0=None, log=False, max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Computes the semi-relaxed FGW transport between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein>`)
+    Computes the semi-relaxed Fused Gromov-Wasserstein transport between two graphs (see [48]).
 
     .. math::
         \mathbf{T}^* \in \mathop{\arg \min}_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
@@ -395,10 +395,10 @@ def line_search(cost, G, deltaG, Mi, cost_G, **kwargs):
 def semirelaxed_fused_gromov_wasserstein2(M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, alpha=0.5, G0=None, log=False,
                                           max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Computes the semi-relaxed FGW divergence between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein2>`)
+    Computes the semi-relaxed FGW divergence between two graphs (see [48]).
 
     .. math::
-        \mathbf{srFGW} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \mathbf{srFGW}_{\alpha} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
         \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) T_{i,j} T_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}
@@ -511,7 +511,7 @@ def semirelaxed_fused_gromov_wasserstein2(M, C1, C2, p=None, loss_fun='square_lo
 def solve_semirelaxed_gromov_linesearch(G, deltaG, cost_G, C1, C2, ones_p,
                                         M, reg, alpha_min=None, alpha_max=None, nx=None, **kwargs):
     """
-    Solve the linesearch in the FW iterations
+    Solve the linesearch in the Conditional Gradient iterations for the semi-relaxed Gromov-Wasserstein divergence.
 
     Parameters
     ----------
@@ -829,6 +829,7 @@ def entropic_semirelaxed_fused_gromov_wasserstein(
         alpha=0.5, G0=None, max_iter=1e4, tol=1e-9, log=False, verbose=False, **kwargs):
     r"""
     Computes the entropic-regularized semi-relaxed FGW transport between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein>`)
+    estimated using a Mirror Descent algorithm following the KL geometry.
 
     .. math::
         \mathbf{T}^* \in \mathop{\arg \min}_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
@@ -988,10 +989,11 @@ def entropic_semirelaxed_fused_gromov_wasserstein2(
         M, C1, C2, p=None, loss_fun='square_loss', symmetric=None, epsilon=0.1,
         alpha=0.5, G0=None, max_iter=1e4, tol=1e-9, log=False, verbose=False, **kwargs):
     r"""
-    Computes the entropic-regularized semi-relaxed FGW transport between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein>`)
+    Computes the entropic-regularized semi-relaxed FGW divergence between two graphs (see :ref:`[48] <references-semirelaxed-fused-gromov-wasserstein>`)
+    estimated using a Mirror Descent algorithm following the KL geometry.
 
     .. math::
-        \mathbf{srFGW} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
+        \mathbf{srFGW}_{\alpha} = \min_{\mathbf{T}} \quad (1 - \alpha) \langle \mathbf{T}, \mathbf{M} \rangle_F +
         \alpha \sum_{i,j,k,l} L(\mathbf{C_1}_{i,k}, \mathbf{C_2}_{j,l}) \mathbf{T}_{i,j} \mathbf{T}_{k,l}
 
         s.t. \ \mathbf{T} \mathbf{1} &= \mathbf{p}

From ff4715ae754025de7cb1002c84361cbfd20565c9 Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Fri, 4 Aug 2023 14:38:22 +0200
Subject: [PATCH 2/4] fix pep8 + update RELEASE

---
 RELEASES.md      | 1 +
 ot/gromov/_gw.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/RELEASES.md b/RELEASES.md
index 6a2cdf9f4..c8e735db9 100644
--- a/RELEASES.md
+++ b/RELEASES.md
@@ -33,6 +33,7 @@ Many other bugs and issues have been fixed and we want to thank all the contribu
 
 #### Closed issues
 
+- Fix gromov conventions (PR #497)
 - Fix change in scipy API for `cdist` (PR #487)
 - More permissive check_backend (PR #494)
 - Fix circleci-redirector action and codecov (PR #460)
diff --git a/ot/gromov/_gw.py b/ot/gromov/_gw.py
index 5b935e34f..10f77104e 100644
--- a/ot/gromov/_gw.py
+++ b/ot/gromov/_gw.py
@@ -332,7 +332,7 @@ def fused_gromov_wasserstein(M, C1, C2, p=None, q=None, loss_fun='square_loss',
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity and feature matrices
     - :math:`\alpha`: trade-off parameter
-    
+
     .. note:: This function is backend-compatible and will work on arrays
         from all compatible backends. But the algorithm uses the C++ CPU backend
         which can lead to copy overhead on GPU arrays.
@@ -494,7 +494,7 @@ def fused_gromov_wasserstein2(M, C1, C2, p=None, q=None, loss_fun='square_loss',
     - :math:`\mathbf{q}`: distribution in the target space
     - `L`: loss function to account for the misfit between the similarity and feature matrices
     - :math:`\alpha`: trade-off parameter
-    
+
     Note that when using backends, this loss function is differentiable wrt the
     matrices (C1, C2, M) and weights (p, q) for quadratic loss using the gradients from [38]_.
 
@@ -836,7 +836,7 @@ def fgw_barycenters(
     - :math:`\mathbf{Y}_s`: feature matrix
     - :math:`\mathbf{C}_s`: metric cost matrix
     - :math:`\mathbf{p}_s`: distribution
-    
+
     Parameters
     ----------
     N : int

From f44f81bab08cbe0c58cdd5a760d76141e2b30408 Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Fri, 4 Aug 2023 14:59:50 +0200
Subject: [PATCH 3/4] improve doc

---
 ot/gromov/_bregman.py    | 3 +--
 ot/gromov/_estimators.py | 3 +--
 ot/gromov/_gw.py         | 4 ++--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/ot/gromov/_bregman.py b/ot/gromov/_bregman.py
index 10a0d65c2..fe249639e 100644
--- a/ot/gromov/_bregman.py
+++ b/ot/gromov/_bregman.py
@@ -226,8 +226,7 @@ def entropic_gromov_wasserstein2(
         tol=1e-9, solver='PGD', warmstart=False, verbose=False, log=False, **kwargs):
     r"""
     Returns the Gromov-Wasserstein discrepancy :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
-    estimated using Sinkhorn projections.
-    The Gromov-Wasserstein distance as defined in [13] satisfies :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
+    estimated using Sinkhorn projections. To recover the Gromov-Wasserstein distance as defined in [13] compute :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
     If `solver="PGD"`, the function solves the following entropic-regularized
     Gromov-Wasserstein optimization problem using Projected Gradient Descent [12]:
diff --git a/ot/gromov/_estimators.py b/ot/gromov/_estimators.py
index 9407ecf64..7ea7e2a8c 100644
--- a/ot/gromov/_estimators.py
+++ b/ot/gromov/_estimators.py
@@ -21,8 +21,7 @@ def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
                            nb_samples_p=None, nb_samples_q=None, std=True, random_state=None):
     r"""
     Returns an approximation of the Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
-    with a fixed transport plan :math:`\mathbf{T}`.
-    An approximation of the Gromov-Wasserstein distance as defined in [13] satisfies :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
+    with a fixed transport plan :math:`\mathbf{T}`. To recover an approximation of the Gromov-Wasserstein distance as defined in [13] compute :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
     The function gives an unbiased approximation of the following equation:
 
diff --git a/ot/gromov/_gw.py b/ot/gromov/_gw.py
index 10f77104e..20373f33b 100644
--- a/ot/gromov/_gw.py
+++ b/ot/gromov/_gw.py
@@ -182,8 +182,8 @@ def line_search(cost, G, deltaG, Mi, cost_G, **kwargs):
 def gromov_wasserstein2(C1, C2, p=None, q=None, loss_fun='square_loss', symmetric=None, log=False, armijo=False, G0=None,
                         max_iter=1e4, tol_rel=1e-9, tol_abs=1e-9, **kwargs):
     r"""
-    Returns the Gromov-Wasserstein discrepancy :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`.
-    The Gromov-Wasserstein distance as defined in [13] satisfies :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
+    Returns the Gromov-Wasserstein loss :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`.
+    To recover the Gromov-Wasserstein distance as defined in [13] compute :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
     The function solves the following optimization problem using Conditional Gradient:
 

From d9b7fe614a6a1134e758581a3f7fe3c70edd0771 Mon Sep 17 00:00:00 2001
From: clvincen <cedvincentcuaz@gmail.com>
Date: Fri, 4 Aug 2023 16:20:14 +0200
Subject: [PATCH 4/4] merge

---
 ot/gromov/_bregman.py    | 2 +-
 ot/gromov/_estimators.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ot/gromov/_bregman.py b/ot/gromov/_bregman.py
index fe249639e..792ed6e86 100644
--- a/ot/gromov/_bregman.py
+++ b/ot/gromov/_bregman.py
@@ -225,7 +225,7 @@ def entropic_gromov_wasserstein2(
         C1, C2, p=None, q=None, loss_fun='square_loss', epsilon=0.1, symmetric=None, G0=None, max_iter=1000,
         tol=1e-9, solver='PGD', warmstart=False, verbose=False, log=False, **kwargs):
     r"""
-    Returns the Gromov-Wasserstein discrepancy :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    Returns the Gromov-Wasserstein loss :math:`\mathbf{GW}` between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
     estimated using Sinkhorn projections. To recover the Gromov-Wasserstein distance as defined in [13] compute :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
     If `solver="PGD"`, the function solves the following entropic-regularized
diff --git a/ot/gromov/_estimators.py b/ot/gromov/_estimators.py
index 7ea7e2a8c..7e12ef930 100644
--- a/ot/gromov/_estimators.py
+++ b/ot/gromov/_estimators.py
@@ -20,7 +20,7 @@
 def GW_distance_estimation(C1, C2, p, q, loss_fun, T,
                            nb_samples_p=None, nb_samples_q=None, std=True, random_state=None):
     r"""
-    Returns an approximation of the Gromov-Wasserstein discrepancy between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
+    Returns an approximation of the Gromov-Wasserstein loss between :math:`(\mathbf{C_1}, \mathbf{p})` and :math:`(\mathbf{C_2}, \mathbf{q})`
     with a fixed transport plan :math:`\mathbf{T}`. To recover an approximation of the Gromov-Wasserstein distance as defined in [13] compute :math:`d_{GW} = \frac{1}{2} \sqrt{\mathbf{GW}}`.
 
     The function gives an unbiased approximation of the following equation: