From 668b51195be2943907010574a11abc703bac6410 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 11 Oct 2025 08:44:13 +0200 Subject: [PATCH 1/2] Renamed variable, small code updates --- README.md | 4 ++-- src/pyversity/datatypes.py | 4 ++-- src/pyversity/pyversity.py | 2 +- src/pyversity/strategies/cover.py | 14 +++++++------- src/pyversity/strategies/dpp.py | 12 ++++++------ src/pyversity/strategies/mmr.py | 2 +- src/pyversity/strategies/msd.py | 2 +- src/pyversity/strategies/utils.py | 6 +++--- src/pyversity/utils.py | 4 ++-- tests/test_strategies.py | 20 ++++++++++---------- 10 files changed, 35 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index b53fc95..fb01443 100644 --- a/README.md +++ b/README.md @@ -44,11 +44,11 @@ diversified_result = diversify( diversity=0.5 # Diversity parameter (higher values prioritize diversity) ) -# Get the indicices of the diversified result +# Get the indices of the diversified result diversified_indices = diversified_result.indices ``` -The returned `DiversificationResult` can be used to access the diversified `indices`, as well as the `marginal gains` of the selected strategy and other useful info. The strategies are extremely fast and scalable: this example runs in 0.0001s. +The returned `DiversificationResult` can be used to access the diversified `indices`, as well as the `selection_scores` of the selected strategy and other useful info. The strategies are extremely fast and scalable: this example runs in 0.0001s. ## Supported Strategies diff --git a/src/pyversity/datatypes.py b/src/pyversity/datatypes.py index ef66fae..c74c1ae 100644 --- a/src/pyversity/datatypes.py +++ b/src/pyversity/datatypes.py @@ -28,7 +28,7 @@ class DiversificationResult: Attributes ---------- indices: Diversified item indices. - marginal_gains: Marginal gains/relevance scores for the diversified items. + selection_scores: Selection scores for the diversified items. strategy: Diversification strategy used. diversity: Diversity parameter used in the strategy. parameters: Additional parameters used in the strategy. @@ -36,7 +36,7 @@ class DiversificationResult: """ indices: np.ndarray - marginal_gains: np.ndarray + selection_scores: np.ndarray strategy: Strategy diversity: float parameters: dict | None = None diff --git a/src/pyversity/pyversity.py b/src/pyversity/pyversity.py index e7df9a6..a7a5dce 100644 --- a/src/pyversity/pyversity.py +++ b/src/pyversity/pyversity.py @@ -25,7 +25,7 @@ def diversify( :param diversity: Diversity parameter (range of [0, 1]). Higher values prioritize diversity and lower values prioritize relevance. :param **kwargs: Additional keyword arguments passed to the specific strategy function. :return: A DiversificationResult containing the selected item indices, - their marginal gains, the strategy used, and the parameters. + their selection scores, the strategy used, and the parameters. :raises ValueError: If the provided strategy is not recognized. """ if strategy == Strategy.MMR: diff --git a/src/pyversity/strategies/cover.py b/src/pyversity/strategies/cover.py index f4d1176..5e3f9e7 100644 --- a/src/pyversity/strategies/cover.py +++ b/src/pyversity/strategies/cover.py @@ -14,7 +14,7 @@ def cover( normalize: bool = True, ) -> DiversificationResult: """ - Select a subset of items that balances relevance and coverage. + Select a subset of items that balances relevance and coverage/diversity. This strategy chooses `k` items by combining pure relevance with diversity-driven coverage using a concave submodular formulation. @@ -22,14 +22,14 @@ def cover( :param embeddings: 2D array of shape (n_samples, n_features). :param scores: 1D array of relevance scores for each item. :param k: Number of items to select. - :param diversity: Trade-off between relevance and coverage in [0, 1] (inverse of theta parameter). + :param diversity: Trade-off between relevance and coverage/diversity in [0, 1] (inverse of theta parameter). 1.0 = pure diversity, 0.0 = pure relevance. :param gamma: Concavity parameter in (0, 1]; lower values emphasize diversity. :param metric: Similarity metric to use. Default is Metric.COSINE. :param normalize: Whether to normalize embeddings before computing similarity. :return: A DiversificationResult containing the selected item indices, - their marginal gains, the strategy used, and the parameters. - :raises ValueError: If theta is not in [0, 1]. + their selection scores, the strategy used, and the parameters. + :raises ValueError: If diversity is not in [0, 1]. :raises ValueError: If gamma is not in (0, 1]. """ # Validate parameters @@ -53,7 +53,7 @@ def cover( # Nothing to select: return empty arrays return DiversificationResult( indices=np.empty(0, np.int32), - marginal_gains=np.empty(0, np.float32), + selection_scores=np.empty(0, np.float32), strategy=Strategy.COVER, diversity=diversity, parameters=params, @@ -69,7 +69,7 @@ def cover( gains = relevance_scores[topk].astype(np.float32, copy=False) return DiversificationResult( indices=topk, - marginal_gains=gains, + selection_scores=gains, strategy=Strategy.COVER, diversity=diversity, parameters=params, @@ -106,7 +106,7 @@ def cover( return DiversificationResult( indices=selected_indices, - marginal_gains=marginal_gains, + selection_scores=marginal_gains, strategy=Strategy.COVER, diversity=diversity, parameters=params, diff --git a/src/pyversity/strategies/dpp.py b/src/pyversity/strategies/dpp.py index 263f6d3..0905af3 100644 --- a/src/pyversity/strategies/dpp.py +++ b/src/pyversity/strategies/dpp.py @@ -24,7 +24,7 @@ def dpp( This strategy selects a diverse and relevant subset of `k` items by maximizing the determinant of a kernel matrix that balances item relevance - and pairwise similarity. + and pairwise similarity. Note that :param embeddings: 2D array of shape (n_samples, n_features). :param scores: 1D array of relevance scores for each item. @@ -33,7 +33,7 @@ def dpp( Higher values increase the emphasis on diversity. :param scale: Optional scaling factor for the beta parameter to adjust relevance influence. :return: A DiversificationResult containing the selected item indices, - their marginal gains, the strategy used, and the parameters. + their selection scores, the strategy used, and the parameters. :raises ValueError: If diversity is not in [0, 1]. """ if not (0.0 <= float(diversity) <= 1.0): @@ -49,7 +49,7 @@ def dpp( # Nothing to select: return empty arrays return DiversificationResult( indices=np.empty(0, np.int32), - marginal_gains=np.empty(0, np.float32), + selection_scores=np.empty(0, np.float32), strategy=Strategy.DPP, diversity=diversity, parameters={"scale": scale}, @@ -80,8 +80,8 @@ def dpp( marginal_gains[step] = best_score selected_mask[best_index] = True - if step == top_k - 1 or best_score <= 0.0: - # No more items to select or no positive gain + if step == top_k - 1: + # No more items to select step += 1 break @@ -106,7 +106,7 @@ def dpp( return DiversificationResult( indices=selected_indices[:step], - marginal_gains=marginal_gains[:step], + selection_scores=marginal_gains[:step], strategy=Strategy.DPP, diversity=diversity, parameters={"scale": scale}, diff --git a/src/pyversity/strategies/mmr.py b/src/pyversity/strategies/mmr.py index b3efe97..e965c8c 100644 --- a/src/pyversity/strategies/mmr.py +++ b/src/pyversity/strategies/mmr.py @@ -27,7 +27,7 @@ def mmr( :param metric: Similarity metric to use. Default is Metric.COSINE. :param normalize: Whether to normalize embeddings before computing similarity. :return: A DiversificationResult containing the selected item indices, - their marginal gains, the strategy used, and the parameters. + their selection scores, the strategy used, and the parameters. """ return greedy_select( "mmr", diff --git a/src/pyversity/strategies/msd.py b/src/pyversity/strategies/msd.py index e52b109..094a734 100644 --- a/src/pyversity/strategies/msd.py +++ b/src/pyversity/strategies/msd.py @@ -27,7 +27,7 @@ def msd( :param metric: Similarity metric to use. Default is Metric.COSINE. :param normalize: Whether to normalize embeddings before computing similarity. :return: A DiversificationResult containing the selected item indices, - their marginal gains, the strategy used, and the parameters. + their selection scores, the strategy used, and the parameters. """ return greedy_select( "msd", diff --git a/src/pyversity/strategies/utils.py b/src/pyversity/strategies/utils.py index 6dc2810..e4429a4 100644 --- a/src/pyversity/strategies/utils.py +++ b/src/pyversity/strategies/utils.py @@ -33,7 +33,7 @@ def greedy_select( :param diversity: Trade-off parameter in [0, 1]. Inverse of lambda parameter. 1.0 = pure diversity, 0.0 = pure relevance. :return: A DiversificationResult containing the selected item indices, - their marginal gains, the strategy used, and the parameters. + their selection scores, the strategy used, and the parameters. :raises ValueError: If diversity is not in [0, 1]. :raises ValueError: If input shapes are inconsistent. """ @@ -55,7 +55,7 @@ def greedy_select( # Nothing to select: return empty arrays return DiversificationResult( indices=np.empty(0, np.int32), - marginal_gains=np.empty(0, np.float32), + selection_scores=np.empty(0, np.float32), strategy=Strategy.MMR if strategy == "mmr" else Strategy.MSD, diversity=diversity, parameters=params, @@ -110,7 +110,7 @@ def greedy_select( return DiversificationResult( indices=selected_indices, - marginal_gains=marginal_gains, + selection_scores=marginal_gains, strategy=Strategy.MMR if strategy == "mmr" else Strategy.MSD, diversity=diversity, parameters=params, diff --git a/src/pyversity/utils.py b/src/pyversity/utils.py index 847506d..9f758fb 100644 --- a/src/pyversity/utils.py +++ b/src/pyversity/utils.py @@ -23,10 +23,10 @@ def prepare_inputs(embeddings: np.ndarray, scores: np.ndarray, k: int) -> tuple[ """ Prepare relevance scores and embeddings. - :param embeddings: Array of shape embeddings. + :param embeddings: Array of embeddings. :param scores: Array of relevance scores. :param k: Number of top elements to consider. - :return: Tuple of relevances, embeddings, k_clamped, early_exit. + :return: Tuple of embeddings, relevances, k_clamped, early_exit. :raises ValueError: If input shapes are inconsistent. """ relevance_scores = np.asarray(scores, dtype=np.float32).reshape(-1) diff --git a/tests/test_strategies.py b/tests/test_strategies.py index fe9265f..8e14832 100644 --- a/tests/test_strategies.py +++ b/tests/test_strategies.py @@ -14,7 +14,7 @@ def test_mmr() -> None: res = mmr(emb, scores, k=3, diversity=0.0, metric=Metric.COSINE, normalize=True) expected = np.array([1, 3, 2], dtype=np.int32) assert np.array_equal(res.indices, expected) - assert np.allclose(res.marginal_gains, scores[expected]) + assert np.allclose(res.selection_scores, scores[expected]) # Strong diversity (diversity=1): avoid near-duplicate emb = np.array([[1.0, 0.0], [0.999, 0.001], [0.0, 1.0]], dtype=np.float32) @@ -63,7 +63,7 @@ def test_cover() -> None: res = cover(emb, scores, k=2, diversity=0.0) expected = np.array([1, 2], dtype=np.int32) assert np.array_equal(res.indices, expected) - assert np.allclose(res.marginal_gains, scores[expected]) + assert np.allclose(res.selection_scores, scores[expected]) # Balanced coverage (diversity=0.5, gamma=0.5): picks diverse set res = cover(emb, scores, k=2, diversity=0.5, gamma=0.5) @@ -88,24 +88,24 @@ def test_dpp() -> None: # Strong diversity (diversity=1) res = dpp(emb, scores, k=2, diversity=1.0) assert 1 <= res.indices.size <= 2 - assert np.all(res.marginal_gains >= -1e-7) - assert np.all(res.marginal_gains[:-1] + 1e-7 >= res.marginal_gains[1:]) + assert np.all(res.selection_scores >= -1e-7) + assert np.all(res.selection_scores[:-1] + 1e-7 >= res.selection_scores[1:]) # Balanced (diversity=0.5) res = dpp(emb, scores, k=2, diversity=0.5) assert 1 <= res.indices.size <= 2 - assert np.all(res.marginal_gains >= -1e-7) - assert np.all(res.marginal_gains[:-1] + 1e-7 >= res.marginal_gains[1:]) + assert np.all(res.selection_scores >= -1e-7) + assert np.all(res.selection_scores[:-1] + 1e-7 >= res.selection_scores[1:]) # Low diversity (diversity=0.0): more relevance-driven res = dpp(emb, scores, k=2, diversity=0.0) assert 1 <= res.indices.size <= 2 - assert np.all(res.marginal_gains >= -1e-7) - assert np.all(res.marginal_gains[:-1] + 1e-7 >= res.marginal_gains[1:]) + assert np.all(res.selection_scores >= -1e-7) + assert np.all(res.selection_scores[:-1] + 1e-7 >= res.selection_scores[1:]) # Early exit on empty input res = dpp(np.empty((0, 3), dtype=np.float32), np.array([], dtype=np.float32), k=3) - assert res.indices.size == 0 and res.marginal_gains.size == 0 + assert res.indices.size == 0 and res.selection_scores.size == 0 @pytest.mark.parametrize( @@ -129,4 +129,4 @@ def test_diversify(strategy: Strategy, fn: Callable[..., DiversificationResult], res_disp = diversify(embeddings=emb, scores=scores, k=2, strategy=strategy, **kwargs) assert np.array_equal(res_direct.indices, res_disp.indices) - assert np.allclose(res_direct.marginal_gains, res_disp.marginal_gains) + assert np.allclose(res_direct.selection_scores, res_disp.selection_scores) From 8eae0cb76c7b16d031884bf1359845c33128ea71 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sat, 11 Oct 2025 08:48:13 +0200 Subject: [PATCH 2/2] Renamed variable, small code updates --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fb01443..2fa8f7f 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,9 @@ diversified_result = diversify( diversified_indices = diversified_result.indices ``` -The returned `DiversificationResult` can be used to access the diversified `indices`, as well as the `selection_scores` of the selected strategy and other useful info. The strategies are extremely fast and scalable: this example runs in 0.0001s. +The returned `DiversificationResult` can be used to access the diversified `indices`, as well as the `selection_scores` of the selected strategy and other useful info. The strategies are extremely fast and scalable: this example runs in milliseconds. + +The `diversity` parameter tunes the trade-off between relevance and diversity: 0.0 focuses purely on relevance (no diversification), while 1.0 maximizes diversity, potentially at the cost of relevance. ## Supported Strategies @@ -64,7 +66,7 @@ The following table describes the supported strategies, how they work, their tim ## Motivation -Traditional retrieval systems rank results purely by relevance (how closely each item matches the query) While effective, this can lead to redundancy: top results often look nearly identical, which can create a poor user experience. +Traditional retrieval systems rank results purely by relevance (how closely each item matches the query). While effective, this can lead to redundancy: top results often look nearly identical, which can create a poor user experience. Diversification techniques like MMR, MSD, COVER, and DPP help balance relevance and variety. Each new item is chosen not only because it’s relevant, but also because it adds new information that wasn’t already covered by earlier results.