Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ diversified_result = diversify(
diversity=0.5 # Diversity parameter (higher values prioritize diversity)
)

# Get the indicices of the diversified result
# Get the indices of the diversified result
diversified_indices = diversified_result.indices
```

The returned `DiversificationResult` can be used to access the diversified `indices`, as well as the `marginal gains` of the selected strategy and other useful info. The strategies are extremely fast and scalable: this example runs in 0.0001s.
The returned `DiversificationResult` can be used to access the diversified `indices`, as well as the `selection_scores` of the selected strategy and other useful info. The strategies are extremely fast and scalable: this example runs in milliseconds.

The `diversity` parameter tunes the trade-off between relevance and diversity: 0.0 focuses purely on relevance (no diversification), while 1.0 maximizes diversity, potentially at the cost of relevance.

## Supported Strategies

Expand All @@ -64,7 +66,7 @@ The following table describes the supported strategies, how they work, their tim

## Motivation

Traditional retrieval systems rank results purely by relevance (how closely each item matches the query) While effective, this can lead to redundancy: top results often look nearly identical, which can create a poor user experience.
Traditional retrieval systems rank results purely by relevance (how closely each item matches the query). While effective, this can lead to redundancy: top results often look nearly identical, which can create a poor user experience.

Diversification techniques like MMR, MSD, COVER, and DPP help balance relevance and variety.
Each new item is chosen not only because it’s relevant, but also because it adds new information that wasn’t already covered by earlier results.
Expand Down
4 changes: 2 additions & 2 deletions src/pyversity/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ class DiversificationResult:
Attributes
----------
indices: Diversified item indices.
marginal_gains: Marginal gains/relevance scores for the diversified items.
selection_scores: Selection scores for the diversified items.
strategy: Diversification strategy used.
diversity: Diversity parameter used in the strategy.
parameters: Additional parameters used in the strategy.

"""

indices: np.ndarray
marginal_gains: np.ndarray
selection_scores: np.ndarray
strategy: Strategy
diversity: float
parameters: dict | None = None
2 changes: 1 addition & 1 deletion src/pyversity/pyversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def diversify(
:param diversity: Diversity parameter (range of [0, 1]). Higher values prioritize diversity and lower values prioritize relevance.
:param **kwargs: Additional keyword arguments passed to the specific strategy function.
:return: A DiversificationResult containing the selected item indices,
their marginal gains, the strategy used, and the parameters.
their selection scores, the strategy used, and the parameters.
:raises ValueError: If the provided strategy is not recognized.
"""
if strategy == Strategy.MMR:
Expand Down
14 changes: 7 additions & 7 deletions src/pyversity/strategies/cover.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ def cover(
normalize: bool = True,
) -> DiversificationResult:
"""
Select a subset of items that balances relevance and coverage.
Select a subset of items that balances relevance and coverage/diversity.

This strategy chooses `k` items by combining pure relevance with
diversity-driven coverage using a concave submodular formulation.

:param embeddings: 2D array of shape (n_samples, n_features).
:param scores: 1D array of relevance scores for each item.
:param k: Number of items to select.
:param diversity: Trade-off between relevance and coverage in [0, 1] (inverse of theta parameter).
:param diversity: Trade-off between relevance and coverage/diversity in [0, 1] (inverse of theta parameter).
1.0 = pure diversity, 0.0 = pure relevance.
:param gamma: Concavity parameter in (0, 1]; lower values emphasize diversity.
:param metric: Similarity metric to use. Default is Metric.COSINE.
:param normalize: Whether to normalize embeddings before computing similarity.
:return: A DiversificationResult containing the selected item indices,
their marginal gains, the strategy used, and the parameters.
:raises ValueError: If theta is not in [0, 1].
their selection scores, the strategy used, and the parameters.
:raises ValueError: If diversity is not in [0, 1].
:raises ValueError: If gamma is not in (0, 1].
"""
# Validate parameters
Expand All @@ -53,7 +53,7 @@ def cover(
# Nothing to select: return empty arrays
return DiversificationResult(
indices=np.empty(0, np.int32),
marginal_gains=np.empty(0, np.float32),
selection_scores=np.empty(0, np.float32),
strategy=Strategy.COVER,
diversity=diversity,
parameters=params,
Expand All @@ -69,7 +69,7 @@ def cover(
gains = relevance_scores[topk].astype(np.float32, copy=False)
return DiversificationResult(
indices=topk,
marginal_gains=gains,
selection_scores=gains,
strategy=Strategy.COVER,
diversity=diversity,
parameters=params,
Expand Down Expand Up @@ -106,7 +106,7 @@ def cover(

return DiversificationResult(
indices=selected_indices,
marginal_gains=marginal_gains,
selection_scores=marginal_gains,
strategy=Strategy.COVER,
diversity=diversity,
parameters=params,
Expand Down
12 changes: 6 additions & 6 deletions src/pyversity/strategies/dpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def dpp(

This strategy selects a diverse and relevant subset of `k` items by
maximizing the determinant of a kernel matrix that balances item relevance
and pairwise similarity.
and pairwise similarity. Note that

:param embeddings: 2D array of shape (n_samples, n_features).
:param scores: 1D array of relevance scores for each item.
Expand All @@ -33,7 +33,7 @@ def dpp(
Higher values increase the emphasis on diversity.
:param scale: Optional scaling factor for the beta parameter to adjust relevance influence.
:return: A DiversificationResult containing the selected item indices,
their marginal gains, the strategy used, and the parameters.
their selection scores, the strategy used, and the parameters.
:raises ValueError: If diversity is not in [0, 1].
"""
if not (0.0 <= float(diversity) <= 1.0):
Expand All @@ -49,7 +49,7 @@ def dpp(
# Nothing to select: return empty arrays
return DiversificationResult(
indices=np.empty(0, np.int32),
marginal_gains=np.empty(0, np.float32),
selection_scores=np.empty(0, np.float32),
strategy=Strategy.DPP,
diversity=diversity,
parameters={"scale": scale},
Expand Down Expand Up @@ -80,8 +80,8 @@ def dpp(
marginal_gains[step] = best_score
selected_mask[best_index] = True

if step == top_k - 1 or best_score <= 0.0:
# No more items to select or no positive gain
if step == top_k - 1:
# No more items to select
step += 1
break

Expand All @@ -106,7 +106,7 @@ def dpp(

return DiversificationResult(
indices=selected_indices[:step],
marginal_gains=marginal_gains[:step],
selection_scores=marginal_gains[:step],
strategy=Strategy.DPP,
diversity=diversity,
parameters={"scale": scale},
Expand Down
2 changes: 1 addition & 1 deletion src/pyversity/strategies/mmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def mmr(
:param metric: Similarity metric to use. Default is Metric.COSINE.
:param normalize: Whether to normalize embeddings before computing similarity.
:return: A DiversificationResult containing the selected item indices,
their marginal gains, the strategy used, and the parameters.
their selection scores, the strategy used, and the parameters.
"""
return greedy_select(
"mmr",
Expand Down
2 changes: 1 addition & 1 deletion src/pyversity/strategies/msd.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def msd(
:param metric: Similarity metric to use. Default is Metric.COSINE.
:param normalize: Whether to normalize embeddings before computing similarity.
:return: A DiversificationResult containing the selected item indices,
their marginal gains, the strategy used, and the parameters.
their selection scores, the strategy used, and the parameters.
"""
return greedy_select(
"msd",
Expand Down
6 changes: 3 additions & 3 deletions src/pyversity/strategies/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def greedy_select(
:param diversity: Trade-off parameter in [0, 1]. Inverse of lambda parameter.
1.0 = pure diversity, 0.0 = pure relevance.
:return: A DiversificationResult containing the selected item indices,
their marginal gains, the strategy used, and the parameters.
their selection scores, the strategy used, and the parameters.
:raises ValueError: If diversity is not in [0, 1].
:raises ValueError: If input shapes are inconsistent.
"""
Expand All @@ -55,7 +55,7 @@ def greedy_select(
# Nothing to select: return empty arrays
return DiversificationResult(
indices=np.empty(0, np.int32),
marginal_gains=np.empty(0, np.float32),
selection_scores=np.empty(0, np.float32),
strategy=Strategy.MMR if strategy == "mmr" else Strategy.MSD,
diversity=diversity,
parameters=params,
Expand Down Expand Up @@ -110,7 +110,7 @@ def greedy_select(

return DiversificationResult(
indices=selected_indices,
marginal_gains=marginal_gains,
selection_scores=marginal_gains,
strategy=Strategy.MMR if strategy == "mmr" else Strategy.MSD,
diversity=diversity,
parameters=params,
Expand Down
4 changes: 2 additions & 2 deletions src/pyversity/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ def prepare_inputs(embeddings: np.ndarray, scores: np.ndarray, k: int) -> tuple[
"""
Prepare relevance scores and embeddings.

:param embeddings: Array of shape embeddings.
:param embeddings: Array of embeddings.
:param scores: Array of relevance scores.
:param k: Number of top elements to consider.
:return: Tuple of relevances, embeddings, k_clamped, early_exit.
:return: Tuple of embeddings, relevances, k_clamped, early_exit.
:raises ValueError: If input shapes are inconsistent.
"""
relevance_scores = np.asarray(scores, dtype=np.float32).reshape(-1)
Expand Down
20 changes: 10 additions & 10 deletions tests/test_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_mmr() -> None:
res = mmr(emb, scores, k=3, diversity=0.0, metric=Metric.COSINE, normalize=True)
expected = np.array([1, 3, 2], dtype=np.int32)
assert np.array_equal(res.indices, expected)
assert np.allclose(res.marginal_gains, scores[expected])
assert np.allclose(res.selection_scores, scores[expected])

# Strong diversity (diversity=1): avoid near-duplicate
emb = np.array([[1.0, 0.0], [0.999, 0.001], [0.0, 1.0]], dtype=np.float32)
Expand Down Expand Up @@ -63,7 +63,7 @@ def test_cover() -> None:
res = cover(emb, scores, k=2, diversity=0.0)
expected = np.array([1, 2], dtype=np.int32)
assert np.array_equal(res.indices, expected)
assert np.allclose(res.marginal_gains, scores[expected])
assert np.allclose(res.selection_scores, scores[expected])

# Balanced coverage (diversity=0.5, gamma=0.5): picks diverse set
res = cover(emb, scores, k=2, diversity=0.5, gamma=0.5)
Expand All @@ -88,24 +88,24 @@ def test_dpp() -> None:
# Strong diversity (diversity=1)
res = dpp(emb, scores, k=2, diversity=1.0)
assert 1 <= res.indices.size <= 2
assert np.all(res.marginal_gains >= -1e-7)
assert np.all(res.marginal_gains[:-1] + 1e-7 >= res.marginal_gains[1:])
assert np.all(res.selection_scores >= -1e-7)
assert np.all(res.selection_scores[:-1] + 1e-7 >= res.selection_scores[1:])

# Balanced (diversity=0.5)
res = dpp(emb, scores, k=2, diversity=0.5)
assert 1 <= res.indices.size <= 2
assert np.all(res.marginal_gains >= -1e-7)
assert np.all(res.marginal_gains[:-1] + 1e-7 >= res.marginal_gains[1:])
assert np.all(res.selection_scores >= -1e-7)
assert np.all(res.selection_scores[:-1] + 1e-7 >= res.selection_scores[1:])

# Low diversity (diversity=0.0): more relevance-driven
res = dpp(emb, scores, k=2, diversity=0.0)
assert 1 <= res.indices.size <= 2
assert np.all(res.marginal_gains >= -1e-7)
assert np.all(res.marginal_gains[:-1] + 1e-7 >= res.marginal_gains[1:])
assert np.all(res.selection_scores >= -1e-7)
assert np.all(res.selection_scores[:-1] + 1e-7 >= res.selection_scores[1:])

# Early exit on empty input
res = dpp(np.empty((0, 3), dtype=np.float32), np.array([], dtype=np.float32), k=3)
assert res.indices.size == 0 and res.marginal_gains.size == 0
assert res.indices.size == 0 and res.selection_scores.size == 0


@pytest.mark.parametrize(
Expand All @@ -129,4 +129,4 @@ def test_diversify(strategy: Strategy, fn: Callable[..., DiversificationResult],
res_disp = diversify(embeddings=emb, scores=scores, k=2, strategy=strategy, **kwargs)

assert np.array_equal(res_direct.indices, res_disp.indices)
assert np.allclose(res_direct.marginal_gains, res_disp.marginal_gains)
assert np.allclose(res_direct.selection_scores, res_disp.selection_scores)