Merge branch 'release/0.9' into main

Princeton-CDH · Feb 24, 2023 · 88f6cc1 · 88f6cc1
2 parents 716ad22 + 68a099c
commit 88f6cc1
Show file tree

Hide file tree

Showing 15 changed files with 328 additions and 53 deletions.
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -12,9 +12,9 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: [3.6, 3.8]
-        solr: [8.6, 6.6]
-        django: [0, 2.2, 3.0, 3.1]
+        python: [3.8, 3.9]
+        solr: [8.6]
+        django: [0, 3.0, 3.1, 3.2]
     # We use service containers to avoid needing to set up a local copy of
     # mysql or postgres on the test runner instance. This syntax is similar to
     # the spec of a docker-compose file. For more, see:

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,18 @@ CHANGELOG
 0.8.2
 -----
 
+* ``SolrQuerySet`` now supports Solr grouping via new `group`
+  method and `GroupedResponse`
+* New class method `prep_index_chunk` on ``Indexable`` class, to support
+  prefetching related objects when iterating over Django querysets for indexing
+* Include django view mixins in sphinx documentation  
+* Dropped support for python 3.6; added python 3.9
+* Dropped support for Django 2.2; added Django 3.2
+* No longer tested against Solr 6.6
+
+0.8.2
+-----
+
 * When subclassing ``SolrQuerySet``, result documents can now be customized by extending ``get_result_document``
 
 0.8.1

diff --git a/README.rst b/README.rst
@@ -57,7 +57,7 @@ configuration and indexing content.
   .. image:: https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336
     :target: https://pycqa.github.io/isort/
 
-Currently tested against Python 3.6 and 3.8, Solr 6.6.5 and 8.6.2, and Django 2.2-3.1 and without Django.
+Currently tested against Python 3.8 and 3.9, Solr 8.6.2, and Django 3.0-3.2 and without Django.
 
 
 Installation

diff --git a/parasolr/__init__.py b/parasolr/__init__.py
@@ -1,6 +1,6 @@
 default_app_config = "parasolr.apps.ParasolConfig"
 
-__version_info__ = (0, 9, 0, "dev")
+__version_info__ = (0, 9, 0, None)
 
 # Dot-connect all but the last. Last is dash-connected if not None.
 __version__ = ".".join([str(i) for i in __version_info__[:-1]])

diff --git a/parasolr/indexing.py b/parasolr/indexing.py
@@ -120,6 +120,15 @@ def total_to_index(cls):
         except AttributeError:
             raise NotImplementedError
 
+    @classmethod
+    def prep_index_chunk(cls, chunk):
+        """Optional method for any additional processing on chunks
+        of items being indexed. Intended to allow adding prefetching on
+        a chunk when iterating on Django QuerySets; since indexing uses Iterator,
+        prefetching configured in `items_to_index` is ignored."""
+        # default behavior is to do nothing; return chunk unchanged
+        return chunk
+
     def index_id(self):
         """Solr identifier. By default, combines :meth:`index item_type`
         and :attr:`id` with :attr:ID_SEPARATOR`."""
@@ -168,6 +177,7 @@ def index_items(cls, items, progbar=None):
         count = 0
         while chunk:
             # call index data method if present; otherwise assume item is dict
+            chunk = cls.prep_index_chunk(chunk)
             cls.solr.update.index(
                 [i.index_data() if hasattr(i, "index_data") else i for i in chunk]
             )

diff --git a/parasolr/management/commands/index.py b/parasolr/management/commands/index.py
@@ -150,14 +150,20 @@ def handle(self, *args, **kwargs):
         # index items requested
         if to_index:
             # list of objects already gathered
+            # items are not guaranteed to be the same subclass of Indexable,
+            # so we don't specify and use the base Indexable class
             count += self.index(to_index, progbar=progbar)
 
         else:
             # iterate over indexables by type and index if requested
             for name, model in self.indexables.items():
                 if self.options["index"] in [name, "all"]:
                     # index in chunks and update progress bar
-                    count += self.index(model.items_to_index(), progbar=progbar)
+                    # pass in indexable class to ensure we use prefetching
+                    # and chunk size specific to that class
+                    count += self.index(
+                        model.items_to_index(), progbar=progbar, indexable=model
+                    )
 
         if progbar:
             progbar.finish()
@@ -170,11 +176,14 @@ def handle(self, *args, **kwargs):
             # using format for comma-separated numbers
             self.stdout.write("Indexed {:,} item{}".format(count, pluralize(count)))
 
-    def index(self, index_data, progbar=None):
+    def index(self, index_data, progbar=None, indexable=None):
         """Index an iterable into the configured solr"""
+        # if indexable subclass is not specified use the base class
+        if indexable is None:
+            indexable = Indexable
         try:
             # index in chunks and update progress bar if there is one
-            return Indexable.index_items(index_data, progbar=progbar)
+            return indexable.index_items(index_data, progbar=progbar)
         except requests.exceptions.ConnectionError as err:
             # bail out if we error connecting to Solr
             raise CommandError(err)

diff --git a/parasolr/pytest_plugin.py b/parasolr/pytest_plugin.py
@@ -145,6 +145,7 @@ def get_mock_solr_queryset(spec=SolrQuerySet, extra_methods=[]):
         "query",
         "only",
         "also",
+        "group",
         "highlight",
         "raw_query_parameters",
         "all",

diff --git a/parasolr/query/aliased_queryset.py b/parasolr/query/aliased_queryset.py
@@ -102,6 +102,14 @@ def highlight(self, field: str, **kwargs) -> "AliasedSolrQuerySet":
         field = self.field_aliases.get(field, field)
         return super().highlight(field, **kwargs)
 
+    def group(self, field: str, **kwargs) -> "AliasedSolrQuerySet":
+        """Extend :meth:`parasolr.query.queryset.SolrQuerySet.group`
+        to support using aliased field names in kwargs. (Note that sorting
+        does not currently support aliased field names)."""
+        field = self.field_aliases.get(field, field)
+        # TODO: should we also reverse alias for sort option if specified?
+        return super().group(field, **kwargs)
+
     def get_facets(self) -> Dict[str, int]:
         """Extend :meth:`parasolr.query.queryset.SolrQuerySet.get_facets`
         to use aliased field names for facet and range facet keys."""

diff --git a/parasolr/query/queryset.py b/parasolr/query/queryset.py
@@ -37,12 +37,14 @@ class SolrQuerySet:
     filter_qs = []
     field_list = []
     highlight_fields = []
+    group_field = None
     facet_field_list = []
     stats_field_list = []
     range_facet_fields = []
     facet_opts = {}
     stats_opts = {}
     highlight_opts = {}
+    group_opts = {}
     raw_params = {}
 
     #: by default, combine search queries with AND
@@ -59,7 +61,7 @@ def __init__(self, solr: SolrClient):
         # convert search operator into form needed for combining queries
         self._search_op = " %s " % self.default_search_operator
 
-    def get_results(self, **kwargs) -> List[dict]:
+    def get_response(self, **kwargs) -> List[dict]:
         """
         Query Solr and get the results for the current query and filter
         options. Populates result cache and returns the documents portion
@@ -72,16 +74,43 @@ def get_results(self, **kwargs) -> List[dict]:
         # if query options have changed?
         # For now, always query.
 
+        # if cached and no override query args are specified,
+        # return existing cached result
+        if self._result_cache and not kwargs:
+            return self._result_cache
+
         query_opts = self.query_opts()
         query_opts.update(**kwargs)
-        # TODO: what do we do about the fact that Solr defaults
-        # to 10 rows?
+
+        # NOTE: still need to work around Solr default of 10 rows
+        # see https://github.com/Princeton-CDH/parasolr/issues/43
+
+        # note that we're caching the result with override options here,
+        # which may not always be the right thing to do ...
+        self._result_cache = self.solr.query(**query_opts)
 
         # NOTE: django templates choke on AttrDict because it is
         # callable; using dictionary response instead
-        self._result_cache = self.solr.query(**query_opts)
+
+        return self._result_cache
+
+    def get_results(self, **kwargs) -> List[dict]:
+        """
+        Query Solr and get the results for the current query and filter
+        options. Populates result cache and returns the documents portion
+        of the reponse.
+        (Note that this method is not currently compatible with grouping.)
+
+        Returns:
+            Solr response documents as a list of dictionaries.
+        """
+        # get query response
+        response = self.get_response(**kwargs)
         # if there is a query error, result will not be set
-        if self._result_cache:
+        if response:
+            # NOTE: should probably handle result doc tranformation on grouped responses.
+            # Intentionally applying to .docs instead of .items to trigger
+            # an error if anyone attempts to use this on a grouped response
             return [self.get_result_document(doc) for doc in self._result_cache.docs]
         return []
 
@@ -99,6 +128,14 @@ def _set_highlighting_opts(self, query_opts: Dict) -> None:
             # (prefixes added in highlight methods)
             query_opts.update(self.highlight_opts)
 
+    def _set_group_opts(self, query_opts: Dict) -> None:
+        """Configure grouping atrtibutes on query_opts. Modifies dictionary
+        directly."""
+        if self.group_field:
+            query_opts.update({"group": True, "group.field": self.group_field})
+            # any other group options can be added as-is
+            query_opts.update(self.group_opts)
+
     def _set_faceting_opts(self, query_opts: Dict) -> None:
         """Configure faceting attributes directly on query_opts. Modifies
         dictionary directly."""
@@ -146,6 +183,9 @@ def query_opts(self) -> Dict[str, str]:
         # highlighting
         self._set_highlighting_opts(query_opts)
 
+        # grouping
+        self._set_group_opts(query_opts)
+
         # faceting
         self._set_faceting_opts(query_opts)
 
@@ -528,6 +568,24 @@ def highlight(self, field: str, **kwargs) -> "SolrQuerySet":
 
         return qs_copy
 
+    def group(self, field: str, **kwargs) -> "SolrQuerySet":
+        """ "Configure grouping. Takes arbitrary Solr group
+        parameters and adds the `group.` prefix to them.  Example use,
+        grouping on a `group_id` field, limiting to three results per group,
+        and sorting group members by an `order` field::
+
+            queryset.group('group_id', limit=3, sort='order asc')
+        """
+        qs_copy = self._clone()
+        # store group field and grouping options
+        # for now, assuming single group field
+        qs_copy.group_field = field
+        qs_copy.group_opts.update(
+            {"group.%s" % opt: value for opt, value in kwargs.items()}
+        )
+
+        return qs_copy
+
     def raw_query_parameters(self, **kwargs) -> "SolrQuerySet":
         """Add abritrary raw parameters to be included in the query
         request, e.g. for variables referenced in join or field queries.
@@ -538,9 +596,7 @@ def raw_query_parameters(self, **kwargs) -> "SolrQuerySet":
 
     def get_highlighting(self) -> Dict[str, Dict[str, List]]:
         """Return the highlighting portion of the Solr response."""
-        if not self._result_cache:
-            self.get_results()
-        return self._result_cache.highlighting
+        return self.get_response().highlighting
 
     def all(self) -> "SolrQuerySet":
         """Return a new queryset that is a copy of the current one."""
@@ -565,6 +621,7 @@ def _clone(self) -> "SolrQuerySet":
         qs_copy.start = self.start
         qs_copy.stop = self.stop
         qs_copy.highlight_fields = list(self.highlight_fields)
+        qs_copy.group_field = self.group_field
 
         # set copies of list and dict attributes
         qs_copy.search_qs = list(self.search_qs)
@@ -573,6 +630,7 @@ def _clone(self) -> "SolrQuerySet":
         qs_copy.field_list = list(self.field_list)
         qs_copy.range_facet_fields = list(self.range_facet_fields)
         qs_copy.highlight_opts = dict(self.highlight_opts)
+        qs_copy.group_opts = dict(self.group_opts)
         qs_copy.raw_params = dict(self.raw_params)
         qs_copy.facet_field_list = list(self.facet_field_list)
         qs_copy.facet_opts = dict(self.facet_opts)
@@ -613,7 +671,7 @@ def __getitem__(self, k):
         # if the result cache is already populated,
         # return the requested index or slice
         if self._result_cache:
-            return self._result_cache.docs[k]
+            return self._result_cache.items[k]
 
         qs_copy = self._clone()
 

diff --git a/parasolr/query/tests/test_aliased_queryset.py b/parasolr/query/tests/test_aliased_queryset.py
@@ -145,6 +145,15 @@ def test_highlight(self, mock_highlight):
         self.mysqs.highlight("foo_b")
         mock_highlight.assert_called_with("foo_b")
 
+    @patch("parasolr.query.queryset.SolrQuerySet.group")
+    def test_group(self, mock_group):
+        # args should be unaliased
+        self.mysqs.group("name")
+        mock_group.assert_called_with(self.mysqs.field_aliases["name"])
+        # unknown should be ignored
+        self.mysqs.group("foo_b")
+        mock_group.assert_called_with("foo_b")
+
     @patch("parasolr.query.queryset.SolrQuerySet.get_facets")
     def test_get_facets(self, mock_get_facets):
         sample_facet_result = {

diff --git a/parasolr/query/tests/test_queryset.py b/parasolr/query/tests/test_queryset.py
@@ -34,6 +34,8 @@ def test_query_opts(self):
             "facet",
             "stats",
             "stats.field",
+            "group",
+            "group.field",
         ]:
             assert opt not in query_opts
 
@@ -97,6 +99,16 @@ def test_query_opts(self):
         assert query_opts["facet.range"] == sqs.range_facet_fields
         assert range_facet_opt in query_opts
 
+    def test_query_opts_group(self):
+        mocksolr = Mock(spec=SolrClient)
+        sqs = SolrQuerySet(mocksolr)
+        sqs.group_field = "group_id"
+        sqs.group_opts = {"group.limit": 3}
+        query_opts = sqs.query_opts()
+        assert query_opts["group"] == True
+        assert query_opts["group.field"] == "group_id"
+        assert query_opts["group.limit"] == 3
+
     def test_query(self):
         mocksolr = Mock(spec=SolrClient)
         mocksolr.query.return_value.docs = []
@@ -494,6 +506,14 @@ def test_highlight(self):
         assert sqs.highlight_fields == []
         assert sqs.highlight_opts == {}
 
+    def test_group(self):
+        mocksolr = Mock(spec=SolrClient)
+        sqs = SolrQuerySet(mocksolr)
+        # field only, defaults
+        group_qs = sqs.group("content", limit=3)
+        assert group_qs.group_field == "content"
+        assert group_qs.group_opts == {"group.limit": 3}
+
     def test_raw_query_parameters(self):
         mocksolr = Mock(spec=SolrClient)
         sqs = SolrQuerySet(mocksolr)
@@ -531,18 +551,6 @@ def test_get_highlighting(self):
         sqs._result_cache = Mock(highlighting=mock_highlights)
         assert sqs.get_highlighting() == mock_highlights
 
-        # should populate cache if empty
-        sqs._result_cache = None
-        with patch.object(sqs, "get_results") as mock_get_results:
-
-            def set_result_cache():
-                sqs._result_cache = Mock()
-
-            mock_get_results.side_effect = set_result_cache
-
-            sqs.get_highlighting()
-            mock_get_results.assert_called_with()
-
     def test_all(self):
         mocksolr = Mock(spec=SolrClient)
         sqs = SolrQuerySet(mocksolr)
@@ -759,7 +767,7 @@ def test_get_item(self):
 
         # simulate result cache already populated
         sqs._result_cache = Mock()
-        sqs._result_cache.docs = [1, 2, 3, 4, 5]
+        sqs._result_cache.items = [1, 2, 3, 4, 5]
         # single item
         assert sqs[0] == 1
         assert sqs[1] == 2