Skip to content

Commit

Permalink
perf(insights): Speed up loading properties (#11037)
Browse files Browse the repository at this point in the history
* perf(insights): Speed up loading properties

* paginator that doesn't count with manual limit and offset

* fix tests and types

* remove console.log

* one more test that relied on count

* add a full count to every row

* make a query context object that can be used to count or return results

* silence unrelated homepage error

* minimise change surface

* minimise change surface

* override less of the pager

Co-authored-by: Paul D'Ambra <paul@posthog.com>
  • Loading branch information
timgl and pauldambra committed Aug 17, 2022
1 parent 4d47d14 commit 5780a28
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 31 deletions.
2 changes: 1 addition & 1 deletion cypress/e2e/a11y.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ describe('a11y', () => {
it('home should have no accessibility violations', () => {
cy.get('[data-attr="menu-item-projecthomepage"]').click()
cy.injectAxe()
reportA11y({ includedImpacts: ['critical'] }, 'home-page-critical', false)
reportA11y({ includedImpacts: ['critical'] }, 'home-page-critical', true)
reportA11y({ includedImpacts: ['serious'] }, 'home-page-serious', true)
})

Expand Down
154 changes: 124 additions & 30 deletions posthog/api/property_definition.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import dataclasses
import json
from typing import Type
from typing import Any, List, Optional, Type

from django.db import connection
from django.db.models import Prefetch
from rest_framework import mixins, permissions, serializers, viewsets
from rest_framework.pagination import LimitOffsetPagination

from posthog.api.routing import StructuredViewSetMixin
from posthog.api.tagged_item import TaggedItemSerializerMixin, TaggedItemViewSetMixin
Expand All @@ -12,6 +15,44 @@
from posthog.models import PropertyDefinition, TaggedItem
from posthog.permissions import OrganizationMemberPermissions, TeamMemberAccessPermission


@dataclasses.dataclass
class QueryContext:
"""
The raw query has multiple parameters that change the SQL.
Particularly
* what the projection is - it is used to count and to load the models
* and whether results should be ordered and paged
"""

projection: str
table: str
team_id: int
name_filter: str
numerical_filter: str
search_query: str
event_property_filter: str
is_feature_flag_filter: str
with_paging: bool
limit: int
offset: int

def as_sql(self):
query = f"""
SELECT {self.projection}
FROM {self.table}
WHERE team_id = {self.team_id} AND name NOT IN %(excluded_properties)s
{self.name_filter} {self.numerical_filter} {self.search_query} {self.event_property_filter} {self.is_feature_flag_filter}
"""
if self.with_paging:
query += f"""
ORDER BY is_event_property DESC, query_usage_30_day DESC NULLS LAST, name ASC
LIMIT {self.limit} OFFSET {self.offset}
"""

return query


# Properties generated by ingestion we don't want to show to users
HIDDEN_PROPERTY_DEFINITIONS = set(
[
Expand Down Expand Up @@ -48,6 +89,45 @@ def update(self, property_definition: PropertyDefinition, validated_data):
raise EnterpriseFeatureException()


class NotCountingLimitOffsetPaginator(LimitOffsetPagination):
"""
The standard LimitOffsetPagination was expensive because there are very many PropertyDefinition models
And we query them using a RawQuerySet that meant for each page of results we loaded all models twice
Once to count them and a second time because we would slice them in memory
This paginator expects the caller to have counted and paged the queryset
"""

def set_count(self, count: int) -> None:
self.count = count

def get_count(self, queryset) -> int:
"""
Determine an object count, supporting either querysets or regular lists.
"""
if self.count is None:
raise Exception("count must be manually set before paginating")

return self.count

def paginate_queryset(self, queryset, request, view=None) -> Optional[List[Any]]:
"""
Assumes the queryset has already had pagination applied
"""
self.count = self.get_count(queryset)
self.limit = self.get_limit(request)
if self.limit is None:
return None

self.offset = self.get_offset(request)
self.request = request

if self.count == 0 or self.offset > self.count:
return []

return list(queryset)


class PropertyDefinitionViewSet(
TaggedItemViewSetMixin,
StructuredViewSetMixin,
Expand All @@ -62,14 +142,15 @@ class PropertyDefinitionViewSet(
filter_backends = [TermSearchFilterBackend]
ordering = "name"
search_fields = ["name"]
pagination_class = NotCountingLimitOffsetPaginator

def get_queryset(self):
use_entreprise_taxonomy = self.request.user.organization.is_feature_available(AvailableFeature.INGESTION_TAXONOMY) # type: ignore
if use_entreprise_taxonomy:
use_enterprise_taxonomy = self.request.user.organization.is_feature_available(AvailableFeature.INGESTION_TAXONOMY) # type: ignore
if use_enterprise_taxonomy:
try:
from ee.models.property_definition import EnterprisePropertyDefinition
except ImportError:
use_entreprise_taxonomy = False
use_enterprise_taxonomy = False

properties_to_filter = self.request.GET.get("properties", None)
if properties_to_filter:
Expand Down Expand Up @@ -117,48 +198,61 @@ def get_queryset(self):
params = {
"event_names": tuple(event_names or []),
"names": names,
"team_id": self.team_id,
"excluded_properties": tuple(set.union(set(excluded_properties or []), HIDDEN_PROPERTY_DEFINITIONS)),
"is_feature_flag_like": "$feature/%",
**search_kwargs,
}

if use_entreprise_taxonomy:
limit = self.paginator.get_limit(self.request) # type: ignore
offset = self.paginator.get_offset(self.request) # type: ignore

property_definition_fields = ", ".join(
[f'"{f.column}"' for f in PropertyDefinition._meta.get_fields() if hasattr(f, "column")], # type: ignore
)
if use_enterprise_taxonomy:
# Prevent fetching deprecated `tags` field. Tags are separately fetched in TaggedItemSerializerMixin
property_definition_fields = ", ".join(
[f'"{f.column}"' for f in EnterprisePropertyDefinition._meta.get_fields() if hasattr(f, "column") and f.column not in ["deprecated_tags", "tags"]], # type: ignore
)

return EnterprisePropertyDefinition.objects.raw(
f"""
SELECT {property_definition_fields},
{event_property_field} AS is_event_property
FROM ee_enterprisepropertydefinition
FULL OUTER JOIN posthog_propertydefinition ON posthog_propertydefinition.id=ee_enterprisepropertydefinition.propertydefinition_ptr_id
WHERE team_id = %(team_id)s AND name NOT IN %(excluded_properties)s
{name_filter} {numerical_filter} {search_query} {event_property_filter} {is_feature_flag_filter}
ORDER BY is_event_property DESC, query_usage_30_day DESC NULLS LAST, name ASC
""",
params=params,
).prefetch_related(
qs = (
EnterprisePropertyDefinition.objects.prefetch_related(
Prefetch("tagged_items", queryset=TaggedItem.objects.select_related("tag"), to_attr="prefetched_tags"),
)
if use_enterprise_taxonomy
else PropertyDefinition.objects
)

property_definition_fields = ", ".join(
[f'"{f.column}"' for f in PropertyDefinition._meta.get_fields() if hasattr(f, "column")], # type: ignore
table = (
"ee_enterprisepropertydefinition FULL OUTER JOIN posthog_propertydefinition ON posthog_propertydefinition.id=ee_enterprisepropertydefinition.propertydefinition_ptr_id"
if use_enterprise_taxonomy
else "posthog_propertydefinition"
)

query_context = QueryContext(
projection=f"{property_definition_fields},{event_property_field} AS is_event_property",
table=table,
team_id=self.team_id,
name_filter=name_filter,
numerical_filter=numerical_filter,
search_query=search_query,
event_property_filter=event_property_filter,
is_feature_flag_filter=is_feature_flag_filter,
limit=limit,
offset=offset,
with_paging=True,
)

return PropertyDefinition.objects.raw(
f"""
SELECT {property_definition_fields},
{event_property_field} AS is_event_property
FROM posthog_propertydefinition
WHERE team_id = %(team_id)s AND name NOT IN %(excluded_properties)s
{name_filter} {numerical_filter} {search_query} {event_property_filter} {is_feature_flag_filter}
ORDER BY is_event_property DESC, query_usage_30_day DESC NULLS LAST, name ASC
""",
params=params,
query_context_for_count = dataclasses.replace(
query_context, with_paging=False, projection="count(*) as full_count"
)
with connection.cursor() as cursor:
cursor.execute(query_context_for_count.as_sql(), params)
full_count = cursor.fetchone()[0]

self.paginator.set_count(full_count) # type: ignore

return qs.raw(query_context.as_sql(), params=params,)

def get_serializer_class(self) -> Type[serializers.ModelSerializer]:
serializer_class = self.serializer_class
Expand Down

0 comments on commit 5780a28

Please sign in to comment.