Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(performance): implement sampling for lifecycle queries #14283

Merged
merged 9 commits into from
Feb 17, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions frontend/src/lib/constants.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ export const FEATURE_FLAGS = {
FF_JSON_PAYLOADS: 'ff-json-payloads', // owner @EDsCODE
PERSON_GROUPS_PROPERTY_DEFINITIONS: 'person-groups-property-definitions', // owner: @macobo
DATA_EXPLORATION_QUERIES_ON_DASHBOARDS: 'data-exploration-queries-on-dashboards', // owner: @pauldambra
SAMPLING: 'sampling', // owner: @yakkomajuri
}

/** Which self-hosted plan's features are available with Cloud's "Standard" plan (aka card attached). */
Expand Down
19 changes: 19 additions & 0 deletions frontend/src/scenes/insights/EditorFilters/EditorFilters.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import {
isStickinessFilter,
isTrendsFilter,
} from 'scenes/insights/sharedUtils'
import { SamplingFilter } from './SamplingFilter'

export interface EditorFiltersProps {
insightProps: InsightLogicProps
Expand Down Expand Up @@ -242,6 +243,24 @@ export function EditorFilters({ insightProps, showing }: EditorFiltersProps): JS
},
]),
},
{
title: 'Sampling',
position: 'right',
editorFilters: filterFalsy([
{
key: 'sampling',
label: '',
position: 'right',
component: SamplingFilter,
tooltip: (
<>
Sampling computes the result on a subset of the data, making insights load significantly
faster. Results are not exact, but statistically similar to the exact results.
</>
),
},
]),
},
].filter((x) => x.editorFilters.length > 0)

const leftFilters = editorFilters.reduce(
Expand Down
33 changes: 33 additions & 0 deletions frontend/src/scenes/insights/EditorFilters/SamplingFilter.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { EditorFilterProps, InsightType } from '~/types'
import './LifecycleToggles.scss'
import { LemonSwitch } from '@posthog/lemon-ui'
import { insightLogic } from 'scenes/insights/insightLogic'
import { useActions, useValues } from 'kea'
import { featureFlagLogic } from 'lib/logic/featureFlagLogic'
import { FEATURE_FLAGS } from 'lib/constants'

export function SamplingFilter({ filters: editorFilters }: EditorFilterProps): JSX.Element {
const { setFilters } = useActions(insightLogic)
const { filters } = useValues(insightLogic)
const { featureFlags } = useValues(featureFlagLogic)

// Sampling is currently behind a feature flag and only available on lifecycle queries
if (!featureFlags[FEATURE_FLAGS.SAMPLING] || editorFilters.insight !== InsightType.LIFECYCLE) {
return <></>
}
return (
<>
<div className="SamplingFilter">
<LemonSwitch
checked={!!filters.sample_results}
label={
<>
<span>Show sampled results</span>
</>
}
onChange={(newChecked) => setFilters({ ...filters, sample_results: newChecked })}
/>
</div>
</>
)
}
1 change: 1 addition & 0 deletions frontend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1470,6 +1470,7 @@ export interface FilterType {
breakdown_value?: string | number
breakdown_group_type_index?: number | null
aggregation_group_type_index?: number // Groups aggregation
sample_results?: boolean | null
}

export interface PropertiesTimelineFilterType {
Expand Down
2 changes: 2 additions & 0 deletions posthog/models/filters/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
InsightMixin,
LimitMixin,
OffsetMixin,
SampleMixin,
SearchMixin,
SelectorMixin,
ShownAsMixin,
Expand Down Expand Up @@ -90,6 +91,7 @@ class Filter(
EmailMixin,
BaseFilter,
ClientQueryIdMixin,
SampleMixin,
):
"""
Filters allow us to describe what events to show/use in various places in the system, for example Trends or Funnels.
Expand Down
16 changes: 16 additions & 0 deletions posthog/models/filters/mixins/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,3 +554,19 @@ class EmailMixin(BaseParamMixin):
def email(self) -> Optional[str]:
email = self._data.get("email", None)
return email


class SampleMixin(BaseParamMixin):
"""
Sample factor for a query.
"""

default_sample_factor = 0.1

@cached_property
def sample_factor(self) -> Optional[float]:
factor = None
if self._data.get("sample_results", False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Why not send the sample factor from the FE?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still unsure what the approach should be with sampling. Should we let the client decide or should we make decisions for them?

@mariusandra seems to like the idea of the client deciding too. From my side, I think I still have non-technical users in mind (i.e. building for PMs), but maybe I should drop that and think about devs first instead. If that's the case, then yeah I'd send this from the frontend and maybe make it a slider-like thing like Marius mentioned

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note we can hard-code the value from the frontend if we want and keep api flexibility up :)

factor = self.default_sample_factor

return factor
73 changes: 73 additions & 0 deletions posthog/queries/test/__snapshots__/test_lifecycle.ambr
Original file line number Diff line number Diff line change
@@ -1,3 +1,76 @@
# name: TestLifecycle.test_sampling
'
WITH 'day' AS selected_period,
periods AS
(SELECT dateSub(day, number, dateTrunc(selected_period, toDateTime('2020-01-19 23:59:59', 'UTC'))) AS start_of_period
FROM numbers(dateDiff('day', dateTrunc('day', toDateTime('2020-01-12 00:00:00', 'UTC')), dateTrunc('day', toDateTime('2020-01-19 23:59:59', 'UTC') + INTERVAL 1 day))))
SELECT groupArray(start_of_period) as date,
groupArray(counts) as data,
status
FROM
(SELECT if(status = 'dormant', toInt64(SUM(counts)) * toInt16(-1), toInt64(SUM(counts))) as counts,
start_of_period,
status
FROM
(SELECT periods.start_of_period as start_of_period,
toUInt16(0) AS counts,
status
FROM periods
CROSS JOIN
(SELECT status
FROM
(SELECT ['new', 'returning', 'resurrecting', 'dormant'] as status) ARRAY
JOIN status) as sec
ORDER BY status,
start_of_period
UNION ALL SELECT start_of_period,
count(DISTINCT person_id) counts,
status
FROM
(SELECT pdi.person_id as person_id,
arraySort(groupUniqArray(dateTrunc('day', toTimeZone(toDateTime(events.timestamp, 'UTC'), 'UTC')))) AS all_activity,
arrayPopBack(arrayPushFront(all_activity, dateTrunc('day', toTimeZone(toDateTime(min(person.created_at), 'UTC'), 'UTC')))) as previous_activity,
arrayPopFront(arrayPushBack(all_activity, dateTrunc('day', toDateTime('1970-01-01')))) as following_activity,
arrayMap((previous, current, index) -> if(previous = current, 'new', if(current - INTERVAL 1 day = previous
AND index != 1, 'returning', 'resurrecting')), previous_activity, all_activity, arrayEnumerate(all_activity)) as initial_status,
arrayMap((current, next) -> if(current + INTERVAL 1 day = next, '', 'dormant'), all_activity, following_activity) as dormant_status,
arrayMap(x -> x + INTERVAL 1 day, arrayFilter((current, is_dormant) -> is_dormant = 'dormant', all_activity, dormant_status)) as dormant_periods,
arrayMap(x -> 'dormant', dormant_periods) as dormant_label,
arrayConcat(arrayZip(all_activity, initial_status), arrayZip(dormant_periods, dormant_label)) as temp_concat,
arrayJoin(temp_concat) as period_status_pairs,
period_status_pairs.1 as start_of_period,
period_status_pairs.2 as status,
toDateTime(min(person.created_at), 'UTC') AS created_at
FROM events AS e SAMPLE 0.1
INNER JOIN
(SELECT distinct_id,
argMax(person_id, version) as person_id
FROM person_distinct_id2
WHERE team_id = 2
GROUP BY distinct_id
HAVING argMax(is_deleted, version) = 0) AS pdi ON e.distinct_id = pdi.distinct_id
INNER JOIN
(SELECT id,
argMax(created_at, version) as created_at
FROM person
WHERE team_id = 2
GROUP BY id
HAVING max(is_deleted) = 0) person ON person.id = pdi.person_id
WHERE team_id = 2
AND event = '$pageview'
AND timestamp >= toDateTime(dateTrunc('day', toDateTime('2020-01-12 00:00:00', 'UTC'))) - INTERVAL 1 day
AND timestamp < toDateTime(dateTrunc('day', toDateTime('2020-01-19 23:59:59', 'UTC'))) + INTERVAL 1 day
GROUP BY pdi.person_id)
GROUP BY start_of_period,
status)
WHERE start_of_period <= dateTrunc('day', toDateTime('2020-01-19 23:59:59', 'UTC'))
AND start_of_period >= dateTrunc('day', toDateTime('2020-01-12 00:00:00', 'UTC'))
GROUP BY start_of_period,
status
ORDER BY start_of_period ASC)
GROUP BY status
'
---
# name: TestLifecycle.test_timezones
'
WITH 'day' AS selected_period,
Expand Down
35 changes: 35 additions & 0 deletions posthog/queries/test/test_lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,41 @@ def test_timezones(self):
],
)

# Ensure running the query with sampling works + generate a snapshot that shows sampling in the query
@snapshot_clickhouse_queries
def test_sampling(self):
self._create_events(
data=[
(
"p1",
[
"2020-01-11T12:00:00Z",
"2020-01-12T12:00:00Z",
"2020-01-13T12:00:00Z",
"2020-01-15T12:00:00Z",
"2020-01-17T12:00:00Z",
"2020-01-19T12:00:00Z",
],
),
("p2", ["2020-01-09T12:00:00Z", "2020-01-12T12:00:00Z"]),
("p3", ["2020-01-12T12:00:00Z"]),
("p4", ["2020-01-15T12:00:00Z"]),
]
)

Trends().run(
Filter(
data={
"date_from": "2020-01-12T00:00:00Z",
"date_to": "2020-01-19T00:00:00Z",
"events": [{"id": "$pageview", "type": "events", "order": 0}],
"shown_as": TRENDS_LIFECYCLE,
"sample_results": True,
}
),
self.team,
)


def assertLifecycleResults(results, expected):
sorted_results = [{"status": r["status"], "data": r["data"]} for r in sorted(results, key=lambda r: r["status"])]
Expand Down
4 changes: 4 additions & 0 deletions posthog/queries/trends/lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ def get_query(self):
created_at_clause = "person.created_at" if not self._using_person_on_events else "person_created_at"

null_person_filter = f"AND notEmpty({self.EVENT_TABLE_ALIAS}.person_id)" if self._using_person_on_events else ""

sample_clause = f"SAMPLE {self._filter.sample_factor}" if self._filter.sample_factor else ""

return (
LIFECYCLE_EVENTS_QUERY.format(
event_table_alias=self.EVENT_TABLE_ALIAS,
Expand All @@ -168,6 +171,7 @@ def get_query(self):
null_person_filter=null_person_filter,
entity_prop_query=entity_prop_query,
interval=self._filter.interval,
sample_clause=sample_clause,
),
self.params,
)
Expand Down
1 change: 1 addition & 0 deletions posthog/queries/trends/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@
period_status_pairs.2 as status,
toDateTime(min({created_at_clause}), %(timezone)s) AS created_at
FROM events AS {event_table_alias}
{sample_clause}
{distinct_id_query}
{person_query}
{groups_query}
Expand Down
2 changes: 2 additions & 0 deletions posthog/queries/trends/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ def parse_response(stats: Dict, filter: Filter, additional_values: Dict = {}) ->
counts = stats[1]
labels = [item.strftime("%-d-%b-%Y{}".format(" %H:%M" if filter.interval == "hour" else "")) for item in stats[0]]
days = [item.strftime("%Y-%m-%d{}".format(" %H:%M:%S" if filter.interval == "hour" else "")) for item in stats[0]]
if filter.sample_factor:
counts = [c * (1 / filter.sample_factor) for c in counts]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Probably worth creating a correct_for_sampling function.

return {
"data": [float(c) for c in counts],
"count": float(sum(counts)),
Expand Down