PostHog · yakkomajuri · Feb 17, 2023 · Feb 16, 2023 · Feb 16, 2023 · Feb 16, 2023
diff --git a/frontend/src/lib/constants.tsx b/frontend/src/lib/constants.tsx
@@ -139,6 +139,7 @@ export const FEATURE_FLAGS = {
     FF_JSON_PAYLOADS: 'ff-json-payloads', // owner @EDsCODE
     PERSON_GROUPS_PROPERTY_DEFINITIONS: 'person-groups-property-definitions', // owner: @macobo
     DATA_EXPLORATION_QUERIES_ON_DASHBOARDS: 'data-exploration-queries-on-dashboards', // owner: @pauldambra
+    SAMPLING: 'sampling', // owner: @yakkomajuri
 }
 
 /** Which self-hosted plan's features are available with Cloud's "Standard" plan (aka card attached). */

diff --git a/frontend/src/scenes/insights/EditorFilters/EditorFilters.tsx b/frontend/src/scenes/insights/EditorFilters/EditorFilters.tsx
@@ -38,6 +38,7 @@ import {
     isStickinessFilter,
     isTrendsFilter,
 } from 'scenes/insights/sharedUtils'
+import { SamplingFilter } from './SamplingFilter'
 
 export interface EditorFiltersProps {
     insightProps: InsightLogicProps
@@ -242,6 +243,24 @@ export function EditorFilters({ insightProps, showing }: EditorFiltersProps): JS
                 },
             ]),
         },
+        {
+            title: 'Sampling',
+            position: 'right',
+            editorFilters: filterFalsy([
+                {
+                    key: 'sampling',
+                    label: '',
+                    position: 'right',
+                    component: SamplingFilter,
+                    tooltip: (
+                        <>
+                            Sampling computes the result on a subset of the data, making insights load significantly
+                            faster. Results are not exact, but statistically similar to the exact results.
+                        </>
+                    ),
+                },
+            ]),
+        },
     ].filter((x) => x.editorFilters.length > 0)
 
     const leftFilters = editorFilters.reduce(

diff --git a/frontend/src/scenes/insights/EditorFilters/SamplingFilter.tsx b/frontend/src/scenes/insights/EditorFilters/SamplingFilter.tsx
@@ -0,0 +1,33 @@
+import { EditorFilterProps, InsightType } from '~/types'
+import './LifecycleToggles.scss'
+import { LemonSwitch } from '@posthog/lemon-ui'
+import { insightLogic } from 'scenes/insights/insightLogic'
+import { useActions, useValues } from 'kea'
+import { featureFlagLogic } from 'lib/logic/featureFlagLogic'
+import { FEATURE_FLAGS } from 'lib/constants'
+
+export function SamplingFilter({ filters: editorFilters }: EditorFilterProps): JSX.Element {
+    const { setFilters } = useActions(insightLogic)
+    const { filters } = useValues(insightLogic)
+    const { featureFlags } = useValues(featureFlagLogic)
+
+    // Sampling is currently behind a feature flag and only available on lifecycle queries
+    if (!featureFlags[FEATURE_FLAGS.SAMPLING] || editorFilters.insight !== InsightType.LIFECYCLE) {
+        return <></>
+    }
+    return (
+        <>
+            <div className="SamplingFilter">
+                <LemonSwitch
+                    checked={!!filters.sample_results}
+                    label={
+                        <>
+                            <span>Show sampled results</span>
+                        </>
+                    }
+                    onChange={(newChecked) => setFilters({ ...filters, sample_results: newChecked })}
+                />
+            </div>
+        </>
+    )
+}
diff --git a/frontend/src/types.ts b/frontend/src/types.ts
@@ -1470,6 +1470,7 @@ export interface FilterType {
     breakdown_value?: string | number
     breakdown_group_type_index?: number | null
     aggregation_group_type_index?: number // Groups aggregation
+    sample_results?: boolean | null
 }
 
 export interface PropertiesTimelineFilterType {

diff --git a/posthog/models/filters/filter.py b/posthog/models/filters/filter.py
@@ -26,6 +26,7 @@
     InsightMixin,
     LimitMixin,
     OffsetMixin,
+    SampleMixin,
     SearchMixin,
     SelectorMixin,
     ShownAsMixin,
@@ -90,6 +91,7 @@ class Filter(
     EmailMixin,
     BaseFilter,
     ClientQueryIdMixin,
+    SampleMixin,
 ):
     """
     Filters allow us to describe what events to show/use in various places in the system, for example Trends or Funnels.

diff --git a/posthog/models/filters/mixins/common.py b/posthog/models/filters/mixins/common.py
@@ -554,3 +554,19 @@ class EmailMixin(BaseParamMixin):
     def email(self) -> Optional[str]:
         email = self._data.get("email", None)
         return email
+
+
+class SampleMixin(BaseParamMixin):
+    """
+    Sample factor for a query.
+    """
+
+    default_sample_factor = 0.1
+
+    @cached_property
+    def sample_factor(self) -> Optional[float]:
+        factor = None
+        if self._data.get("sample_results", False):
+            factor = self.default_sample_factor
+
+        return factor
diff --git a/posthog/queries/test/__snapshots__/test_lifecycle.ambr b/posthog/queries/test/__snapshots__/test_lifecycle.ambr
@@ -1,3 +1,76 @@
+# name: TestLifecycle.test_sampling
+  '
+  WITH 'day' AS selected_period,
+       periods AS
+    (SELECT dateSub(day, number, dateTrunc(selected_period, toDateTime('2020-01-19 23:59:59', 'UTC'))) AS start_of_period
+     FROM numbers(dateDiff('day', dateTrunc('day', toDateTime('2020-01-12 00:00:00', 'UTC')), dateTrunc('day', toDateTime('2020-01-19 23:59:59', 'UTC') + INTERVAL 1 day))))
+  SELECT groupArray(start_of_period) as date,
+         groupArray(counts) as data,
+         status
+  FROM
+    (SELECT if(status = 'dormant', toInt64(SUM(counts)) * toInt16(-1), toInt64(SUM(counts))) as counts,
+            start_of_period,
+            status
+     FROM
+       (SELECT periods.start_of_period as start_of_period,
+               toUInt16(0) AS counts,
+               status
+        FROM periods
+        CROSS JOIN
+          (SELECT status
+           FROM
+             (SELECT ['new', 'returning', 'resurrecting', 'dormant'] as status) ARRAY
+           JOIN status) as sec
+        ORDER BY status,
+                 start_of_period
+        UNION ALL SELECT start_of_period,
+                         count(DISTINCT person_id) counts,
+                         status
+        FROM
+          (SELECT pdi.person_id as person_id,
+                  arraySort(groupUniqArray(dateTrunc('day', toTimeZone(toDateTime(events.timestamp, 'UTC'), 'UTC')))) AS all_activity,
+                  arrayPopBack(arrayPushFront(all_activity, dateTrunc('day', toTimeZone(toDateTime(min(person.created_at), 'UTC'), 'UTC')))) as previous_activity,
+                  arrayPopFront(arrayPushBack(all_activity, dateTrunc('day', toDateTime('1970-01-01')))) as following_activity,
+                  arrayMap((previous, current, index) -> if(previous = current, 'new', if(current - INTERVAL 1 day = previous
+                                                                                          AND index != 1, 'returning', 'resurrecting')), previous_activity, all_activity, arrayEnumerate(all_activity)) as initial_status,
+                  arrayMap((current, next) -> if(current + INTERVAL 1 day = next, '', 'dormant'), all_activity, following_activity) as dormant_status,
+                  arrayMap(x -> x + INTERVAL 1 day, arrayFilter((current, is_dormant) -> is_dormant = 'dormant', all_activity, dormant_status)) as dormant_periods,
+                  arrayMap(x -> 'dormant', dormant_periods) as dormant_label,
+                  arrayConcat(arrayZip(all_activity, initial_status), arrayZip(dormant_periods, dormant_label)) as temp_concat,
+                  arrayJoin(temp_concat) as period_status_pairs,
+                  period_status_pairs.1 as start_of_period,
+                  period_status_pairs.2 as status,
+                  toDateTime(min(person.created_at), 'UTC') AS created_at
+           FROM events AS e SAMPLE 0.1
+           INNER JOIN
+             (SELECT distinct_id,
+                     argMax(person_id, version) as person_id
+              FROM person_distinct_id2
+              WHERE team_id = 2
+              GROUP BY distinct_id
+              HAVING argMax(is_deleted, version) = 0) AS pdi ON e.distinct_id = pdi.distinct_id
+           INNER JOIN
+             (SELECT id,
+                     argMax(created_at, version) as created_at
+              FROM person
+              WHERE team_id = 2
+              GROUP BY id
+              HAVING max(is_deleted) = 0) person ON person.id = pdi.person_id
+           WHERE team_id = 2
+             AND event = '$pageview'
+             AND timestamp >= toDateTime(dateTrunc('day', toDateTime('2020-01-12 00:00:00', 'UTC'))) - INTERVAL 1 day
+             AND timestamp < toDateTime(dateTrunc('day', toDateTime('2020-01-19 23:59:59', 'UTC'))) + INTERVAL 1 day
+           GROUP BY pdi.person_id)
+        GROUP BY start_of_period,
+                 status)
+     WHERE start_of_period <= dateTrunc('day', toDateTime('2020-01-19 23:59:59', 'UTC'))
+       AND start_of_period >= dateTrunc('day', toDateTime('2020-01-12 00:00:00', 'UTC'))
+     GROUP BY start_of_period,
+              status
+     ORDER BY start_of_period ASC)
+  GROUP BY status
+  '
+---
 # name: TestLifecycle.test_timezones
   '
   WITH 'day' AS selected_period,

diff --git a/posthog/queries/test/test_lifecycle.py b/posthog/queries/test/test_lifecycle.py
@@ -775,6 +775,41 @@ def test_timezones(self):
             ],
         )
 
+    # Ensure running the query with sampling works + generate a snapshot that shows sampling in the query
+    @snapshot_clickhouse_queries
+    def test_sampling(self):
+        self._create_events(
+            data=[
+                (
+                    "p1",
+                    [
+                        "2020-01-11T12:00:00Z",
+                        "2020-01-12T12:00:00Z",
+                        "2020-01-13T12:00:00Z",
+                        "2020-01-15T12:00:00Z",
+                        "2020-01-17T12:00:00Z",
+                        "2020-01-19T12:00:00Z",
+                    ],
+                ),
+                ("p2", ["2020-01-09T12:00:00Z", "2020-01-12T12:00:00Z"]),
+                ("p3", ["2020-01-12T12:00:00Z"]),
+                ("p4", ["2020-01-15T12:00:00Z"]),
+            ]
+        )
+
+        Trends().run(
+            Filter(
+                data={
+                    "date_from": "2020-01-12T00:00:00Z",
+                    "date_to": "2020-01-19T00:00:00Z",
+                    "events": [{"id": "$pageview", "type": "events", "order": 0}],
+                    "shown_as": TRENDS_LIFECYCLE,
+                    "sample_results": True,
+                }
+            ),
+            self.team,
+        )
+
 
 def assertLifecycleResults(results, expected):
     sorted_results = [{"status": r["status"], "data": r["data"]} for r in sorted(results, key=lambda r: r["status"])]

diff --git a/posthog/queries/trends/lifecycle.py b/posthog/queries/trends/lifecycle.py
@@ -154,6 +154,9 @@ def get_query(self):
         created_at_clause = "person.created_at" if not self._using_person_on_events else "person_created_at"
 
         null_person_filter = f"AND notEmpty({self.EVENT_TABLE_ALIAS}.person_id)" if self._using_person_on_events else ""
+
+        sample_clause = f"SAMPLE {self._filter.sample_factor}" if self._filter.sample_factor else ""
+
         return (
             LIFECYCLE_EVENTS_QUERY.format(
                 event_table_alias=self.EVENT_TABLE_ALIAS,
@@ -168,6 +171,7 @@ def get_query(self):
                 null_person_filter=null_person_filter,
                 entity_prop_query=entity_prop_query,
                 interval=self._filter.interval,
+                sample_clause=sample_clause,
             ),
             self.params,
         )

diff --git a/posthog/queries/trends/sql.py b/posthog/queries/trends/sql.py
@@ -439,6 +439,7 @@
     period_status_pairs.2 as status,
     toDateTime(min({created_at_clause}), %(timezone)s) AS created_at
 FROM events AS {event_table_alias}
+{sample_clause}
 {distinct_id_query}
 {person_query}
 {groups_query}

diff --git a/posthog/queries/trends/util.py b/posthog/queries/trends/util.py
@@ -85,6 +85,8 @@ def parse_response(stats: Dict, filter: Filter, additional_values: Dict = {}) ->
     counts = stats[1]
     labels = [item.strftime("%-d-%b-%Y{}".format(" %H:%M" if filter.interval == "hour" else "")) for item in stats[0]]
     days = [item.strftime("%Y-%m-%d{}".format(" %H:%M:%S" if filter.interval == "hour" else "")) for item in stats[0]]
+    if filter.sample_factor:
+        counts = [c * (1 / filter.sample_factor) for c in counts]
     return {
         "data": [float(c) for c in counts],
         "count": float(sum(counts)),