In [1]:
import pandas as pd

oct_df = pd.read_csv("data/raw/2019-Oct.csv")
nov_df = pd.read_csv("data/raw/2019-Nov.csv")

oct_df.shape, nov_df.shape


((42448764, 9), (67501979, 9))

In [2]:
oct_df.columns

Index(['event_time', 'event_type', 'product_id', 'category_id',
       'category_code', 'brand', 'price', 'user_id', 'user_session'],
      dtype='str')

In [3]:
oct_df["event_type"].value_counts().head(10)


event_type
view        40779399
cart          926516
purchase      742849
Name: count, dtype: int64

In [4]:
nov_df["event_type"].value_counts().head(10)


event_type
view        63556110
cart         3028930
purchase      916939
Name: count, dtype: int64

In [7]:
import duckdb

con = duckdb.connect()
con.execute("SET temp_directory='data/processed/tmp';")
con.execute("PRAGMA threads=4;")
con.execute("PRAGMA memory_limit='1GB';")

con.execute("""
CREATE OR REPLACE VIEW events AS
SELECT * FROM read_parquet([
  'data/processed/events_2019_oct.parquet',
  'data/processed/events_2019_nov.parquet'
]);
""")


<_duckdb.DuckDBPyConnection at 0x25ea6724230>

In [8]:
funnel = con.execute("""
WITH per_user AS (
  SELECT
    user_id,
    MAX(CASE WHEN event_type='view' THEN 1 ELSE 0 END) AS did_view,
    MAX(CASE WHEN event_type='cart' THEN 1 ELSE 0 END) AS did_cart,
    MAX(CASE WHEN event_type='purchase' THEN 1 ELSE 0 END) AS did_purchase
  FROM events
  GROUP BY user_id
)
SELECT
  COUNT(*) AS total_users,
  SUM(did_view) AS view_users,
  SUM(did_cart) AS cart_users,
  SUM(did_purchase) AS purchase_users,
  SUM(did_cart)*1.0 / NULLIF(SUM(did_view),0) AS view_to_cart,
  SUM(did_purchase)*1.0 / NULLIF(SUM(did_cart),0) AS cart_to_purchase,
  SUM(did_purchase)*1.0 / NULLIF(SUM(did_view),0) AS view_to_purchase
FROM per_user;
""").df()

funnel


Unnamed: 0,total_users,view_users,cart_users,purchase_users,view_to_cart,cart_to_purchase,view_to_purchase
0,5316649,5316128.0,1054133.0,697470.0,0.19829,0.661653,0.131199


In [12]:
import duckdb

con = duckdb.connect()

con.execute("SET temp_directory='data/processed/tmp';")
con.execute("PRAGMA threads=4;")
con.execute("PRAGMA memory_limit='1GB';")

con.execute("""
CREATE OR REPLACE VIEW events AS
SELECT * FROM read_parquet([
  'data/processed/events_2019_oct.parquet',
  'data/processed/events_2019_nov.parquet'
]);
""")


<_duckdb.DuckDBPyConnection at 0x25ea16fd170>

In [13]:
con.execute("""
CREATE OR REPLACE VIEW events_enriched AS
SELECT
  *,
  DATE(event_time) AS event_date,
  STRFTIME(event_time, '%Y-%m') AS event_month,
  STRFTIME(event_time, '%Y-%W') AS event_week
FROM events;
""")


<_duckdb.DuckDBPyConnection at 0x25ea16fd170>

In [14]:
con.execute("""
SELECT event_month, COUNT(*) AS events
FROM events_enriched
GROUP BY event_month
ORDER BY event_month
""").df()


Unnamed: 0,event_month,events
0,2019-10,42448764
1,2019-11,67501979
