Group-By in Polars

In [1]:
import polars as pl
import pandas as pd
import numpy as np
import pyarrow

import matplotlib.pyplot as plt
import seaborn as sns


# Download Dataset or run in a kaggle notebook
# https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store
df = pl.read_csv("C:/Users/LENOVO/Downloads/2019-Oct.csv")

In [2]:
df.head()

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-10-01 00:…","""view""",44600062,2103807459595387724,,"""shiseido""",35.79,541312140,"""72d76fde-8bb3-…"
"""2019-10-01 00:…","""view""",3900821,2053013552326770905,"""appliances.env…","""aqua""",33.2,554748717,"""9333dfbd-b87a-…"
"""2019-10-01 00:…","""view""",17200506,2053013559792632471,"""furniture.livi…",,543.1,519107250,"""566511c2-e2e3-…"
"""2019-10-01 00:…","""view""",1307067,2053013558920217191,"""computers.note…","""lenovo""",251.74,550050854,"""7c90fc70-0e80-…"
"""2019-10-01 00:…","""view""",1004237,2053013555631882655,"""electronics.sm…","""apple""",1081.98,535871217,"""c6bd7419-2748-…"


1. Groupby - pl.count (one group)

In [4]:
df.group_by("brand").agg([
    pl.count()
]).head()

brand,count
str,u32
"""a-derma""",298
"""sacvoyage""",453
"""dobrusskijfarf…",770
"""danielklein""",1186
"""banbao""",185


In [10]:
# sorting in polars
df.group_by("brand").agg([
    pl.count()
]).sort("count").reverse().head()

brand,count
str,u32
,6113008
"""samsung""",5282775
"""apple""",4122554
"""xiaomi""",3083763
"""huawei""",1111205


In [12]:
# aggregate in polars and convert to pandas for sorting the aggregation
df_count = df.group_by("brand").agg([
    pl.count()
]).to_pandas().sort_values("count", ascending=False)

print(df_count.shape)

df_count.head()

(3446, 2)


Unnamed: 0,brand,count
1616,,6113008
2210,samsung,5282775
2703,apple,4122554
127,xiaomi,3083763
2017,huawei,1111205


In [14]:
# Compute count and percent of instances
(
    df.group_by("brand").agg([
        pl.count()
    ])
    .sort('count').reverse()
    .with_columns([
        (pl.col("count") / pl.col("count").sum()).alias("n_pct")
    ])
    .head(10)
)

brand,count,n_pct
str,u32,f64
,6113008,0.144009
"""samsung""",5282775,0.124451
"""apple""",4122554,0.097118
"""xiaomi""",3083763,0.072647
"""huawei""",1111205,0.026178
"""lucente""",655861,0.015451
"""lg""",562404,0.013249
"""bosch""",557090,0.013124
"""oppo""",482887,0.011376
"""sony""",456644,0.010758


In [15]:
df.sample(5)

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-10-31 17:…","""view""",14500002,2053013554725912943,"""appliances.kit…","""bosch""",41.74,523599631,"""0391c41c-72d3-…"
"""2019-10-19 18:…","""view""",16400025,2053013558249128509,,"""tefal""",90.07,536204791,"""aafc138f-23c3-…"
"""2019-10-28 17:…","""view""",2201036,2053013560555995845,,"""canon""",308.86,565142503,"""938b2523-1e38-…"
"""2019-10-30 09:…","""view""",12718373,2053013553559896355,,"""triangle""",48.91,518060299,"""bc05feae-cd9a-…"
"""2019-10-21 06:…","""view""",10900222,2053013555069845885,"""appliances.kit…","""moulinex""",42.78,554071220,"""846699fe-44ce-…"


Value Counts works as expected

In [17]:
# this works!
df['event_type'].value_counts()

# More verbose version with groupby
df.group_by("event_type").agg(pl.count())

event_type,count
str,u32
"""purchase""",742849
"""cart""",926516
"""view""",40779399


2. Groupby - pl.count (two groups)

In [18]:
df_brand_event = df.group_by(["brand", "event_type"]).agg([
    pl.count()
])

df_brand_event.head()

brand,event_type,count
str,str,u32
"""aqua""","""view""",14155
"""brw""","""view""",170638
"""gran-stone""","""view""",3129
"""lg""","""view""",540131
"""microsoft""","""view""",21435


In [19]:
# Convert from long to wide format
df_brand_event_wide = df_brand_event.pivot(values="count", 
                                           index=["brand"], 
                                           columns=['event_type'])
df_brand_event_wide.head()

brand,view,cart,purchase
str,u32,u32,u32
"""aqua""",14155,391.0,187
"""brw""",170638,,808
"""gran-stone""",3129,,26
"""lg""",540131,13546.0,8727
"""microsoft""",21435,313.0,225


In [21]:
# Pivot and sorting
df_brand_event_wide = ( 
    df_brand_event
    .pivot(values="count", index=["brand"], columns=['event_type'])
    .sort("purchase").reverse()
)

df_brand_event_wide.head()

brand,view,cart,purchase
str,u32,u32,u32
"""samsung""",4806630,303249,172896
"""apple""",3770597,209084,142873
,6035988,18806,58214
"""xiaomi""",2922650,104497,56616
"""huawei""",1045572,42132,23501


In [22]:
# Keep original columns and compute % values
agg_performance = df_brand_event_wide.with_columns([
    (pl.col("cart") / pl.col("view")).alias("cart_by_views"),
    (pl.col("purchase") / pl.col("cart")).alias("buy_by_cart"),
    (pl.col("purchase") / pl.col("view")).alias("buy_by_views"),
])
agg_performance.head()

brand,view,cart,purchase,cart_by_views,buy_by_cart,buy_by_views
str,u32,u32,u32,f64,f64,f64
"""samsung""",4806630,303249,172896,0.06309,0.570145,0.03597
"""apple""",3770597,209084,142873,0.055451,0.683328,0.037891
,6035988,18806,58214,0.003116,3.095501,0.009644
"""xiaomi""",2922650,104497,56616,0.035754,0.541795,0.019371
"""huawei""",1045572,42132,23501,0.040296,0.557795,0.022477


In [27]:
agg_performance.filter(pl.col("buy_by_cart") < 1)\
.select(['cart_by_views', 'buy_by_cart', 'buy_by_views'])\
.to_pandas().quantile([0, 0.25, 0.5, 0.75, 0.99, 1])

#.boxplot()

Unnamed: 0,cart_by_views,buy_by_cart,buy_by_views
0.0,0.002124,0.090909,0.000826
0.25,0.012777,0.5,0.006614
0.5,0.018381,0.612583,0.011384
0.75,0.025814,0.72449,0.015649
0.99,0.061554,0.955687,0.035142
1.0,0.142418,0.986928,0.052284


In [28]:
df.head()

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-10-01 00:…","""view""",44600062,2103807459595387724,,"""shiseido""",35.79,541312140,"""72d76fde-8bb3-…"
"""2019-10-01 00:…","""view""",3900821,2053013552326770905,"""appliances.env…","""aqua""",33.2,554748717,"""9333dfbd-b87a-…"
"""2019-10-01 00:…","""view""",17200506,2053013559792632471,"""furniture.livi…",,543.1,519107250,"""566511c2-e2e3-…"
"""2019-10-01 00:…","""view""",1307067,2053013558920217191,"""computers.note…","""lenovo""",251.74,550050854,"""7c90fc70-0e80-…"
"""2019-10-01 00:…","""view""",1004237,2053013555631882655,"""electronics.sm…","""apple""",1081.98,535871217,"""c6bd7419-2748-…"


3. Group By - UserId + 2 Groups

In [29]:
# group_by 
# %%timeit
# 8.72 s ± 246 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

df_user_brand_event = df.group_by(["user_id", "brand", "event_type"]).agg([
    pl.count()
])

df_user_brand_event.head()

user_id,brand,event_type,count
i64,str,str,u32
541312140,"""shiseido""","""view""",2
512742880,"""pulser""","""view""",37
555444559,"""haier""","""view""",13
555446831,"""apple""","""view""",10
554754045,"""hotpoint-arist…","""view""",9


In [30]:
df_user_brand_event.shape

(12028150, 4)

In [31]:
# pivot and sort
df_user_brand_event = (df_user_brand_event
    .pivot(values="count", index=["user_id", "brand"], columns=['event_type'])
    .sort("purchase").reverse()
)
df_user_brand_event.shape

(11158112, 5)

In [32]:
df_user_brand_event.head()

user_id,brand,view,purchase,cart
i64,str,u32,u32,u32
523974502,"""samsung""",385,272,218
543312954,"""samsung""",443,192,103
519267944,"""apple""",314,183,184
513117637,"""apple""",438,183,56
517728689,"""samsung""",429,171,486


In [33]:
df_user_brand_event.sample(10)

user_id,brand,view,purchase,cart
i64,str,u32,u32,u32
512399252,"""lemark""",1,,
513282718,"""topface""",2,,
518895681,"""stels""",2,,
515888702,"""samsung""",5,,
556333372,"""samsung""",6,2.0,2.0
561939669,"""samsung""",2,,
560376610,"""artel""",1,,
550316295,"""apple""",1,,
515009595,"""orient""",1,,
546503214,"""xiaomi""",6,1.0,2.0


In [34]:
df_user_brand_event = df_user_brand_event.with_columns([
    pl.col('purchase').fill_null(strategy="zero"),
    pl.col('view').fill_null(strategy="zero"),
    pl.col('cart').fill_null(strategy="zero"),
])

In [35]:
df_user_brand_event.sample(10)

user_id,brand,view,purchase,cart
i64,str,u32,u32,u32
547307134,"""philips""",4,0,0
514093605,"""rebus""",3,0,0
560771867,"""delonghi""",1,0,0
512615513,,1,0,0
540303961,"""joie""",1,0,0
540660337,"""hp""",1,0,0
545806761,"""komfort-s""",4,0,0
513503095,"""t-max""",17,0,0
518865905,"""apple""",3,0,0
515184056,"""acer""",10,0,0


In [36]:
# Looks like these user-ids are distributors (nobody need 400 samsung devices)
df_user_brand_event.head(10)

user_id,brand,view,purchase,cart
i64,str,u32,u32,u32
523974502,"""samsung""",385,272,218
543312954,"""samsung""",443,192,103
519267944,"""apple""",314,183,184
513117637,"""apple""",438,183,56
517728689,"""samsung""",429,171,486
541510103,"""samsung""",333,165,207
513320236,"""samsung""",305,135,85
530834332,"""samsung""",270,129,33
547330965,"""apple""",491,125,31
553431815,"""apple""",174,124,10


In [37]:
df_user_brand_event = df_user_brand_event.with_columns(
    (pl.col("purchase") / pl.col("view")).alias("pct_buy_views")
)