## Group By & RFM Analysis
#Recently did they make a purchase recently?
#Frequency How often do they make a purchase?
#Monetary How much do they spend?

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import pyarrow

import matplotlib.pyplot as plt
import seaborn as sns

df = pl.read_csv('C:/Users/LENOVO/Downloads/2019-Oct.csv')

In [2]:
df.shape

(42448764, 9)

In [3]:
df.head()#.to_pandas()

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
str,str,i64,i64,str,str,f64,i64,str
"""2019-10-01 00:…","""view""",44600062,2103807459595387724,,"""shiseido""",35.79,541312140,"""72d76fde-8bb3-…"
"""2019-10-01 00:…","""view""",3900821,2053013552326770905,"""appliances.env…","""aqua""",33.2,554748717,"""9333dfbd-b87a-…"
"""2019-10-01 00:…","""view""",17200506,2053013559792632471,"""furniture.livi…",,543.1,519107250,"""566511c2-e2e3-…"
"""2019-10-01 00:…","""view""",1307067,2053013558920217191,"""computers.note…","""lenovo""",251.74,550050854,"""7c90fc70-0e80-…"
"""2019-10-01 00:…","""view""",1004237,2053013555631882655,"""electronics.sm…","""apple""",1081.98,535871217,"""c6bd7419-2748-…"


In [4]:
#Working with dates docs
#https://pola-rs.github.io/polars-book/user-guide/howcani/timeseries/parsing_dates_times.html

df = df.with_columns(
      pl.col("event_time").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S %Z")
)

In [5]:
df.head(3) 

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
datetime[μs],str,i64,i64,str,str,f64,i64,str
2019-10-01 00:00:00,"""view""",44600062,2103807459595387724,,"""shiseido""",35.79,541312140,"""72d76fde-8bb3-…"
2019-10-01 00:00:00,"""view""",3900821,2053013552326770905,"""appliances.env…","""aqua""",33.2,554748717,"""9333dfbd-b87a-…"
2019-10-01 00:00:01,"""view""",17200506,2053013559792632471,"""furniture.livi…",,543.1,519107250,"""566511c2-e2e3-…"


In [6]:
#Useful step to understand the data. Look at one user-id
df.filter(pl.col('user_id') == 541312140).to_pandas().sample(5)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
66,2019-10-05 15:09:27,view,17600057,2053013558895051365,,payot,43.98,541312140,23fb14a1-9fd3-4e35-a729-bfaa64f4e875
105,2019-10-13 14:58:30,view,48200093,2146660887002349890,apparel.dress,mltl,38.35,541312140,e11fc6dc-1808-4f8e-8923-5b2ce3238b3c
62,2019-10-05 15:08:20,view,19600071,2053013558836331105,,payot,29.76,541312140,23fb14a1-9fd3-4e35-a729-bfaa64f4e875
85,2019-10-06 02:34:33,view,5600342,2053013562913194819,,braun,64.33,541312140,5bd3612b-3222-42d5-9dc8-ba54aeccbfaa
89,2019-10-06 02:39:12,view,5300032,2053013563173241677,,rowenta,66.35,541312140,5bd3612b-3222-42d5-9dc8-ba54aeccbfaa


## 1.Keep rows with purchases only

In [7]:
df_purchases = df.filter(pl.col('event_type') == 'purchase')

In [8]:
df_purchases.shape

(742849, 9)

## 2.Let's compute RFM

In [9]:
#select unique values
df_purchases = df_purchases.select(['event_time','user_id','price']).unique()

In [10]:
df_purchases.shape

(742773, 3)

In [11]:
df_purchases.head()

event_time,user_id,price
datetime[μs],i64,f64
2019-10-01 02:19:12,516178643,391.26
2019-10-01 02:20:11,517129864,189.91
2019-10-01 02:20:28,554101070,91.12
2019-10-01 02:21:07,516815266,62.52
2019-10-01 02:21:59,514127317,3.37


#Compute time difference

In [12]:
from datetime import datetime
anchor_date = datetime(2019, 11, 30)

In [13]:
df_purchases = df_purchases.with_columns(
     (anchor_date - pl.col("event_time")).alias("date_diff") / (1e6 * 3600 *24)
)

In [14]:
df_purchases.head()

event_time,user_id,price,date_diff
datetime[μs],i64,f64,f64
2019-10-01 02:19:12,516178643,391.26,59.903333
2019-10-01 02:20:11,517129864,189.91,59.90265
2019-10-01 02:20:28,554101070,91.12,59.902454
2019-10-01 02:21:07,516815266,62.52,59.902002
2019-10-01 02:21:59,514127317,3.37,59.9014


In [18]:
%%timeit
df_agg = df_purchases.group_by('user_id').agg([
    pl.col('date_diff').min().alias('recency'),
    pl.count().alias("frequency"),
    pl.col('price').sum().alias('monetary')
])

60.3 ms ± 4.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
df_agg.shape

(347118, 4)

In [20]:
df_agg.head()

user_id,recency,frequency,monetary
i64,f64,u32,f64
516074584,39.437593,2,192.7
548225520,29.447303,5,1231.63
531781576,40.792917,1,154.18
559572080,30.564294,2,71.67
518537040,42.257546,2,499.72


## 3.Convert to pandas for analysis and plotting

In [21]:
df_agg = df_agg.to_pandas()

In [23]:
breaks = np.arange(0, 1.1, 0.1)
num_vars = ['recency','frequency','monetary']
df_agg[num_vars].quantile(breaks)

Unnamed: 0,recency,frequency,monetary
0.0,29.000509,1.0,0.88
0.1,31.583488,1.0,42.99
0.2,34.398789,1.0,82.574
0.3,37.014054,1.0,131.02
0.4,39.733875,1.0,174.31
0.5,42.679005,1.0,246.52
0.6,45.592382,1.0,331.972
0.7,48.523569,2.0,483.9
0.8,51.798266,2.0,766.76
0.9,55.703633,4.0,1418.05
