# Customer Recency Analysis

## Purpose
Finding insight on :

## Background

## Assumption

## Sanity Check

In [None]:
import json

def load_config(file_path: str = "./config.json"):
    with open(file_path) as config_file:
        data = json.load(config_file)
    return data

config = load_config("../config.json")
DBNAME = config.get("DBNAME")
HOSTNAME = config.get("HOSTNAME")
USER = config.get("USER")
PASS = config.get("PASS")
SCHEMA = config.get("SCHEMA")

In [None]:
# Basic 
import sys
import numpy as np
import scipy as sp
import pandas as pd

# SQL Engine
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

# Profiling process
from tqdm import tqdm

# Warning problems in notebook
import warnings
warnings.filterwarnings('ignore')

# Visualization
import bamboolib as bam
import plotly.express as px

# Reporting result
import sweetviz as sv
from dataprep.eda import create_report

In [None]:
def mapping_geolocation(val):
    mapping = {
        "DKI JAKARTA" : "JAKARTA RAYA",
        "KALIMANTAN UTARA" : "KALIMANTAN TENGAH",
        "DI YOGYAKARTA" : "YOGYAKARTA",
        "KEPULAUAN BANGKA BELITUNG" : "BANGKA BELITUNG"
    }
    if (val in mapping) :
        return mapping[val]
    return val

In [None]:
# Load data

# Create an engine instance
alchemyEngine = create_engine(
    f'postgresql+psycopg2://{USER}:{PASS}@{HOSTNAME}/{DBNAME}', pool_recycle=3600)

# Connect to PostgreSQL server
conn = alchemyEngine.connect()

schema = SCHEMA

## Multiple vs Single Time Only User

### Understand Lifetime Purchase Behaviour

In [None]:
QUERY = """
select 
	u.user_name,
	MAX(foi.lifetime_order) as lifetime_order ,
	MAX(foi.lifetime_spending) as lifetime_spending 
from staging.fct_order_items foi
left outer join (
	select 
		du.user_key ,
		du.user_name 
	from staging.dim_user du 
	where du.is_current_version=true
) u on foi.user_key = u.user_key
group by u.user_name
"""

df = pd.read_sql_query(QUERY, conn)
df

In [None]:
df_temp = df.groupby(['lifetime_order']).agg(number_of_customer=('user_name', 'size')).reset_index()
df_temp

In [None]:
fig = px.bar(df_temp, x='lifetime_order', y='number_of_customer', template='ggplot2', title='User by Number of Order')
fig.update_yaxes(title_text='Number of user')
fig.update_xaxes(title_text='Number of order')
fig

In [None]:
df_temp = df.groupby(['lifetime_spending']).agg(number_of_customer=('user_name', 'size')).reset_index()
df_temp

In [None]:
fig = px.histogram(df_temp, x='lifetime_spending', y='number_of_customer', template='ggplot2', title='User by Total Spending')
fig.update_yaxes(title_text='Number of user')
fig.update_xaxes(title_text='Total Spending')
fig

### Important Notes : 
Most people only purchase 1 times in the ecommerce ...

## Understanding Interval between purchase for multiple purchase

In [None]:
QUERY = """
select 
distinct
	foi.order_id,
	u.user_name,
	dd.date
from staging.fct_order_items foi 
left outer join (
	select 
		du.user_key ,
		du.user_name 
	from staging.dim_user du 
	where du.is_current_version=true
) u on foi.user_key = u.user_key
left outer join staging.dim_date dd on foi.order_date = dd.date_id 
left outer join staging.dim_time dt on foi.order_time = dt.time_id
where foi.lifetime_order > 1;
"""

interval_df = pd.read_sql_query(QUERY, conn)
interval_df['date'] = pd.to_datetime(interval_df['date'], format='%Y-%m-%d')
interval_df

In [None]:
interval_df['previous_date'] = interval_df.groupby('user_name')['date'].shift(1)
interval_df.dropna(inplace = True)
interval_df['order_interval'] = (interval_df['date'] - interval_df['previous_date']).dt.days
interval_df = interval_df[interval_df['order_interval'] > 0]
interval_df

In [None]:
fig = px.violin(interval_df[interval_df['order_interval'] > 0], box=True, template="ggplot2", x='order_interval', title='Distribution between each order (multiple purchase user)')
fig.update_xaxes(title_text='Number of Days')
fig

In [None]:
fig = px.box(interval_df[interval_df['order_interval'] > 0], template="ggplot2", x='order_interval', title='Distribution between each order (multiple purchase user)')
fig.update_xaxes(title_text='Number of Days')
fig

The violin plot in the upper shows that the majority of customers who repurchase order leave between 22 and 168 days to do so. The vast majority of customers will purchase within 400 days with there being a few extreme cases which are over 600 days. The upper quartile (Q3) suggests that there is a rough interval of 183 days which sits about 6 months. Using the upper bound of that interval, we can have this as our cut-off point for recency, ultimately any purchase within 6 months (~183 days) will be considered as a recent purchase.

## User Recency Segmentation

In [None]:
# Finding max date recorded in data
QUERY = """
select MAX(dd."date")
from staging.fct_order_items foi
left outer join staging.dim_date dd on foi.order_date = dd.date_id 
limit 1;
"""

_ = pd.read_sql_query(QUERY, conn)
max_date = _['max'][0]
print(f"Max Order Date Recorded : {max_date.day}-{max_date.month}-{max_date.year}")

In [None]:
# Init RFM Analysis Query
QUERY = """
select 
	u.user_name,
	DATE_PART('day', '2018-09-03'::timestamp- MAX(dd."date")) as recency,
	MAX(foi.lifetime_order) as frequency ,
	MAX(foi.lifetime_spending) as monetary,
	MAX(dd."date") - MIN(dd."date") as usage_days,
    MAX(foi.lifetime_spending) as total_spending,
	MAX(foi.lifetime_spending) / MAX(foi.lifetime_order) as average_order_value,
    COUNT(foi.order_item_id)  as total_basket_size,
    COUNT(foi.order_item_id) / MAX(foi.lifetime_order)  as avg_basket_size,
    CASE 
        WHEN DATE_PART('day', '2018-09-03'::timestamp- MAX(dd."date")) < 183 THEN 'ACTIVE'
        WHEN DATE_PART('day', '2018-09-03'::timestamp- MAX(dd."date")) >= 183 AND DATE_PART('day', '2018-09-03'::timestamp- MAX(dd."date")) < 365 THEN 'INACTIVE'
        ELSE 'LAPSED'
    END as recency_status
from staging.fct_order_items foi
left join staging.dim_date dd on foi.order_date = dd.date_id 
left join (
	select 
		du.user_key ,
		du.user_name 
	from staging.dim_user du 
	where du.is_current_version=true
) u on foi.user_key = u.user_key
group by u.user_name 
order by 6 desc;
"""
rfm_df = pd.read_sql_query(QUERY, conn)
rfm_df = rfm_df.dropna(subset=['frequency'])
rfm_df

In [None]:
fig = px.bar( x=rfm_df['recency_status'].value_counts().index, 
             y=rfm_df['recency_status'].value_counts().values, 
             template='ggplot2', 
             title='User by Number of Order',
             color=rfm_df['recency_status'].value_counts().index)
fig.update_yaxes(title_text='Number of user')
fig.update_xaxes(title_text='Recency Status')
fig.update_layout(legend_title_text='Type')
fig

In [None]:
fig = px.box(rfm_df, x='recency_status', y='recency',
             template='ggplot2', 
             title='Distribution of recency days of user',
            color='recency_status')
fig.update_yaxes(title_text='Number of days')
fig.update_xaxes(title_text='Recency Status')
fig.update_layout(legend_title_text='Type')
fig

## User Spending Preference based on Recency Segment Type

In [None]:
# Checking for any missed data (some order may not delivered yet -> cancelled, unavailable, etc)
rfm_df.loc[rfm_df.avg_basket_size < 1]

In [None]:
# We can see the distribution for the basket size 
fig = px.histogram(rfm_df, x='avg_basket_size', template='ggplot2', title='User Average Cart Size Distribution ')
fig.update_yaxes(title_text='Order count')
fig.update_xaxes(title_text='Number of item')
fig

In [None]:
fig = px.violin(rfm_df, x='avg_basket_size', box=True, template='ggplot2', title='User Average Cart Size Distribution ')
fig.update_xaxes(title_text='Number of item')
fig

In [None]:
# Lets divide into 2 categories : 
# TODO : barplot + create segmentation using apply
rfm_df['percentile_avg_spending'] = pd.qcut(rfm_df['average_order_value'], 10, labels=np.arange(1, 11, 1)).astype('int')
rfm_df['percentile_total_spending'] = pd.qcut(rfm_df['total_spending'], 10, labels=np.arange(1, 11, 1)).astype('int')
rfm_df['volume_type'] = rfm_df.apply(
    lambda row : "High Volume" if row['avg_basket_size'] > 1 else "Low Volume",
    axis = 1
)
rfm_df['spending_type'] = rfm_df.apply(
    lambda row : "Low Value" if row['percentile_avg_spending'] < 6 else "High Value"
    , axis = 1
)

In [None]:
def mapping_segment(row) :
    if (row.recency_status == "ACTIVE") :
        return f"{row.spending_type} & {row.volume_type}"
    elif row.recency_status == "INACTIVE" :
        return row.spending_type
    else :
        return "Invalid"

rfm_df['segment'] = rfm_df.apply(mapping_segment, axis = 1)
rfm_df

In [None]:
rfm_df_active = rfm_df.loc[rfm_df.recency_status =="ACTIVE"]
rfm_df_inactive = rfm_df.loc[rfm_df.recency_status =="INACTIVE"]

In [None]:
rfm_df_active_segment = rfm_df_active.groupby(['segment']).agg(user_count=('user_name', 'size'), avg_spending = ('average_order_value', 'mean'), avg_basket = ('avg_basket_size', 'mean')).reset_index()
rfm_df_active_segment

In [None]:
fig = px.bar(rfm_df_active_segment, x='segment', y='user_count', color="segment", template='ggplot2', title='Active User Value & Volume Segment')
fig.update_xaxes(categoryorder='total descending')
fig.update_yaxes(title_text='Number of User')
fig.update_xaxes(title_text='Segment')
fig

In [None]:
fig = px.bar(rfm_df_active_segment, x='segment', y='avg_spending', color="segment", template='ggplot2', title='Active User Average Spending (per Segment)')
fig.update_xaxes(categoryorder='total descending')
fig.update_yaxes(title_text='Average Spending Value')
fig.update_xaxes(title_text='Segment')
fig

In [None]:
fig = px.bar(rfm_df_active_segment, x='segment', y='avg_basket', color="segment", template='ggplot2', title='Active User Average Cart Size (per Segment)')
fig.update_xaxes(categoryorder='total descending')
fig.update_yaxes(title_text='Average Cart Size')
fig.update_xaxes(title_text='Segment')
fig

In [None]:
rfm_df_inactive_segment = rfm_df_inactive.groupby(['segment']).agg(user_count=('user_name', 'size'),avg_spending = ('average_order_value', 'mean')).reset_index()
rfm_df_inactive_segment

In [None]:
fig = px.bar(rfm_df_inactive_segment, x='segment', y='user_count', color="segment", template='ggplot2', title='Inactive User Value')
fig.update_xaxes(categoryorder='total descending')
fig.update_yaxes(title_text='Number of User')
fig.update_xaxes(title_text='Segment')
fig

In [None]:
fig = px.bar(rfm_df_inactive_segment, x='segment', y='avg_spending', color="segment", template='ggplot2', title='Inactive User Average Spending (per Segment)')
fig.update_xaxes(categoryorder='total descending')
fig.update_yaxes(title_text='Average Spending Value')
fig.update_xaxes(title_text='Segment')
fig

## User vs Seller Location

In [None]:
# Init needed data

QUERY_USER = """
select * 
from staging.dim_user;
"""

QUERY_SELLER = """
select * 
from staging.dim_seller;
"""

# Init dataframe
user_df = pd.read_sql_query(QUERY_USER, conn)
seller_df = pd.read_sql_query(QUERY_SELLER, conn)

# Mapping geolocation
user_df['mapped_geolocation'] = user_df.apply(
    lambda row : mapping_geolocation(row['customer_state']),
    axis = 1
)

seller_df['mapped_geolocation'] = seller_df.apply(
    lambda row : mapping_geolocation(row['seller_state']),
    axis = 1
)

In [None]:
# User df info
user_df_geo_grouped = user_df.groupby(['mapped_geolocation']).agg(user_count=('user_name', 'size')).reset_index()
user_df_geo_grouped

In [None]:
fig = px.bar(user_df_geo_grouped, x='mapped_geolocation', y='user_count', title='Number of user per state', template='plotly_white', orientation='v', color='mapped_geolocation')
fig.update_xaxes(categoryorder='total descending')
fig.update_xaxes(title_text='State')
fig.update_yaxes(title_text='Number of user')
fig.update_layout(legend_title_text='State')
fig

In [None]:
# Seller df info
seller_df_geo_grouped = seller_df.groupby(['mapped_geolocation']).agg(seller_count=('seller_id', 'size')).reset_index()
seller_df_geo_grouped

In [None]:
fig = px.bar(seller_df_geo_grouped, x='mapped_geolocation', y='seller_count', title='Number of seller per state', template='plotly_white', orientation='v', color='mapped_geolocation')
fig.update_xaxes(categoryorder='total descending')
fig.update_xaxes(title_text='State')
fig.update_yaxes(title_text='Number of seller')
fig.update_layout(legend_title_text='State')
fig

In [None]:
import geopandas as gpd

path = '../data/gadm36_IDN_1.json'
df_geo = gpd.read_file(path)
df_geo['NAME_1_upper'] = df_geo['NAME_1'].str.upper()

In [None]:
seller_geo = df_geo.merge(seller_df_geo_grouped, how="inner", left_on="NAME_1_upper", right_on="mapped_geolocation")
seller_geo.info()

In [None]:
fig = px.choropleth(seller_geo,
                   geojson=seller_geo.geometry,
                   locations=seller_geo.index,
                   hover_name=seller_geo.mapped_geolocation,
                   color="seller_count",
                    template='ggplot2'
                   )
fig.update_geos(fitbounds="locations", visible=False)

In [None]:
user_geo = df_geo.merge(user_df_geo_grouped, how="inner", left_on="NAME_1_upper", right_on="mapped_geolocation")
user_geo.info()

In [None]:
fig = px.choropleth(user_geo,
                   geojson=user_geo.geometry,
                   locations=user_geo.index,
                   hover_name=user_geo.mapped_geolocation,
                   color="user_count",
                    template='ggplot2'
                   )
fig.update_geos(fitbounds="locations", visible=False)

## Recommendation

## Reference
- [RFM Analysis](https://www.kaggle.com/alpamys/rfm-cohort-analysis)
- [Customer Satisfaction](https://www.kaggle.com/andresionek/predicting-customer-satisfaction)
- [Mapping in Plotly](https://towardsdatascience.com/noobs-guide-to-create-choropleth-map-using-python-geopandas-d6269e9e9a0c)
- [Kaggle Plotting Map](https://www.kaggle.com/farizdarari/simple-map-visualization-using-geopandas)
- [Web application for map visualization](https://plotly.com/python/choropleth-maps/)