# Product Recommendation Data Construct

In [1]:
# Basic 
import sys
import json

# Warning problems in notebook
import warnings
warnings.filterwarnings('ignore')

# Profiling process
from tqdm import tqdm

# Reporting result
from IPython.display import display

import pickle

# Math
import numpy as np
import scipy as sp
import pandas as pd

# SQL Engine
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

# Visualization
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

# CLustering
from sklearn.cluster import KMeans

# Dimension reductionality
from sklearn.decomposition import PCA

In [2]:
# Progress apply for pandas
tqdm.pandas()

# Seaborn color palette
sns.set_palette('husl')

# Inline matplotlib
%matplotlib inline

plt.style.use('default')

# === Color pallete ===
raw_light_palette = [
    (0, 122, 255), # Blue
    (255, 149, 0), # Orange
    (52, 199, 89), # Green
    (255, 59, 48), # Red
    (175, 82, 222),# Purple
    (255, 45, 85), # Pink
    (88, 86, 214), # Indigo
    (90, 200, 250),# Teal
    (255, 204, 0),  # Yellow
    (255, 255, 255), # White
    (0, 0, 0), # Black
]

light_palette = np.array(raw_light_palette)/255

# Plotting pretty figures and avoid blurry images
%config InlineBackend.figure_format = 'retina'

# Larger scale for plots in notebooks
sns.set_context('notebook')

from matplotlib.offsetbox import AnchoredText
from mpl_toolkits.axes_grid1 import make_axes_locatable
# Visualization
import matplotlib as mpl
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec

# Color constant
color1 = "#F36E8E"
color2 = "#1ABDE9"

In [3]:
import json

def load_config(file_path: str = "./config.json"):
    with open(file_path) as config_file:
        data = json.load(config_file)
    return data

config = load_config("../config.json")
DBNAME = config.get("DBNAME")
HOSTNAME = config.get("HOSTNAME")
USER = config.get("USER")
PASS = config.get("PASS")
SCHEMA = config.get("SCHEMA")

# Create an engine instance
alchemyEngine = create_engine(
    f'postgresql+psycopg2://{USER}:{PASS}@{HOSTNAME}/{DBNAME}', pool_recycle=3600)

# Connect to PostgreSQL server
conn = alchemyEngine.connect()

schema = SCHEMA

In [4]:
# === Init query ===
QUERY = """
select
    u.user_name,
    DATE_PART('day', '2018-09-03'::timestamp- MAX(dd."date")) as recency,
	MAX(foi.lifetime_order) as frequency ,
	MAX(foi.lifetime_spending) as monetary,
    DATE_PART('day', '2018-09-03'::timestamp- MIN(dd."date")) as usage_days,
    MAX(foi.lifetime_spending) / MAX(foi.lifetime_order) as latest_aov,
    COUNT(foi.order_item_id)  as lifetime_product_order,
    COUNT(foi.order_item_id) / MAX(foi.lifetime_order)  as latest_avg_basket_size
from staging.fct_order_items foi
left join staging.dim_date dd on foi.order_date = dd.date_id 
left join (
	select 
		du.user_key ,
		du.user_name ,
    dg.state as customer_state
	from staging.dim_user du 
  left join staging.dim_geo dg on du.customer_geo_id = dg.geo_id 
	where du.is_current_version=true
) u on foi.user_key = u.user_key
where foi.order_item_status in ('delivered', 'cancelled')
group by u.user_name 
having COUNT(foi.order_item_id) / MAX(foi.lifetime_order) >= 1
"""

PRODUCT_QUERY = """
select 
	foi.order_id ,
	du.user_name ,
    dp.product_id,
	dp.product_category
from staging.fct_order_items foi 
left join staging.dim_product dp on dp.product_key = foi.product_key 
left join staging.dim_user du on du.user_key = foi.user_key 
where foi.order_item_status in ('delivered', 'cancelled')
"""

df = pd.read_sql_query(QUERY, conn)
product_df = pd.read_sql_query(PRODUCT_QUERY, conn) 
print("=== RFM Data Construct ===")
print(df.info())

print("=== Product x User Example Data Construct ===")
print(product_df.info())

=== RFM Data Construct ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93279 entries, 0 to 93278
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_name               93279 non-null  object 
 1   recency                 93279 non-null  float64
 2   frequency               93279 non-null  float64
 3   monetary                93279 non-null  float64
 4   usage_days              93279 non-null  float64
 5   latest_aov              93279 non-null  float64
 6   lifetime_product_order  93279 non-null  int64  
 7   latest_avg_basket_size  93279 non-null  float64
dtypes: float64(6), int64(1), object(1)
memory usage: 5.7+ MB
None
=== Product x User Example Data Construct ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110197 entries, 0 to 110196
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   order_id     

## Data Exploration

In [5]:
cols = ['recency', 'frequency', 'monetary', 'usage_days', 'latest_aov', 'lifetime_product_order', 'latest_avg_basket_size']
df[cols].describe().T.style \
                            .bar(subset=['mean'], color=color2)\
                            .bar(subset=['min', 'max'], color=color1)\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
recency,93279.0,242.448214,152.579458,5.0,119.0,223.0,351.0,718.0
frequency,93279.0,1.033341,0.204274,1.0,1.0,1.0,1.0,9.0
monetary,93279.0,165198.722113,226332.280463,9590.0,63010.0,107790.0,182590.0,13664080.0
usage_days,93279.0,245.085378,153.088773,5.0,121.0,226.0,355.0,718.0
latest_aov,93279.0,160305.651481,219574.627248,9590.0,62345.0,105660.0,176650.0,13664080.0
lifetime_product_order,93279.0,1.180298,0.619392,1.0,1.0,1.0,1.0,24.0
latest_avg_basket_size,93279.0,1.139499,0.526977,1.0,1.0,1.0,1.0,21.0


In [6]:
# === RFM & Usage Days & AoV ===

In [7]:
# === Basket size ===

In [8]:
# === Save Data ===
df.to_pickle("../data/processed/rfm-unsupervised-dataset.pkl", protocol=4) # Use protocol 4 to be able to be used for python3.6 <
product_df.to_pickle("../data/processed/product-unsupervised-dataset.pkl", protocol=4) # Use protocol 4 to be able to be used for python3.6 <