# Modelling Ecommerce User Journey Using Markov Chain


This notebook is purposely made for code illustration

## Import Library

In [1]:
import numpy as np
import pandas as pd

np.set_printoptions(suppress=True)
%load_ext autoreload
%autoreload 2

In [2]:
DEBUG = False
SEED = 1234
RANDOM_LEVEL = 0.5

User Defined Function

In [3]:
def classify(str):
    """This is a function to classify the page URLs into 4 types"""
    if "/harga/" in str:
        type = "PDP"
    elif "/search/" in str:
        type = "Search Page"
    elif "/checkout/" in str:
        type = "Checkout Page"
    else:
        type = "PLP"
    return type


def clean(data, str):
    """This is a function to clean landing & exit dataframe """
    if DEBUG:
        data = data.sample(10, random_state=SEED)
    else:
        data = randomize(data)
    col_name = str.title() + " Page"
    data = data.assign(page_type=lambda df: df[col_name].map(
        lambda pg: classify(pg))).iloc[:, [2, 1]]
    data["Pageviews"] = data.groupby(["page_type"])[
        "Pageviews"].transform("sum")
    return data.drop_duplicates(subset=["page_type"])


def randomize(df):
    """This is a function to randomize the page, landing & exit dataframe, due to confidential issue in iPrice data to public"""
    df_sample = df.sample(frac=RANDOM_LEVEL, random_state=SEED)
    size = len(df_sample)
    np.random.seed(SEED)
    percent = np.random.uniform(-RANDOM_LEVEL, RANDOM_LEVEL, size)
    value = df_sample["Pageviews"] * (1 + percent)
    replace = np.absolute(np.around(value)).astype(int)
    df.loc[replace.index, "Pageviews"] = replace
    return df

## Read Raw Data

In [4]:
page = pd.read_excel("data/raw.xlsx", sheet_name="previousPage")
landing = pd.read_excel("data/raw.xlsx", sheet_name="landing")
exit = pd.read_excel("data/raw.xlsx", sheet_name="exit")

## Cleaning page Dataframe

In [5]:
if DEBUG:
    page = page.sample(10, random_state=SEED)
else:
    page = randomize(page)
page = page[page["Previous Page Path"] != "(entrance)"]\
    .assign(
        page_type=lambda df: df["Page"].map(lambda pg: classify(pg))
).assign(
        previous_page_type=lambda df: df["Previous Page Path"].map(
            lambda pg: classify(pg))
).iloc[:, [4, 3, 2]]

page["Pageviews"] = page\
    .groupby(["page_type", "previous_page_type"])["Pageviews"]\
    .transform("sum")
page_type = page.drop_duplicates(subset=["page_type", "previous_page_type"])

## Cleaning landing & exit Dataframe

In [6]:
landing_type = clean(landing, "landing")\
    .assign(previous_page_type="Start")\
    .iloc[:, [2, 0, 1]]
exit_type = clean(exit, "exit")\
    .assign(previous_page_type=lambda df: df["page_type"])\
    .assign(page_type="End")\
    .iloc[:, [2, 0, 1]]

## Transition Matrix in Pandas

In [7]:
transition_matrix = page_type\
    .append(landing_type)\
    .append(exit_type)\
    .pivot(index="previous_page_type", columns="page_type", values="Pageviews")\
    .fillna(0)\
    .astype(int)

transition_matrix = transition_matrix\
    .append(pd.Series(0, index=transition_matrix.columns, name="End"))\
    .assign(Start=0)

transition_matrix.loc["End", "End"] = transition_matrix.to_numpy().sum()

order = ["Start", "Checkout Page", "PDP", "PLP", "Search Page", "End"]
transition_matrix = transition_matrix\
    .reindex(order, axis=0)\
    .reindex(order, axis=1)

print(transition_matrix)

if not DEBUG:
    transition_matrix.to_csv("result/transition_matrix.csv")

page_type           Start  Checkout Page      PDP       PLP  Search Page  \
previous_page_type                                                         
Start                   0          42996  5789152  12529655      3126393   
Checkout Page           0         108943   461754     89278        75979   
PDP                     0         564483  5405731   1773164       216972   
PLP                     0          74802  2132129   4357494         3777   
Search Page             0          81088   284087     11866      1350673   
End                     0              0        0         0            0   

page_type                End  
previous_page_type            
Start                      0  
Checkout Page         623973  
PDP                  8521594  
PLP                  9175992  
Search Page          2581942  
End                 59383917  


## transition matrix, *P*

In [8]:
# np_matrix = pd.read_csv("result/transition_matrix.csv").iloc[:, 1:].to_numpy()
np_matrix = transition_matrix.to_numpy()

row_sum = np.sum(np_matrix, axis=1)

P = np_matrix / row_sum[:, None]
print("P = \n{}".format(P))

P = 
[[0.         0.00200091 0.2694108  0.58309478 0.14549351 0.        ]
 [0.         0.08010945 0.33954323 0.06564911 0.05586991 0.4588283 ]
 [0.         0.03424857 0.32797897 0.10758221 0.01316422 0.51702603]
 [0.         0.00475108 0.13542319 0.27676831 0.0002399  0.58281751]
 [0.         0.01881542 0.06591872 0.00275335 0.31340622 0.59910629]
 [0.         0.         0.         0.         0.         1.        ]]


Canonical Form

In [9]:
Q = P[:5, :5]
R = P[:5, 5]
print("Q = \n{} \nR = {}".format(Q,R))

Q = 
[[0.         0.00200091 0.2694108  0.58309478 0.14549351]
 [0.         0.08010945 0.33954323 0.06564911 0.05586991]
 [0.         0.03424857 0.32797897 0.10758221 0.01316422]
 [0.         0.00475108 0.13542319 0.27676831 0.0002399 ]
 [0.         0.01881542 0.06591872 0.00275335 0.31340622]] 
R = [0.         0.4588283  0.51702603 0.58281751 0.59910629]


## Fundamental matrix, *N = (I - Q) ^ -1*

In [10]:
I = np.identity(len(Q))
N = np.linalg.inv(I - Q)

print("N = \n{}".format(N))

N = 
[[1.         0.0346616  0.62261506 0.90286081 0.22697975]
 [0.         1.11292469 0.61110659 0.19231529 0.10234564]
 [0.         0.06041443 1.57021784 0.23919074 0.03510581]
 [0.         0.01863558 0.29808954 1.42874523 0.00773097]
 [0.         0.03637362 0.16869614 0.03396401 1.46267141]]


## Expected Number of time, *t = N \* 1*

In [11]:
t = np.sum(N, axis=1)

print("t = {}".format(t))

t = [2.78711722 2.01869222 1.90492881 1.75320132 1.70170518]
