In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Intro into RFM Framework

RFM framework is a method used to determine customer value by looking at the following three dimensions:
* Recency: when is the last time the user takes an action (e.g., login, place an order)? 
* Frequency: how many times does the user take this action?
* Monetary value: what is the sum of monetary value from this user throughout his/her lifetime?

When used properly, RFM becomes a powerful tool to identify the most valuable customer (MVC) of a business. Based on the output from this model, we are able to develop customized CRM strategies for different customer segments. With this post, I want to share the Kaggle notebook from an RFM segmentation analysis plus some tips that I found useful in the application of the model output.

In [None]:
# Load the dataset and have a peek into first rows
data = pd.read_csv("/kaggle/input/ecommerce-data/data.csv")
data.head()

In [None]:
# Check the basic info of the dataset: size, variables, data types
data.info()

In [None]:
# Note that we have some missing values in column "Description" and "CustomerID".
# Since our goal is to create a customer segmentation using the RFM framework, we need 
# drop rows with missing customer ID and description.
data = data.dropna()
data.info()

In [None]:
# Convert column "InvoiceDate" to datetime
from datetime import datetime
data["InvoiceDate"] = data["InvoiceDate"].apply(lambda x: x.split(' ')[0])
data["InvoiceDate"].head()

In [None]:
data["InvoiceDate"] = data["InvoiceDate"].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))

In [None]:
# Check the dataset info after conversion
data.info()

In [None]:
# Create a new variable OrderValue = Quanity * UnitPrice
data["OrderValue"] = data["Quantity"] * data["UnitPrice"]
data[["Quantity", "UnitPrice", "OrderValue"]].head()

In [None]:
# Aggregate at customer level
customer_data = pd.DataFrame(data.pivot_table(index="CustomerID", 
                                              values=["InvoiceDate", "OrderValue"], 
                                              aggfunc={"InvoiceDate": [min, max, pd.Series.nunique], "OrderValue": sum}))
customer_data.head()

In [None]:
customer_data.info()

In [None]:
# Rename columns
customer_data.columns = ["LastInvoiceDate", "FirstInvoiceDate", "Frequency", "MonetaryValue"]
customer_data.head()

In [None]:
import matplotlib.pyplot as plt
# Create column "FirstInvoceMonth" to look at customer intake
customer_data["FirstInvoiceMonth"] = customer_data["FirstInvoiceDate"].apply(lambda x: x.replace(day=1))
customer_data.groupby(["FirstInvoiceMonth"]).count()["FirstInvoiceDate"].plot(kind="bar")
plt.title("Monthly Customer Intakes")

In [None]:
# Calculate recency, relative recency and relative frequency
# Take the maximum invoice date as today
today = customer_data["LastInvoiceDate"].max()
today

In [None]:
# Calculate recency: the interval (days) between the last transation day and today
customer_data["Recency"] = (today - customer_data["LastInvoiceDate"]) / np.timedelta64(1, 'D')
customer_data["Recency"].hist()

In [None]:
# Calculate customer lifetime: the interval (days) between the first transation day and today
customer_data["Lifetime"] = (today - customer_data["FirstInvoiceDate"]) / np.timedelta64(1, 'D')
customer_data["Lifetime"].hist()

In [None]:
customer_data["Lifetime"].describe()

In [None]:
# Calculate the relative recency: recency / customer lifetime
# Note: this is to normalise the fact that newer customers have lower lifetime and thus lower recency
# by nature.
customer_data["RelRecency"] = 1 - customer_data["Recency"] / customer_data["Lifetime"]
customer_data["RelRecency"].hist()

In [None]:
customer_data["Recency"].describe()

In [None]:
customer_data["RelRecency"].describe()

In [None]:
sum(customer_data["RelRecency"]==0) / len(customer_data)
# This shows that 31% customers only had one transactions.

In [None]:
# Calculate the relative frequency: frequency / customer lifetime
# Note: this is to normalise the fact that newer customers have lower lifetime and thus lower frequency
# by nature.
customer_data["RelFrequency"] = customer_data["Frequency"] / customer_data["Lifetime"]
customer_data["RelFrequency"].apply(lambda x: np.isinf(x)).value_counts()

In [None]:
# Found a record with inifinity value, we need to remove it
customer_data[customer_data["RelFrequency"].apply(lambda x: np.isinf(x))]

In [None]:
customer_data = customer_data[customer_data["RelFrequency"].apply(lambda x: np.isinf(x))==False]
customer_data["RelFrequency"].hist()

In [None]:
customer_data["RelFrequency"].describe()

In [None]:
# Calculate the relative monetary value: monetary value / customer lifetime
# Note: this is to normalise the fact that newer customers have lower lifetime and thus lower frequency
# by nature.
customer_data["MonetaryValue"].describe()

In [None]:
customer_data["MonetaryValue"].hist()
# There are some outliers in terms of monetary value.

In [None]:
np.percentile(customer_data["MonetaryValue"], 99)
# The 99.5% percentile of monetary value is 33.4K, i.e., 0.5% * 4K = 20 customers have value higher than 33.4K.
# We should exclude these outliers from our analysis.

In [None]:
customer_data = customer_data[customer_data["MonetaryValue"]<=np.percentile(customer_data["MonetaryValue"], 99)]

In [None]:
len(customer_data)

In [None]:
len(customer_data[customer_data["MonetaryValue"]<0])
# 43 customers have negative transation value because of the returns.

In [None]:
customer_data[customer_data["MonetaryValue"]<0].head()

In [None]:
data[data["CustomerID"]==12454]
# Some customers have a negative sum of order value. This is because they have returns.

In [None]:
customer_data["RelMonetaryValue"] = customer_data["MonetaryValue"] / customer_data["Lifetime"]
customer_data["RelMonetaryValue"].hist()

In [None]:
customer_data["RelMonetaryValue"].describe()

# Create RFM buckets using absolute values

In [None]:
# Create RFM buckets using absolute values
# For this analysis, we take the medians and 75% quartiles of relative recency, relative frequency and relative monetary value and use them for
# 4 bins for each variable. We label those bins as 1, 2, 3 and 4 and use them as the scores for R, F and M respectively.
# We then create 
# By doing so we end up with 10 clusters ()

In [None]:
customer_data['RecencyScore'] = pd.cut(customer_data["Recency"], 
                                       bins=[-1, 
                                             np.percentile(customer_data["Recency"], 25), 
                                             np.percentile(customer_data["Recency"], 50), 
                                             np.percentile(customer_data["Recency"], 75), 
                                             customer_data["Recency"].max()], 
                                       labels=[4, 3, 2, 1]).astype("int")
customer_data["RecencyScore"].value_counts()

In [None]:
customer_data["FrequencyScore"] = pd.cut(customer_data["Frequency"], 
                                       bins=[-1, 
                                             np.percentile(customer_data["Frequency"], 25), 
                                             np.percentile(customer_data["Frequency"], 50), 
                                             np.percentile(customer_data["Frequency"], 75), 
                                             customer_data["Frequency"].max()], 
                                       labels=[1, 2, 3, 4]).astype("int")
customer_data["FrequencyScore"].value_counts()

In [None]:
customer_data["MonetaryScore"] = pd.cut(customer_data["MonetaryValue"], 
                                       bins=[customer_data["MonetaryValue"].min()-1, 
                                             np.percentile(customer_data["MonetaryValue"], 25),
                                             np.percentile(customer_data["MonetaryValue"], 50), 
                                             np.percentile(customer_data["MonetaryValue"], 75), 
                                             customer_data["MonetaryValue"].max()], 
                                       labels=[1, 2, 3, 4]).astype("int")
customer_data["MonetaryScore"].value_counts()

In [None]:
customer_data["RFM"] = customer_data["RecencyScore"] + customer_data["FrequencyScore"] + customer_data["MonetaryScore"]
customer_data["RFM"].value_counts()

In [None]:
rfm_abs = pd.DataFrame(customer_data.pivot_table(index=["RFM"], 
                                    values=["Recency", "Frequency", "MonetaryValue", "Lifetime"], 
                                    aggfunc={"Recency": [np.min, np.median, np.max], 
                                             "Frequency": [np.min, np.median, np.max], 
                                             "MonetaryValue": [np.min, np.median, np.max], 
                                             "Lifetime": [np.min, np.median, np.max, "count"]}))
rfm_abs

As we can see, the groups have very different median days of lifetime. This suggests potential bias associated with customer sign up date.

# Create RFM buckets using relative values

In [None]:
# Create RFM buckets using relative values
# For this analysis, we take the medians and 75% quartiles of relative recency, relative frequency and relative monetary value and use them for
# 4 bins for each variable. We label those bins as 1, 2, 3 and 4 and use them as the scores for R, F and M respectively.
# We then create 
# By doing so we end up with 10 clusters ()
customer_data["RecencyScore"] = pd.cut(customer_data["RelRecency"], 
                                       bins=[-1, 
                                             np.percentile(customer_data["RelRecency"], 25), 
                                             np.percentile(customer_data["RelRecency"], 50), 
                                             np.percentile(customer_data["RelRecency"], 75), 
                                             customer_data["RelRecency"].max()], 
                                       labels=[1, 2, 3, 4]).astype("int")
customer_data["RecencyScore"].value_counts()

In [None]:
customer_data["FrequencyScore"] = pd.cut(customer_data["RelFrequency"], 
                                       bins=[-1, 
                                             np.percentile(customer_data["RelFrequency"], 25), 
                                             np.percentile(customer_data["RelFrequency"], 50), 
                                             np.percentile(customer_data["RelFrequency"], 75), 
                                             customer_data["RelFrequency"].max()], 
                                       labels=[1, 2, 3, 4]).astype("int")
customer_data["FrequencyScore"].value_counts()

In [None]:
customer_data["MonetaryScore"] = pd.cut(customer_data["RelMonetaryValue"], 
                                       bins=[customer_data["RelMonetaryValue"].min()-1, 
                                             np.percentile(customer_data["RelMonetaryValue"], 25),
                                             np.percentile(customer_data["RelMonetaryValue"], 50), 
                                             np.percentile(customer_data["RelMonetaryValue"], 75), 
                                             customer_data["RelMonetaryValue"].max()], 
                                       labels=[1, 2, 3, 4]).astype("int")
customer_data["MonetaryScore"].value_counts()

In [None]:
customer_data.head()

In [None]:
customer_data["RFM"] = customer_data["RecencyScore"] + customer_data["FrequencyScore"] + customer_data["MonetaryScore"]
customer_data["RFM"].value_counts()

In [None]:
rfm_rel = pd.DataFrame(customer_data.pivot_table(index=["RFM"], 
                                    values=["Recency", "Frequency", "MonetaryValue", "Lifetime"], 
                                    aggfunc={"Recency": [np.min, np.median, np.max], 
                                             "Frequency": [np.min, np.median, np.max], 
                                             "MonetaryValue": [np.min, np.median, np.max], 
                                             "Lifetime": [np.min, np.median, np.max, "count"]}))
rfm_rel
# Note that the median lifetime is rather constant across clusters. This is a good news - our segmentation is not biased by the lifetime
# of the customers.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1)
fig.suptitle('Median Lifetime of RFM Segments (Absolute vs Relative)')
rfm_abs["Lifetime"]["median"].plot(ax=axes[0], kind="bar")
rfm_rel["Lifetime"]["median"].plot(ax=axes[1], kind="bar")
plt.show()

In [None]:
# Visualise segments using 3D plot
x = customer_data["RelRecency"]
y = customer_data["RelFrequency"]
z = (customer_data["RelMonetaryValue"] - customer_data["RelMonetaryValue"].min()) / customer_data["RelMonetaryValue"].max()
c = customer_data["RFM"]
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=x, ys=y, zs=z, c=c, s=30, alpha=0.5)
ax.set_title("RFM Visualisation")
ax.set_xlabel("Relative Recency")
ax.set_ylabel("Relative Frequency")
ax.set_zlabel("Relative Monetary Value (with Min-Max Standardisation)")
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=3, ncols=1, sharex=True, sharey=True, figsize=(5, 15))
fig.suptitle('Correlation between R, F and M')

axs[0].scatter(x, y, alpha=0.5)
axs[0].set_title("Relative Recency (x) vs Relative Frequency (y)")

axs[1].scatter(y, z, alpha=0.5)
axs[1].set_title("Relative Frequency (x) vs Relative Monetary Value (y)")

axs[2].scatter(x, z, alpha=0.5)
axs[2].set_title("Relative Recency (x) vs Relative Monetary Value (y)")


In [None]:
x = customer_data["RecencyScore"]
y = customer_data["FrequencyScore"]
z = customer_data["MonetaryScore"]
c = customer_data["RFM"]
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(xs=x, ys=y, zs=z, c=c, s=30, alpha=0.5)
ax.set_title("RFM Visualisation")
ax.set_xlabel("Recency Score")
ax.set_ylabel("Frequency Score")
ax.set_zlabel("Monetary Score")
plt.show()

# Generate simple criteria for labelling

In [None]:
# From these statistics we can generate some heuristic business rules to simplify our segmentation.
# For example, RFM score 3 and 4 can be combined, because these two groups tend to have only one transaction.
# Create new columns with our learnings
customer_data.describe()

In [None]:
rfm_rel

In [None]:
customer_data["last_order_within_l60d"] = customer_data["Recency"]<60 # Had transactions in the last 60 days
customer_data["more_than_two_orders"] = customer_data["Frequency"]>2 # Logged in more than twice
customer_data["value_higher_than_2k"] = customer_data["MonetaryValue"]>2000 # Sum of value higher than 2K
customer_data.groupby(["last_order_within_l60d", "more_than_two_orders", "value_higher_than_2k"]).count()["Lifetime"]

In [None]:
conditions = [
    (customer_data["last_order_within_l60d"]==True)&(customer_data["more_than_two_orders"]==True)&(customer_data["value_higher_than_2k"]==True),
    (customer_data["last_order_within_l60d"]==True)&(customer_data["more_than_two_orders"]==True)&(customer_data["value_higher_than_2k"]==False),
    (customer_data["last_order_within_l60d"]==True)&(customer_data["more_than_two_orders"]==False),
    (customer_data["last_order_within_l60d"]==False)&(customer_data["more_than_two_orders"]==True),
    (customer_data["last_order_within_l60d"]==False)&(customer_data["more_than_two_orders"]==False)
]
mappings = ["01. high engagement & high value", 
            "02. high engagement & low value", 
            "03. recent and low frequency", 
            "04. old and high frequency", 
            "05. low engagement & low value"]
customer_data['FinalRFM'] = np.select(conditions, mappings, default="Others")
customer_data['FinalRFM'].value_counts()

In [None]:
customer_data.pivot_table(index=["FinalRFM"], 
                          values=["Recency", "Frequency", "MonetaryValue", "Lifetime"], 
                          aggfunc={"Recency": [np.min, np.median, np.max], 
                                   "Frequency": [np.min, np.median, np.max], 
                                   "MonetaryValue": [np.min, np.median, np.max], 
                                   "Lifetime": [np.min, np.median, np.max, "count"]})

# Bonus: calculate the percentage of new, active, inactive and return users in each month

In [None]:
# Calculate the number of new users, inactive users, return users and active users in each month
# Definition:
# New users: those who made their first purchase in the current month
# Active users: those who made purchases in the previous month and in the current month
# Inactive users: those who made purchases in previous months, but not in the current month
# Return users: those who made purchases before the previous month, not in the previous month and made purchases agian in the current month
data["InvoiceMonth"] = data["InvoiceDate"].apply(lambda x: x.replace(day=1))
data["InvoiceMonth"] = data["InvoiceMonth"].apply(lambda x: x.strftime("%Y-%m-%d"))
user_month_pivot = data.pivot_table(index=["CustomerID"], 
                                    columns=["InvoiceMonth"], 
                                    values=["InvoiceNo"], 
                                    aggfunc="count", 
                                    fill_value=0)
user_month_pivot.head()

In [None]:
# Replace count of invoices with 1
user_month_pivot = user_month_pivot.applymap(lambda x: 1 if x>0 else 0)
user_month_pivot.head()

In [None]:
# Get the number of columns
len((user_month_pivot).columns)

In [None]:
# Define functions to get user status
def user_status(data):
    status = []
    for i in range(13):
    # If the user has no purchase in the current month
        if data[i] == 0:
            # If the user has made purchases before
            if len(status) > 0:
                # If the user is unregistered in the previous month
                if status[i-1] == "unreg":
                # The the user is also unregistered this month
                    status.append("unreg")
                # Otherwise the user is an active user, i.e., he/she already registered
                else:
                    status.append("inactive")
            # Otherwise the user is not registered in the current month, i.e., he/she has never made any purchases
            else:
                status.append("unreg")
        else:
            # This is the first purchase of the user
            if len(status) == 0:
                status.append("new")
            else:
                if status[i-1] == "inactive":
                    status.append("return")
                elif status[i-1] == "unreg":
                    status.append("new")
                else:
                    status.append("active")
    return status

In [None]:
user_month_status = pd.DataFrame(user_month_pivot.apply(lambda x: pd.Series(user_status(x)), axis=1))
user_month_status.columns = user_month_pivot.columns
user_month_status.head()

In [None]:
month_status_pivot = pd.DataFrame(user_month_status.replace("unreg", np.NaN).apply(lambda x: pd.value_counts(x)))
month_status_pivot.head()

In [None]:
month_status_pivot = month_status_pivot.fillna(0).T
month_status_pivot.reset_index(inplace=True)
month_status_pivot.set_index("InvoiceMonth", inplace=True)

In [None]:
ax = month_status_pivot.plot.area(figsize = (12,6))
plt.title("Number of Users by Status in each month")
plt.show()