In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

pio.templates.default = "plotly_white"

In [33]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("rfm_data.csv")
data.head()

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location
0,8814,2023-04-11,943.31,Product C,890075,Tokyo
1,2188,2023-04-11,463.7,Product A,176819,London
2,4608,2023-04-11,80.28,Product A,340062,New York
3,2559,2023-04-11,221.29,Product A,239145,London
4,9482,2023-04-11,739.56,Product A,194545,Paris


## Calculate RFM Values

In [3]:
from datetime import datetime

In [17]:
# convert 'PurchaseDate' to datetime
data['PurchaseDate'] = pd.to_datetime(data['PurchaseDate'])

# calculate recency
data['Recency'] = (datetime.now().date() - data['PurchaseDate'].dt.date).apply(lambda x: x.days)

# calculate frequency
frequency_data = data.groupby('CustomerID')['OrderID'].count().reset_index()
frequency_data.rename(columns={'OrderID': 'Frequency'}, inplace=True)
data = data.merge(frequency_data, on='CustomerID', how='left')

# calculate monetary value
monetary_data = data.groupby('CustomerID')['TransactionAmount'].sum().reset_index()
monetary_data.rename(columns={'TransactionAmount': 'MonetaryValue'}, inplace=True)
data = data.merge(monetary_data, on='CustomerID', how='left')


In [18]:
data.head()

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location,Recency,Frequency,MonetaryValue
0,8814,2023-04-11,943.31,Product C,890075,Tokyo,465,1,943.31
1,2188,2023-04-11,463.7,Product A,176819,London,465,1,463.7
2,4608,2023-04-11,80.28,Product A,340062,New York,465,1,80.28
3,2559,2023-04-11,221.29,Product A,239145,London,465,1,221.29
4,9482,2023-04-11,739.56,Product A,194545,Paris,465,1,739.56


## Calculate RFM Scores

In [20]:
# define scoring criteria for each RFM value
recency_scores = [5,4,3,2,1] # Higher score for lower recency (more recent)
frequency_scores = [1,2,3,4,5] # Higher score for higher frquency
monetary_scores = [1,2,3,4,5] # Higher score for higher monetary value

# calculate RFM scores
data['RecencyScore'] = pd.cut(data['Recency'], bins=5, labels=recency_scores)
data['FrequencyScore'] = pd.cut(data['Frequency'], bins=5, labels=frequency_scores)
data['MonetaryScore'] = pd.cut(data['MonetaryValue'], bins=5, labels=monetary_scores)

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   CustomerID          1000 non-null   int64         
 1   PurchaseDate        1000 non-null   datetime64[ns]
 2   TransactionAmount   1000 non-null   float64       
 3   ProductInformation  1000 non-null   object        
 4   OrderID             1000 non-null   int64         
 5   Location            1000 non-null   object        
 6   Recency             1000 non-null   int64         
 7   Frequency           1000 non-null   int64         
 8   MonetaryValue       1000 non-null   float64       
 9   RecencyScore        1000 non-null   category      
 10  FrequencyScore      1000 non-null   category      
 11  MonetaryScore       1000 non-null   category      
dtypes: category(3), datetime64[ns](1), float64(2), int64(4), object(2)
memory usage: 74.0+ KB


We can see that these scores are categorical variables, so, we need to convert them into integers to use them.

In [22]:
# convert RFM scores to numeric type
data['RecencyScore'] = data['RecencyScore'].astype(int)
data['FrequencyScore'] = data['FrequencyScore'].astype(int)
data['MonetaryScore'] = data['MonetaryScore'].astype(int)

## RFM Value Segmentation

Now let’s calculate the final RFM score and the value segment according to the scores

In [24]:
# calculate RFM score by combining the individual scores
data['RFM_Score'] = data['RecencyScore'] + data['FrequencyScore'] + data['MonetaryScore']

# create RFM segments based on the RFM score
segment_labels = ['Low-Value', 'Mid-Value', 'High-Value']
data['Value Segment'] = pd.qcut(data['RFM_Score'], q=3, labels=segment_labels)


In [26]:
data.tail()

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location,Recency,Frequency,MonetaryValue,RecencyScore,FrequencyScore,MonetaryScore,RFM_Score,Value Segment
995,2970,2023-06-10,759.62,Product B,275284,London,405,1,759.62,5,1,2,8,High-Value
996,6669,2023-06-10,941.5,Product C,987025,New York,405,1,941.5,5,1,2,8,High-Value
997,8836,2023-06-10,545.36,Product C,512842,London,405,1,545.36,5,1,2,8,High-Value
998,1440,2023-06-10,729.94,Product B,559753,Paris,405,1,729.94,5,1,2,8,High-Value
999,4759,2023-06-10,804.28,Product D,467544,New York,405,1,804.28,5,1,2,8,High-Value


Let's take a look at the segment distribution

In [28]:
# RFM Segment Distribution
segment_counts = data['Value Segment'].value_counts().reset_index()
segment_counts.columns = ['Value Segment', 'Count']

fig_segment_dist = px.bar(
    segment_counts,
    x = 'Value Segment',
    y = 'Count',
    color = 'Value Segment',
    title = 'RFM Value Segment Distribution'
)

fig_segment_dist.update_layout(xaxis_title='RFM Value Segment',
                              yaxis_title='Count',
                              showlegend=False)
fig_segment_dist.show()

## RFM Customer Segments

Now let’s create and analyze RFM Customer Segments that are broader classifications based on the RFM scores. These segments, such as “Champions”, “Potential Loyalists”, and “Can’t Lose” provide a more strategic perspective on customer behaviour and characteristics in terms of recency, frequency, and monetary aspects.

In [36]:
# Create a new column for RFM Customer Segments
data['RFM Customer Segments'] = ''

# Assign RFM segments based on the RFM score
data.loc[data['RFM_Score'] >= 9, 'RFM Customer Segments'] = 'Champions'
data.loc[(data['RFM_Score'] >= 6) & (data['RFM_Score'] < 9), 'RFM Customer Segments'] = 'Potential Loyalists'
data.loc[(data['RFM_Score'] >= 5) & (data['RFM_Score'] < 6), 'RFM Customer Segments'] = 'At Risk Customers'
data.loc[(data['RFM_Score'] >= 4) & (data['RFM_Score'] < 5), 'RFM Customer Segments'] = "Can't Lose" 
data.loc[(data['RFM_Score'] >= 3) & (data['RFM_Score'] < 4), 'RFM Customer Segments'] = 'Lost'

data[['CustomerID', 'RFM Customer Segments']]

Unnamed: 0,CustomerID,RFM Customer Segments
0,8814,Can't Lose
1,2188,Lost
2,4608,Lost
3,2559,Lost
4,9482,Can't Lose
...,...,...
995,2970,Potential Loyalists
996,6669,Potential Loyalists
997,8836,Potential Loyalists
998,1440,Potential Loyalists


## RFM Analysis

Now let’s analyze the distribution of customers across different RFM customer segments within each value segment

In [37]:
segment_product_counts = data.groupby(['Value Segment', 'RFM Customer Segments']).size().reset_index(name='Count')
segment_product_counts = segment_product_counts.sort_values('Count', ascending=False)

fig_treemap_segment_product = px.treemap(
    segment_product_counts,
    path=['Value Segment', 'RFM Customer Segments'],
    values='Count',
    color='Value Segment', 
    color_discrete_sequence=px.colors.qualitative.Pastel,
    title='RFM Customer Segments by Value'
)

fig_treemap_segment_product.show()

Now let’s analyze the distribution of RFM values within the Champions segment

In [38]:
# Filter the data to include only the customers in the Champions segment
champion_segment = data[data['RFM Customer Segments'] == 'Champions']

fig = go.Figure()
fig.add_trace(go.Box(y=champion_segment['RecencyScore'], name='Recency'))
fig.add_trace(go.Box(y=champion_segment['FrequencyScore'], name='Frequency'))
fig.add_trace(go.Box(y=champion_segment['MonetaryScore'], name='Monetary'))

fig.update_layout(title='Distribution of RFM Values within Champions Segment',
                  yaxis_title='RFM Value',
                  showlegend=True)

fig.show()

Now let’s analyze the correlation of the recency, frequency, and monetary scores within the champions segment

In [41]:
corr_matrix = champion_segment[['RecencyScore', 'FrequencyScore', 'MonetaryScore']].corr()

fig_heatmap = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='RdBu',
    colorbar=dict(title='Correlation'))
)

fig_heatmap.update_layout(title='Correlation Matrix of RFM Values within Champions Segment')

fig_heatmap.show()

Now let’s have a look at the number of customers in all the segments

In [51]:
import plotly.colors

pastel_colors = plotly.colors.qualitative.Pastel

In [52]:
segment_counts = data ['RFM Customer Segments'].value_counts()

fig = go.Figure(data=[go.Bar(x=segment_counts.index, y=segment_counts.values,
                            marker=dict(color=pastel_colors))])

champions_color = 'rgb(158, 202, 225)'
fig.update_traces(marker_color=[champions_color if segment == 'Champions' else pastel_colors[i]
                                for i, segment in enumerate(segment_counts.index)],
                  marker_line_color='rgb(8, 48, 107)',
                  marker_line_width=1.5, opacity=0.6)

fig.show()

Now let’s have a look at the recency, frequency, and monetary scores of all the segments

In [57]:
# Calculate the average Recency, Frequency, and Monetary scores for each segment
segment_scores = data.groupby('RFM Customer Segments')[['RecencyScore', 'FrequencyScore', 'MonetaryScore']].mean().reset_index()

fig = go.Figure()

# Add bars for Recency score
fig.add_trace(go.Bar(
    x=segment_scores['RFM Customer Segments'],
    y=segment_scores['RecencyScore'],
    name='Recency Score',
    marker_color='rgb(158,202,225)'
))

# Add bars for Frequency score
fig.add_trace(go.Bar(
    x=segment_scores['RFM Customer Segments'],
    y=segment_scores['FrequencyScore'],
    name='Frequency Score',
    marker_color='rgb(94,158,217)'
))

# Add bars for Monetary score
fig.add_trace(go.Bar(
    x=segment_scores['RFM Customer Segments'],
    y=segment_scores['MonetaryScore'],
    name='Monetary Score',
    marker_color='rgb(32,102,148)'
))

# Update the layout
fig.update_layout(
    title='Comparison of RFM Segments based on Recency, Frequency, and Monetary Scores',
    xaxis_title='RFM Segments',
    yaxis_title='Score',
    barmode='group',
    showlegend=True
)

fig.show()