In [1]:
# Load libraries
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [2]:
# Load data
df = pd.read_excel("Online Retail.xlsx")

In [3]:
# Take a glimpse of the data
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
# Check for missing information # Missing customer info
df.isnull().sum(axis=0)

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [6]:
# Check Quantity 
# Looks like negative quantities exist
df.Quantity.min()

-80995

In [7]:
# Remove missing customer ID
df = df[pd.notnull(df['CustomerID'])]

In [8]:
# Remove negative quantities
df = df[(df['Quantity']>0)]

In [9]:
# Add a column for total price
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

In [10]:
# Find max date
df['InvoiceDate'].max()

Timestamp('2011-12-09 12:50:00')

In [11]:
import datetime as dt
NOW = dt.datetime(2011,12,10)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [12]:
# RFM model
rfmTable = df.groupby('CustomerID').agg({'InvoiceDate': lambda x: (NOW - x.max()).days,
                                        'InvoiceNo': lambda x: len(x),  
                                        'TotalPrice': lambda x: x.sum()})

rfmTable['InvoiceDate'] = rfmTable['InvoiceDate'].astype(int)
rfmTable.rename(columns={'InvoiceDate': 'recency', 
                         'InvoiceNo': 'frequency', 
                         'TotalPrice': 'monetary_value'}, inplace=True)
rfmTable.head()

Unnamed: 0_level_0,recency,frequency,monetary_value
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346.0,325,1,77183.6
12347.0,2,182,4310.0
12348.0,75,31,1797.24
12349.0,18,73,1757.55
12350.0,310,17,334.4


In [13]:
#split quantiles
quantiles = rfmTable.quantile(q=[0.2,0.4,0.6,0.8])
quantiles = quantiles.to_dict()

In [18]:
def RScore(x,p,d):
    if x <= d[p][0.20]:
        return 5
    elif x <= d[p][0.40]:
        return 4
    elif x <= d[p][0.60]: 
        return 3
    elif x <= d[p][0.80]: 
        return 2
    else:
        return 1
    
def FMScore(x,p,d):
    if x <= d[p][0.20]:
        return 1
    elif x <= d[p][0.40]:
        return 2
    elif x <= d[p][0.60]: 
        return 3
    elif x <= d[p][0.80]: 
        return 4
    else:
        return 5

In [19]:
segmented_rfm = rfmTable

In [20]:
# Score data
segmented_rfm['r_quartile'] = segmented_rfm['recency'].apply(RScore, args=('recency',quantiles,))
segmented_rfm['f_quartile'] = segmented_rfm['frequency'].apply(FMScore, args=('frequency',quantiles,))
segmented_rfm['m_quartile'] = segmented_rfm['monetary_value'].apply(FMScore, args=('monetary_value',quantiles,))
segmented_rfm.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346.0,325,1,77183.6,1,1,5,515
12347.0,2,182,4310.0,5,5,5,155
12348.0,75,31,1797.24,2,3,4,434
12349.0,18,73,1757.55,4,4,4,244
12350.0,310,17,334.4,1,2,2,522


In [21]:
# RFM Score
segmented_rfm['RFMScore'] = segmented_rfm.r_quartile.map(str) + segmented_rfm.f_quartile.map(str) + segmented_rfm.m_quartile.map(str)
segmented_rfm.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346.0,325,1,77183.6,1,1,5,115
12347.0,2,182,4310.0,5,5,5,555
12348.0,75,31,1797.24,2,3,4,234
12349.0,18,73,1757.55,4,4,4,444
12350.0,310,17,334.4,1,2,2,122


In [23]:
# Segment customers
segmentation_map = {
    r'[1-2][1-2]': 'Hibernating',
    r'[1-2][3-4]': 'At Risk',
    r'[1-2]5': 'Can\'t Loose',
    r'3[1-2]': 'About to Sleep',
    r'33': 'Need Attention',
    r'[3-4][4-5]': 'Loyal Customers',
    r'41': 'Promising',
    r'51': 'New Customers',
    r'[4-5][2-3]': 'Potential Loyalists',
    r'5[4-5]': 'Champions'
}

segmented_rfm['Segment'] = segmented_rfm['r_quartile'].astype(str) + segmented_rfm['f_quartile'].astype(str)
segmented_rfm['Segment'] = segmented_rfm['Segment'].replace(segmentation_map, regex=True)

In [24]:
# See segmented customers
segmented_rfm.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12346.0,325,1,77183.6,1,1,5,115,Hibernating
12347.0,2,182,4310.0,5,5,5,555,Champions
12348.0,75,31,1797.24,2,3,4,234,At Risk
12349.0,18,73,1757.55,4,4,4,444,Loyal Customers
12350.0,310,17,334.4,1,2,2,122,Hibernating


In [25]:
# Segmentation Statistics
rfmStats1 = segmented_rfm[["Segment","recency","frequency", "monetary_value"]].groupby("Segment").agg(["mean","median","count", "std"])
rfmStats1.columns = rfmStats1.columns.map('_'.join).str.strip('|')
rfmStats1

Unnamed: 0_level_0,recency_mean,recency_median,recency_count,recency_std,frequency_mean,frequency_median,frequency_count,frequency_std,monetary_value_mean,monetary_value_median,monetary_value_count,monetary_value_std
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
About to Sleep,52.192547,51.5,322,11.057209,15.987578,16,322,8.115972,450.997174,324.685,322,643.749221
At Risk,165.435852,144.0,569,82.593437,56.859402,51,569,22.65575,996.916872,747.81,569,1015.83414
Can't Loose,143.22619,117.0,84,73.923006,181.666667,156,84,71.000311,2370.705012,2116.51,84,1697.752066
Champions,5.40099,4.0,606,3.743768,289.031353,166,606,536.250916,6960.915446,2619.805,606,20735.090858
Hibernating,209.251397,211.0,1074,89.778884,13.589385,13,1074,8.151748,536.535672,288.3,1074,3005.946594
Loyal Customers,33.206854,29.0,817,15.484023,157.116279,117,817,121.488208,2845.732852,1775.18,817,6004.584153
Need Attention,51.985366,53.0,205,11.511068,41.736585,40,205,8.293565,856.195854,705.12,205,988.663752
New Customers,6.421053,7.0,57,3.385618,7.578947,8,57,3.990826,3618.697018,213.96,57,22305.203885
Potential Loyalists,15.668699,16.0,492,9.25086,34.943089,34,492,12.614801,915.450409,604.195,492,1874.419436
Promising,22.362832,22.0,113,5.461001,7.707965,8,113,3.949868,425.633628,197.64,113,764.078636


### Business Questions

In [29]:
# Best customers
segment_name = "Champions"
best_customers = segmented_rfm[segmented_rfm['Segment']==segment_name]
best_customers.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12378.0,129,219,4008.62,2,5,5,255,Can't Loose
12501.0,336,149,2169.39,1,5,5,155,Can't Loose
12502.0,95,147,3723.87,2,5,5,255,Can't Loose
12520.0,79,152,2634.26,2,5,5,255,Can't Loose
12688.0,113,171,4873.81,2,5,5,255,Can't Loose


In [30]:
# Who are your loyal customers?
segment_name = "Loyal Customers"
loyal_customers = segmented_rfm[segmented_rfm['Segment']==segment_name]
loyal_customers.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12349.0,18,73,1757.55,4,4,4,444,Loyal Customers
12352.0,36,85,2506.04,3,4,5,345,Loyal Customers
12356.0,22,59,2811.43,4,4,5,445,Loyal Customers
12357.0,33,131,6207.67,3,5,5,355,Loyal Customers
12359.0,57,248,6372.58,3,5,5,355,Loyal Customers


In [31]:
# Which customers are at the verge of churning?
segment_name = "At Risk"
churn_customers = segmented_rfm[segmented_rfm['Segment']==segment_name]
churn_customers.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12348.0,75,31,1797.24,2,3,4,234,At Risk
12354.0,232,58,1079.4,1,3,4,134,At Risk
12377.0,315,77,1628.12,1,4,4,144,At Risk
12379.0,81,40,852.24,2,3,3,233,At Risk
12383.0,184,99,1850.56,1,4,4,144,At Risk


In [32]:
# Who has the potential to be converted in more profitable customers?
segment_name = "Potential Loyalists"
possible_conversion_customers = segmented_rfm[segmented_rfm['Segment']==segment_name]
possible_conversion_customers.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12358.0,1,19,1168.06,5,2,4,524,Potential Loyalists
12374.0,25,33,742.93,4,3,3,433,Potential Loyalists
12375.0,10,17,457.5,5,2,2,522,Potential Loyalists
12384.0,28,27,585.27,4,2,3,423,Potential Loyalists
12421.0,15,45,807.04,4,3,3,433,Potential Loyalists


In [33]:
# Who are lost customers that you don’t need to pay much attention to?
segment_name = "Hibernating"
lost_customers = segmented_rfm[segmented_rfm['Segment']==segment_name]
lost_customers.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12346.0,325,1,77183.6,1,1,5,115,Hibernating
12350.0,310,17,334.4,1,2,2,122,Hibernating
12353.0,204,4,89.0,1,1,1,111,Hibernating
12355.0,214,13,459.4,1,1,2,112,Hibernating
12361.0,287,10,189.9,1,1,1,111,Hibernating


In [34]:
# Which customers you must retain?
segment_name = "Can't Loose"
must_retain_customers = segmented_rfm[segmented_rfm['Segment']==segment_name]
must_retain_customers.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12378.0,129,219,4008.62,2,5,5,255,Can't Loose
12501.0,336,149,2169.39,1,5,5,155,Can't Loose
12502.0,95,147,3723.87,2,5,5,255,Can't Loose
12520.0,79,152,2634.26,2,5,5,255,Can't Loose
12688.0,113,171,4873.81,2,5,5,255,Can't Loose


In [35]:
# Which group of customers is most likely to respond to your current campaign?
segment_name = "Promising"
campaign_respond_customers = segmented_rfm[segmented_rfm['Segment']==segment_name]
campaign_respond_customers.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,r_quartile,f_quartile,m_quartile,RFMScore,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12445.0,22,4,133.4,4,1,1,411,Promising
12504.0,18,9,482.05,4,1,2,412,Promising
12586.0,17,11,196.94,4,1,1,411,Promising
12603.0,21,3,739.2,4,1,3,413,Promising
12618.0,21,11,177.31,4,1,1,411,Promising
