In [28]:
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt


In [29]:
# Data loading
data = pd.read_csv("datasets/customer_data.csv", encoding="ISO-8859-1").dropna()

data.head()

Unnamed: 0,customer_id,revenue,most_recent_visit,number_of_orders,recency_days
0,22086,777,5/14/2006,9,232
1,2290,1555,9/8/2006,16,115
2,26377,336,11/19/2006,5,43
3,24650,1189,10/29/2006,12,64
4,12883,1229,12/9/2006,12,23


In [30]:
# Convert the Date time to pandas datetime

data["most_recent_visit"] = pd.to_datetime(data["most_recent_visit"])

# Last transaction date
last_date = data["most_recent_visit"].max()



data["Monetary"] = data.revenue / data.number_of_orders


data.head()




Unnamed: 0,customer_id,revenue,most_recent_visit,number_of_orders,recency_days,Monetary
0,22086,777,2006-05-14,9,232,86.333333
1,2290,1555,2006-09-08,16,115,97.1875
2,26377,336,2006-11-19,5,43,67.2
3,24650,1189,2006-10-29,12,64,99.083333
4,12883,1229,2006-12-09,12,23,102.416667


In [31]:
# 

df = data



df.head()

Unnamed: 0,customer_id,revenue,most_recent_visit,number_of_orders,recency_days,Monetary
0,22086,777,2006-05-14,9,232,86.333333
1,2290,1555,2006-09-08,16,115,97.1875
2,26377,336,2006-11-19,5,43,67.2
3,24650,1189,2006-10-29,12,64,99.083333
4,12883,1229,2006-12-09,12,23,102.416667


In [33]:
# Rename:

df = df.rename(columns={
    "recency_days":"Recency",
    "number_of_orders":"Frequency",
})

df = df.drop(columns=["revenue","most_recent_visit"])

df.tail()

Unnamed: 0,customer_id,Frequency,Recency,Monetary
39994,3249,10,31,99.8
39995,6686,8,187,96.375
39996,16418,9,154,112.888889
39997,9117,7,195,96.857143
39998,19184,13,113,116.846154


In [34]:
# Changing them into quartiles for better analysis

df['R'] = pd.qcut(x=df.Recency, q=3, labels = range(3,0,-1))
df['F'] = pd.qcut(x=df.Frequency, q=3, labels = range(1,4,1))
df['M'] = pd.qcut(x=df.Monetary, q=3, labels = range(1,4,1))

df.head(10)

Unnamed: 0,customer_id,Frequency,Recency,Monetary,R,F,M
0,22086,9,232,86.333333,1,2,1
1,2290,16,115,97.1875,2,3,2
2,26377,5,43,67.2,3,1,1
3,24650,12,64,99.083333,3,3,2
4,12883,12,23,102.416667,3,3,2
5,2119,11,72,84.454545,3,2,1
6,31283,17,112,92.294118,2,3,2
7,33815,11,142,70.727273,2,2,1
8,15972,9,43,71.222222,3,2,1
9,27650,10,131,97.0,2,2,2


In [35]:
# Take the RFM sum to classify them

df["RFM"] = df[["R","F","M"]].sum(axis=1)

df.head()

Unnamed: 0,customer_id,Frequency,Recency,Monetary,R,F,M,RFM
0,22086,9,232,86.333333,1,2,1,4
1,2290,16,115,97.1875,2,3,2,7
2,26377,5,43,67.2,3,1,1,5
3,24650,12,64,99.083333,3,3,2,8
4,12883,12,23,102.416667,3,3,2,8


In [36]:
# The RFM function

def rfm_segmentation_func(df):
    if df['RFM'] >=7:
        return "Superstar"
    elif df['RFM'] <7 and df['RFM']>=4:
        return "High Potential"
    else:
        return "Low Relevance"

In [37]:
df["RFM-Segment"] = df.apply(rfm_segmentation_func, axis=1)

df.head()

Unnamed: 0,customer_id,Frequency,Recency,Monetary,R,F,M,RFM,RFM-Segment
0,22086,9,232,86.333333,1,2,1,4,High Potential
1,2290,16,115,97.1875,2,3,2,7,Superstar
2,26377,5,43,67.2,3,1,1,5,High Potential
3,24650,12,64,99.083333,3,3,2,8,Superstar
4,12883,12,23,102.416667,3,3,2,8,Superstar


In [38]:
df["RFM-Segment"].value_counts()

RFM-Segment
High Potential    22768
Superstar         14805
Low Relevance      2426
Name: count, dtype: int64

In [40]:
df.groupby("RFM-Segment").agg({
    "Recency":"mean",
    "Frequency":"mean",
    "Monetary":["mean","count"]
}).round(1)

Unnamed: 0_level_0,Recency,Frequency,Monetary,Monetary
Unnamed: 0_level_1,mean,mean,mean,count
RFM-Segment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
High Potential,215.3,8.9,92.3,22768
Low Relevance,362.1,6.2,71.7,2426
Superstar,99.7,11.9,104.2,14805
