In [24]:
import pandas as pd
import numpy as np

from warnings import filterwarnings
filterwarnings("ignore")

In [25]:
cf = pd.read_csv("data_intermediate/customer_features.csv")

In [26]:
cf.head()

Unnamed: 0,MEMBER_ID,total_orders,total_items,total_units,gross_product_spend,order_adj_total,shipping_total,tax_total,unique_products,unique_types,...,top_type,top_type_share,REGISTERTYPE,LASTORDER,REGISTRATION,LASTSESSION,REGISTRATIONUPDATE,LOGONID,EMAIL,NAME
0,3003,2,2,2,100.0,-10.0,0.0,0.0,1,1,...,ItemBean,1.0,R,2025-11-20,2025-11-20,2025-11-20,2025-11-20,pkumar,pkumar@solveda.com,Pranav Kumar
1,4002,2,2,2,255.0,-27.38,11.88,0.0,2,1,...,ItemBean,1.0,R,2025-11-25,2025-11-21,2025-11-25,2025-11-21,robertdowneyjr1@yopmail.com,robertdowneyjr1@yopmail.com,Robert Downey Jr
2,4003,1,3,3,3342.0,-191.76,86.76,0.0,3,1,...,ItemBean,1.0,R,2025-11-21,2025-11-21,2025-11-21,2025-11-21,johnchambers@yopmail.com,johnchambers@yopmail.com,John Chambers
3,4004,2,12,12,16256.0,-467.96,162.36,0.0,11,1,...,ItemBean,1.0,R,2025-11-21,2025-11-21,2025-11-21,2025-11-21,andymurray@yopmail.com,andymurray@yopmail.com,Andy Murray
4,4005,2,3,3,319.0,-36.7,19.8,0.0,3,1,...,ItemBean,1.0,R,2025-11-21,2025-11-21,2025-11-21,2025-11-21,michaels@yopmail.com,michaels@yopmail.com,Michael S


In [27]:
rfm = cf[["MEMBER_ID","days_since_last_purchase","total_orders","net_spend"]].copy()

rfm = rfm.rename(columns={
    "days_since_last_purchase":"recency",
    "total_orders":"frequency",
    "net_spend":"monetary"
})

In [None]:
# Quantile scoring (4 bins) with basic validation
if rfm["recency"].notna().sum() == 0:
    raise ValueError(
        "All recency values are NaN. This usually means TIMEPLACED failed to parse in S_02 "
        "or your valid filtering removed all dated orders. Fix S_02 first."
    )

# If a few customers have no dated purchases, keep them but score them as worst (R=1)
rfm["recency"] = rfm["recency"].fillna(rfm["recency"].max() + 1)

def score_r(x, q):  # lower recency = better
    if x <= q[0.25]: return 4
    if x <= q[0.50]: return 3
    if x <= q[0.75]: return 2
    return 1

def score_fm(x, q):  # higher is better
    if x <= q[0.25]: return 1
    if x <= q[0.50]: return 2
    if x <= q[0.75]: return 3
    return 4

rq = rfm["recency"].quantile([0.25,0.5,0.75])
fq = rfm["frequency"].quantile([0.25,0.5,0.75])
mq = rfm["monetary"].quantile([0.25,0.5,0.75])

rfm["R"] = rfm["recency"].apply(lambda x: score_r(x, rq))
rfm["F"] = rfm["frequency"].apply(lambda x: score_fm(x, fq))
rfm["M"] = rfm["monetary"].apply(lambda x: score_fm(x, mq))
rfm["RFM_score"] = rfm[["R","F","M"]].mean(axis=1)

In [29]:
# State segment rules (simple, effective)
def segment(row):
    R,F,M = row["R"],row["F"],row["M"]
    if R==4 and F==4 and M==4: return "Champions"
    if R>=3 and F>=3 and M>=3: return "Loyal"
    if R==4 and F<=2: return "New/Promising"
    if R<=2 and F>=3: return "At Risk"
    if R==1 and F==1: return "Churned"
    return "Regular"

rfm["state_segment"] = rfm.apply(segment, axis=1)

In [30]:
rfm.head()

Unnamed: 0,MEMBER_ID,recency,frequency,monetary,R,F,M,RFM_score,state_segment
0,3003,11,2,90.0,1,1,1,1.0,Churned
1,4002,6,2,227.62,1,1,2,1.333333,Churned
2,4003,10,1,3150.24,1,1,4,2.0,Churned
3,4004,10,2,15788.04,1,1,4,2.0,Churned
4,4005,10,2,282.3,1,1,3,1.666667,Churned


In [31]:
rfm.to_csv("data_intermediate/rfm_analysis.csv", index=False)