In [2]:
import pandas as pd

df = pd.read_csv("../data/processed/retail_cleaned.csv")
df["invoicedate"] = pd.to_datetime(df["invoicedate"])

# RFM snapshot date (last date in dataset)
snapshot_date = df["invoicedate"].max() + pd.Timedelta(days=1)

rfm = df.groupby("customerid").agg(
    recency=("invoicedate", lambda x: (snapshot_date - x.max()).days),
    frequency=("invoiceno", "nunique"),
    monetary=("revenue", "sum")
).reset_index()

# RFM scores (1-5)
rfm["R_score"] = pd.qcut(rfm["recency"], 5, labels=[5,4,3,2,1]).astype(int)
rfm["F_score"] = pd.qcut(rfm["frequency"].rank(method="first"), 5, labels=[1,2,3,4,5]).astype(int)
rfm["M_score"] = pd.qcut(rfm["monetary"], 5, labels=[1,2,3,4,5]).astype(int)

rfm["RFM_Score"] = rfm["R_score"].astype(str) + rfm["F_score"].astype(str) + rfm["M_score"].astype(str)

rfm.head()


Unnamed: 0,customerid,recency,frequency,monetary,R_score,F_score,M_score,RFM_Score
0,12346.0,326,1,77183.6,1,1,5,115
1,12347.0,2,7,4310.0,5,5,5,555
2,12348.0,75,4,1797.24,2,4,4,244
3,12349.0,19,1,1757.55,4,1,4,414
4,12350.0,310,1,334.4,1,1,2,112


In [3]:
def segment(row):
    if row["R_score"] >= 4 and row["F_score"] >= 4:
        return "Champions"
    if row["R_score"] >= 4 and row["F_score"] <= 2:
        return "New Customers"
    if row["R_score"] <= 2 and row["F_score"] >= 4:
        return "At Risk"
    if row["R_score"] <= 2 and row["F_score"] <= 2:
        return "Lost"
    return "Regular"

rfm["Segment"] = rfm.apply(segment, axis=1)
rfm["Segment"].value_counts()


Segment
Regular          1540
Champions        1139
Lost             1065
New Customers     319
At Risk           275
Name: count, dtype: int64

In [4]:
rfm.to_csv("../outputs/rfm_segments.csv", index=False)
print("Saved: ../outputs/rfm_segments.csv")


Saved: ../outputs/rfm_segments.csv
