In [12]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load cleaned transaction data
weekly_path = Path("../data/processed/weekly_sku_demand.csv")
weekly = pd.read_csv(weekly_path, parse_dates=["WeekStart"])

# Load raw transactions again for customer-level work
raw_path = Path("../data/raw/online_retail_ii.xlsx")
tx = pd.read_excel(raw_path)

# Normalize column names
tx = tx.rename(columns={
    "Customer ID": "CustomerID",
    "Price": "UnitPrice",
}).copy()

tx["InvoiceDate"] = pd.to_datetime(tx["InvoiceDate"], errors="coerce")
tx["Invoice"] = tx["Invoice"].astype(str)

# Core cleaning (same logic as Project #1)
tx = tx.dropna(subset=["CustomerID", "InvoiceDate", "Quantity", "UnitPrice"])
tx = tx.loc[~tx["Invoice"].str.startswith("C")]
tx = tx.loc[tx["Quantity"] > 0]
tx = tx.loc[tx["UnitPrice"] > 0]

tx["LineRevenue"] = tx["Quantity"] * tx["UnitPrice"]

tx.head(), tx.shape

(  Invoice StockCode                          Description  Quantity  \
 0  489434     85048  15CM CHRISTMAS GLASS BALL 20 LIGHTS        12   
 1  489434    79323P                   PINK CHERRY LIGHTS        12   
 2  489434    79323W                  WHITE CHERRY LIGHTS        12   
 3  489434     22041         RECORD FRAME 7" SINGLE SIZE         48   
 4  489434     21232       STRAWBERRY CERAMIC TRINKET BOX        24   
 
           InvoiceDate  UnitPrice  CustomerID         Country  LineRevenue  
 0 2009-12-01 07:45:00       6.95     13085.0  United Kingdom         83.4  
 1 2009-12-01 07:45:00       6.75     13085.0  United Kingdom         81.0  
 2 2009-12-01 07:45:00       6.75     13085.0  United Kingdom         81.0  
 3 2009-12-01 07:45:00       2.10     13085.0  United Kingdom        100.8  
 4 2009-12-01 07:45:00       1.25     13085.0  United Kingdom         30.0  ,
 (407664, 9))

In [13]:
tx["CustomerID"].nunique()

4312

In [14]:
# Reference date for recency (end of dataset)
analysis_date = tx["InvoiceDate"].max() + pd.Timedelta(days=1)

customer_features = (
    tx.groupby("CustomerID")
      .agg(
          recency_days=("InvoiceDate", lambda x: (analysis_date - x.max()).days),
          frequency=("Invoice", "nunique"),
          monetary_value=("LineRevenue", "sum"),
          avg_order_value=("LineRevenue", lambda x: x.sum() / x.nunique()),
          total_units=("Quantity", "sum"),
          unique_skus=("StockCode", "nunique"),
          first_purchase=("InvoiceDate", "min"),
          last_purchase=("InvoiceDate", "max"),
      )
      .reset_index()
)

# Customer lifespan in days
customer_features["customer_lifespan_days"] = (
    customer_features["last_purchase"] - customer_features["first_purchase"]
).dt.days

customer_features.head(), customer_features.shape

(   CustomerID  recency_days  frequency  monetary_value  avg_order_value  \
 0     12346.0           165         11          372.86        62.143333   
 1     12347.0             3          2         1323.32        40.100606   
 2     12348.0            74          1          222.16        44.432000   
 3     12349.0            43          3         2671.14        62.119535   
 4     12351.0            11          1          300.93        23.148462   
 
    total_units  unique_skus      first_purchase       last_purchase  \
 0           70           26 2009-12-14 08:34:00 2010-06-28 13:53:00   
 1          828           70 2010-10-31 14:20:00 2010-12-07 14:57:00   
 2          373           20 2010-09-27 14:59:00 2010-09-27 14:59:00   
 3          993           90 2010-04-29 13:20:00 2010-10-28 08:23:00   
 4          261           21 2010-11-29 15:23:00 2010-11-29 15:23:00   
 
    customer_lifespan_days  
 0                     196  
 1                      37  
 2                   

In [15]:
customer_features.describe(percentiles=[0.5, 0.75, 0.9, 0.95, 0.99])

Unnamed: 0,CustomerID,recency_days,frequency,monetary_value,avg_order_value,total_units,unique_skus,customer_lifespan_days
count,4312.0,4312.0,4312.0,4312.0,4312.0,4312.0,4312.0,4312.0
mean,15349.290353,91.171846,4.455705,2048.238236,66.745955,1284.404917,63.646104,133.998609
std,1701.200176,96.860633,8.170213,8914.48128,310.675213,6459.16458,85.757966,132.827183
min,12346.0,1.0,1.0,2.95,2.95,1.0,1.0,0.0
50%,15350.5,53.0,2.0,706.02,34.13066,382.0,38.0,105.0
75%,16834.25,136.0,5.0,1723.1425,56.098211,996.25,79.25,254.0
90%,17690.9,254.9,9.0,3800.225,97.741507,2190.8,149.0,343.0
95%,17987.45,304.0,13.0,6237.6035,161.091356,3439.25,203.45,358.0
99%,18230.89,368.0,31.0,20137.2375,620.6822,13147.65,388.56,369.0
max,18287.0,374.0,205.0,349164.35,13916.34,220600.0,1741.0,373.0


In [16]:
features_for_model = customer_features.copy()

# Log-transform skewed features
for col in [
    "monetary_value",
    "avg_order_value",
    "frequency",
    "total_units",
    "unique_skus",
    "customer_lifespan_days",
]:
    features_for_model[f"log_{col}"] = np.log1p(features_for_model[col])

model_cols = [
    "recency_days",
    "log_frequency",
    "log_monetary_value",
    "log_avg_order_value",
    "log_total_units",
    "log_unique_skus",
]

X = features_for_model[model_cols].copy()
X.head()

Unnamed: 0,recency_days,log_frequency,log_monetary_value,log_avg_order_value,log_total_units,log_unique_skus
0,165,2.484907,5.923881,4.145407,4.26268,3.295837
1,3,1.098612,7.188654,3.716023,6.72022,4.26268
2,74,0.693147,5.407889,3.816217,5.924256,3.044522
3,43,1.386294,7.890635,4.14503,6.901737,4.51086
4,11,0.693147,5.710195,3.184221,5.568345,3.091042


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [18]:
customer_features[["recency_days", "frequency", "monetary_value"]].describe(percentiles=[0.9, 0.95, 0.99])

Unnamed: 0,recency_days,frequency,monetary_value
count,4312.0,4312.0,4312.0
mean,91.171846,4.455705,2048.238236
std,96.860633,8.170213,8914.48128
min,1.0,1.0,2.95
50%,53.0,2.0,706.02
90%,254.9,9.0,3800.225
95%,304.0,13.0,6237.6035
99%,368.0,31.0,20137.2375
max,374.0,205.0,349164.35


In [19]:
from sklearn.cluster import KMeans

# Fit KMeans with 5 clusters
kmeans = KMeans(
    n_clusters=5,
    random_state=42,
    n_init=20
)

clusters = kmeans.fit_predict(X_scaled)

customer_features["segment"] = clusters
customer_features["segment"].value_counts()

0    1476
2    1197
1     761
4     606
3     272
Name: segment, dtype: int64

In [20]:
segment_summary = (
    customer_features
    .groupby("segment")
    .agg(
        customers=("CustomerID", "nunique"),
        avg_recency_days=("recency_days", "mean"),
        avg_frequency=("frequency", "mean"),
        avg_monetary_value=("monetary_value", "mean"),
        avg_order_value=("avg_order_value", "mean"),
    )
    .sort_values("avg_monetary_value", ascending=False)
)

segment_summary

Unnamed: 0_level_0,customers,avg_recency_days,avg_frequency,avg_monetary_value,avg_order_value
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,606,25.506601,15.40264,9453.283508,131.545533
0,1476,56.218835,4.292005,1460.752457,40.849506
3,272,108.602941,2.294118,1099.755404,360.023319
2,1197,56.785297,1.620718,360.307084,24.857429
1,761,259.113009,1.287779,284.916941,26.435491


In [21]:
segment_names = {
    4: "High-Value Loyal",
    0: "Growing Repeat",
    3: "Bulk / Wholesale",
    2: "Low-Value / One-Time",
    1: "At-Risk / Churned",
}

customer_features["segment_name"] = customer_features["segment"].map(segment_names)

customer_features[["CustomerID", "segment_name"]].head()

Unnamed: 0,CustomerID,segment_name
0,12346.0,Growing Repeat
1,12347.0,Growing Repeat
2,12348.0,Low-Value / One-Time
3,12349.0,Growing Repeat
4,12351.0,Low-Value / One-Time


In [22]:
customer_features.to_csv(
    Path("../data/processed/customer_features.csv"),
    index=False
)

In [23]:
customer_segments = customer_features[["CustomerID", "segment_name"]].copy()

out_path = Path("../data/processed/customer_segments.csv")
customer_segments.to_csv(out_path, index=False)
out_path

WindowsPath('../data/processed/customer_segments.csv')

In [24]:
# tx is the cleaned transactions table with LineRevenue, StockCode, CustomerID
tx2 = tx.copy()
tx2["StockCode"] = tx2["StockCode"].astype(str)

tx2 = tx2.merge(customer_segments, on="CustomerID", how="left")

sku_segment_mix = (
    tx2.groupby(["StockCode", "segment_name"], as_index=False)
       .agg(
           revenue=("LineRevenue", "sum"),
           units=("Quantity", "sum"),
           orders=("Invoice", "nunique"),
           customers=("CustomerID", "nunique"),
       )
)

# Add shares within SKU
sku_segment_mix["revenue_share"] = sku_segment_mix["revenue"] / sku_segment_mix.groupby("StockCode")["revenue"].transform("sum")
sku_segment_mix["unit_share"] = sku_segment_mix["units"] / sku_segment_mix.groupby("StockCode")["units"].transform("sum")

out_path = Path("../data/processed/sku_segment_mix.csv")
sku_segment_mix.to_csv(out_path, index=False)
out_path

WindowsPath('../data/processed/sku_segment_mix.csv')