In [36]:
import pandas as pd
import numpy as np

from warnings import filterwarnings
filterwarnings("ignore")

In [42]:
tx = pd.read_csv("data_intermediate/fact_transactions.csv")
users = pd.read_csv("data_intermediate/dim_users.csv")

DT_FMT = "%Y-%m-%d"

tx["TIMEPLACED"] = (
    tx["TIMEPLACED"]
    .astype("string")
    .str.strip()
)

tx["TIMEPLACED"] = pd.to_datetime(tx["TIMEPLACED"], format=DT_FMT, errors="coerce")

In [43]:
# choose valid rows for time-based features
valid = tx[tx["TIMEPLACED"].notna()].copy()

# OPTIONAL: if you know which statuses are valid, uncomment and adjust:
# valid = valid[valid["STATUS"].isin(["C", "COMPLETED", "SHIPPED"])]

analysis_date = valid["TIMEPLACED"].max()

print("analysis_date:", analysis_date)

analysis_date: 2025-12-01 00:00:00


In [44]:
# customer level aggregation
cust = valid.groupby("MEMBER_ID_order").agg(
    total_orders=("ORDERS_ID","nunique"),
    total_items=("ORDERITEMS_ID","nunique"),
    total_units=("QUANTITY","sum"),
    gross_product_spend=("TOTALPRODUCT_order","sum"),
    order_adj_total=("TOTALADJUSTMENT_order","sum"),
    shipping_total=("TOTALSHIPPING","sum"),
    tax_total=("TOTALTAX","sum"),

    unique_products=("PARTNUM","nunique"),
    unique_types=("TYPE","nunique"),
    unique_brands=("MFNAME","nunique"),

    promo_orders=("promo_count", lambda x: (x > 0).mean()),
    promo_adj_total=("promo_adj_total","sum"),

    last_purchase=("TIMEPLACED","max"),
    first_purchase=("TIMEPLACED","min"),
).reset_index()

cust["net_spend"] = cust["gross_product_spend"] + cust["order_adj_total"]
cust["avg_order_value"] = cust["net_spend"] / cust["total_orders"].replace(0, np.nan)
cust["avg_units_per_order"] = cust["total_units"] / cust["total_orders"].replace(0, np.nan)
cust["days_since_last_purchase"] = (analysis_date - cust["last_purchase"]).dt.days
cust["purchase_span_days"] = (cust["last_purchase"] - cust["first_purchase"]).dt.days
cust["shipping_ratio"] = cust["shipping_total"] / cust["net_spend"].replace(0, np.nan)

# rename MEMBER_ID_order to MEMBER_ID for merging
cust = cust.rename(columns={"MEMBER_ID_order":"MEMBER_ID"})

In [45]:
# type-level spend share
type_spend = (
    valid.groupby(["MEMBER_ID_order","TYPE"])["TOTALPRODUCT_item"]
    .sum()
    .reset_index()
)

type_tot = type_spend.groupby("MEMBER_ID_order")["TOTALPRODUCT_item"].sum().reset_index(name="type_total")
type_spend = type_spend.merge(type_tot, on="MEMBER_ID_order", how="left")
type_spend["type_spend_share"] = type_spend["TOTALPRODUCT_item"] / type_spend["type_total"].replace(0, np.nan)

In [46]:
# top type per customer
top_type = (
    type_spend.sort_values(["MEMBER_ID_order","type_spend_share"], ascending=[True, False])
    .drop_duplicates("MEMBER_ID_order")
    .rename(columns={"TYPE":"top_type","type_spend_share":"top_type_share","MEMBER_ID_order":"MEMBER_ID"})[["MEMBER_ID","top_type","top_type_share"]]
)

customer_features = cust.merge(top_type, on="MEMBER_ID", how="left")
customer_features = customer_features.merge(users, left_on="MEMBER_ID", right_on="USERS_ID", how="left")

# clean up ids
customer_features["MEMBER_ID"] = customer_features["MEMBER_ID"].astype(int)
customer_features.drop(columns=["USERS_ID"], inplace=True)

In [47]:
customer_features.head()

Unnamed: 0,MEMBER_ID,total_orders,total_items,total_units,gross_product_spend,order_adj_total,shipping_total,tax_total,unique_products,unique_types,...,top_type,top_type_share,REGISTERTYPE,LASTORDER,REGISTRATION,LASTSESSION,REGISTRATIONUPDATE,LOGONID,EMAIL,NAME
0,3003,2,2,2,100.0,-10.0,0.0,0.0,1,1,...,ItemBean,1.0,R,2025-11-20,2025-11-20,2025-11-20,2025-11-20,pkumar,pkumar@solveda.com,Pranav Kumar
1,4002,2,2,2,255.0,-27.38,11.88,0.0,2,1,...,ItemBean,1.0,R,2025-11-25,2025-11-21,2025-11-25,2025-11-21,robertdowneyjr1@yopmail.com,robertdowneyjr1@yopmail.com,Robert Downey Jr
2,4003,1,3,3,3342.0,-191.76,86.76,0.0,3,1,...,ItemBean,1.0,R,2025-11-21,2025-11-21,2025-11-21,2025-11-21,johnchambers@yopmail.com,johnchambers@yopmail.com,John Chambers
3,4004,2,12,12,16256.0,-467.96,162.36,0.0,11,1,...,ItemBean,1.0,R,2025-11-21,2025-11-21,2025-11-21,2025-11-21,andymurray@yopmail.com,andymurray@yopmail.com,Andy Murray
4,4005,2,3,3,319.0,-36.7,19.8,0.0,3,1,...,ItemBean,1.0,R,2025-11-21,2025-11-21,2025-11-21,2025-11-21,michaels@yopmail.com,michaels@yopmail.com,Michael S


In [52]:
customer_features.to_csv("data_intermediate/customer_features.csv", index=False)