In [8]:
import pandas as pd

In [3]:
# Load master dataset (already cleaned & encoded)
df_master = pd.read_csv("master_orders_customers_stores.csv")

In [9]:
# ---------------------------------
# 0️⃣ Identify item columns safely
# ---------------------------------
exclude_cols = [
    "CUSTOMER_ID", "STORE_NUMBER", "ORDER_CREATED_DATE", "ORDER_ID",
    "ORDER_CHANNEL_NAME", "ORDER_SUBCHANNEL_NAME", "ORDER_OCCASION_NAME",
    "total_order_price", "cust_registered", "cust_guest", "cust_special_membership",
    "store_STATE"
] + [col for col in df_master.columns if col.startswith("store_city_")]

item_cols = [col for col in df_master.columns if col not in exclude_cols]

# Ensure item columns are numeric
df_master[item_cols] = df_master[item_cols].apply(pd.to_numeric, errors="coerce").fillna(0)

In [10]:
# ===============================
# 1️⃣ Total orders per customer
# ===============================
customer_orders_count = (
    df_master.groupby("CUSTOMER_ID")["ORDER_ID"]
    .nunique()
    .reset_index()
    .rename(columns={"ORDER_ID": "orders_count"})
)

In [11]:
# ===============================
# 2️⃣ Total items purchased
# ===============================
customer_items_count = (
    df_master.groupby("CUSTOMER_ID")[item_cols]
    .sum()
    .sum(axis=1)
    .reset_index()
)
customer_items_count.columns = ["CUSTOMER_ID", "items_count"]

In [12]:
# ===============================
# 3️⃣ Favorite item (most purchased)
# ===============================
favorite_item = (
    df_master.groupby("CUSTOMER_ID")[item_cols]
    .sum()
    .idxmax(axis=1)
    .reset_index()
)
favorite_item.columns = ["CUSTOMER_ID", "favorite_item"]

In [13]:
# ===============================
# 4️⃣ Repeat purchase rate
#     (# items bought more than once / total distinct items bought)
# ===============================
customer_item_freq = df_master.groupby("CUSTOMER_ID")[item_cols].sum()

repeat_purchase_rate = (
    (customer_item_freq > 1).sum(axis=1) /
    (customer_item_freq > 0).sum(axis=1)
).reset_index()
repeat_purchase_rate.columns = ["CUSTOMER_ID", "repeat_purchase_rate"]

In [14]:
# ===============================
# 5️⃣ Average order value
# ===============================
avg_order_value = (
    df_master.groupby("CUSTOMER_ID")["total_order_price"]
    .mean()
    .reset_index()
    .rename(columns={"total_order_price": "avg_order_value"})
)

In [15]:
# ===============================
# 6️⃣ Merge all customer-level features
# ===============================
df_customer_features = (
    customer_orders_count
    .merge(customer_items_count, on="CUSTOMER_ID", how="left")
    .merge(favorite_item, on="CUSTOMER_ID", how="left")
    .merge(repeat_purchase_rate, on="CUSTOMER_ID", how="left")
    .merge(avg_order_value, on="CUSTOMER_ID", how="left")
)

In [16]:

# ===============================
# 7️⃣ Save to CSV
# ===============================
df_customer_features.to_csv("customer_level_features.csv", index=False)

print("✅ Customer-level features file created: customer_level_features.csv")

✅ Customer-level features file created: customer_level_features.csv


In [17]:
df_customer_features.head(10)

Unnamed: 0,CUSTOMER_ID,orders_count,items_count,favorite_item,repeat_purchase_rate,avg_order_value
0,1588,4,6,20pc Spicy Feast Deal,0.5,17.46
1,2475,2,7,20pc Spicy Feast Deal,0.2,57.255
2,4769,1,2,40 pc Family Mixed Wings,0.0,57.08
3,5791,1,1,25 pc Game Day Pack,0.0,24.99
4,6524,3,6,10 pc Grilled Wings,0.0,37.346667
5,7771,7,21,10 pc Grilled Wings Combo,0.5,58.392857
6,11823,1,4,6 pc Grilled Wings Combo,1.0,93.12
7,14343,16,30,Chicken Sub,0.416667,20.719375
8,15966,4,5,24 pc Family Grilled Wings,0.5,35.2625
9,17591,1,2,24 pc Family Grilled Wings,0.0,39.68
