**1. Imports and Settings**

In [2]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
DATA_PATH = "../data/raw/"


**2. Load Prior Data**

In [3]:
orders = pd.read_csv(DATA_PATH + "orders.csv")
op_prior = pd.read_csv(DATA_PATH + "order_products__prior.csv")
products = pd.read_csv(DATA_PATH + "products.csv")
aisles = pd.read_csv(DATA_PATH + "aisles.csv")
departments = pd.read_csv(DATA_PATH + "departments.csv")

orders_prior = orders[orders["eval_set"] == "prior"]


In [6]:
print(orders.head())
print(orders["eval_set"].unique())


   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0   2539329        1    prior             1          2                  8   
1   2398795        1    prior             2          3                  7   
2    473747        1    prior             3          3                 12   
3   2254736        1    prior             4          4                  7   
4    431534        1    prior             5          4                 15   

   days_since_prior_order  
0                     NaN  
1                    15.0  
2                    21.0  
3                    29.0  
4                    28.0  
['prior' 'train' 'test']


**3. Merge Prior Orders with Product Metadata**

In [7]:
prior = (op_prior
    .merge(orders_prior, on="order_id", how="left")
    .merge(products, on="product_id", how="left")
    .merge(aisles, on="aisle_id", how="left")
    .merge(departments, on="department_id", how="left")
)

prior.head()


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,eggs,dairy eggs
1,2,28985,2,1,202279,prior,3,5,9,8.0,Michigan Organic Kale,83,4,fresh vegetables,produce
2,2,9327,3,0,202279,prior,3,5,9,8.0,Garlic Powder,104,13,spices seasonings,pantry
3,2,45918,4,1,202279,prior,3,5,9,8.0,Coconut Butter,19,13,oils vinegars,pantry
4,2,30035,5,0,202279,prior,3,5,9,8.0,Natural Sweetener,17,13,baking ingredients,pantry


**4. Build Time Axis (days_since_first_order)**

In [8]:
orders_prior_sorted = orders_prior.sort_values(["user_id", "order_number"])
#orders_prior_sorted.head(10)

In [9]:
orders_prior_sorted["days_since_first_order"] = (
    orders_prior_sorted.groupby("user_id")["days_since_prior_order"].cumsum().fillna(0)
)
#orders_prior_sorted.head()

In [10]:
prior = prior.merge(
    orders_prior_sorted[["order_id", "days_since_first_order"]],
    on="order_id",
    how="left"
)
#prior.head()

**5. Create User-Level Features**

*5.1 Base Aggregates*

In [13]:
user_orders = orders_prior.groupby("user_id").agg(
    n_orders=("order_number", "max"),
    avg_days_between_orders=("days_since_prior_order", "mean")
)


In [14]:
user_orders.head()

Unnamed: 0_level_0,n_orders,avg_days_between_orders
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10,19.555556
2,14,15.230769
3,12,12.090909
4,5,13.75
5,4,13.333333


*5.2 Basket Size*

In [15]:
basket_sizes = (prior.groupby(["user_id", "order_id"])["product_id"]
                .count()
                .groupby("user_id")
                .mean()
                .to_frame("avg_basket_size"))


In [None]:
basket_sizes.head()

*5.3 Reorder Ratio*

In [16]:
user_reorder_ratio = (prior.groupby("user_id")["reordered"]
                      .mean()
                      .to_frame("user_reorder_ratio"))


In [None]:
user_reorder_ratio.head()

*5.4 Combine Features*

In [17]:
user_features = (
    user_orders
    .join(basket_sizes, how="left")
    .join(user_reorder_ratio, how="left")
)
user_features.reset_index(inplace=True)
#user_features.head()



**6. Create Product-Level Features**

In [18]:
product_features = (
    prior.groupby(["product_id", "product_name"])
    .agg(
        prod_total_purchases=("order_id", "count"),
        prod_n_users=("user_id", "nunique"),
        prod_reorder_rate=("reordered", "mean"),
        prod_avg_add_to_cart=("add_to_cart_order", "mean"),
    )
    .reset_index()
)


product_features.reset_index(inplace=True)
#product_features.head()


**7. User–Product Interaction Features**

In [19]:
up = prior.groupby(["user_id", "product_id"]).agg(
    up_order_count=("order_id", "count"),
    up_first_order_number=("order_number", "min"),
    up_last_order_number=("order_number", "max"),
    up_avg_add_to_cart=("add_to_cart_order", "mean")
)


*7.1 Interaction Order Ratio*

In [20]:
up = up.join(user_features["n_orders"], on="user_id")
up["up_order_ratio"] = up["up_order_count"] / up["n_orders"]


*7.2 Recency Feature Anchored to Last Order*

In [21]:
last_day = (prior.groupby(["user_id", "product_id"])["days_since_first_order"]
            .max()
            .to_frame("up_last_days_since_first_order"))

up = up.join(last_day)


**8. Save Features**

In [22]:
user_features.to_parquet("../data/features/user_features.parquet", engine="fastparquet")
product_features.to_parquet(
    "../data/features/product_features.parquet", engine="fastparquet"
)
up.reset_index().to_parquet(
    "../data/features/user_product_features.parquet", engine="fastparquet"
)
prior.to_parquet("../data/features/prior_merged.parquet", engine="fastparquet")


In [23]:
# ---------------------------------------------------
# FINAL INFERENCE FEATURE TABLE (TRAIN / SERVE PARITY)
# ---------------------------------------------------

model_features = [
    "n_orders_x",
    "avg_days_between_orders",
    "avg_basket_size",
    "user_reorder_ratio",
    "prod_total_purchases",
    "prod_n_users",
    "prod_reorder_rate",
    "prod_avg_add_to_cart",
    "up_order_count",
    "up_first_order_number",
    "up_last_order_number",
    "up_avg_add_to_cart",
    "n_orders_y",
    "up_order_ratio",
    "up_last_days_since_first_order",
]

# Merge ALL feature tables exactly as done for training
final_features = (
    up.reset_index()
    .merge(user_features, on="user_id", how="left")
    .merge(product_features, on="product_id", how="left")
)

# Sanity check
missing = set(model_features) - set(final_features.columns)
assert not missing, f"Missing features: {missing}"

# Persist inference-ready feature table
inference_features = final_features[
    ["user_id", "product_id", "product_name"] + model_features
].copy()


#inference_features.to_parquet("../data/features/user_features.parquet", index=False)
inference_features.to_parquet(
    "../data/features/inference_features.parquet", index=False
)

print("✅ inference_features.parquet saved with full model feature set")


✅ inference_features.parquet saved with full model feature set
