Can you predict future shopping baskets to recommend better fresh produce purchases?
Farm To Feed connects farmers, businesses and consumers with odd-looking fruits and vegetables that would otherwise go to waste. By selling otherwise wasted fresh produce, Farm to Feed is reducing food waste, boosting farmer incomes, and making fresh, nutritious produce more accessible to all

Installing the packages

In [57]:
!pip install requests pandas numpy matplotlib seaborn  lightgbm xgboost catboost



In [58]:
!pip install scikit-learn



Import packages and libraries

In [59]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, mean_absolute_error

import lightgbm as lgb

pd.set_option("display.max_columns", 200)
np.random.seed(42)


Load Data

In [60]:
DATA_PATH = "data/"

train = pd.read_csv(DATA_PATH + "Train.csv", parse_dates=["week_start", "customer_created_at"])
test = pd.read_csv(DATA_PATH + "Test.csv", parse_dates=["week_start", "customer_created_at"])

print(train.shape, test.shape)
train.head()


(2114436, 20) (275796, 11)


Unnamed: 0,ID,customer_id,product_unit_variant_id,week_start,qty_this_week,num_orders_week,spend_this_week,purchased_this_week,product_id,grade_name,unit_name,product_grade_variant_id,selling_price,customer_category,customer_status,customer_created_at,Target_qty_next_1w,Target_purchase_next_1w,Target_qty_next_2w,Target_purchase_next_2w
0,339_1_20241028,339,1,2024-10-28,0.0,0.0,0.0,0,65,GRADE_01,UNIT_004,84,280.0,CUST_CAT_003,CUST_STAT_000,2023-09-29,0.0,0,0.0,0
1,679_498_20241028,679,498,2024-10-28,0.0,0.0,0.0,0,430,GRADE_05,UNIT_004,479,711.0,CUST_CAT_003,CUST_STAT_000,2025-07-10,0.0,0,0.0,0
2,679_502_20241028,679,502,2024-10-28,0.0,0.0,0.0,0,434,GRADE_05,UNIT_004,483,1047.0,CUST_CAT_003,CUST_STAT_000,2025-07-10,0.0,0,0.0,0
3,409_576_20241028,409,576,2024-10-28,0.0,0.0,0.0,0,484,GRADE_05,UNIT_006,539,6581.0,CUST_CAT_003,CUST_STAT_000,2024-08-25,0.0,0,0.0,0
4,778_560_20241028,778,560,2024-10-28,0.0,0.0,0.0,0,341,GRADE_01,UNIT_004,351,55.0,CUST_CAT_003,CUST_STAT_000,2025-10-22,0.0,0,0.0,0


Sort & Sanity Checks (Time-Safe)

In [61]:
num_cols = ["qty_this_week", "num_orders_week", "spend_this_week"]

for col in num_cols:
    if col in train.columns:
        train[col] = train[col].fillna(0)
    if col in test.columns:
        test[col] = test[col].fillna(0)


Base Temporal Features

In [62]:
for df in [train, test]:
    df["weekofyear"] = df["week_start"].dt.isocalendar().week.astype(int)
    df["month"] = df["week_start"].dt.month
    
    # Cyclical encoding
    df["week_sin"] = np.sin(2 * np.pi * df["weekofyear"] / 52)
    df["week_cos"] = np.cos(2 * np.pi * df["weekofyear"] / 52)
    
    df["customer_tenure_weeks"] = (
        (df["week_start"] - df["customer_created_at"]).dt.days // 7
    )


Customer–SKU Historical Features (Core Signal)

In [63]:
LAGS = [1, 2, 4, 8]

for lag in LAGS:
    train[f"qty_lag_{lag}"] = (
        train.groupby(["customer_id", "product_unit_variant_id"])["qty_this_week"]
        .shift(lag)
    )
    train[f"orders_lag_{lag}"] = (
        train.groupby(["customer_id", "product_unit_variant_id"])["num_orders_week"]
        .shift(lag)
    )
    train[f"purchase_lag_{lag}"] = (
        train.groupby(["customer_id", "product_unit_variant_id"])["purchased_this_week"]
        .shift(lag)
    )


Rolling Aggregates

In [64]:
ROLLS = [4, 8, 12]

for w in ROLLS:
    train[f"qty_roll_mean_{w}"] = (
        train.groupby(["customer_id", "product_unit_variant_id"])["qty_this_week"]
        .shift(1)
        .rolling(w)
        .mean()
    )
    
    train[f"purchase_freq_{w}"] = (
        train.groupby(["customer_id", "product_unit_variant_id"])["purchased_this_week"]
        .shift(1)
        .rolling(w)
        .mean()
    )


Recency Features

In [65]:
def weeks_since_last_purchase(x):
    last = -1
    out = []
    for i, v in enumerate(x):
        if v == 1:
            last = i
            out.append(0)
        else:
            out.append(i - last if last != -1 else np.nan)
    return out

train["weeks_since_last_purchase"] = (
    train.groupby(["customer_id", "product_unit_variant_id"])["purchased_this_week"]
    .transform(weeks_since_last_purchase)
)


Category & Product-Level Signals

In [66]:
train["category_qty_roll_8w"] = (
    train.groupby(["customer_id", "product_id"])["qty_this_week"]
    .shift(1)
    .rolling(8)
    .mean()
)

train["global_sku_popularity"] = (
    train.groupby("product_unit_variant_id")["purchased_this_week"]
    .transform("mean")
)


Price & Spend Behavior

In [67]:
train["avg_spend_per_order"] = (
    train["spend_this_week"] / (train["num_orders_week"] + 1e-5)
)

train["price_change"] = (
    train.groupby("product_unit_variant_id")["selling_price"]
    .pct_change()
)


Encode Categorical Variables

In [68]:
cat_cols = [
    "customer_id",
    "product_unit_variant_id",
    "product_id",
    "grade_name",
    "unit_name",
    "customer_category",
    "customer_status"
]

encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    encoders[col] = le


Feature List & Targets

In [69]:
TARGETS = [
    "Target_purchase_next_1w",
    "Target_qty_next_1w",
    "Target_purchase_next_2w",
    "Target_qty_next_2w"
]

FEATURES = [c for c in train.columns if c not in (
    TARGETS + [
        "ID",
        "week_start",
        "customer_created_at",
        "qty_this_week",
        "num_orders_week",
        "spend_this_week",
        "purchased_this_week"
    ]
)]


Time-Based Train / Validation Split

In [70]:
cutoff_date = train["week_start"].quantile(0.8)

tr = train[train["week_start"] <= cutoff_date]
va = train[train["week_start"] > cutoff_date]


Model Definitions
Purchase Probability (AUC)

In [71]:
clf_params = dict(
    objective="binary",
    learning_rate=0.05,
    n_estimators=800,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
)
clf_1w = lgb.LGBMClassifier(**clf_params)
clf_2w = lgb.LGBMClassifier(**clf_params)


Quantity Regression (MAE)

In [72]:
reg_params = dict(
    objective="regression",
    learning_rate=0.05,
    n_estimators=800,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
)
reg_1w = lgb.LGBMRegressor(**reg_params)
reg_2w = lgb.LGBMRegressor(**reg_params)


Train Models

In [73]:
clf_1w.fit(
    tr[FEATURES], tr["Target_purchase_next_1w"],
    eval_set=[(va[FEATURES], va["Target_purchase_next_1w"])],
    eval_metric="auc",
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=0)  # disables verbose output
    ]
)

reg_1w.fit(
    tr[FEATURES], tr["Target_qty_next_1w"],
    eval_set=[(va[FEATURES], va["Target_qty_next_1w"])],
    eval_metric="l1",
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=0)
    ]
)


[LightGBM] [Info] Number of positive: 31573, number of negative: 1669169
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.171541 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3180
[LightGBM] [Info] Number of data points in the train set: 1700742, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.018564 -> initscore=-3.967779
[LightGBM] [Info] Start training from score -3.967779
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[121]	valid_0's auc: 0.975092	valid_0's binary_logloss: 0.0475564
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152320 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3180
[LightGBM] [Info

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,6
,learning_rate,0.05
,n_estimators,800
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


Validation Metrics

In [74]:
p1 = clf_1w.predict_proba(va[FEATURES])[:, 1]
q1 = reg_1w.predict(va[FEATURES])

print("1W AUC:", roc_auc_score(va["Target_purchase_next_1w"], p1))
print("1W MAE:", mean_absolute_error(va["Target_qty_next_1w"], q1))


1W AUC: 0.9750918077591316
1W MAE: 1.9679054041592772


Test Predictions & Submission

In [75]:
train["is_train"] = 1
test["is_train"] = 0

full = pd.concat([train, test], axis=0, ignore_index=True)

full = full.sort_values(
    ["customer_id", "product_unit_variant_id", "week_start"]
).reset_index(drop=True)

# Lag features
for lag in [1, 2, 4, 8]:
    full[f"qty_lag_{lag}"] = (
        full.groupby(["customer_id", "product_unit_variant_id"])["qty_this_week"]
        .shift(lag)
    )

# Rolling features
for w in [4, 8, 12]:
    full[f"qty_roll_mean_{w}"] = (
        full.groupby(["customer_id", "product_unit_variant_id"])["qty_this_week"]
        .shift(1)
        .rolling(w)
        .mean()
    )

# Recency (safe transform version)
full["weeks_since_last_purchase"] = (
    full.groupby(["customer_id", "product_unit_variant_id"])["purchased_this_week"]
    .transform(weeks_since_last_purchase)
)

train = full[full["is_train"] == 1].drop(columns=["is_train"])
test = full[full["is_train"] == 0].drop(columns=["is_train"])

test["p1"] = clf_1w.predict_proba(
    test[FEATURES], num_iteration=clf_1w.best_iteration_
)[:, 1]

test["q1"] = reg_1w.predict(
    test[FEATURES], num_iteration=reg_1w.best_iteration_
)


In [76]:
clf_2w.fit(
    tr[FEATURES], tr["Target_purchase_next_2w"],
    eval_set=[(va[FEATURES], va["Target_purchase_next_2w"])],
    eval_metric="auc",
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=0)
    ]
)

reg_2w.fit(
    tr[FEATURES], tr["Target_qty_next_2w"],
    eval_set=[(va[FEATURES], va["Target_qty_next_2w"])],
    eval_metric="l1",
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=0)
    ]
)


[LightGBM] [Info] Number of positive: 42293, number of negative: 1658449
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.147272 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3180
[LightGBM] [Info] Number of data points in the train set: 1700742, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.024867 -> initscore=-3.669017
[LightGBM] [Info] Start training from score -3.669017
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[174]	valid_0's auc: 0.968586	valid_0's binary_logloss: 0.0651678
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.155823 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3180
[LightGBM] [Info

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,6
,learning_rate,0.05
,n_estimators,800
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [77]:
# Predict with best_iteration_ to respect early stopping
p1 = clf_1w.predict_proba(
    test[FEATURES], num_iteration=clf_1w.best_iteration_
)[:, 1]

q1 = reg_1w.predict(
    test[FEATURES], num_iteration=reg_1w.best_iteration_
)

p2 = clf_2w.predict_proba(
    test[FEATURES], num_iteration=clf_2w.best_iteration_
)[:, 1]

q2 = reg_2w.predict(
    test[FEATURES], num_iteration=reg_2w.best_iteration_
)

# Safety: quantities must be non-negative
q1 = np.clip(q1, 0, None)
q2 = np.clip(q2, 0, None)

submission = pd.DataFrame({
    "ID": test["ID"],
    "Target_purchase_next_1w": p1,
    "Target_qty_next_1w": p1 * q1,
    "Target_purchase_next_2w": p2,
    "Target_qty_next_2w": p2 * q2,
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,ID,Target_purchase_next_1w,Target_qty_next_1w,Target_purchase_next_2w,Target_qty_next_2w
46,339_1_20250922,0.000271,0.000173,0.000657,0.00092
47,339_1_20250929,0.000272,0.000173,0.000662,0.000926
48,339_1_20251006,0.000273,0.000173,0.000689,0.000964
49,339_1_20251013,0.000273,0.000173,0.000689,0.000964
50,339_1_20251020,0.000273,0.000173,0.00078,0.001091
