## Step 0 — Install (once) & imports

In [1]:
# If not installed already (safe to rerun)
# %pip install lightgbm==4.3.0 joblib==1.4.2

import os, json, math, gc, random
from datetime import datetime
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

### Step 1 — Config and knobs 

In [10]:
TOP_N_PER_ANCHOR = 40
FALLBACK_MIN_CANDS = 30
NEG_PER_CART = 20
MAX_TRAIN_BASKETS = 80_000

In [None]:
# === FILENAMES (same folder) ===
FILE_MODELING = "modeling_dataset.csv"  # your full ~1.4M modeling dataset (if named different, change)
FILE_TEST     = "test_data_question.csv"
FILE_P        = "P_j_given_i.csv"
FILE_LIFT     = "item_lift_matrix.csv"
FILE_JACC     = "item_jaccard_matrix.csv"
FILE_COOC     = "item_cooccurrence_counts.csv"
FILE_POP      = "item_stats_counts_and_freq.csv"

# === COLUMN MAPPINGS known from your data ===
POPULAR_COL_ITEM = "Unnamed: 0"  # item name column in item_stats_counts_and_freq.csv
POPULAR_COL_FREQ = "freq"

# P_j_given_i: we will auto-melt wide to long if needed
COL_ANCHOR = "anchor_item"
COL_CAND   = "candidate_item"
COL_PROB   = "P_j_given_i"

# === Stage-1 candidate gen knobs used for both training & inference ===
TOP_N_PER_ANCHOR = 60         # how many candidates per anchor item (before aggregation)
FALLBACK_MIN_CANDS = 40       # ensure at least this many candidates per cart
AGG_PROB_METHOD = "one_minus_prod"  # {"one_minus_prod","sum","max"}
AGG_OTHER_METHOD = "mean"           # {"mean","sum","max"}

W_PROB = 0.60
W_LIFT = 0.25
W_JACC = 0.10
W_POPU = 0.05

# === Training set size (speed/scale) ===
MAX_TRAIN_BASKETS = 150_000   # cap number of LOO carts from modeling (adjust per machine)
NEG_PER_CART      = 40        # sampled negatives per cart (downsample to keep train small)
RANDOM_SEED       = 42
N_JOBS            = max(1, os.cpu_count() - 1)  # parallel jobs for feature gen

# === Validation split ===
# Use time-based split (last X% of orders for validation)
VAL_FRACTION_TIME = 0.10  # use last 10% by date as validation

# === Output files ===
FILE_STAGE2_MODEL = "stage2_lgbm_model.txt"              # saved LightGBM model
FILE_STAGE2_FEATS = "stage2_feature_columns.json"        # persisted feature list
FILE_STAGE2_SUB   = "stage2_recommendations_top3.csv"    # final submission csv


### Step 2 — Helpers

In [3]:
def canonicalize_item(x):
    if pd.isna(x): return x
    return str(x).strip()

def safe_minmax(s: pd.Series):
    s = s.astype(float).fillna(0.0)
    lo, hi = s.min(), s.max()
    if hi - lo < 1e-12:
        return pd.Series(np.zeros(len(s), dtype=float), index=s.index)
    return (s - lo) / (hi - lo)

def one_minus_product_of_complements(values):
    v = np.clip(np.array(values, dtype=float), 0.0, 1.0)
    return float(1.0 - np.prod(1.0 - v))

def item_bucket(name: str) -> str:
    n = str(name).lower()
    if "fries" in n: return "fries"
    if "combo" in n: return "combo"
    if "dip" in n: return "dip"
    if "corn" in n: return "sides"
    if any(w in n for w in ["cake","brownie","cookie"]): return "dessert"
    if "drink" in n or "soda" in n: return "drink"
    if "wings" in n and "spicy" in n: return "wings_spicy"
    if "wings" in n and "grilled" in n: return "wings_grilled"
    if "wings" in n: return "wings"
    if "strips" in n: return "strips"
    return "other"


### Step 3 — Load tables and normalize them

In [8]:
# 3.1 Popularity
pop_df = pd.read_csv(FILE_POP)
pop_df = pop_df[[POPULAR_COL_ITEM, POPULAR_COL_FREQ]].dropna()
pop_df[POPULAR_COL_ITEM] = pop_df[POPULAR_COL_ITEM].map(canonicalize_item)
pop_df = pop_df.sort_values(POPULAR_COL_FREQ, ascending=False).reset_index(drop=True)

# 3.2 P(j|i) table
P_raw = pd.read_csv(FILE_P)
# If it's a wide N x N matrix, melt it:
if COL_PROB not in P_raw.columns or len({'item','Item','ITEM'}.intersection(P_raw.columns))>0:
    row_item_col = 'item' if 'item' in P_raw.columns else ('Item' if 'Item' in P_raw.columns else P_raw.columns[0])
    P = P_raw.melt(id_vars=[row_item_col], var_name=COL_CAND, value_name=COL_PROB)\
             .rename(columns={row_item_col: COL_ANCHOR})
else:
    P = P_raw.rename(columns={P_raw.columns[0]: COL_ANCHOR,
                              P_raw.columns[1]: COL_CAND,
                              P_raw.columns[2]: COL_PROB})
P[COL_ANCHOR] = P[COL_ANCHOR].map(canonicalize_item)
P[COL_CAND]   = P[COL_CAND].map(canonicalize_item)
P = P.dropna(subset=[COL_CAND, COL_PROB])

# 3.3 Lift / Jaccard / Co-occurrence: melt to long if needed
def to_long_pairs(df, value_col):
    cols = df.columns.tolist()
    str_cols = [c for c in cols if df[c].dtype == "object"]
    if value_col in cols and len(str_cols) >= 2:
        # already long
        return df.rename(columns={str_cols[0]: COL_ANCHOR, str_cols[1]: COL_CAND})
    base = df.copy()
    row_item_col = cols[0]
    base[row_item_col] = base[row_item_col].map(canonicalize_item)
    long_df = base.melt(id_vars=[row_item_col], var_name=COL_CAND, value_name=value_col)\
                  .rename(columns={row_item_col: COL_ANCHOR})
    long_df[COL_CAND] = long_df[COL_CAND].map(canonicalize_item)
    return long_df

lift_pairs = to_long_pairs(pd.read_csv(FILE_LIFT),  "lift")
jacc_pairs = to_long_pairs(pd.read_csv(FILE_JACC),  "jaccard")
cooc_pairs = to_long_pairs(pd.read_csv(FILE_COOC),  "cooc_count")

# Keep only needed columns & drop NaNs
lift_pairs = lift_pairs[[COL_ANCHOR, COL_CAND, "lift"]].dropna()
jacc_pairs = jacc_pairs[[COL_ANCHOR, COL_CAND, "jaccard"]].dropna()
cooc_pairs = cooc_pairs[[COL_ANCHOR, COL_CAND, "cooc_count"]].dropna()

# 3.4 Build fast per-anchor top-N
P_sorted = P.sort_values([COL_ANCHOR, COL_PROB], ascending=[True, False]).copy()
P_topN = P_sorted.groupby(COL_ANCHOR, as_index=False).head(TOP_N_PER_ANCHOR)
P_by_anchor = {a: g[[COL_CAND, COL_PROB]].reset_index(drop=True)
               for a, g in P_topN.groupby(COL_ANCHOR)}

POPULAR_ITEMS = pop_df[POPULAR_COL_ITEM].tolist()

# === Step 3.5 — Prejoin signals once per anchor to avoid merging inside every call ===
pairs_all = P_topN.merge(lift_pairs, on=[COL_ANCHOR, COL_CAND], how="left") \
                  .merge(jacc_pairs, on=[COL_ANCHOR, COL_CAND], how="left") \
                  .merge(cooc_pairs, on=[COL_ANCHOR, COL_CAND], how="left")

# Build a dict: anchor -> DataFrame of [candidate, prob, lift, jaccard, cooc_count]
CAND_BY_ANCHOR = {a: g[[COL_CAND, COL_PROB, "lift", "jaccard", "cooc_count"]].reset_index(drop=True)
                  for a, g in pairs_all.groupby(COL_ANCHOR)}



### Step 4 — Detect menu-item columns in the modeling dataset

In [5]:
# Only load header to detect columns fast
modeling_header = pd.read_csv(FILE_MODELING, nrows=5)
menu_items = set(pop_df[POPULAR_COL_ITEM].unique())
ITEM_COLS = [c for c in modeling_header.columns if c in menu_items]

ID_COLS = [
    "CUSTOMER_ID","STORE_NUMBER","ORDER_CREATED_DATE","ORDER_ID",
    "ORDER_CHANNEL_NAME","ORDER_SUBCHANNEL_NAME","ORDER_OCCASION_NAME"
]

CUST_CTX_COLS = [
    # customer-type one-hots (based on your schema)
    "cust_registered","cust_guest","cust_special_membership",
    # customer features
    "orders_count","items_count","repeat_purchase_rate","avg_order_value",
    # preferred/favorite
    "favorite_item",
    # contextual features baked in
    "weekend_order_ratio","most_common_order_hour","most_common_order_dow",
    "most_common_store","store_diversity_count"
]

STORE_ONEHOTS = [c for c in modeling_header.columns if c.startswith("store_city_")] + (["store_STATE"] if "store_STATE" in modeling_header.columns else [])

# Keep only columns we’ll actually use to avoid memory blow-ups
KEEP_COLS = ID_COLS + ITEM_COLS + CUST_CTX_COLS + STORE_ONEHOTS + (["total_order_price"] if "total_order_price" in modeling_header.columns else [])


### Step 5 — Stage-1 candidate generator (vectorized, same logic as earlier)

In [9]:
def get_candidates_for_cart(cart_items):
    # 1) Collect per-anchor frames fast (no merges here)
    frames = []
    for anchor in cart_items:
        df = CAND_BY_ANCHOR.get(anchor)
        if df is not None and not df.empty:
            tmp = df.copy()
            tmp["_anchor"] = anchor
            frames.append(tmp)

    if not frames:
        # popularity fallback
        fb = pop_df[~pop_df[POPULAR_COL_ITEM].isin(cart_items)] \
             .rename(columns={POPULAR_COL_ITEM: COL_CAND}).copy()
        fb["stage1_score"] = safe_minmax(fb[POPULAR_COL_FREQ])
        fb["votes"] = 0
        return fb[[COL_CAND, "stage1_score", "votes"]].head(FALLBACK_MIN_CANDS)

    cand_pairs = pd.concat(frames, ignore_index=True)
    cand_pairs = cand_pairs[~cand_pairs[COL_CAND].isin(cart_items)].copy()

    # 2) Aggregate across anchors
    def agg_prob(s):
        if AGG_PROB_METHOD == "one_minus_prod": return one_minus_product_of_complements(s)
        if AGG_PROB_METHOD == "sum": return s.sum()
        return s.max()
    def agg_other(s):
        if AGG_OTHER_METHOD == "mean": return s.mean(skipna=True)
        if AGG_OTHER_METHOD == "sum": return s.sum(skipna=True)
        return s.max(skipna=True)

    grouped = cand_pairs.groupby(COL_CAND).agg({
        COL_PROB: agg_prob,
        "lift": agg_other,
        "jaccard": agg_other,
        "cooc_count": "sum",
        "_anchor": "count"     # <-- votes (how many anchors suggested this candidate)
    }).rename(columns={"_anchor": "votes"}).reset_index()

    # 3) Attach popularity & score
    grouped = grouped.merge(
        pop_df.rename(columns={POPULAR_COL_ITEM: COL_CAND, POPULAR_COL_FREQ: "popularity"}),
        on=COL_CAND, how="left"
    )

    for col in [COL_PROB, "lift", "jaccard", "popularity"]:
        if col not in grouped.columns: grouped[col] = 0.0
        grouped[f"{col}_norm"] = safe_minmax(grouped[col])

    grouped["stage1_score"] = (
        W_PROB*grouped[f"{COL_PROB}_norm"] +
        W_LIFT*grouped["lift_norm"] +
        W_JACC*grouped["jaccard_norm"] +
        W_POPU*grouped["popularity_norm"]
    )

    # 4) Ensure minimum candidates
    need = max(0, FALLBACK_MIN_CANDS - len(grouped))
    if need > 0:
        fb = pop_df[~pop_df[POPULAR_COL_ITEM].isin(set(grouped[COL_CAND]).union(cart_items))] \
                  .head(need).rename(columns={POPULAR_COL_ITEM: COL_CAND})
        fb["stage1_score"] = 0.0
        fb["votes"] = 0
        grouped = pd.concat([grouped[[COL_CAND,"stage1_score","votes",COL_PROB,"lift","jaccard","popularity"]],
                             fb[[COL_CAND,"stage1_score","votes"]]],
                            ignore_index=True)

    return grouped.sort_values("stage1_score", ascending=False).reset_index(drop=True)


### Step 6 — Create Leave-One-Out (LOO) training pairs

In [13]:
rng = np.random.default_rng(RANDOM_SEED)

def row_to_cart_items(row_items):
    # return list of item cols present (value > 0)
    return [c for c, v in row_items.items() if float(v) > 0]

def build_pairs_for_row(row):
    """
    Returns a DataFrame with columns:
    - 'cart_id', 'candidate_item', 'label', 'ORDER_CREATED_DATE' (+ features in next step)
    cart_id is ORDER_ID to keep a unique group per cart in training.
    """
    # basket items from one-hot/counts
    items = [c for c in ITEM_COLS if float(row.get(c, 0)) > 0]
    if len(items) < 2:
        return None

    heldout = rng.choice(items)
    cart = [it for it in items if it != heldout]

    # Stage-1 candidates (vectorized aggregation)
    ranked = get_candidates_for_cart(cart)

    # --- NEW: canonicalize both sides to avoid string mismatches ---
    heldout = canonicalize_item(heldout)
    ranked[COL_CAND] = ranked[COL_CAND].map(canonicalize_item)

    # Ensure the true item is present (pre-truncation)
    if heldout not in set(ranked[COL_CAND]):
        ranked = pd.concat(
            [pd.DataFrame({COL_CAND:[heldout],
                        "stage1_score":[(ranked["stage1_score"].min() if len(ranked) else 0) - 1e-6],
                        COL_PROB:[0.0], "lift":[0.0], "jaccard":[0.0], "popularity":[0.0]}),
            ranked],
            ignore_index=True
        )


    # Sample negatives to keep per-cart small
    target = max(NEG_PER_CART+1, 50)
    ranked = ranked.head(target)

    # --- NEW: ensure the positive survived truncation ---
    if not (ranked[COL_CAND] == heldout).any():
        extra = pd.DataFrame({COL_CAND:[heldout],
                            "stage1_score":[ranked["stage1_score"].min() - 1e-6],
                            COL_PROB:[0.0], "lift":[0.0], "jaccard":[0.0], "popularity":[0.0]})
        # put it at the top and re-trim to keep length == target
        ranked = pd.concat([extra, ranked], ignore_index=True).head(target)

    # Labels
    ranked["label"] = (ranked[COL_CAND] == heldout).astype(int)

    out = ranked.copy()
    out["cart_id"] = row["ORDER_ID"]  # group id
    out["ORDER_CREATED_DATE"] = row.get("ORDER_CREATED_DATE")
    # Keep some cart-level descriptors (optional for features)
    out["cart_size"] = len(cart)
    out["anchors_voted"] = out.get("votes", 0)
    out["cart_has_combo"] = int(any("combo" in it.lower() for it in cart))
    out["cart_has_wings"] = int(any("wings" in it.lower() for it in cart))
    out["cart_has_fries"] = int(any("fries" in it.lower() for it in cart))
    out["cart_has_dip"]   = int(any("dip"   in it.lower() for it in cart))
    out["candidate_bucket"] = out[COL_CAND].map(item_bucket)
    return out

def iterate_modeling_rows():
    # chunked read for memory efficiency
    usecols = [c for c in KEEP_COLS if c in modeling_header.columns]
    for chunk in pd.read_csv(FILE_MODELING, usecols=usecols, chunksize=50_000):
        # ensure proper dtypes
        for c in ITEM_COLS:
            if c in chunk.columns:
                chunk[c] = chunk[c].fillna(0).astype(float)
        yield chunk

# Build train pool (sample up to MAX_TRAIN_BASKETS rows that have >=2 items)
train_pairs = []
seen = 0

for chunk in iterate_modeling_rows():
    # Keep baskets with >=2 items
    basket_mask = chunk[ITEM_COLS].sum(axis=1) >= 2
    cand_chunk = chunk.loc[basket_mask].copy()

    # --- NEW: ensure one LOO per ORDER_ID in this pass ---
    cand_chunk = cand_chunk.drop_duplicates(subset=["ORDER_ID"])

    # time split prep
    # parse date once
    cand_chunk["ORDER_CREATED_DATE"] = pd.to_datetime(cand_chunk["ORDER_CREATED_DATE"])

    # sample subset to control size
    if len(cand_chunk) > 0:
        needed = max(0, MAX_TRAIN_BASKETS - seen)
        if needed <= 0:
            break
        frac = min(1.0, needed / len(cand_chunk))
        sampled = cand_chunk.sample(frac=frac, random_state=RANDOM_SEED)
        seen += len(sampled)

        # Parallel per-row pair building
        dict_rows = sampled.to_dict(orient="records")
        # results = Parallel(n_jobs=N_JOBS, backend="loky", verbose=0)(
        #     delayed(build_pairs_for_row)(row) for row in dict_rows
        # )
        results = Parallel(
            n_jobs=N_JOBS,
            backend="threading",        # <-- was "loky"
            batch_size=256,             # <-- add batching to reduce scheduling overhead
            verbose=0
        )(delayed(build_pairs_for_row)(row) for row in dict_rows)
        
        results = [r for r in results if r is not None]
        if results:
            train_pairs.append(pd.concat(results, ignore_index=True))

    # free memory
    del cand_chunk, chunk
    gc.collect()

train_df = pd.concat(train_pairs, ignore_index=True) if train_pairs else pd.DataFrame()
print("Train pairs:", train_df.shape)


Train pairs: (3637100, 22)


### Checks

In [14]:
n_rows = len(train_df)
n_carts = train_df['cart_id'].nunique()
avg_per_cart = n_rows / max(1, n_carts)
pos_rate = train_df['label'].mean()
cart_without_pos = (train_df.groupby('cart_id')['label'].max() == 0).sum()
cart_size_stats = train_df.groupby('cart_id').size().describe()

print(f"rows={n_rows:,}  carts={n_carts:,}  avg_per_cart={avg_per_cart:.1f}")
print(f"positive rate (should be ~1/(NEG_PER_CART+1)): {pos_rate:.4f}")
print(f"carts with NO positive (should be 0): {cart_without_pos}")
print("cart size stats:\n", cart_size_stats)

rows=3,637,100  carts=72,742  avg_per_cart=50.0
positive rate (should be ~1/(NEG_PER_CART+1)): 0.0200
carts with NO positive (should be 0): 0
cart size stats:
 count    72742.0
mean        50.0
std          0.0
min         50.0
25%         50.0
50%         50.0
75%         50.0
max         50.0
dtype: float64


### Step 7 — Feature engineering for pairs (cart, candidate)

In [15]:
# Candidate attributes
train_df = train_df.merge(pop_df.rename(columns={POPULAR_COL_ITEM: COL_CAND,
                                                 POPULAR_COL_FREQ: "cand_popularity"}),
                          on=COL_CAND, how="left")

# Stage-1 ranks
train_df["stage1_rank"] = train_df.groupby("cart_id")["stage1_score"].rank(ascending=False, method="first")

# Candidate type/bucket one-hot (small set)
train_df["cand_is_combo"]   = (train_df["candidate_bucket"] == "combo").astype(int)
train_df["cand_is_fries"]   = (train_df["candidate_bucket"] == "fries").astype(int)
train_df["cand_is_dip"]     = (train_df["candidate_bucket"] == "dip").astype(int)
train_df["cand_is_wings"]   = train_df["candidate_bucket"].isin(["wings","wings_spicy","wings_grilled"]).astype(int)
train_df["cand_is_strips"]  = (train_df["candidate_bucket"] == "strips").astype(int)

# Bring customer/context columns from modeling set (for each cart_id)
# We left-join one row per cart_id from original modeling dataset
# (cart_id is ORDER_ID of that row)
ctx_cols = [c for c in (ID_COLS + CUST_CTX_COLS + STORE_ONEHOTS + ["total_order_price"]) if c in modeling_header.columns]
ctx = []

for chunk in pd.read_csv(FILE_MODELING, usecols=ctx_cols, chunksize=100_000):
    ctx.append(chunk.drop_duplicates(subset=["ORDER_ID"]))
ctx = pd.concat(ctx, ignore_index=True).drop_duplicates(subset=["ORDER_ID"])
ctx["ORDER_CREATED_DATE"] = pd.to_datetime(ctx["ORDER_CREATED_DATE"])

train_df = train_df.merge(ctx.rename(columns={"ORDER_ID":"cart_id"}), on="cart_id", how="left")

# Binary feature: candidate equals favorite item
if "favorite_item" in train_df.columns:
    train_df["cand_is_favorite_item"] = (train_df[COL_CAND] == train_df["favorite_item"].astype(str)).astype(int)
else:
    train_df["cand_is_favorite_item"] = 0

# ----- Consistent category vocab (train) -----
CAT_COLS = [c for c in ["ORDER_CHANNEL_NAME","ORDER_SUBCHANNEL_NAME","ORDER_OCCASION_NAME"] if c in train_df.columns]
cat_vocab = {}
for c in CAT_COLS:
    train_df[c] = train_df[c].astype(str)
    cats = sorted(train_df[c].unique())
    cat_vocab[c] = cats
    map_c = {s:i for i,s in enumerate(cats)}
    train_df[c] = train_df[c].map(map_c).fillna(-1).astype(int)

with open("stage2_cat_vocab.json","w") as f:
    json.dump(cat_vocab, f)

# ----- Downcast numerics to float32 (saves RAM) -----
for col in train_df.columns:
    if col not in ('cart_id','label','ORDER_CREATED_DATE', COL_CAND):
        if pd.api.types.is_numeric_dtype(train_df[col]):
            train_df[col] = pd.to_numeric(train_df[col], downcast='float')

# Final numeric cleanups
for col in ["cand_popularity","jaccard","lift",COL_PROB,"stage1_score","stage1_rank",
            "orders_count","items_count","repeat_purchase_rate","avg_order_value",
            "weekend_order_ratio","store_diversity_count","cart_size","anchors_voted"]:
    if col in train_df.columns:
        train_df[col] = train_df[col].fillna(0.0).astype(float)

# Drop heavy strings not needed after feature derivation
drop_cols = ["candidate_bucket","favorite_item","most_common_store"]
for c in drop_cols:
    if c in train_df.columns: train_df.drop(columns=[c], inplace=True)
gc.collect()

0

In [19]:
train_df.columns

Index(['candidate_item', 'P_j_given_i', 'lift', 'jaccard', 'cooc_count',
       'votes', 'popularity', 'P_j_given_i_norm', 'lift_norm', 'jaccard_norm',
       'popularity_norm', 'stage1_score', 'label', 'cart_id',
       'ORDER_CREATED_DATE_x', 'cart_size', 'anchors_voted', 'cart_has_combo',
       'cart_has_wings', 'cart_has_fries', 'cart_has_dip', 'cand_popularity',
       'stage1_rank', 'cand_is_combo', 'cand_is_fries', 'cand_is_dip',
       'cand_is_wings', 'cand_is_strips', 'CUSTOMER_ID', 'STORE_NUMBER',
       'ORDER_CREATED_DATE_y', 'ORDER_CHANNEL_NAME', 'ORDER_SUBCHANNEL_NAME',
       'ORDER_OCCASION_NAME', 'total_order_price', 'cust_registered',
       'cust_guest', 'cust_special_membership', 'store_STATE',
       'store_city_Apple Valley', 'store_city_Ardmore', 'store_city_Arlington',
       'store_city_Atwater', 'store_city_Aurora', 'store_city_Austin',
       'store_city_Brandon', 'store_city_Charlotte', 'store_city_Cicero',
       'store_city_Dallas', 'store_city_El Paso',

In [20]:
# One-off coalesce for existing train_df
if "ORDER_CREATED_DATE" not in train_df.columns:
    if "ORDER_CREATED_DATE_x" in train_df.columns or "ORDER_CREATED_DATE_y" in train_df.columns:
        left = train_df.get("ORDER_CREATED_DATE_x")
        right = train_df.get("ORDER_CREATED_DATE_y")
        train_df["ORDER_CREATED_DATE"] = (left if left is not None else pd.Series(index=train_df.index)).fillna(right)
for c in ["ORDER_CREATED_DATE_ctx", "ORDER_CREATED_DATE_x", "ORDER_CREATED_DATE_y"]:
    if c in train_df.columns:
        train_df.drop(columns=[c], inplace=True)
train_df["ORDER_CREATED_DATE"] = pd.to_datetime(train_df["ORDER_CREATED_DATE"])


In [23]:
# 1) groups sum must equal dataset length
assert sum(train_group) == len(train_sorted), "Train groups don't sum to num rows."
assert sum(val_group) == len(val_sorted),     "Val groups don't sum to num rows."

# 2) features should be numeric (convert any stray objects just in case)
bad = [c for c in FEATURES if not pd.api.types.is_numeric_dtype(train_sorted[c])]
if bad:
    for c in bad:
        train_sorted[c] = pd.to_numeric(train_sorted[c], errors="coerce").fillna(0)
        val_sorted[c]   = pd.to_numeric(val_sorted[c], errors="coerce").fillna(0)


### Step 8 — Train/val split and LightGBM LambdaRank

In [26]:
# ===== Step 8 — Train/val split and LightGBM LambdaRank =====

import os, json
import lightgbm as lgb
import pandas as pd

# 1) Time-based split
train_df["ORDER_CREATED_DATE"] = pd.to_datetime(train_df["ORDER_CREATED_DATE"])
cutoff = train_df["ORDER_CREATED_DATE"].quantile(1 - VAL_FRACTION_TIME)
train_part = train_df[train_df["ORDER_CREATED_DATE"] < cutoff].copy()
val_part   = train_df[train_df["ORDER_CREATED_DATE"] >= cutoff].copy()

# 2) Feature list (exclude identifiers/labels)
NON_FEATURES = {
    "cart_id","label","ORDER_CREATED_DATE","candidate_item",
    "CUSTOMER_ID","STORE_NUMBER"   # <— add these two
}
FEATURES = [c for c in train_df.columns if c not in NON_FEATURES]

# Persist feature list for inference
with open(FILE_STAGE2_FEATS, "w") as f:
    json.dump(FEATURES, f)

# 3) Ensure all FEATURES are numeric (coerce objects safely) and downcast to float32 to save RAM
def ensure_numeric(df, cols):
    for c in cols:
        if c not in df.columns:
            df[c] = 0.0
        if not pd.api.types.is_numeric_dtype(df[c]):
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)
        df[c] = pd.to_numeric(df[c], downcast="float")
    return df

train_part_feat = ensure_numeric(train_part.copy(), FEATURES)
val_part_feat   = ensure_numeric(val_part.copy(), FEATURES)

# 4) Build LightGBM Datasets with group info (one group per cart)
def to_lgb_dataset(df):
    df = df.sort_values(["cart_id"]).copy()
    X = df[FEATURES]
    y = df["label"].astype(int)
    group_sizes = df.groupby("cart_id").size().tolist()
    dset = lgb.Dataset(X, label=y, free_raw_data=False)
    return dset, group_sizes, df

lgb_train, train_group, train_sorted = to_lgb_dataset(train_part_feat)
lgb_val,   val_group,   val_sorted   = to_lgb_dataset(val_part_feat)

# Set groups on datasets (required for ranking)
lgb_train.set_group(train_group)
lgb_val.set_group(val_group)

# Sanity checks
assert sum(train_group) == len(train_sorted), "Train groups don't sum to num rows."
assert sum(val_group)   == len(val_sorted),   "Val groups don't sum to num rows."

# 5) LightGBM params (LambdaRank) + speed/memory tweaks
params = dict(
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[3],
    learning_rate=0.05,
    num_leaves=63,
    min_data_in_leaf=50,
    feature_fraction=0.9,
    bagging_fraction=0.9,
    bagging_freq=1,
    max_depth=-1,
    random_state=RANDOM_SEED,
    verbose=-1,
    # speed/memory
    num_threads=os.cpu_count(),
    max_bin=255,
    min_data_in_bin=3,
)

# 6) Callbacks for early stopping + logging (works across LightGBM versions)
callbacks = [
    lgb.early_stopping(stopping_rounds=100, first_metric_only=False),
    lgb.log_evaluation(period=50),
]

# 7) Train
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=["train", "val"],  # if your version errors on this, remove this line
    callbacks=callbacks,
)

# 8) Save model to disk
model.save_model(FILE_STAGE2_MODEL)
print("Saved model to", FILE_STAGE2_MODEL)

Training until validation scores don't improve for 100 rounds
[50]	train's ndcg@3: 0.671783	val's ndcg@3: 0.68207
[100]	train's ndcg@3: 0.686826	val's ndcg@3: 0.690709
[150]	train's ndcg@3: 0.696571	val's ndcg@3: 0.696733
[200]	train's ndcg@3: 0.70662	val's ndcg@3: 0.700724
[250]	train's ndcg@3: 0.712235	val's ndcg@3: 0.703222
[300]	train's ndcg@3: 0.716669	val's ndcg@3: 0.704836
[350]	train's ndcg@3: 0.721202	val's ndcg@3: 0.704892
[400]	train's ndcg@3: 0.726643	val's ndcg@3: 0.706856
[450]	train's ndcg@3: 0.731256	val's ndcg@3: 0.707527
[500]	train's ndcg@3: 0.736165	val's ndcg@3: 0.708223
[550]	train's ndcg@3: 0.739555	val's ndcg@3: 0.70796
[600]	train's ndcg@3: 0.745449	val's ndcg@3: 0.708546
[650]	train's ndcg@3: 0.751271	val's ndcg@3: 0.710248
[700]	train's ndcg@3: 0.756842	val's ndcg@3: 0.712066
[750]	train's ndcg@3: 0.761714	val's ndcg@3: 0.713437
[800]	train's ndcg@3: 0.765087	val's ndcg@3: 0.713459
[850]	train's ndcg@3: 0.76923	val's ndcg@3: 0.714611
[900]	train's ndcg@3: 0.7

### Step 9 — Validation Recall@K (1 & 3)

In [27]:
def recall_at_k(df_scored, k=3):
    hits = 0
    for cart_id, g in df_scored.groupby("cart_id"):
        topk = g.nlargest(k, "score")
        if (topk["label"] > 0).any():
            hits += 1
    return hits / df_scored["cart_id"].nunique()

# Score validation
val_scores = model.predict(val_sorted[FEATURES], num_iteration=model.best_iteration)
val_scored = val_sorted.copy()
val_scored["score"] = val_scores

r1 = recall_at_k(val_scored, k=1)
r3 = recall_at_k(val_scored, k=3)
print(f"Validation Recall@1={r1:.4f}, Recall@3={r3:.4f}")


Validation Recall@1=0.6397, Recall@3=0.7695


In [28]:
imp = pd.Series(model.feature_importance(importance_type="gain"), index=FEATURES).sort_values(ascending=False).head(20)
print(imp)

cand_is_favorite_item    2.114380e+06
P_j_given_i_norm         8.821843e+05
jaccard_norm             3.216634e+05
stage1_rank              2.118754e+05
total_order_price        2.095782e+05
popularity_norm          1.218129e+05
lift                     1.098521e+05
P_j_given_i              8.897433e+04
lift_norm                7.949733e+04
popularity               7.548164e+04
stage1_score             7.209091e+04
jaccard                  7.205901e+04
cooc_count               6.035629e+04
items_count              5.636283e+04
cart_has_wings           5.355803e+04
repeat_purchase_rate     4.055914e+04
cand_popularity          3.414247e+04
avg_order_value          2.552053e+04
cart_has_combo           2.515591e+04
cand_is_combo            2.448984e+04
dtype: float64


### Step 10 — Inference on test_data_question.csv → final Top-3

In [None]:
test_df = pd.read_csv(FILE_TEST)

# find item columns (item1, item2, ...)
ITEMCOLS_TEST = [c for c in test_df.columns if c.lower().startswith("item")]
ITEMCOLS_TEST = sorted(ITEMCOLS_TEST, key=lambda x: int(''.join(ch for ch in x if ch.isdigit()) or 0))

# optional: join a compact customer/context lookup if you want personalization at inference
# (build from modeling to avoid reloading the huge file)
ctx_small = ctx.copy()  # from Step 7
if "CUSTOMER_ID" in ctx_small.columns and "CUSTOMER_ID" in test_df.columns:
    # best-effort de-dup
    ctx_small = ctx_small.drop_duplicates(subset=["CUSTOMER_ID"]).copy()

def extract_test_cart(row):
    items = []
    for c in ITEMCOLS_TEST:
        val = row.get(c)
        if pd.notna(val) and str(val).strip():
            items.append(canonicalize_item(val))
    return items

# Prepare feature list
with open(FILE_STAGE2_FEATS, "r") as f:
    FEATURES = json.load(f)

rows_out = []
for idx, row in test_df.iterrows():
    cart_items = extract_test_cart(row)
    cand = get_candidates_for_cart(cart_items).copy()
    # basic features mirroring training
    cand = cand.merge(pop_df.rename(columns={POPULAR_COL_ITEM: COL_CAND,
                                             POPULAR_COL_FREQ: "cand_popularity"}),
                      on=COL_CAND, how="left")
    cand["stage1_rank"] = np.arange(1, len(cand)+1, dtype=int)
    cand["candidate_bucket"] = cand[COL_CAND].map(item_bucket)
    cand["cand_is_combo"]   = (cand["candidate_bucket"] == "combo").astype(int)
    cand["cand_is_fries"]   = (cand["candidate_bucket"] == "fries").astype(int)
    cand["cand_is_dip"]     = (cand["candidate_bucket"] == "dip").astype(int)
    cand["cand_is_wings"]   = cand["candidate_bucket"].isin(["wings","wings_spicy","wings_grilled"]).astype(int)
    cand["cand_is_strips"]  = (cand["candidate_bucket"] == "strips").astype(int)
    cand["cart_size"] = len(cart_items)
    cand["anchors_voted"] = cand[COL_CAND].map(lambda c: sum([c in set(P_by_anchor.get(a, pd.DataFrame())[COL_CAND]) for a in cart_items]))
    cand["cart_has_combo"] = int(any("combo" in it.lower() for it in cart_items))
    cand["cart_has_wings"] = int(any("wings" in it.lower() for it in cart_items))
    cand["cart_has_fries"] = int(any("fries" in it.lower() for it in cart_items))
    cand["cart_has_dip"]   = int(any("dip"   in it.lower() for it in cart_items))

    # attach context/personalization if available
    if "CUSTOMER_ID" in test_df.columns and "CUSTOMER_ID" in ctx_small.columns:
        row_ctx = ctx_small[ctx_small["CUSTOMER_ID"] == row["CUSTOMER_ID"]].head(1)
        if not row_ctx.empty:
            for c in ["orders_count","items_count","repeat_purchase_rate","avg_order_value",
                      "weekend_order_ratio","store_diversity_count",
                      "cust_registered","cust_guest","cust_special_membership"]:
                if c in row_ctx.columns:
                    cand[c] = float(row_ctx.iloc[0].get(c, 0.0))
            # favorite item match
            fav = str(row_ctx.iloc[0].get("favorite_item", "")).strip()
            cand["cand_is_favorite_item"] = (cand[COL_CAND] == fav).astype(int)
        else:
            cand["cand_is_favorite_item"] = 0
    else:
        cand["cand_is_favorite_item"] = 0

    # encode channel/occasion inputs
    for c in ["ORDER_CHANNEL_NAME","ORDER_SUBCHANNEL_NAME","ORDER_OCCASION_NAME"]:
        if c in test_df.columns:
            # create deterministic mapping based on training categories (if known)
            cand[c] = hash(str(row.get(c))) % 100  # simple numeric proxy

    # numeric cleanups
    for col in ["cand_popularity","jaccard","lift",COL_PROB,"stage1_score","stage1_rank",
                "orders_count","items_count","repeat_purchase_rate","avg_order_value",
                "weekend_order_ratio","store_diversity_count","cart_size","anchors_voted"]:
        if col in cand.columns:
            cand[col] = cand[col].fillna(0.0).astype(float)

    # Keep feature subset in correct order (missing columns auto-filled with 0)
    for fcol in FEATURES:
        if fcol not in cand.columns:
            cand[fcol] = 0.0

    import lightgbm as lgb
    booster = lgb.Booster(model_file=FILE_STAGE2_MODEL)
    scores = booster.predict(cand[FEATURES], num_iteration=booster.best_iteration if hasattr(booster,"best_iteration") else None)
    cand["score"] = scores

    # Pick top-3 (you can add your soft-diversity selector here if you want)
    top3 = cand.nlargest(3, "score")[COL_CAND].tolist()
    while len(top3) < 3: top3.append("")

    rows_out.append({
        "ORDER_ID": row["ORDER_ID"],
        "RECOMMENDATION_1": top3[0],
        "RECOMMENDATION_2": top3[1],
        "RECOMMENDATION_3": top3[2],
    })

stage2_out = pd.DataFrame(rows_out)
stage2_out.to_csv(FILE_STAGE2_SUB, index=False)
print("Saved:", FILE_STAGE2_SUB)
stage2_out.head()
