In [40]:
# === PATHS (edit if needed) ===
PATH_TEST = "test_data_question.csv"
PATH_P = "P_j_given_i.csv"
PATH_LIFT = "item_lift_matrix.csv"
PATH_JACCARD = "item_jaccard_matrix.csv"
PATH_COOC = "item_cooccurrence_counts.csv"
PATH_POP = "item_stats_counts_and_freq.csv"
# Optional: precomputed top-10 per anchor (if you created it)
PATH_TOP10_JSON = None  # set to None if you don't have it

# === CANDIDATE GENERATION HYPERPARAMETERS ===
TOP_N_PER_ANCHOR = 50      # how many candidates to pull per cart item (before merging)
FALLBACK_MIN_CANDS = 30    # ensure at least this many candidates by filling with popularity
FINAL_TOPK = 3             # we ultimately need Top-3 for submission, but we can generate more upstream

# === RE-RANK (still Stage 1 aggregation) WEIGHTS for a simple score
W_PROB = 0.60
W_LIFT = 0.25
W_JACC = 0.10
W_POPU = 0.05

# How to aggregate multiple anchors (items in cart) for a candidate
AGG_PROB_METHOD = "one_minus_prod"  # choices: {"sum", "max", "one_minus_prod"}
AGG_OTHER_METHOD = "mean"           # choices: {"mean", "max", "sum"}

RANDOM_SEED = 42


In [41]:
import json
import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None

def canonicalize_item(x: str) -> str:
    """Make item keys consistent across files."""
    if pd.isna(x):
        return x
    return str(x).strip()

def safe_minmax(s: pd.Series):
    """Min-max normalize a series; if constant, return zeros."""
    s = s.astype(float).fillna(0.0)
    lo, hi = s.min(), s.max()
    if hi - lo < 1e-12:
        return pd.Series(np.zeros(len(s), dtype=float), index=s.index)
    return (s - lo) / (hi - lo)

def one_minus_product_of_complements(values):
    """Aggregate probabilities p_i -> 1 - Π(1 - p_i)."""
    v = np.clip(np.array(values, dtype=float), 0.0, 1.0)
    return float(1.0 - np.prod(1.0 - v))


In [42]:
test_df = pd.read_csv(PATH_TEST)

P = pd.read_csv(PATH_P)
lift_df = pd.read_csv(PATH_LIFT)
jacc_df = pd.read_csv(PATH_JACCARD)
cooc_df = pd.read_csv(PATH_COOC)
pop_df = pd.read_csv(PATH_POP)

# Canonicalize item text-ish columns (we'll set exact column names in Step 3)
for df in [P, lift_df, jacc_df, cooc_df, pop_df]:
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].map(canonicalize_item)


In [43]:
# ==== P_j_given_i.csv ====
# Expected: one row per (anchor_item, candidate_item) with conditional probability
COL_ANCHOR = "anchor_item"        # e.g., "i" or "item_i" or "anchor"
COL_CAND   = "candidate_item"     # e.g., "j" or "item_j" or "candidate"
COL_PROB   = "P_j_given_i"        # e.g., "p" or "prob"

# If your file uses different names, uncomment and set them:
# COL_ANCHOR = "item_i"
# COL_CAND   = "item_j"
# COL_PROB   = "p_j_given_i"

# ==== item_lift_matrix.csv ====
# Can be either long-form with (anchor, candidate, lift) or a wide square matrix.
COL_LIFT = "lift"                 # if long-form. If wide, we'll melt in Step 4.

# ==== item_jaccard_matrix.csv ====
COL_JACC = "jaccard"              # if long-form. If wide, we'll melt in Step 4.

# ==== item_cooccurrence_counts.csv ====
COL_COOC = "cooc_count"           # if long-form. If wide, we'll melt in Step 4.

# ==== item_stats_counts_and_freq.csv (popularity) ====
POPULAR_COL_ITEM = "Unnamed: 0"    # item column in popularity table
POPULAR_COL_FREQ = "freq"         # could be "probability" or "relative_freq" etc.

# ==== test_data_question.csv cart columns ====
# We'll auto-detect all columns named like "item1", "item2", ... (case-insensitive)


In [44]:
print("P columns:", P.columns.tolist())
print("lift columns:", lift_df.columns.tolist())
print("jacc columns:", jacc_df.columns.tolist())
print("cooc columns:", cooc_df.columns.tolist())
print("pop columns:", pop_df.columns.tolist())
print("test columns:", test_df.columns.tolist())


P columns: ['item', '$19.99 Crispy Feast', '10 pc Grilled Wings', '10 pc Grilled Wings Combo', '10 pc Mixed Wings', '10 pc Mixed Wings Combo', '10 pc Spicy Wings', '10 pc Spicy Wings Combo', '100 pc Family Grilled Wings', '100 pc Family Mixed Wings', '100 pc Family Spicy Wings', '100 pc Grilled Wings', '100 pc Mixed Wings', '100 pc Spicy Wings', '15 pc Crispy Strips', '15 pc Grilled Wings', '15 pc Grilled Wings Combo', '15 pc Mixed Wings', '15 pc Mixed Wings Combo', '15 pc Spicy Wings', '15 pc Spicy Wings Combo', '2 pc Crispy Strips', '20 Oz Soda', '20 pc Crispy Strips', '20 pc Grilled Wings', '20 pc Mixed Wings', '20 pc Spicy Wings', '20pc Spicy Feast Deal', '24 pc Family Grilled Wings', '24 pc Family Mixed Wings', '24 pc Family Spicy Wings', '25 pc Game Day Pack', '3 Strips Lunch', '3 pc Crispy Strips Combo', '3 pc Grilled Wings', '30 pc Crispy Strips', '30 pc Family Grilled Wings', '30 pc Family Mixed Wings', '30 pc Family Spicy Wings', '30 pc Grilled Wings', '30 pc Mixed Wings', '3

In [45]:
def to_long_pairs(df, value_col, item_id_col=None):
    """
    Convert a square matrix to long-form (anchor, candidate, value_col).
    If already long (contains both anchor & candidate), return as-is.
    - item_id_col: if the first column holds the row item names.
    """
    cols = df.columns.tolist()
    # Heuristic: already long if it has at least two string/object columns
    str_cols = [c for c in cols if df[c].dtype == "object"]
    if len(str_cols) >= 2 and value_col in cols:
        # Looks long already
        return df.rename(columns={str_cols[0]: COL_ANCHOR, str_cols[1]: COL_CAND})
    # Otherwise, assume wide matrix with row index or first col as item id
    if item_id_col is None:
        item_id_col = cols[0]
    wide = df.copy()
    wide[item_id_col] = wide[item_id_col].map(canonicalize_item)
    long_df = wide.melt(id_vars=[item_id_col], var_name=COL_CAND, value_name=value_col)
    long_df.rename(columns={item_id_col: COL_ANCHOR}, inplace=True)
    long_df[COL_CAND] = long_df[COL_CAND].map(canonicalize_item)
    return long_df

# Convert to long if needed
lift_pairs  = to_long_pairs(lift_df,  COL_LIFT)
jacc_pairs  = to_long_pairs(jacc_df,  COL_JACC)
cooc_pairs  = to_long_pairs(cooc_df,  COL_COOC)

# Standardize key columns text
for df in [P, lift_pairs, jacc_pairs, cooc_pairs]:
    for c in [COL_ANCHOR, COL_CAND]:
        if c in df.columns:
            df[c] = df[c].map(canonicalize_item)


In [46]:
# Convert P from wide to long form
P_long = P.melt(id_vars=['item'],  # <-- replace 'item' with whatever the first column name is in P_j_given_i.csv
                var_name=COL_CAND,
                value_name=COL_PROB)
P_long.rename(columns={'item': COL_ANCHOR}, inplace=True)

# Canonicalize names
P_long[COL_ANCHOR] = P_long[COL_ANCHOR].map(canonicalize_item)
P_long[COL_CAND]   = P_long[COL_CAND].map(canonicalize_item)

# Overwrite P
P = P_long


In [47]:
# Ensure P has the expected columns
assert {COL_ANCHOR, COL_CAND, COL_PROB}.issubset(P.columns), "Fix column mappings in Step 3."

# Sort P by prob desc for each anchor and keep TOP_N_PER_ANCHOR rows per anchor
P_sorted = P.sort_values([COL_ANCHOR, COL_PROB], ascending=[True, False]).copy()
P_topN = P_sorted.groupby(COL_ANCHOR, as_index=False).head(TOP_N_PER_ANCHOR)

# Optional: load precomputed top-k by anchor
topk_dict = {}
if PATH_TOP10_JSON and len(PATH_TOP10_JSON) > 0:
    try:
        with open(PATH_TOP10_JSON, "r") as f:
            topk_dict = json.load(f)
        # canonicalize keys
        topk_dict = {canonicalize_item(k): [canonicalize_item(x) for x in v] for k, v in topk_dict.items()}
    except Exception as e:
        print("Skipping JSON top-k; reason:", e)

# Build dict: anchor -> DataFrame of candidates from P_topN
P_by_anchor = {a: g[[COL_CAND, COL_PROB]].reset_index(drop=True)
               for a, g in P_topN.groupby(COL_ANCHOR)}

# Fast join frames on (anchor, candidate)
# Merge lift/jacc/cooc so we can bring these signals in later
lift_pairs = lift_pairs[[COL_ANCHOR, COL_CAND, COL_LIFT]].dropna()
jacc_pairs = jacc_pairs[[COL_ANCHOR, COL_CAND, COL_JACC]].dropna()
cooc_pairs = cooc_pairs[[COL_ANCHOR, COL_CAND, COL_COOC]].dropna()

# Global popularity table
pop_df = pop_df[[POPULAR_COL_ITEM, POPULAR_COL_FREQ]].dropna()
pop_df[POPULAR_COL_ITEM] = pop_df[POPULAR_COL_ITEM].map(canonicalize_item)
pop_df = pop_df.sort_values(POPULAR_COL_FREQ, ascending=False).reset_index(drop=True)

POPULAR_ITEMS = pop_df[POPULAR_COL_ITEM].tolist()


In [48]:
# Auto-detect item columns in test
item_cols = [c for c in test_df.columns if c.lower().startswith("item")]
item_cols = sorted(item_cols, key=lambda x: int(''.join([ch for ch in x if ch.isdigit()]) or 0))  # item1, item2, ...

def extract_cart_items(row):
    items = []
    for c in item_cols:
        val = row.get(c, np.nan)
        if pd.notna(val) and str(val).strip():
            items.append(canonicalize_item(val))
    return [it for it in items if it]  # non-empty


In [49]:
def get_candidates_for_cart(cart_items):
    # 1) Collect per-anchor candidates
    per_anchor_frames = []
    for anchor in cart_items:
        # Prefer JSON top-k if available
        if topk_dict and anchor in topk_dict:
            cand_list = topk_dict[anchor]
            df = pd.DataFrame({COL_CAND: cand_list})
            # bring probability if available
            df = df.merge(P[[COL_ANCHOR, COL_CAND, COL_PROB]][P[COL_ANCHOR]==anchor],
                          on=COL_CAND, how="left")
            df[COL_PROB] = df[COL_PROB].fillna(0.0)
        else:
            # Use P_by_anchor fallback
            df = P_by_anchor.get(anchor, pd.DataFrame(columns=[COL_CAND, COL_PROB])).copy()
        if df.empty:
            continue

        # Attach other signals for this (anchor, candidate) pair
        df[COL_ANCHOR] = anchor
        df = df.merge(lift_pairs, on=[COL_ANCHOR, COL_CAND], how="left")
        df = df.merge(jacc_pairs, on=[COL_ANCHOR, COL_CAND], how="left")
        df = df.merge(cooc_pairs, on=[COL_ANCHOR, COL_CAND], how="left")
        per_anchor_frames.append(df)

    if not per_anchor_frames:
        # No anchor had candidates → return popularity (excluding cart items)
        fallback = pop_df[~pop_df[POPULAR_COL_ITEM].isin(cart_items)].copy()
        fallback = fallback.rename(columns={POPULAR_COL_ITEM: COL_CAND,
                                            POPULAR_COL_FREQ: "popularity"})
        fallback["final_score"] = fallback["popularity"].rank(pct=True, ascending=False)
        return fallback[[COL_CAND, "final_score"]].head(FALLBACK_MIN_CANDS)

    # 2) Combine all anchors’ candidates
    cand_pairs = pd.concat(per_anchor_frames, ignore_index=True)

    # Remove items already in cart
    cand_pairs = cand_pairs[~cand_pairs[COL_CAND].isin(cart_items)].copy()

    # 3) Aggregate across anchors for each candidate
    #    - prob aggregation: sum / max / 1 - Π(1 - p)
    #    - other signals: mean / max / sum
    agg_funcs = {
        COL_PROB: (lambda s: {
            "sum": s.sum(),
            "max": s.max(),
            "one_minus_prod": one_minus_product_of_complements(s)
        }[AGG_PROB_METHOD]),
        COL_LIFT: (lambda s: {
            "mean": s.mean(skipna=True),
            "max": s.max(skipna=True),
            "sum": s.sum(skipna=True)
        }[AGG_OTHER_METHOD]),
        COL_JACC: (lambda s: {
            "mean": s.mean(skipna=True),
            "max": s.max(skipna=True),
            "sum": s.sum(skipna=True)
        }[AGG_OTHER_METHOD]),
        COL_COOC: (lambda s: {
            "mean": s.mean(skipna=True),
            "max": s.max(skipna=True),
            "sum": s.sum(skipna=True)
        }[AGG_OTHER_METHOD]),
    }

    grouped = cand_pairs.groupby(COL_CAND).agg({
        COL_PROB: agg_funcs[COL_PROB],
        COL_LIFT: agg_funcs[COL_LIFT] if COL_LIFT in cand_pairs.columns else "mean",
        COL_JACC: agg_funcs[COL_JACC] if COL_JACC in cand_pairs.columns else "mean",
        COL_COOC: agg_funcs[COL_COOC] if COL_COOC in cand_pairs.columns else "sum",
    }).reset_index()

    # 4) Attach popularity (global) for tie-breaking
    grouped = grouped.merge(pop_df.rename(columns={POPULAR_COL_ITEM: COL_CAND,
                                                   POPULAR_COL_FREQ: "popularity"}),
                            on=COL_CAND, how="left")

    # 5) Normalize each signal and combine with weights
    for col in [COL_PROB, COL_LIFT, COL_JACC, "popularity"]:
        if col not in grouped.columns:
            grouped[col] = 0.0
        grouped[f"{col}_norm"] = safe_minmax(grouped[col])

    grouped["final_score"] = (
        W_PROB * grouped[f"{COL_PROB}_norm"] +
        W_LIFT * grouped[f"{COL_LIFT}_norm"] +
        W_JACC * grouped[f"{COL_JACC}_norm"] +
        W_POPU * grouped["popularity_norm"]
    )

    # 6) Popularity fallback to guarantee coverage
    need = max(0, FALLBACK_MIN_CANDS - len(grouped))
    if need > 0:
        fallback = pop_df[~pop_df[POPULAR_COL_ITEM].isin(set(grouped[COL_CAND]).union(cart_items))] \
                    .head(need).copy()
        fallback = fallback.rename(columns={POPULAR_COL_ITEM: COL_CAND})
        fallback["final_score"] = 0.0  # will sink below existing candidates
        grouped = pd.concat([grouped[[COL_CAND, "final_score"]], fallback[[COL_CAND, "final_score"]]],
                            ignore_index=True)

    # 7) Return sorted candidate list
    grouped = grouped.sort_values("final_score", ascending=False).reset_index(drop=True)
    return grouped[[COL_CAND, "final_score"]]


In [50]:
# Pick a random row from test and get its cart items
row = test_df.sample(1, random_state=RANDOM_SEED).iloc[0]
cart_items = extract_cart_items(row)
print("Cart items:", cart_items)

cand_df = get_candidates_for_cart(cart_items)
cand_df.head(10)


Cart items: ['Chicken Sub Combo', '6 pc Grilled Wings Combo', 'Ranch Dip - Regular']


Unnamed: 0,candidate_item,final_score
0,10 pc Grilled Wings Combo,0.791638
1,8 pc Grilled Wings Combo,0.711733
2,2 pc Crispy Strips,0.66902
3,Regular Buffalo Fries,0.566795
4,10 pc Grilled Wings,0.554154
5,6 pc Spicy Wings Combo,0.523215
6,10 pc Spicy Wings,0.484875
7,Fried Corn - Regular,0.434329
8,20pc Spicy Feast Deal,0.426439
9,Chicken Sub,0.411842


In [53]:
def item_bucket(name: str) -> str:
    n = name.lower()
    if "fries" in n: return "fries"
    if "combo" in n: return "combo"
    if "dip" in n: return "dip"
    if "corn" in n: return "sides"
    if any(w in n for w in ["cake","brownie","cookie"]): return "dessert"
    if "drink" in n or "soda" in n: return "drink"
    if "wings" in n and "spicy" in n: return "wings_spicy"
    if "wings" in n and "grilled" in n: return "wings_grilled"
    if "wings" in n: return "wings"
    if "strips" in n: return "strips"
    return "other"

def pick_diverse_topk(ranked_df, k=3):
    seen = set()
    chosen = []
    for _, r in ranked_df.iterrows():
        b = item_bucket(r['candidate_item'])
        if b in seen: 
            continue
        chosen.append(r['candidate_item'])
        seen.add(b)
        if len(chosen) == k:
            break
    # backfill if we didn’t hit k
    if len(chosen) < k:
        for _, r in ranked_df.iterrows():
            if r['candidate_item'] not in chosen:
                chosen.append(r['candidate_item'])
                if len(chosen) == k: break
    return chosen[:k]


In [None]:
def recommend_topk_for_row(row, k=3):
    cart_items = extract_cart_items(row)
    cands = get_candidates_for_cart(cart_items)
    topk = pick_diverse_topk(cands, k=k)
    while len(topk) < k: topk.append("")
    return pd.Series({f"RECOMMENDATION_{i+1}": topk[i] for i in range(k)})

reco_cols = [f"RECOMMENDATION_{i+1}" for i in range(FINAL_TOPK)]
reco_df = test_df.copy()
reco_df[reco_cols] = test_df.apply(lambda r: recommend_topk_for_row(r, k=FINAL_TOPK), axis=1)

# Save an output (intermediate) file; you'll paste these cols into the official template if needed
OUT_PATH = "stage1_candidate_recommendations_top3_v2.csv"
reco_df[["ORDER_ID"] + reco_cols].to_csv(OUT_PATH, index=False)
print(f"Saved: {OUT_PATH}")
reco_df.head(5)[["ORDER_ID"] + reco_cols]


Saved: stage1_candidate_recommendations_top{FINAL_TOPK}.csv


Unnamed: 0,ORDER_ID,RECOMMENDATION_1,RECOMMENDATION_2,RECOMMENDATION_3
0,9351345556,10 pc Grilled Wings Combo,2 pc Crispy Strips,Regular Buffalo Fries
1,3595377080,Ranch Dip - Regular,10 pc Grilled Wings,2 pc Crispy Strips
2,4071757785,Regular Buffalo Fries,10 pc Grilled Wings,Ranch Dip - Large
3,3931766769,Ranch Dip - Regular,Regular Buffalo Fries,Ranch Dip - Large
4,3739700809,Ranch Dip - Regular,Large Buffalo Fries,10 pc Spicy Wings
