In [1]:
import os, json, gc, math, random
import numpy as np
import pandas as pd

# === Files (same folder) ===
FILE_TEST     = "test_data_question.csv"
FILE_P        = "P_j_given_i.csv"
FILE_LIFT     = "item_lift_matrix.csv"
FILE_JACC     = "item_jaccard_matrix.csv"
FILE_COOC     = "item_cooccurrence_counts.csv"
FILE_POP      = "item_stats_counts_and_freq.csv"

FILE_STAGE2_MODEL = "stage2_lgbm_model.txt"
FILE_STAGE2_FEATS = "stage2_feature_columns.json"   # try to load; fall back to model feature names
FILE_CAT_VOCAB    = "stage2_cat_vocab.json"         # try to load; fall back to test-driven codes

FILE_MODELING     = "modeling_dataset.csv"          # optional: only to build a tiny ctx lookup

# === Stage-1 knobs (safe defaults) ===
TOP_N_PER_ANCHOR   = 60
FALLBACK_MIN_CANDS = 40
AGG_PROB_METHOD    = "one_minus_prod"   # {"one_minus_prod","sum","max"}
AGG_OTHER_METHOD   = "mean"             # {"mean","sum","max"}
W_PROB, W_LIFT, W_JACC, W_POPU = 0.60, 0.25, 0.10, 0.05

# === Popularity columns (from your file) ===
POPULAR_COL_ITEM = "Unnamed: 0"
POPULAR_COL_FREQ = "freq"

# === Column aliases ===
COL_ANCHOR = "anchor_item"
COL_CAND   = "candidate_item"
COL_PROB   = "P_j_given_i"

def canonicalize_item(x):
    if pd.isna(x): return x
    return str(x).strip()

def safe_minmax(s: pd.Series):
    s = pd.to_numeric(s, errors="coerce").fillna(0.0).astype(float)
    lo, hi = s.min(), s.max()
    if hi - lo < 1e-12:
        return pd.Series(np.zeros(len(s), dtype=float), index=s.index)
    return (s - lo) / (hi - lo)

def one_minus_product_of_complements(values):
    v = np.clip(np.array(values, dtype=float), 0.0, 1.0)
    return float(1.0 - np.prod(1.0 - v))

def item_bucket(name: str) -> str:
    n = str(name).lower()
    if "fries" in n: return "fries"
    if "combo" in n: return "combo"
    if "dip" in n: return "dip"
    if "corn" in n: return "sides"
    if any(w in n for w in ["cake","brownie","cookie"]): return "dessert"
    if "drink" in n or "soda" in n: return "drink"
    if "wings" in n and "spicy" in n: return "wings_spicy"
    if "wings" in n and "grilled" in n: return "wings_grilled"
    if "wings" in n: return "wings"
    if "strips" in n: return "strips"
    return "other"


In [2]:
# 1) Popularity
pop_df = pd.read_csv(FILE_POP)
pop_df = pop_df[[POPULAR_COL_ITEM, POPULAR_COL_FREQ]].dropna()
pop_df[POPULAR_COL_ITEM] = pop_df[POPULAR_COL_ITEM].map(canonicalize_item)
pop_df = pop_df.sort_values(POPULAR_COL_FREQ, ascending=False).reset_index(drop=True)

# 2) P(j|i) â€” wide or long
P_raw = pd.read_csv(FILE_P)
if COL_PROB not in P_raw.columns or 'item' in P_raw.columns or 'Item' in P_raw.columns:
    row_item_col = 'item' if 'item' in P_raw.columns else ('Item' if 'Item' in P_raw.columns else P_raw.columns[0])
    P = P_raw.melt(id_vars=[row_item_col], var_name=COL_CAND, value_name=COL_PROB)\
             .rename(columns={row_item_col: COL_ANCHOR})
else:
    P = P_raw.rename(columns={P_raw.columns[0]: COL_ANCHOR,
                              P_raw.columns[1]: COL_CAND,
                              P_raw.columns[2]: COL_PROB})
P[COL_ANCHOR] = P[COL_ANCHOR].map(canonicalize_item)
P[COL_CAND]   = P[COL_CAND].map(canonicalize_item)
P = P.dropna(subset=[COL_CAND, COL_PROB])

# 3) Melt lift/jaccard/cooc if needed
def to_long_pairs(df, value_col):
    cols = df.columns.tolist()
    str_cols = [c for c in cols if df[c].dtype == "object"]
    if value_col in cols and len(str_cols) >= 2:
        return df.rename(columns={str_cols[0]: COL_ANCHOR, str_cols[1]: COL_CAND})
    base = df.copy()
    row_item_col = cols[0]
    base[row_item_col] = base[row_item_col].map(canonicalize_item)
    long_df = base.melt(id_vars=[row_item_col], var_name=COL_CAND, value_name=value_col)\
                  .rename(columns={row_item_col: COL_ANCHOR})
    long_df[COL_CAND] = long_df[COL_CAND].map(canonicalize_item)
    return long_df

lift_pairs = to_long_pairs(pd.read_csv(FILE_LIFT),  "lift")
jacc_pairs = to_long_pairs(pd.read_csv(FILE_JACC),  "jaccard")
cooc_pairs = to_long_pairs(pd.read_csv(FILE_COOC),  "cooc_count")

lift_pairs = lift_pairs[[COL_ANCHOR, COL_CAND, "lift"]].dropna()
jacc_pairs = jacc_pairs[[COL_ANCHOR, COL_CAND, "jaccard"]].dropna()
cooc_pairs = cooc_pairs[[COL_ANCHOR, COL_CAND, "cooc_count"]].dropna()

# 4) Build per-anchor Top-N and prejoin signals ONCE
P_sorted = P.sort_values([COL_ANCHOR, COL_PROB], ascending=[True, False]).copy()
P_topN = P_sorted.groupby(COL_ANCHOR, as_index=False).head(TOP_N_PER_ANCHOR)

pairs_all = P_topN.merge(lift_pairs, on=[COL_ANCHOR, COL_CAND], how="left") \
                  .merge(jacc_pairs, on=[COL_ANCHOR, COL_CAND], how="left") \
                  .merge(cooc_pairs, on=[COL_ANCHOR, COL_CAND], how="left")

CAND_BY_ANCHOR = {a: g[[COL_CAND, COL_PROB, "lift", "jaccard", "cooc_count"]].reset_index(drop=True)
                  for a, g in pairs_all.groupby(COL_ANCHOR)}

POPULAR_ITEMS = pop_df[POPULAR_COL_ITEM].tolist()
print("Stage-1 assets ready. Anchors:", len(CAND_BY_ANCHOR))


Stage-1 assets ready. Anchors: 130


In [4]:
def get_candidates_for_cart(cart_items):
    frames = []
    for anchor in cart_items:
        df = CAND_BY_ANCHOR.get(anchor)
        if df is not None and not df.empty:
            tmp = df.copy()
            tmp["_anchor"] = anchor
            frames.append(tmp)

    if not frames:
        fb = pop_df[~pop_df[POPULAR_COL_ITEM].isin(cart_items)].rename(columns={POPULAR_COL_ITEM: COL_CAND}).copy()
        fb["stage1_score"] = safe_minmax(fb[POPULAR_COL_FREQ])
        fb["votes"] = 0
        return fb[[COL_CAND, "stage1_score", "votes"]].head(FALLBACK_MIN_CANDS)

    cand_pairs = pd.concat(frames, ignore_index=True)
    cand_pairs = cand_pairs[~cand_pairs[COL_CAND].isin(cart_items)].copy()

    def agg_prob(s):
        if AGG_PROB_METHOD == "one_minus_prod": return one_minus_product_of_complements(s)
        if AGG_PROB_METHOD == "sum": return s.sum()
        return s.max()
    def agg_other(s):
        if AGG_OTHER_METHOD == "mean": return s.mean(skipna=True)
        if AGG_OTHER_METHOD == "sum": return s.sum(skipna=True)
        return s.max(skipna=True)

    grouped = cand_pairs.groupby(COL_CAND).agg({
        COL_PROB: agg_prob,
        "lift": agg_other,
        "jaccard": agg_other,
        "cooc_count": "sum",
        "_anchor": "count"
    }).rename(columns={"_anchor": "votes"}).reset_index()

    grouped = grouped.merge(
        pop_df.rename(columns={POPULAR_COL_ITEM: COL_CAND, POPULAR_COL_FREQ: "popularity"}),
        on=COL_CAND, how="left"
    )

    for col in [COL_PROB, "lift", "jaccard", "popularity"]:
        if col not in grouped.columns: grouped[col] = 0.0
        grouped[f"{col}_norm"] = safe_minmax(grouped[col])

    grouped["stage1_score"] = (
        W_PROB*grouped[f"{COL_PROB}_norm"] +
        W_LIFT*grouped["lift_norm"] +
        W_JACC*grouped["jaccard_norm"] +
        W_POPU*grouped["popularity_norm"]
    )

    need = max(0, FALLBACK_MIN_CANDS - len(grouped))
    if need > 0:
        fb = pop_df[~pop_df[POPULAR_COL_ITEM].isin(set(grouped[COL_CAND]).union(cart_items))] \
                  .head(need).rename(columns={POPULAR_COL_ITEM: COL_CAND})
        fb["stage1_score"] = 0.0
        fb["votes"] = 0
        grouped = pd.concat([grouped[[COL_CAND,"stage1_score","votes",COL_PROB,"lift","jaccard","popularity"]],
                             fb[[COL_CAND,"stage1_score","votes"]]],
                            ignore_index=True)

    return grouped.sort_values("stage1_score", ascending=False).reset_index(drop=True)


In [5]:
import lightgbm as lgb

# Load booster
booster = lgb.Booster(model_file=FILE_STAGE2_MODEL)

# Features: try saved list, else from model
if os.path.exists(FILE_STAGE2_FEATS):
    with open(FILE_STAGE2_FEATS, "r") as f:
        FEATURES = json.load(f)
else:
    # fallback to model feature names
    FEATURES = list(booster.feature_name())
print("Features to use:", len(FEATURES))

# Category vocab (train->test mapping); fall back to test-only codes
cat2id = {}
if os.path.exists(FILE_CAT_VOCAB):
    with open(FILE_CAT_VOCAB, "r") as f:
        cat_vocab = json.load(f)
    cat2id = {c:{s:i for i,s in enumerate(v)} for c,v in cat_vocab.items()}
else:
    print("WARNING: category vocab not found. Will build codes from test values (may reduce accuracy).")


Features to use: 74


In [6]:
# Build a slim per-customer lookup to personalize a bit (safe subset of columns)
ctx_small = None
try:
    base_cols = ["CUSTOMER_ID","orders_count","items_count","repeat_purchase_rate",
                 "avg_order_value","weekend_order_ratio","store_diversity_count",
                 "cust_registered","cust_guest","cust_special_membership","store_STATE"]
    # keep only available
    preview = pd.read_csv(FILE_MODELING, nrows=5)
    usecols = [c for c in base_cols if c in preview.columns]
    cs = []
    for chunk in pd.read_csv(FILE_MODELING, usecols=usecols, chunksize=200_000):
        cs.append(chunk.drop_duplicates(subset=["CUSTOMER_ID"]))
    ctx_small = pd.concat(cs, ignore_index=True).drop_duplicates(subset=["CUSTOMER_ID"])
    print("Built ctx_small with rows:", len(ctx_small))
except Exception as e:
    print("Skipping ctx_small build:", e)


Built ctx_small with rows: 563346


In [7]:
# Read test
test_df = pd.read_csv(FILE_TEST)

# item1, item2, ...
ITEMCOLS_TEST = [c for c in test_df.columns if c.lower().startswith("item")]
ITEMCOLS_TEST = sorted(ITEMCOLS_TEST, key=lambda x: int(''.join(ch for ch in x if ch.isdigit()) or 0))

def extract_test_cart(row):
    items = []
    for c in ITEMCOLS_TEST:
        v = row.get(c)
        if pd.notna(v) and str(v).strip():
            items.append(canonicalize_item(v))
    return items

rows_out = []
for _, row in test_df.iterrows():
    cart_items = extract_test_cart(row)
    cand = get_candidates_for_cart(cart_items).copy()

    # Candidate attributes / basics
    cand = cand.merge(pop_df.rename(columns={POPULAR_COL_ITEM: COL_CAND, POPULAR_COL_FREQ: "cand_popularity"}),
                      on=COL_CAND, how="left")
    cand["stage1_rank"] = np.arange(1, len(cand)+1, dtype=int)
    cand["candidate_bucket"] = cand[COL_CAND].map(item_bucket)
    cand["cand_is_combo"]   = (cand["candidate_bucket"] == "combo").astype(int)
    cand["cand_is_fries"]   = (cand["candidate_bucket"] == "fries").astype(int)
    cand["cand_is_dip"]     = (cand["candidate_bucket"] == "dip").astype(int)
    cand["cand_is_wings"]   = cand["candidate_bucket"].isin(["wings","wings_spicy","wings_grilled"]).astype(int)
    cand["cand_is_strips"]  = (cand["candidate_bucket"] == "strips").astype(int)
    cand["cart_size"] = len(cart_items)
    cand["anchors_voted"] = cand.get("votes", 0)

    cand["cart_has_combo"] = int(any("combo" in it.lower() for it in cart_items))
    cand["cart_has_wings"] = int(any("wings" in it.lower() for it in cart_items))
    cand["cart_has_fries"] = int(any("fries" in it.lower() for it in cart_items))
    cand["cart_has_dip"]   = int(any("dip"   in it.lower() for it in cart_items))

    # Personalization (if lookup available)
    if ctx_small is not None and "CUSTOMER_ID" in test_df.columns and "CUSTOMER_ID" in ctx_small.columns:
        rc = ctx_small[ctx_small["CUSTOMER_ID"] == row.get("CUSTOMER_ID")].head(1)
        if not rc.empty:
            for c in ["orders_count","items_count","repeat_purchase_rate","avg_order_value",
                      "weekend_order_ratio","store_diversity_count",
                      "cust_registered","cust_guest","cust_special_membership"]:
                if c in rc.columns:
                    cand[c] = float(rc.iloc[0].get(c, 0.0))
            if "store_STATE" in rc.columns and "store_STATE" in cat2id:
                s = str(rc.iloc[0].get("store_STATE",""))
                cand["store_STATE"] = cat2id["store_STATE"].get(s, -1)
        else:
            for c in ["orders_count","items_count","repeat_purchase_rate","avg_order_value",
                      "weekend_order_ratio","store_diversity_count",
                      "cust_registered","cust_guest","cust_special_membership","store_STATE"]:
                cand[c] = 0.0
    else:
        # No personalization
        for c in ["orders_count","items_count","repeat_purchase_rate","avg_order_value",
                  "weekend_order_ratio","store_diversity_count",
                  "cust_registered","cust_guest","cust_special_membership","store_STATE"]:
            cand[c] = 0.0

    # Encode channel/occasion with saved vocab (or fallback)
    for c in ["ORDER_CHANNEL_NAME","ORDER_SUBCHANNEL_NAME","ORDER_OCCASION_NAME"]:
        if c in test_df.columns:
            if cat2id:
                code = cat2id.get(c, {}).get(str(row.get(c, "")), -1)
            else:
                # fallback: build simple per-test mapping (consistent within this run)
                # map unseen to -1
                # (You can prebuild dicts outside loop for speed if needed.)
                code = hash(str(row.get(c, ""))) % 100
            cand[c] = int(code)

    # Drop helper col and ensure FEATURES exist & numeric
    if "candidate_bucket" in cand.columns:
        cand.drop(columns=["candidate_bucket"], inplace=True)

    for fcol in FEATURES:
        if fcol not in cand.columns:
            cand[fcol] = 0.0
        if not pd.api.types.is_numeric_dtype(cand[fcol]):
            cand[fcol] = pd.to_numeric(cand[fcol], errors="coerce").fillna(0.0)

    # Predict with best_iteration
    scores = booster.predict(cand[FEATURES], num_iteration=getattr(booster, "best_iteration", None))
    cand["score"] = scores

    # Top-3
    top3 = cand.nlargest(3, "score")[COL_CAND].tolist()
    # ensure uniqueness and not already in cart (belt-and-suspenders)
    seen = set()
    out3 = []
    for it in top3:
        if it not in seen and it not in cart_items:
            out3.append(it); seen.add(it)
        if len(out3) == 3: break
    while len(out3) < 3: out3.append("")

    rows_out.append({
        "ORDER_ID": row["ORDER_ID"],
        "RECOMMENDATION_1": out3[0],
        "RECOMMENDATION_2": out3[1],
        "RECOMMENDATION_3": out3[2],
    })

stage2_out = pd.DataFrame(rows_out)
OUT_FILE = "stage2_recommendations_top3.csv"
stage2_out.to_csv(OUT_FILE, index=False)
print("Saved:", OUT_FILE)
stage2_out.head()


Saved: stage2_recommendations_top3.csv


Unnamed: 0,ORDER_ID,RECOMMENDATION_1,RECOMMENDATION_2,RECOMMENDATION_3
0,9351345556,Chicken Sub,Add 5 Spicy Wings,Regular Buffalo Fries
1,3595377080,Ranch Dip - Regular,Blue Cheese Dip - Regular,2 pc Crispy Strips
2,4071757785,Regular Buffalo Fries,Add 5 Spicy Wings,Add 5 Grilled Wings
3,3931766769,Ranch Dip - Regular,Regular Buffalo Fries,Ranch Dip - Large
4,3739700809,Ranch Dip - Regular,Large Buffalo Fries,Fried Corn - Regular


In [9]:
import pandas as pd
test = pd.read_csv("test_data_question.csv")
sub  = pd.read_csv("stage2_recommendations_top3.csv")
sub  = test[["ORDER_ID"]].merge(sub, on="ORDER_ID", how="left")
sub.to_excel("stage2_recommendations_top3.xlsx", index=False)

In [1]:
import pandas as pd

# === file names (same folder) ===
TEST_PATH = "test_data_question.csv"
RECS_PATH = "stage2_recommendations_top3.csv"
OUT_CSV   = "submission_test_data_question.csv"
OUT_XLSX  = "submission_test_data_question.xlsx"

# 1) Load
test = pd.read_csv(TEST_PATH, dtype={"ORDER_ID": str, "CUSTOMER_ID": str})
recs = pd.read_csv(RECS_PATH, dtype={"ORDER_ID": str})

# 2) Standardize recommendation column names (underscore -> space)
rename_map = {
    "RECOMMENDATION_1": "RECOMMENDATION 1",
    "RECOMMENDATION_2": "RECOMMENDATION 2",
    "RECOMMENDATION_3": "RECOMMENDATION 3",
}
# If they already have spaces, this is a no-op
recs = recs.rename(columns=rename_map)

# 3) Ensure we only keep the needed rec columns + ORDER_ID
rec_cols = ["RECOMMENDATION 1", "RECOMMENDATION 2", "RECOMMENDATION 3"]
keep_cols = ["ORDER_ID"] + [c for c in rec_cols if c in recs.columns]
recs = recs[keep_cols].drop_duplicates(subset=["ORDER_ID"], keep="first")

# 4) Left-join onto test (keeps test order)
merged = test.merge(recs, on="ORDER_ID", how="left")

# 5) Put recommendation columns at the end (after all original test columns)
ordered_cols = [c for c in merged.columns if c not in rec_cols] + rec_cols
merged = merged[ordered_cols]

# 6) Fill any missing recs with blank strings (submission-safe)
for c in rec_cols:
    if c not in merged.columns:
        merged[c] = ""
    merged[c] = merged[c].fillna("")

# 7) Quick sanity checks (optional prints)
print("Rows in test:", len(test))
print("Rows in merged:", len(merged))
print("Missing rec rows:", int(merged[rec_cols].isna().any(axis=1).sum()))
print("Blank strings in recs:", int((merged[rec_cols] == "").sum().sum()))

# 8) Save outputs
merged.to_csv(OUT_CSV, index=False)
merged.to_excel(OUT_XLSX, index=False)

print("Saved:", OUT_CSV)
print("Saved:", OUT_XLSX)

Rows in test: 1000
Rows in merged: 1000
Missing rec rows: 0
Blank strings in recs: 0
Saved: submission_test_data_question.csv
Saved: submission_test_data_question.xlsx
