## Basket Recommendation System

In [14]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

In [15]:
pd.__version__

'2.2.2'

## Read the datasets and setup the config

In [16]:
# === CONFIGURATION ===
ALL_ORDERS_FILE = 'all_except_last_orders.csv'
LAST_ORDERS_FILE = 'last_orders_subset.csv'
SUBMISSION_FILE = 'submission.csv'
N_RECOMMENDATIONS = 5


## Optimal Hyperparameters

We have tried multiple paramters before this. We used apriori and using confidence, RF hybrid with changing Lift boost factors, decay rate, lift ratios - we found highest score around 0.223.

***After using this RF + Item-Item Similarity we were able to see improvement***

In [17]:
FREQ_POWER = 2.0      # Frequency squared emphasis
SIM_LAMBDA = 0.3      # Item similarity weight
USE_EXP_DECAY = False # Linear recency works best

## Data Loading & Pre-Processing

In [18]:
# === LOAD DATA ===
print("Loading datasets...")
df_past = pd.read_csv(ALL_ORDERS_FILE)
df_last_subset = pd.read_csv(LAST_ORDERS_FILE)

# === DATE PROCESSING ===
df_past["Delivery Date"] = pd.to_datetime(df_past["Delivery Date"],
                                          errors="coerce")
df_past.dropna(subset=["Delivery Date"], inplace=True)

last_delivery_date = df_past["Delivery Date"].max()
test_orders = (df_last_subset[["Member", "Order"]]
               .drop_duplicates()
               .reset_index(drop=True))

print(f"Test orders: {len(test_orders)}")
print(f"Anchor date: {last_delivery_date.date()}")


Loading datasets...
Test orders: 638
Anchor date: 2014-12-03


The **FREQ_POWER of 2.0** ensures that items bought consistently are rewarded far
more than one-off purchases, making them strong candidates for forgotten staples.

In [19]:
def recency_score(date):
    """Linear recency: 1/(days+1) - optimal for this dataset."""
    days = (last_delivery_date - date).days
    if days < 0:
        days = 0
    return 1.0 / (days + 1.0)

# === PRECOMPUTE RF FOR ALL MEMBER-SKU PAIRS ===
member_sku_freq = (df_past.groupby(["Member", "SKU"])
                   .size()
                   .reset_index(name="Frequency"))

member_sku_last = (df_past.groupby(["Member", "SKU"])["Delivery Date"]
                   .max()
                   .reset_index()
                   .rename(columns={"Delivery Date": "LastPurchase"}))

member_sku = member_sku_freq.merge(member_sku_last, on=["Member", "SKU"])
member_sku["RecencyScore"] = member_sku["LastPurchase"].apply(recency_score)
member_sku["RF_Score"] = (member_sku["Frequency"] ** FREQ_POWER
                          * member_sku["RecencyScore"])

# === LOOKUP DICTIONARY ===
rf_score_dict = {(row["Member"], row["SKU"]): row["RF_Score"]
                 for _, row in member_sku.iterrows()}


## Global Popularity Fallback

* Order count = how many distinct baskets contain this SKU
* Measures breadth of appeal, not just volume
* Grocery insight: "staples" appear across many baskets

In [20]:
# === ORDER-BASED POPULARITY (not raw frequency) ===
sku_popularity_counts = df_past.groupby("SKU")["Order"].nunique()
sku_popularity_sorted = sku_popularity_counts.sort_values(ascending=False)
global_pop_list = list(sku_popularity_sorted.index)

print(f" Top global SKU: {global_pop_list[0]} "
      f"({sku_popularity_counts[global_pop_list[0]]} orders)")

 Top global SKU: 15668381 (530 orders)


## Item-Item Collaborative Filtering Matrix

*   Basket Matrix: Rows=Orders, Columns=SKUs â†’ 10000*50000 sparse matrix
*  Binarize: quantity>0 â†’ 1 (co-occurrence only matters)
*  Transpose: SKUs*Orders â†’ now compute SKUâ†”SKU similarity
*  Cosine Similarity: For every SKU pair

In [21]:
print("ðŸ”— Building item-item similarity matrix...")

# === BASKET MATRIX: Orders Ã— SKUs (binary) ===
order_sku_matrix = (df_past.groupby(["Order", "SKU"])["SKU"]
                    .count()
                    .unstack()
                    .fillna(0))
order_sku_bin = (order_sku_matrix > 0).astype(int)

# === COSINE SIMILARITY: SKU Ã— SKU ===
item_sim_matrix = cosine_similarity(order_sku_bin.T)
sku_list = list(order_sku_bin.columns)
sku_to_idx = {sku: idx for idx, sku in enumerate(sku_list)}


ðŸ”— Building item-item similarity matrix...


## Similarity Scoring Function
Treats current basket as "query vector" â†’ average neighborhood similarity.

In [22]:
def item_sim_score(target_sku, basket_skus):
    if target_sku not in sku_to_idx: return 0.0
    t_idx = sku_to_idx[target_sku]

    valid_basket = [s for s in basket_skus if s in sku_to_idx]
    if not valid_basket: return 0.0

    idxs = [sku_to_idx[s] for s in valid_basket]
    sims = item_sim_matrix[t_idx, idxs]  # Row of similarities
    return float(np.mean(sims))  # Average similarity to basket

## Hybrid Recommendation Engine

Why Î»=0.3? Grid search: 0.2=too weak, 0.4=overpowers RF signal.

Final Score: RF_score + 0.3 Ã— Similarity â†’ empirically optimal weights.

In [23]:
def get_hybrid_recommendations(member_id, order_id, df_last_subset,
                               rf_score_dict, global_pop_list, N=5):
    """RF + Similarity hybrid scoring."""

    # Current partial basket
    current_cart_skus = set(df_last_subset
                            [df_last_subset["Order"] == order_id]
                            ["SKU"].unique())

    # Member's historical items (excl. current basket)
    member_skus = {sku for (m, sku) in rf_score_dict.keys() if m == member_id}
    candidates = [sku for sku in member_skus if sku not in current_cart_skus]

    # === HYBRID SCORING: RF + Î»Ã—Similarity ===
    scores = {}
    for sku in candidates:
        rf_base = rf_score_dict.get((member_id, sku), 0.0)
        sim_boost = item_sim_score(sku, current_cart_skus)
        scores[sku] = rf_base + SIM_LAMBDA * sim_boost

    # === TOP-N + FALLBACK ===
    sorted_skus = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    top_skus = [sku for sku, s in sorted_skus if s > 0][:N]

    if len(top_skus) < N:
        for sku in global_pop_list:
            if sku not in current_cart_skus and sku not in top_skus:
                top_skus.append(sku)
                if len(top_skus) == N:
                    break

    # Safety: exactly N recommendations
    while len(top_skus) < N:
        top_skus.append(900000000 + len(top_skus) + int(order_id))

    return top_skus[:N]

## Submission Generation

### Step 1: Initialize Storage
### Step 2: Loop Through Each Test Order
### Step 3 : Get Recommendations for This Order
1. Find all SKUs in this partial basket
   current_basket.
2. Get member's entire purchase history
   member_history.
3. Filter candidates (history - current_basket)
   candidates # Forgot to order these?
4. Score each candidate
5. Return top 5 SKUs
### Step 4: Create Row for Each SKU Recommendation
### Step 5: Repeat for All Test Orders
### Step 6: Convert to DataFrame & Save

In [24]:
print(" Generating final recommendations...")
submission_rows = []
row_id = 1

for _, row in test_orders.iterrows():
    member_id = row["Member"]
    order_id = row["Order"]

    recs = get_hybrid_recommendations(member_id, order_id, df_last_subset,
                                      rf_score_dict, global_pop_list,
                                      N=N_RECOMMENDATIONS)

    for sku in recs:
        submission_rows.append({
            "ID": row_id,
            "Member": member_id,
            "Order": int(order_id),
            "SKU": int(sku)
        })
        row_id += 1

df_sub = pd.DataFrame(submission_rows)
df_sub[["ID", "Member", "Order", "SKU"]].to_csv(SUBMISSION_FILE, index=False)

print(f" Saved '{SUBMISSION_FILE}'")
print(f" Rows: {len(df_sub)}")
print(f" Orders: {df_sub['Order'].nunique()}")

 Generating final recommendations...
 Saved 'submission.csv'
 Rows: 3190
 Orders: 638
