In [1]:
# Step 0: Imports
import pandas as pd
import numpy as np
from pathlib import Path

# Optional: gensim for item2vec (install gensim if using that section)
# pip install gensim

In [2]:
orders_fp = "order_data_cleaned_and_encoded.csv"
df = pd.read_csv(orders_fp, low_memory=False)

# Meta columns that are NOT item columns (adjust if you have additional meta columns)
meta_cols = {
    "CUSTOMER_ID", "STORE_NUMBER", "ORDER_CREATED_DATE", "ORDER_ID",
    "ORDER_CHANNEL_NAME", "ORDER_SUBCHANNEL_NAME", "ORDER_OCCASION_NAME",
    "total_order_price"
}

# Detect item columns as all other columns
item_cols = [c for c in df.columns if c not in meta_cols]
print(f"Detected {len(item_cols)} item columns")
# quick check
print(item_cols[:30])

Detected 130 item columns
['$19.99 Crispy Feast', '10 pc Grilled Wings', '10 pc Grilled Wings Combo', '10 pc Mixed Wings', '10 pc Mixed Wings Combo', '10 pc Spicy Wings', '10 pc Spicy Wings Combo', '100 pc Family Grilled Wings', '100 pc Family Mixed Wings', '100 pc Family Spicy Wings', '100 pc Grilled Wings', '100 pc Mixed Wings', '100 pc Spicy Wings', '15 pc Crispy Strips', '15 pc Grilled Wings', '15 pc Grilled Wings Combo', '15 pc Mixed Wings', '15 pc Mixed Wings Combo', '15 pc Spicy Wings', '15 pc Spicy Wings Combo', '2 pc Crispy Strips', '20 Oz Soda', '20 pc Crispy Strips', '20 pc Grilled Wings', '20 pc Mixed Wings', '20 pc Spicy Wings', '20pc Spicy Feast Deal', '24 pc Family Grilled Wings', '24 pc Family Mixed Wings', '24 pc Family Spicy Wings']


In [4]:
presence = (df[item_cols] > 0).astype(np.int32)   # DataFrame same index as df, columns = item_cols
n_orders = presence.shape[0]
print("Orders:", n_orders)


Orders: 1414410


In [5]:
item_counts = presence.sum(axis=0).rename("count")    # number of orders containing item
item_freq = (item_counts / n_orders).rename("freq")   # fraction of orders containing item
item_stats = pd.concat([item_counts, item_freq], axis=1).sort_values("count", ascending=False)
print(item_stats.head(20))
# Save if you want:
item_stats.to_csv("item_stats_counts_and_freq.csv")


                            count      freq
Ranch Dip - Regular        302870  0.214132
20pc Spicy Feast Deal      267974  0.189460
10 pc Grilled Wings Combo  166664  0.117833
6 pc Grilled Wings Combo   117894  0.083352
8 pc Grilled Wings Combo   117595  0.083141
Regular Buffalo Fries      100141  0.070801
2 pc Crispy Strips          84162  0.059503
Ranch Dip - Large           80610  0.056992
6 pc Spicy Wings Combo      72234  0.051070
10 pc Grilled Wings         67043  0.047400
Large Buffalo Fries         59962  0.042394
8 pc Spicy Wings Combo      59308  0.041931
10 pc Spicy Wings           59039  0.041741
Fried Corn - Regular        58584  0.041419
Chicken Sub Combo           58169  0.041126
10 pc Spicy Wings Combo     57397  0.040580
Flavor Platter              55667  0.039357
3 pc Crispy Strips Combo    54478  0.038516
Chicken Sub                 45257  0.031997
15 pc Grilled Wings Combo   43452  0.030721


4) Co-occurrence matrix (dense)
Because unique items < 200, we can compute the full items × items matrix via a matrix multiplication.

In [6]:
# cooccurrence counts: cooc[i,j] = number of orders containing both i and j
cooc_matrix = presence.T.dot(presence)   # DataFrame: index & cols = item_cols
cooc_matrix.index.name = "item"
cooc_matrix.columns.name = "item"

# Sanity: diagonal equals item_counts
assert (np.allclose(np.diag(cooc_matrix.values), item_counts.values)), "Diagonal mismatch"

# Save (CSV) if you want
cooc_matrix.to_csv("item_cooccurrence_counts.csv")


In [7]:
# Joint probability P(i & j)
P_ij = cooc_matrix / n_orders

# P(i) as Series (already computed as item_freq)
P_i = item_freq
P_j = item_freq

# Conditional probabilities P(j|i): for each row i, divide row by count(i)
# avoid division by zero (if some item_count==0)
counts = item_counts.replace(0, np.nan)  # will produce NaN for items with 0 count
P_j_given_i = cooc_matrix.div(counts, axis=0)  # rows are i, columns j

# Lift: P(i,j) / (P(i)*P(j)) -> matrix
# Construct denominator matrix P(i)*P(j)
den = np.outer(P_i.values, P_j.values)
lift = P_ij.values / den
lift = pd.DataFrame(lift, index=item_cols, columns=item_cols)

# Jaccard: cooc / (count(i) + count(j) - cooc)
counts_matrix = np.add.outer(item_counts.values, item_counts.values)
jaccard = cooc_matrix / (counts_matrix - cooc_matrix)

# Save some matrices
P_j_given_i.to_csv("P_j_given_i.csv")     # careful: contains NaNs for rare items
lift.to_csv("item_lift_matrix.csv")
jaccard.to_csv("item_jaccard_matrix.csv")


In [8]:
def get_topk(df_matrix, k=10):
    """
    df_matrix: square DataFrame indexed & columns by item names
    returns dict[item] = list of (item_j, score) sorted descending
    """
    topk = {}
    for i in df_matrix.index:
        series = df_matrix.loc[i].drop(labels=[i], errors='ignore')  # drop self
        srt = series.sort_values(ascending=False).head(k)
        topk[i] = list(zip(srt.index.tolist(), srt.values.tolist()))
    return topk

top10_by_Pjg = get_topk(P_j_given_i.fillna(0), k=10)   # using conditional prob
top10_by_cooc = get_topk(cooc_matrix, k=10)
top10_by_lift = get_topk(lift.fillna(0), k=10)

# Example: print top 5 for a sample item name
sample_item = item_stats.index[0]   # most popular item
print("Top by P(j|i) for", sample_item, top10_by_Pjg[sample_item][:5])
print("Top by cooc for", sample_item, top10_by_cooc[sample_item][:5])
print("Top by lift for", sample_item, top10_by_lift[sample_item][:5])

# Save top lists to disk (as JSON)
import json
with open("top10_by_Pjg.json","w") as f:
    json.dump(top10_by_Pjg, f)


Top by P(j|i) for Ranch Dip - Regular [('10 pc Grilled Wings Combo', 0.15301944728761516), ('Regular Buffalo Fries', 0.13915541321358998), ('10 pc Grilled Wings', 0.12858982401690494), ('10 pc Spicy Wings', 0.10440122824974411), ('8 pc Grilled Wings Combo', 0.10009575065209496)]
Top by cooc for Ranch Dip - Regular [('10 pc Grilled Wings Combo', 46345), ('Regular Buffalo Fries', 42146), ('10 pc Grilled Wings', 38946), ('10 pc Spicy Wings', 31620), ('8 pc Grilled Wings Combo', 30316)]
Top by lift for Ranch Dip - Regular [('Sports Drink', 4.67002344240103), ('10 pc Mixed Wings', 3.0266194650927614), ('15 pc Mixed Wings', 2.917115631923525), ('20 pc Mixed Wings', 2.8278952059527023), ('10 pc Grilled Wings', 2.712866861383746)]


In [9]:
def score_candidates_from_cart(cart_items, P_j_given_i_df, item_counts_series, exclude_in_cart=True, weight_by_popularity=False):
    """
    cart_items: list of item names present in cart
    Returns a pd.Series indexed by item_name of scores
    """
    # ensure cart_items exist in P_j_given_i_df
    valid_cart = [c for c in cart_items if c in P_j_given_i_df.index]
    if not valid_cart:
        # fallback: return global popularity
        return item_counts_series.sort_values(ascending=False)

    # Sum of P(j|i) across items in cart (rows i)
    # P_j_given_i_df rows = i, columns = j
    summed = P_j_given_i_df.loc[valid_cart].sum(axis=0).copy()  # Series indexed by j

    if weight_by_popularity:
        # downweight contributions from extremely popular items
        weights = 1 / np.log1p(item_counts_series)  # Series indexed by items
        # multiply rows by weights (align by index)
        weighted_matrix = P_j_given_i_df.mul(weights, axis=0)
        summed = weighted_matrix.loc[valid_cart].sum(axis=0)

    if exclude_in_cart:
        for it in cart_items:
            if it in summed.index:
                summed[it] = -np.inf  # ensure not recommended

    # sort descending
    return summed.sort_values(ascending=False)

# Example usage
cart = ["10 pc Grilled Wings Combo", "20 Oz Soda"]   # change to actual item names in your dataset
scores = score_candidates_from_cart(cart, P_j_given_i.fillna(0), item_counts)
top3 = scores.head(10)
print("Top candidates:", top3.head(10))


Top candidates: item
Ranch Dip - Regular         0.546536
20pc Spicy Feast Deal       0.412700
Regular Buffalo Fries       0.180186
2 pc Crispy Strips          0.123584
10 pc Spicy Wings           0.119987
Ranch Dip - Large           0.106153
6 pc Grilled Wings Combo    0.105485
8 pc Grilled Wings Combo    0.105139
Fried Corn - Regular        0.100454
10 pc Spicy Wings Combo     0.078102
dtype: float64
