# E-Commerce Market Basket Analysis (Instacart)

## Task 1 – Data Preparation


Imports + Settings

In [43]:
import pandas as pd
import numpy as np
from collections import Counter


pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)


File Paths

In [44]:
# ---- FILE PATHS (edit if your files are in another folder) ----
ORDERS_PATH   = "orders.csv"
PRIOR_PATH    = "order_products__prior.csv"

PRODUCTS_PATH = "products.csv"
AISLES_PATH   = "aisles.csv"
DEPTS_PATH    = "departments.csv"


load small lookup tables

In [45]:
# ---- Load lookup tables (small) ----
products = pd.read_csv(
    PRODUCTS_PATH,
    usecols=["product_id", "product_name", "aisle_id", "department_id"],
    dtype={"product_id":"int32", "aisle_id":"int16", "department_id":"int16"}
)

aisles = pd.read_csv(AISLES_PATH, dtype={"aisle_id":"int16"})
departments = pd.read_csv(DEPTS_PATH, dtype={"department_id":"int16"})

print("Loaded products/aisles/departments")
print(products.shape, aisles.shape, departments.shape)


Loaded products/aisles/departments
(49688, 4) (134, 2) (21, 2)


load orders (only needed columns) + clean nulls

In [46]:
# ---- Load orders with only needed columns ----
orders = pd.read_csv(
    ORDERS_PATH,
    usecols=["order_id", "user_id", "eval_set", "order_number", "order_dow", "order_hour_of_day", "days_since_prior_order"],
    dtype={
        "order_id":"int32",
        "user_id":"int32",
        "eval_set":"category",
        "order_number":"int16",
        "order_dow":"int8",
        "order_hour_of_day":"int8",
        "days_since_prior_order":"float32"
    }
)

# ---- Clean: remove nulls in critical columns ----
orders = orders.dropna(subset=["order_id", "user_id", "eval_set"])

print("orders:", orders.shape)
orders.head()


orders: (3421083, 7)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


take a manageable subset (15,000 users)

In [47]:
# ---- SUBSET: sample 20,000 users to keep dataset manageable ----
N_USERS = 15_000
rng = np.random.default_rng(42)

unique_users = orders["user_id"].unique()
sample_users = rng.choice(unique_users, size=min(N_USERS, len(unique_users)), replace=False)

orders_sub = orders[orders["user_id"].isin(sample_users)].copy()

# We focus on 'prior' orders for Market Basket Analysis (baskets)
orders_sub = orders_sub[orders_sub["eval_set"].isin(["prior"])].copy()

eligible_order_ids = set(orders_sub["order_id"].tolist())

print("orders_sub:", orders_sub.shape)
print("eligible_order_ids:", len(eligible_order_ids))


orders_sub: (230974, 7)
eligible_order_ids: 230974


chunk-read prior order products + clean lines

In [48]:
# ---- Chunk-read order_products_prior and keep only sampled orders ----
op_cols = ["order_id", "product_id"]
op_dtypes = {"order_id":"int32", "product_id":"int32"}

CHUNKSIZE = 2_000_000   # if RAM is low, use 500_000
kept_chunks = []
order_item_counts = Counter()

for chunk in pd.read_csv(PRIOR_PATH, usecols=op_cols, dtype=op_dtypes, chunksize=CHUNKSIZE):
    # remove nulls in critical fields
    chunk = chunk.dropna(subset=["order_id", "product_id"])

    # keep only sampled order_ids
    chunk = chunk[chunk["order_id"].isin(eligible_order_ids)]

    # remove duplicates (same product repeated in same order)
    chunk = chunk.drop_duplicates(subset=["order_id", "product_id"])

    # count items per order (for removing 1-item baskets)
    order_item_counts.update(chunk["order_id"].tolist())

    kept_chunks.append(chunk)

order_products_sub = pd.concat(kept_chunks, ignore_index=True)

print("order_products_sub:", order_products_sub.shape)
order_products_sub.head()


order_products_sub: (2317518, 2)


Unnamed: 0,order_id,product_id
0,25,9755
1,25,31487
2,25,37510
3,25,14576
4,25,22105


remove orders with only one item

In [49]:
# ---- Remove orders with only 1 item (basket must have >=2 items) ----
valid_orders = {oid for oid, c in order_item_counts.items() if c >= 2}

order_products_sub = order_products_sub[order_products_sub["order_id"].isin(valid_orders)].copy()
orders_sub = orders_sub[orders_sub["order_id"].isin(valid_orders)].copy()

print("After removing 1-item orders:")
print("orders_sub:", orders_sub.shape)
print("order_products_sub:", order_products_sub.shape)


After removing 1-item orders:
orders_sub: (219497, 7)
order_products_sub: (2306041, 2)


remove very rare products

In [50]:
# ---- OPTIONAL: remove very rare products to reduce columns later ----
MIN_PRODUCT_COUNT = 50   # try 20, 50, 100 (higher = faster, fewer rules)

prod_counts = order_products_sub["product_id"].value_counts()
keep_products = set(prod_counts[prod_counts >= MIN_PRODUCT_COUNT].index.astype("int32"))

order_products_sub = order_products_sub[order_products_sub["product_id"].isin(keep_products)].copy()

print("After removing rare products:")
print("order_products_sub:", order_products_sub.shape)
print("Unique products kept:", order_products_sub["product_id"].nunique())


After removing rare products:
order_products_sub: (1991413, 2)
Unique products kept: 6629


attach product names (small merge)

In [51]:
# ---- Add product names (merge after filtering) ----
order_products_sub = order_products_sub.merge(
    products[["product_id", "product_name"]],
    on="product_id",
    how="left"
)

# remove any rows that failed the merge (should be rare)
order_products_sub = order_products_sub.dropna(subset=["product_name"])

print("Final cleaned order_products_sub:", order_products_sub.shape)
order_products_sub.head()


Final cleaned order_products_sub: (1991413, 3)


Unnamed: 0,order_id,product_id,product_name
0,25,9755,Original Popcorn
1,25,31487,Boomchickapop Sweet & Salty Kettle Corn
2,25,14576,Delights Turkey Sausage Egg Whites & Cheese En...
3,25,22105,Ultra Thin Sliced Provolone Cheese
4,25,6383,All Whites 100% Egg Whites


Self Check

In [52]:
print("Transactions (orders):", orders_sub["order_id"].nunique())
print("Unique products:", order_products_sub["product_id"].nunique())
print("Lines (order-product rows):", len(order_products_sub))


Transactions (orders): 219497
Unique products: 6629
Lines (order-product rows): 1991413


re-drop 1-item baskets AFTER rare-product filtering

In [53]:
# ---- FIX: Remove orders that became 1-item after rare-product filtering ----
order_sizes = order_products_sub.groupby("order_id")["product_id"].nunique()
valid_orders = order_sizes[order_sizes >= 2].index

order_products_sub = order_products_sub[order_products_sub["order_id"].isin(valid_orders)].copy()
orders_sub = orders_sub[orders_sub["order_id"].isin(valid_orders)].copy()

print("After re-removing 1-item orders:")
print("orders_sub unique orders:", orders_sub["order_id"].nunique())
print("order_products_sub rows:", len(order_products_sub))


After re-removing 1-item orders:
orders_sub unique orders: 213416
order_products_sub rows: 1986391


## Task 2 – Basket Construction


Build transaction list (order → list of product_ids)

In [54]:
# ---- TASK 2.1: Build baskets: order_id -> list of product_ids ----

# Ensure correct types (saves memory + avoids weird groupby issues)
order_products_sub["order_id"] = order_products_sub["order_id"].astype("int32")
order_products_sub["product_id"] = order_products_sub["product_id"].astype("int32")

# Group items per order (basket)
baskets = (
    order_products_sub
    .groupby("order_id")["product_id"]
    .apply(list)
)

print("Number of baskets:", baskets.shape[0])
print("Example basket (first one):", baskets.iloc[0][:20])  # show up to 20 items


Number of baskets: 213416
Example basket (first one): [9755, 31487, 14576, 22105, 6383, 39046, 48171, 30320, 15925, 37449, 48299]


Convert baskets into a clean “transactions” table

In [55]:
# ---- TASK 2.2: Build clean transaction list dataset ----

transactions_df = baskets.reset_index()
transactions_df.columns = ["order_id", "items"]

# add basket size (handy for analysis/debug)
transactions_df["basket_size"] = transactions_df["items"].apply(len).astype("int16")

print("transactions_df:", transactions_df.shape)
transactions_df.head()


transactions_df: (213416, 3)


Unnamed: 0,order_id,items,basket_size
0,25,"[9755, 31487, 14576, 22105, 6383, 39046, 48171...",11
1,70,"[11067, 11481, 44008, 35824]",4
2,72,"[28204, 39108, 4472, 29228, 20588, 24852, 4747...",8
3,106,"[4210, 19836, 27288, 23011, 37449, 47144, 2262...",19
4,144,"[45437, 35921, 43394, 12384, 1503, 24799, 2247...",14


Quick quality checks

In [56]:
# ---- TASK 2.3: Quality checks ----

min_size = transactions_df["basket_size"].min()
max_size = transactions_df["basket_size"].max()
mean_size = transactions_df["basket_size"].mean()

print("Basket size min:", min_size)
print("Basket size max:", max_size)
print("Basket size mean:", round(mean_size, 2))

# Confirm no single-item orders slipped in
print("Number of 1-item baskets:", (transactions_df["basket_size"] == 1).sum())


Basket size min: 2
Basket size max: 90
Basket size mean: 9.31
Number of 1-item baskets: 0


## Task 3 – One-Hot Encoding


Build One-Hot matrix from baskets

In [57]:
# ---- TASK 3.1: One-Hot Encoding using pandas (boolean, memory-efficient) ----

order_ids = baskets.index.to_numpy()

# 2) Create a stable mapping: order_id -> row index (0..n_orders-1)
order_id_to_row = pd.Series(np.arange(len(order_ids), dtype=np.int32), index=order_ids)

# 3) Build (row, col) coordinates for sparse matrix from order_products_sub (much faster than apply(pd.Series))
#    IMPORTANT: order_products_sub must only contain orders in baskets (it does)
rows = order_products_sub["order_id"].map(order_id_to_row).to_numpy(dtype=np.int32)

# Map product_id to column indices
unique_products = np.sort(order_products_sub["product_id"].unique())
product_id_to_col = pd.Series(np.arange(len(unique_products), dtype=np.int32), index=unique_products)

cols = order_products_sub["product_id"].map(product_id_to_col).to_numpy(dtype=np.int32)

# 4) Build sparse matrix
from scipy.sparse import coo_matrix

data = np.ones(len(order_products_sub), dtype=np.bool_)  # boolean ones
X = coo_matrix((data, (rows, cols)), shape=(len(order_ids), len(unique_products))).tocsr()

# 5) Convert to pandas sparse DataFrame (mlxtend can work with it; if not, we’ll convert later)
basket_onehot = pd.DataFrame.sparse.from_spmatrix(X, index=order_ids, columns=unique_products)

print("One-hot shape (orders x products):", basket_onehot.shape)
print("Non-zeros (total 1s):", X.nnz)
basket_onehot.iloc[:5, :5]


One-hot shape (orders x products): (213416, 6629)
Non-zeros (total 1s): 1986391


  basket_onehot = pd.DataFrame.sparse.from_spmatrix(X, index=order_ids, columns=unique_products)


Unnamed: 0,1,10,23,25,28
25,0,0,0,0,0
70,0,0,0,0,0
72,0,0,0,0,0
106,0,0,0,0,0
144,0,0,0,0,0


Quick sanity checks

In [58]:
# ---- TASK 3.2: Sanity checks ----

print("X shape:", X.shape)
print("X nnz:", X.nnz)

# Fast row sums directly from CSR
row_sums_fast = np.asarray(X.sum(axis=1)).ravel()

print("Row sum min:", int(row_sums_fast.min()))
print("Row sum max:", int(row_sums_fast.max()))
print("Row sum mean:", float(row_sums_fast.mean()))

# Compare against true basket sizes from order_products_sub (also fast)
true_sizes = order_products_sub.groupby("order_id")["product_id"].nunique()

# order_ids is the index we used for X rows
true_sizes_aligned = true_sizes.reindex(order_ids).fillna(0).to_numpy()

mismatch = np.sum(row_sums_fast != true_sizes_aligned)
print("Mismatch count:", int(mismatch))


X shape: (213416, 6629)
X nnz: 1986391
Row sum min: 2
Row sum max: 90
Row sum mean: 9.307601117067136
Mismatch count: 0


## Task 4 – Apriori Algorithm

Build a smaller Apriori matrix (Top-N products)

In [59]:
from mlxtend.frequent_patterns import apriori

# ---- TASK 4.0: Reduce columns for Apriori (Top-N frequent products) ----
TOP_N = 500  # try 300, 500, 1000 depending on speed/RAM

top_products = (
    order_products_sub["product_id"]
    .value_counts()
    .head(TOP_N)
    .index
    .to_numpy()
)

# Slice sparse matrix to these columns
# Build column indices for top_products
top_col_idx = product_id_to_col.loc[top_products].to_numpy()

X_top = X[:, top_col_idx]

# Convert to dense boolean DataFrame (safe because columns are only TOP_N)
basket_onehot_top = pd.DataFrame(
    X_top.toarray().astype(bool),
    index=order_ids,
    columns=top_products
)

print("Apriori matrix shape:", basket_onehot_top.shape)
print("Density:", round(X_top.nnz / (X_top.shape[0]*X_top.shape[1]), 6))


Apriori matrix shape: (213416, 500)
Density: 0.009294


Run Apriori with min_support = 0.05

In [60]:
# ---- TASK 4.1: Apriori (min_support=0.05) ----
freq_05 = apriori(basket_onehot_top, min_support=0.05, use_colnames=True, low_memory=True)

freq_05["itemset_size"] = freq_05["itemsets"].apply(len).astype("int16")
freq_05 = freq_05.sort_values(["support", "itemset_size"], ascending=[False, False]).reset_index(drop=True)

print("Frequent itemsets (0.05):", len(freq_05))
print("Max itemset size (0.05):", int(freq_05["itemset_size"].max()) if len(freq_05) else 0)
freq_05.head(10)


Frequent itemsets (0.05): 7
Max itemset size (0.05): 1


Unnamed: 0,support,itemsets,itemset_size
0,0.159182,(24852),1
1,0.124695,(13176),1
2,0.090195,(21137),1
3,0.080463,(21903),1
4,0.072563,(47209),1
5,0.058983,(47766),1
6,0.052269,(47626),1


Run Apriori with min_support = 0.01

In [61]:
# ---- TASK 4.2: Apriori (min_support=0.01) ----
freq_01 = apriori(basket_onehot_top, min_support=0.01, use_colnames=True, low_memory=True)

freq_01["itemset_size"] = freq_01["itemsets"].apply(len).astype("int16")
freq_01 = freq_01.sort_values(["support", "itemset_size"], ascending=[False, False]).reset_index(drop=True)

print("Frequent itemsets (0.01):", len(freq_01))
print("Max itemset size (0.01):", int(freq_01["itemset_size"].max()) if len(freq_01) else 0)
freq_01.head(10)


Frequent itemsets (0.01): 129
Max itemset size (0.01): 2


Unnamed: 0,support,itemsets,itemset_size
0,0.159182,(24852),1
1,0.124695,(13176),1
2,0.090195,(21137),1
3,0.080463,(21903),1
4,0.072563,(47209),1
5,0.058983,(47766),1
6,0.052269,(47626),1
7,0.048263,(16797),1
8,0.046665,(26209),1
9,0.045282,(27845),1


Comparison table

In [62]:
comparison = pd.DataFrame({
    "min_support": [0.05, 0.01],
    "num_itemsets": [len(freq_05), len(freq_01)],
    "max_itemset_size": [
        int(freq_05["itemset_size"].max()) if len(freq_05) else 0,
        int(freq_01["itemset_size"].max()) if len(freq_01) else 0
    ],
    "avg_itemset_size": [
        float(freq_05["itemset_size"].mean()) if len(freq_05) else 0,
        float(freq_01["itemset_size"].mean()) if len(freq_01) else 0
    ],
})
comparison


Unnamed: 0,min_support,num_itemsets,max_itemset_size,avg_itemset_size
0,0.05,7,1,1.0
1,0.01,129,2,1.147287


## Conclusion

In this project, Market Basket Analysis and Association Rule Mining were performed on the Instacart dataset using an efficient, scalable pipeline. Due to the large size of the raw data, careful preprocessing and memory-aware techniques were applied, including user sampling, removal of rare products, and sparse one-hot encoding.

Transactions were constructed at the order level, ensuring that each basket contained at least two items. A sparse boolean one-hot matrix was then generated, preserving correctness while remaining computationally feasible for large-scale analysis.

The Apriori algorithm was applied using two different minimum support thresholds. With a higher threshold (min_support = 0.05), only a small number of frequent itemsets were discovered, all of which consisted of single highly popular products. This reflects strong but very general purchasing behavior. When the support threshold was lowered to 0.01, the number of frequent itemsets increased substantially, and product pairs began to emerge, revealing more detailed and potentially actionable co-purchase patterns.

These results confirm the theoretical properties of the Apriori algorithm: increasing the minimum support reduces both the number and size of frequent itemsets due to the downward closure property. From a business perspective, higher support thresholds are useful for identifying widely popular products, while lower thresholds enable the discovery of meaningful cross-selling opportunities.

Overall, this analysis demonstrates how association rule mining can be effectively applied to large-scale e-commerce data to extract insights that support recommendation systems, promotions, and product placement strategies.


## Task 5 – Association Rules


Generate association rules

In [63]:
from mlxtend.frequent_patterns import association_rules

# ---- TASK 5.1: Generate association rules ----
rules = association_rules(
    freq_01,
    metric="lift",
    min_threshold=1.0
)

print("Total rules generated:", len(rules))
rules.head()


Total rules generated: 36


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(13176),(21137),0.124695,0.090195,0.02049,0.164324,1.821884,1.0,0.009244,1.088706,0.515384,0.105404,0.081479,0.195752
1,(21137),(13176),0.090195,0.124695,0.02049,0.227181,1.821884,1.0,0.009244,1.132612,0.49584,0.105404,0.117085,0.195752
2,(13176),(47209),0.124695,0.072563,0.019764,0.1585,2.184323,1.0,0.010716,1.102124,0.619433,0.111352,0.092661,0.215437
3,(47209),(13176),0.072563,0.124695,0.019764,0.272375,2.184323,1.0,0.010716,1.202961,0.584613,0.111352,0.168718,0.215437
4,(21137),(24852),0.090195,0.159182,0.019338,0.214401,1.34689,1.0,0.00498,1.070289,0.283081,0.084063,0.065673,0.167942


Select relevant metrics only (clean table)

In [64]:
# ---- TASK 5.2: Clean rules table ----
rules_clean = rules[[
    "antecedents",
    "consequents",
    "support",
    "confidence",
    "lift"
]].copy()

# Sort by lift descending
rules_clean = rules_clean.sort_values(
    by="lift",
    ascending=False
).reset_index(drop=True)

rules_clean.head(10)


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(21137),(27966),0.010557,0.117045,2.768401
1,(27966),(21137),0.010557,0.249695,2.768401
2,(27966),(13176),0.011831,0.27984,2.244191
3,(13176),(27966),0.011831,0.094882,2.244191
4,(24852),(28204),0.010435,0.065554,2.22103
5,(28204),(24852),0.010435,0.353548,2.22103
6,(47209),(13176),0.019764,0.272375,2.184323
7,(13176),(47209),0.019764,0.1585,2.184323
8,(47766),(21903),0.01036,0.175643,2.182921
9,(21903),(47766),0.01036,0.128756,2.182921


Extract Top 3 rules by Lift

In [65]:
# ---- TASK 5.3: Top 3 rules by Lift ----
top_3_rules = rules_clean.head(3)
top_3_rules

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(21137),(27966),0.010557,0.117045,2.768401
1,(27966),(21137),0.010557,0.249695,2.768401
2,(27966),(13176),0.011831,0.27984,2.244191


Map product_id → product_name ( Make report readable )

In [66]:
# ---- Map product IDs to product names for interpretation ----

product_id_to_name = dict(
    zip(products["product_id"], products["product_name"])
)

def map_itemset(itemset):
    return [product_id_to_name.get(pid, str(pid)) for pid in itemset]

top_3_rules_named = top_3_rules.copy()
top_3_rules_named["antecedents"] = top_3_rules_named["antecedents"].apply(map_itemset)
top_3_rules_named["consequents"] = top_3_rules_named["consequents"].apply(map_itemset)

top_3_rules_named


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,[Organic Strawberries],[Organic Raspberries],0.010557,0.117045,2.768401
1,[Organic Raspberries],[Organic Strawberries],0.010557,0.249695,2.768401
2,[Organic Raspberries],[Bag of Organic Bananas],0.011831,0.27984,2.244191


## Association Rule Analysis

Association rules were generated from frequent itemsets using the lift metric. Lift values greater than 1 indicate a positive association between products, meaning they are purchased together more often than expected by chance.

### Top 3 Rules by Lift

#### Rule 1
- **If a customer buys Organic Strawberries, they are also likely to buy Organic Raspberries**
- Support: 0.0106
- Confidence: 0.1170
- Lift: 2.77

This rule shows a strong co-purchase relationship between fresh organic berries. It is suitable for cross-selling strategies, such as recommending raspberries when strawberries are added to the cart or offering bundled promotions.

---

#### Rule 2
- **If a customer buys Organic Raspberries, they are also likely to buy Organic Strawberries**
- Support: 0.0106
- Confidence: 0.2497
- Lift: 2.77

This bidirectional rule confirms a strong mutual relationship between these products. It can be used in recommendation systems and “frequently bought together” features to improve basket size.

---

#### Rule 3
- **If a customer buys Organic Raspberries, they are also likely to buy a Bag of Organic Bananas**
- Support: 0.0118
- Confidence: 0.2798
- Lift: 2.24

This association suggests that customers purchasing organic berries often include staple organic produce items. This insight can be leveraged for product placement optimization and targeted discounts.


## Task 6 – Business Interpretation


# Business Interpretation of Association Rules

The association rules discovered from the Instacart dataset provide actionable insights into customer purchasing behavior. These patterns can be leveraged by an online grocery retailer to improve recommendations, increase basket size, and optimize marketing strategies.

### 1. Recommendation Systems
Strong association rules with high lift values, such as the relationship between *Organic Strawberries* and *Organic Raspberries*, can be directly integrated into recommendation engines. When a customer adds one product to their cart, the associated product can be suggested in real time as a “frequently bought together” item. This increases the likelihood of additional purchases and improves the overall customer experience.

### 2. Cross-Selling and Bundling
Rules with meaningful confidence values indicate reliable co-purchase behavior. For example, customers buying organic berries often also purchase staple organic produce such as bananas. These insights can be used to create product bundles or cross-selling promotions, such as discounts applied when related items are purchased together.

### 3. Promotions and Targeted Marketing
Association rules allow retailers to design targeted promotions based on real customer behavior. Instead of offering generic discounts, promotions can focus on complementary products identified through rule mining. This leads to more efficient marketing spend and higher conversion rates.

### 4. Store Layout and Product Placement
In both physical and online store layouts, products with strong associations should be placed closer together or linked digitally. For example, placing organic berries near other organic fruits or highlighting them together in category pages can encourage customers to add multiple related items to their basket.

### 5. Strategic Decision Support
From a strategic perspective, Market Basket Analysis supports data-driven decisions by revealing hidden relationships in transactional data. These insights help retailers understand customer preferences, improve inventory planning, and design personalized shopping experiences.

Overall, association rule mining transforms raw transactional data into practical business intelligence that supports revenue growth and customer satisfaction.
